criteria",
107 | )
108 | quality: float = Field(
109 | 0.0,
110 | description="A score (0-100) of the response quality - this includes the usefullness and clarity of the output",
111 | )
112 |
113 | # Hallucination metrics
114 | hallucination_score: float = Field(
115 | 0.0,
116 | description="A score (0-100) representing the presence of hallucinations (lower is better)",
117 | )
118 | false_claims: list = Field(
119 | [],
120 | description="List of identified false claims or hallucinations in the response",
121 | )
122 |
123 | # Tools
124 | failed_tool_calls: int = Field(
125 | 0,
126 | description="The number of failed tool calls, or tool calls that encountered an error",
127 | )
128 |
129 |
130 | @dataclass
131 | class Score:
132 | """Used to score the result of an LLM tool call."""
133 |
134 | score: ScoreModel
135 | model: str
136 | duration: float
137 | tool_analysis: dict
138 | redundant_tool_calls: int
139 | tool_calls: int
140 | trace: dict | None = None
141 |
142 | def __getattribute__(self, name: str) -> Any:
143 | if name == "score":
144 | return object.__getattribute__(self, name)
145 | if hasattr(self.score, name):
146 | return getattr(self.score, name)
147 | return object.__getattribute__(self, name)
148 |
149 | def to_dataframe(self) -> pd.DataFrame:
150 | """Convert results to a pandas DataFrame for analysis."""
151 | record = {
152 | "model": self.model,
153 | "duration": self.duration,
154 | "tool_use": self.score.tool_use,
155 | "tool_calls": self.tool_calls,
156 | "accuracy": self.score.accuracy,
157 | "helpfulness": self.score.completeness,
158 | "quality": self.score.quality,
159 | "hallucination_score": self.score.hallucination_score,
160 | "redundant_tool_calls": self.redundant_tool_calls,
161 | "false_claims_count": len(self.score.false_claims),
162 | "trace": self.trace,
163 | }
164 | return pd.DataFrame(record)
165 |
166 | def save_trace(self, path):
167 | """Save trace to disk"""
168 | trace = self.trace.copy()
169 | trace["model"] = self.model
170 | with open(path, "w") as f:
171 | f.write(json.dumps(trace))
172 |
173 |
174 | class Results(BaseModel):
175 | """Collection of scores from multiple model evaluations."""
176 |
177 | scores: List[Score] = Field([], description="A list of scores for each model")
178 | duration: float = Field(0.0, description="Total duration of all tests")
179 |
180 | def to_dataframe(self) -> pd.DataFrame:
181 | """Convert results to a pandas DataFrame for analysis."""
182 | records = []
183 | for score in self.scores:
184 | records.append(score.to_dataframe())
185 | return pd.concat(records)
186 |
187 |
188 | @dataclass
189 | class Test:
190 | """Configuration for a model evaluation test."""
191 |
192 | name: str
193 | prompt: str
194 | check: str
195 | models: List[str]
196 | expected_tools: List[str]
197 | ignore_tools: List[str]
198 | profile: Optional[str]
199 | vars: Dict[str, Any]
200 | task: Optional[str]
201 | task_run: Optional[str]
202 |
203 | def __init__(
204 | self,
205 | name: str,
206 | prompt: str,
207 | check: str = "",
208 | models: List[str] | None = None,
209 | expected_tools: List[str] | None = None,
210 | ignore_tools: Optional[List[str]] = None,
211 | profile: Optional[str] = None,
212 | vars: Optional[Dict[str, Any]] = None,
213 | task: Optional[str] = None,
214 | task_run: Optional[str] = None,
215 | ):
216 | self.name = name
217 | self.prompt = prompt
218 | self.check = check
219 | self.models = models or []
220 | self.expected_tools = expected_tools or []
221 | self.profile = profile
222 | self.ignore_tools = ignore_tools or []
223 | self.vars = vars or {}
224 | self.task = task
225 | self.task_run = task_run
226 |
227 | @staticmethod
228 | def from_dict(data: dict) -> "Test":
229 | """Parse a dict into a test"""
230 | return Test(
231 | data.get("name", ""),
232 | data.get("prompt", ""),
233 | data.get("check", ""),
234 | data.get("models", []),
235 | data.get("expected-tools", []),
236 | ignore_tools=data.get("ignored-tools", data.get("ignore-tools", [])),
237 | vars=data.get("vars", {}),
238 | profile=data.get("profile"),
239 | task=data.get("task"),
240 | task_run=data.get("task-run"),
241 | )
242 |
243 | @staticmethod
244 | def load(path: str) -> "Test":
245 | """Load a test configuration from a TOML file."""
246 | import tomllib
247 | import os
248 |
249 | with open(path) as f:
250 | s = f.read()
251 | data = tomllib.loads(s)
252 |
253 | if "import" in data:
254 | imports = data["import"]
255 | if isinstance(imports, str):
256 | imports = [imports]
257 |
258 | t = None
259 | for imp in imports:
260 | if t is None:
261 | t = Test.load(os.path.join(os.path.dirname(path), imp))
262 |
263 | # Update test attributes with any overrides from current file
264 | t.name = data.get("name", t.name)
265 | t.prompt = data.get("prompt", t.prompt)
266 | t.check = data.get("check", t.check)
267 | t.profile = data.get("profile", t.profile)
268 | t.models = data.get("models", t.models)
269 | t.expected_tools.extend(data.get("expected-tools", []))
270 | t.ignore_tools.extend(
271 | data.get("ignored-tools", data.get("ignore-tools", []))
272 | )
273 | t.vars.update(**data.get("vars", {}))
274 | t.task = t.task or data.get("task")
275 | t.task_run = t.task_run or data.get("task-run")
276 | return t
277 |
278 | if "name" not in data:
279 | data["name"] = path
280 |
281 | return Test.from_dict(data)
282 |
--------------------------------------------------------------------------------
/tests/test_mcpx.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import Mock, patch, AsyncMock, MagicMock
3 | import asyncio
4 | from datetime import datetime, timedelta
5 | import json
6 |
7 | from mcpx_eval import Judge, Test, Model, Score, Results, Database
8 | from mcpx_eval.models import ScoreModel
9 | from mcpx_eval.judge import ToolAnalysis
10 |
11 | class TestJudge(unittest.TestCase):
12 | def setUp(self):
13 | self.judge = Judge(
14 | models=["test-model"],
15 | judge_model="test-judge",
16 | ignore_tools=["ignored-tool"]
17 | )
18 |
19 | def test_add_model(self):
20 | """Test adding models to the judge"""
21 | judge = Judge()
22 |
23 | # Test adding string model
24 | judge.add_model("gpt-4")
25 | self.assertEqual(len(judge.models), 1)
26 | self.assertEqual(judge.models[0].name, "gpt-4")
27 |
28 | # Test adding Model instance
29 | model = Model(name="anthropic:claude-3")
30 | judge.add_model(model)
31 | self.assertEqual(len(judge.models), 2)
32 | self.assertEqual(judge.models[1].name, "claude-3")
33 | self.assertEqual(judge.models[1].provider, "anthropic")
34 |
35 | # Test adding model with profile
36 | judge.add_model("mistral", profile="custom")
37 | self.assertEqual(len(judge.models), 3)
38 | self.assertEqual(judge.models[2].name, "mistral")
39 | self.assertEqual(judge.models[2].profile, "custom")
40 |
41 | class TestToolAnalysis(unittest.TestCase):
42 | def test_analyze_message_unique_tools(self):
43 | """Test analyzing unique tool calls"""
44 | from mcpx_eval.judge import ToolAnalysis
45 |
46 | tool_analysis = ToolAnalysis()
47 |
48 | # Test first unique tool call
49 | msg1 = {
50 | "tool": {
51 | "name": "test_tool",
52 | "input": {"param": "value1"}
53 | }
54 | }
55 | tool_analysis.analyze_message(msg1, 0)
56 |
57 | self.assertEqual(tool_analysis.total_tool_calls, 1)
58 | self.assertEqual(tool_analysis.redundant_tool_calls, 0)
59 | self.assertEqual(
60 | tool_analysis.tool_analysis["tool_0"]["redundancy"],
61 | "unique"
62 | )
63 |
64 | # Test second unique tool call
65 | msg2 = {
66 | "tool": {
67 | "name": "test_tool",
68 | "input": {"param": "value2"}
69 | }
70 | }
71 | tool_analysis.analyze_message(msg2, 1)
72 |
73 | self.assertEqual(tool_analysis.total_tool_calls, 2)
74 | self.assertEqual(tool_analysis.redundant_tool_calls, 0)
75 |
76 | def test_analyze_message_redundant_tools(self):
77 | """Test analyzing redundant tool calls"""
78 | from mcpx_eval.judge import ToolAnalysis
79 |
80 | tool_analysis = ToolAnalysis()
81 |
82 | # Add first tool call
83 | msg1 = {
84 | "tool": {
85 | "name": "test_tool",
86 | "input": {"param": "value1"}
87 | }
88 | }
89 | tool_analysis.analyze_message(msg1, 0)
90 |
91 | # Add redundant tool call
92 | msg2 = {
93 | "tool": {
94 | "name": "test_tool",
95 | "input": {"param": "value1"}
96 | }
97 | }
98 | tool_analysis.analyze_message(msg2, 1)
99 |
100 | self.assertEqual(tool_analysis.total_tool_calls, 2)
101 | self.assertEqual(tool_analysis.redundant_tool_calls, 1)
102 | self.assertEqual(
103 | tool_analysis.tool_analysis["tool_1"]["redundancy"],
104 | "redundant"
105 | )
106 |
107 | class TestModelApiConfig(unittest.TestCase):
108 | @patch.dict('os.environ', {
109 | 'OPENAI_HOST': 'https://custom-openai.com',
110 | 'GPT-4_HOST': 'https://custom-gpt4.com'
111 | })
112 | def test_get_host_url(self):
113 | """Test getting host URLs for different providers"""
114 | from mcpx_eval.judge import ModelApiConfig
115 |
116 | # Test OpenAI default
117 | url = ModelApiConfig.get_host_url("gpt-3.5-turbo", "openai")
118 | self.assertEqual(url, "https://custom-openai.com/v1")
119 |
120 | # Test model-specific override
121 | url = ModelApiConfig.get_host_url("gpt-4", "openai")
122 | self.assertEqual(url, "https://custom-gpt4.com/v1")
123 |
124 | # Test Ollama default
125 | url = ModelApiConfig.get_host_url("llama2", "ollama")
126 | self.assertEqual(url, "http://127.0.0.1:11434/v1")
127 |
128 | import asyncio
129 |
130 | class AsyncIteratorMock:
131 | def __init__(self, items):
132 | self.items = items
133 | self.index = 0
134 |
135 | async def __aiter__(self):
136 | return self
137 |
138 | async def __anext__(self):
139 | try:
140 | item = self.items[self.index]
141 | except IndexError:
142 | raise StopAsyncIteration
143 | self.index += 1
144 | return item
145 |
146 | class MockPart:
147 | def __init__(self, **kwargs):
148 | self.__dict__.update(kwargs)
149 |
150 | class MockResponse:
151 | def __init__(self, **kwargs):
152 | for key, value in kwargs.items():
153 | setattr(self, key, value)
154 |
155 | class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase):
156 | @patch('mcpx_eval.judge.Chat')
157 | @patch('mcpx_eval.judge.mcp_run')
158 | async def test_evaluate_model_success(self, mock_mcp_run, mock_chat):
159 | """Test successful model evaluation"""
160 | # Setup mock mcp_run.Client with proper tools attribute
161 | mock_tools = MagicMock()
162 | mock_tools.keys.return_value = ["test_tool"]
163 | mock_client = MagicMock()
164 | mock_client.tools = mock_tools
165 | mock_mcp_run.Client = Mock(return_value=mock_client)
166 | mock_mcp_run.ClientConfig = Mock()
167 |
168 | # Setup mock chat instance
169 | mock_chat_instance = MagicMock()
170 | mock_chat_instance.client = mock_client
171 |
172 | # Setup response parts
173 | model_response_parts = [
174 | MockPart(
175 | part_kind="text",
176 | content="Test response"
177 | ),
178 | MockPart(
179 | part_kind="tool-call",
180 | tool_name="test_tool",
181 | tool_call_id="123",
182 | args={"param": "value"},
183 | args_as_dict=lambda: {"param": "value"}
184 | )
185 | ]
186 | request_parts = [
187 | MockPart(
188 | part_kind="tool-return",
189 | tool_name="test_tool",
190 | tool_call_id="123",
191 | content="Tool result"
192 | )
193 | ]
194 |
195 | async def mock_iter(prompt):
196 | yield MockResponse(model_response=MockResponse(parts=model_response_parts))
197 | yield MockResponse(request=MockResponse(parts=request_parts))
198 | yield MockResponse(data=MockPart(data="Final result"))
199 |
200 | mock_chat_instance.iter = mock_iter
201 | mock_chat.return_value = mock_chat_instance
202 |
203 | judge = Judge()
204 | model = Model(name="test-model")
205 | tool_analysis = ToolAnalysis()
206 |
207 | result = await judge.evaluate_model(model, "Test prompt", tool_analysis)
208 |
209 | self.assertIsNotNone(result)
210 | self.assertEqual(len(result["messages"]), 4) # text, tool-call, tool-return, final_result
211 | self.assertEqual(result["messages"][0]["kind"], "text")
212 | self.assertEqual(result["messages"][1]["kind"], "tool-call")
213 | self.assertEqual(result["messages"][2]["kind"], "tool-return")
214 | self.assertEqual(result["messages"][3]["kind"], "final_result")
215 |
216 | @patch('mcpx_eval.judge.Chat')
217 | @patch('mcpx_eval.judge.mcp_run')
218 | async def test_evaluate_model_failure(self, mock_mcp_run, mock_chat):
219 | """Test model evaluation with error"""
220 | # Setup mock mcp_run.Client
221 | mock_client = Mock()
222 | mock_mcp_run.Client = Mock(return_value=mock_client)
223 | mock_mcp_run.ClientConfig = Mock()
224 |
225 | mock_chat_instance = Mock()
226 |
227 | async def mock_iter(prompt):
228 | raise Exception("Test error")
229 | yield # Needed to make it a generator
230 |
231 | mock_chat_instance.iter = mock_iter
232 | mock_chat.return_value = mock_chat_instance
233 |
234 | judge = Judge()
235 | model = Model(name="test-model")
236 | tool_analysis = ToolAnalysis()
237 |
238 | result = await judge.evaluate_model(model, "Test prompt", tool_analysis)
239 |
240 | self.assertIsNone(result)
241 |
242 | class TestDatabase(unittest.TestCase):
243 | def setUp(self):
244 | self.db = Database(":memory:") # Use in-memory SQLite for testing
245 |
246 | def test_save_and_retrieve_results(self):
247 | """Test saving and retrieving test results"""
248 | # Create test data
249 | test_name = "test1"
250 | score_data = ScoreModel(
251 | tool_use=80,
252 | accuracy=90,
253 | completeness=85,
254 | quality=88,
255 | hallucination_score=5,
256 | false_claims=["claim1"],
257 | llm_output="test output",
258 | description="test description"
259 | )
260 |
261 | score = Score(
262 | score=score_data,
263 | model="test-model",
264 | duration=1.5,
265 | tool_analysis={"tool_1": {"name": "test_tool", "redundancy": "unique"}},
266 | redundant_tool_calls=0,
267 | tool_calls=1
268 | )
269 |
270 | results = Results(scores=[score], duration=1.5)
271 |
272 | # Save results
273 | self.db.save_results(test_name, results)
274 |
275 | # Retrieve and verify results
276 | retrieved = self.db.average_results(test_name)
277 |
278 | self.assertEqual(len(retrieved.scores), 1)
279 | self.assertEqual(retrieved.scores[0].model, "test-model")
280 | self.assertEqual(retrieved.scores[0].duration, 1.5)
281 | self.assertEqual(retrieved.scores[0].tool_calls, 1)
282 | self.assertEqual(retrieved.scores[0].redundant_tool_calls, 0)
283 | self.assertEqual(retrieved.scores[0].accuracy, 90)
284 |
285 | if __name__ == '__main__':
286 | unittest.main()
--------------------------------------------------------------------------------
/mcpx_eval/__main__.py:
--------------------------------------------------------------------------------
1 | from . import Judge, Test, Database
2 | from .html import visualize_json
3 | import asyncio
4 | import logging
5 | import pandas as pd
6 | from tempfile import NamedTemporaryFile
7 | import webbrowser
8 | import os
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def print_result(result):
14 | # Print model header
15 | print(f"\n{result.model}")
16 | print("=" * len(result.model))
17 |
18 | # Create a DataFrame for the metrics
19 | metrics_df = pd.DataFrame(
20 | {
21 | "Metric": [
22 | "Duration (s)",
23 | "Tool Calls",
24 | "Redundant Calls",
25 | "Failed Calls",
26 | "Tool Use %",
27 | "Accuracy %",
28 | "Completeness %",
29 | "Quality %",
30 | "Hallucination Score",
31 | ],
32 | "Value": [
33 | f"{result.duration:.2f}",
34 | result.tool_calls,
35 | result.redundant_tool_calls,
36 | result.failed_tool_calls,
37 | f"{result.tool_use:.1f}",
38 | f"{result.accuracy:.1f}",
39 | f"{result.completeness:.1f}",
40 | f"{result.quality:.1f}",
41 | f"{result.hallucination_score:.1f}",
42 | ],
43 | }
44 | )
45 |
46 | # Print metrics table
47 | print("\nMetrics:")
48 | print(metrics_df.to_string(index=False))
49 |
50 | # Print output and description
51 | print("\nOutput:")
52 | print(result.llm_output)
53 | print("\nDescription:")
54 | print(result.description)
55 |
56 | # Print false claims if any
57 | if result.false_claims and len(result.false_claims) > 0:
58 | print("\nFalse Claims Detected:")
59 | for claim in result.false_claims:
60 | print(f" - {claim}")
61 |
62 | # Print tool analysis if any
63 | if result.tool_analysis and len(result.tool_analysis) > 0:
64 | print("\nTool Analysis:")
65 | tool_data = []
66 | for tool_id, analysis in result.tool_analysis.items():
67 | if isinstance(analysis, list):
68 | for a in analysis:
69 | tool_data.append(
70 | {
71 | "Tool ID": tool_id,
72 | "Name": a["name"],
73 | "Redundancy": a["redundancy"],
74 | }
75 | )
76 | else:
77 | tool_data.append(
78 | {
79 | "Tool ID": tool_id,
80 | "Name": analysis["name"],
81 | "Redundancy": analysis["redundancy"],
82 | }
83 | )
84 |
85 | if tool_data:
86 | tool_df = pd.DataFrame(tool_data)
87 | print(tool_df.to_string(index=False))
88 |
89 |
90 | def summary(args):
91 | db = Database(args.db)
92 | res = db.average_results(args.name)
93 | if not res.scores:
94 | return # Database class now handles empty results messaging
95 |
96 | print(f"\nTest Summary: {args.name}")
97 | print("=" * (14 + len(args.name)))
98 | print(f"Number of results: {len(res.scores)}\n")
99 |
100 | for result in res.scores:
101 | print_result(result)
102 |
103 |
104 | def json_summary(args):
105 | """Generate a JSON summary of test data"""
106 | import json
107 |
108 | db = Database(args.db)
109 | summary = db.generate_json_summary()
110 |
111 | # Filter to specific test if requested
112 | if args.name:
113 | if args.name in summary["tests"]:
114 | filtered_summary = {
115 | "tests": {args.name: summary["tests"][args.name]},
116 | "total": {
117 | "models": {},
118 | "metrics": summary["tests"][args.name]["metrics"],
119 | "test_count": 1,
120 | "model_count": summary["tests"][args.name]["model_count"],
121 | },
122 | "generated_at": summary["generated_at"],
123 | }
124 | # Include only models that participated in this test
125 | for model_name, model_data in summary["total"]["models"].items():
126 | if model_name in summary["tests"][args.name]["models"]:
127 | filtered_summary["total"]["models"][model_name] = {
128 | **model_data,
129 | "test_count": 1,
130 | }
131 | summary = filtered_summary
132 | else:
133 | print(f"Warning: Test '{args.name}' not found in results")
134 |
135 | # Format JSON with indentation for readability
136 | formatted_json = json.dumps(summary, indent=2)
137 |
138 | # Output to file or stdout
139 | if args.json:
140 | with open(args.output, "w") as f:
141 | f.write(formatted_json)
142 | print(f"JSON summary saved to {args.output}")
143 | print(
144 | f"To visualize this file, run: uv run python -m mcpx_eval html {args.output}"
145 | )
146 | elif not args.html and not args.show:
147 | print(formatted_json)
148 |
149 | # If visualization is requested, create and open it
150 | output_path = args.html
151 | html = visualize_json(summary, output_path)
152 | # Also save a copy to the specified location if provided
153 | if output_path:
154 | with open(output_path, "w") as f:
155 | f.write(html)
156 | print(f"Saved to {output_path}")
157 | temp_path = os.path.abspath(output_path)
158 | if args.show:
159 | if output_path is None:
160 | # Write to temporary file and open in browser
161 | with NamedTemporaryFile(suffix=".html", delete=False, mode="w") as f:
162 | f.write(html)
163 | temp_path = f.name
164 |
165 | print("Opening browser...")
166 | webbrowser.open(f"file://{temp_path}")
167 |
168 |
169 | async def run():
170 | from argparse import ArgumentParser
171 |
172 | parser = ArgumentParser(
173 | "mcpx-eval", description="Open-ended LLM tool use evaluator for mcp.run tools"
174 | )
175 | subparsers = parser.add_subparsers(dest="command", help="Command to run")
176 | parser.add_argument("--db", default=None, help="SQLite3 database path")
177 |
178 | # Main test command (default)
179 | test_parser = subparsers.add_parser("test", help="Run evaluation tests")
180 | test_parser.add_argument("--name", default="", help="Test name")
181 | test_parser.add_argument(
182 | "--model",
183 | "-m",
184 | default=[],
185 | help="Model to include in test",
186 | action="append",
187 | )
188 | test_parser.add_argument(
189 | "--judge-model",
190 | default="claude-3-5-sonnet-latest",
191 | help="Model to use for Judge",
192 | )
193 | test_parser.add_argument(
194 | "--ignore-tool",
195 | "-x",
196 | default=[],
197 | help="Ignore tool",
198 | action="append",
199 | )
200 | test_parser.add_argument(
201 | "--tool",
202 | "-t",
203 | default=[],
204 | help="Expected tool",
205 | action="append",
206 | )
207 | test_parser.add_argument(
208 | "--profile",
209 | "-p",
210 | default=None,
211 | help="Profile to use for judge model",
212 | )
213 |
214 | test_parser.add_argument("--prompt", help="Test prompt")
215 | test_parser.add_argument("--check", help="Test check")
216 | test_parser.add_argument("--config", help="Test config file")
217 | test_parser.add_argument(
218 | "--iter",
219 | "-i",
220 | default=1,
221 | type=int,
222 | help="Number of times to run the test for each model",
223 | )
224 | test_parser.add_argument(
225 | "--no-save",
226 | default=False,
227 | action="store_true",
228 | help="Don't save results in db",
229 | )
230 | test_parser.add_argument(
231 | "--task",
232 | default=None,
233 | help="Name of task from mcp.run to get prompt from",
234 | )
235 | test_parser.add_argument(
236 | "--task-run",
237 | default=None,
238 | help="Name of a specific task run",
239 | )
240 | test_parser.add_argument(
241 | "--var",
242 | default=[],
243 | help="Template variable",
244 | action="append",
245 | )
246 |
247 | # Summary command
248 | summary_parser = subparsers.add_parser("summary", help="Show test results summary")
249 | summary_parser.add_argument("name", help="Test name to summarize")
250 |
251 | # JSON summary command
252 | gen_parser = subparsers.add_parser(
253 | "gen", help="Generate JSON summary of all test data"
254 | )
255 | gen_parser.add_argument(
256 | "--name",
257 | "-n",
258 | help="Filter results to a specific test name",
259 | )
260 | gen_parser.add_argument(
261 | "--json",
262 | help="Output JSON file path (default: print to stdout)",
263 | )
264 | gen_parser.add_argument(
265 | "--show",
266 | "-s",
267 | action="store_true",
268 | help="Create an interactive HTML visualization of the JSON data",
269 | )
270 | gen_parser.add_argument(
271 | "--html",
272 | help="Output path for HTML visualization (optional)",
273 | )
274 |
275 | # Global options
276 | parser.add_argument(
277 | "--log",
278 | default=None,
279 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
280 | help="Set the logging level.",
281 | )
282 | parser.add_argument(
283 | "--verbose", default=False, action="store_true", help="Enable verbose logging"
284 | )
285 |
286 | args = parser.parse_args()
287 |
288 | # Setup logging
289 | level = args.log or "INFO"
290 | log_level = getattr(logging, level, None)
291 | if not isinstance(log_level, int):
292 | raise ValueError("Invalid log level: %s" % level)
293 | logging.basicConfig(level=log_level)
294 |
295 | if not args.verbose:
296 | for handler in logging.root.handlers:
297 | handler.addFilter(logging.Filter("mcpx_eval"))
298 |
299 | # Handle command routing
300 | command = getattr(args, "command", "test") # Default to test if not specified
301 |
302 | # Visualization commands removed
303 |
304 | # Summary command
305 | if command == "summary":
306 | summary(args)
307 | return
308 |
309 | # gen command
310 | elif command == "gen":
311 | json_summary(args)
312 | return
313 |
314 | # Test command (default)
315 | elif command == "test":
316 | test = None
317 | name = args.name or args.task
318 |
319 | vars = {}
320 | for line in args.var:
321 | s = line.split("=")
322 | vars[s[0]] = s[1]
323 |
324 | if hasattr(args, "config") and args.config is not None:
325 | test = Test.load(args.config)
326 | for model in args.model:
327 | test.models.append(model)
328 | if args.name is None or args.name == "":
329 | if test.name is not None:
330 | name = test.name
331 | test.vars.update(**vars)
332 | test.expected_tools.extend(args.tool)
333 | test.ignore_tools.extend(args.ignore_tool)
334 | test.task = args.task or test.task
335 | test.prompt = args.prompt or test.prompt
336 | test.check = args.check or test.check
337 | test.name = args.name or test.name
338 | test.task_run = args.task_run or test.task_run
339 | else:
340 | test = Test(
341 | name=name,
342 | prompt=args.prompt or "",
343 | check=args.check or "",
344 | models=args.model,
345 | profile=args.profile,
346 | expected_tools=args.tool,
347 | ignore_tools=args.ignore_tool,
348 | vars=vars,
349 | task=args.task,
350 | task_run=args.task_run,
351 | )
352 |
353 | iterations = args.iter
354 | logger.info(
355 | f"Running {test.name}: task={test.task is not None}, models=[{', '.join(test.models)}] ({iterations} iteration{'s' if iterations > 1 else ''})"
356 | )
357 | db = None
358 | if args.db is not None:
359 | db = Database(args.db)
360 |
361 | judge = Judge(
362 | models=test.models,
363 | profile=args.profile,
364 | db=db,
365 | judge_model=args.judge_model,
366 | ignore_tools=test.ignore_tools,
367 | )
368 | judge.db.save_test(test)
369 |
370 | total_duration = 0
371 |
372 | for i in range(iterations):
373 | if iterations > 1:
374 | logger.info(f"Iteration {i + 1}/{iterations}")
375 |
376 | # For multiple iterations, pass save=True to ensure each run is saved to DB
377 | res = await judge.run_test(test, save=not args.no_save)
378 | total_duration += res.duration
379 | logger.debug(f"Result: {res.scores}")
380 | if not args.no_save:
381 | logger.info("Results saved to db")
382 |
383 | if iterations > 1:
384 | logger.info(f"Iteration {i + 1} finished in {res.duration}s")
385 |
386 | logger.info(f"{test.name} finished in {total_duration}s total")
387 |
388 | if iterations > 1:
389 | print(f"\nShowing results from iteration {iterations} of {iterations}.")
390 | print(f"Use 'mcpx-eval summary {test.name}' to see aggregated results.\n")
391 |
392 | for result in res.scores:
393 | if result is None:
394 | continue
395 | print_result(result)
396 | else:
397 | parser.print_help()
398 |
399 |
400 | def main():
401 | asyncio.run(run())
402 |
403 |
404 | if __name__ == "__main__":
405 | main()
406 |
--------------------------------------------------------------------------------
/mcpx_eval/database.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import json
3 | import pandas as pd
4 | from datetime import datetime
5 | from .models import Score, Results, Test, ScoreModel
6 |
7 |
8 | class Database:
9 | conn: sqlite3.Connection
10 |
11 | def __init__(self, path: str | None = "eval.db"):
12 | if path is None:
13 | path = "eval.db"
14 | self.conn = sqlite3.connect(path)
15 |
16 | self.conn.executescript(
17 | """
18 | CREATE TABLE IF NOT EXISTS tests (
19 | id INTEGER PRIMARY KEY,
20 | name TEXT NOT NULL,
21 | prompt TEXT NOT NULL,
22 | prompt_check TEXT NOT NULL,
23 | UNIQUE(name)
24 | );
25 | CREATE TABLE IF NOT EXISTS eval_results (
26 | id INTEGER PRIMARY KEY,
27 | t TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
28 | test_name TEXT NOT NULL,
29 | model TEXT NOT NULL,
30 | duration REAL NOT NULL,
31 | output TEXT NOT NULL,
32 | description TEXT NOT NULL,
33 | accuracy REAL NOT NULL,
34 | tool_use REAL NOT NULL,
35 | tool_calls INT NOT NULL,
36 | redundant_tool_calls INT NOT NULL DEFAULT 0,
37 | failed_tool_calls INT NOT NULL DEFAULT 0,
38 | completeness REAL NOT NULL DEFAULT 0.0,
39 | quality REAL NOT NULL,
40 | hallucination_score REAL NOT NULL DEFAULT 0.0,
41 | false_claims TEXT NOT NULL DEFAULT '[]',
42 | tool_analysis TEXT NOT NULL DEFAULT '{}',
43 | FOREIGN KEY(test_name) REFERENCES tests(name)
44 | );
45 | """
46 | )
47 | self.conn.commit()
48 |
49 | def save_score(self, name: str, score: Score, commit=True):
50 | if name == "":
51 | return
52 |
53 | # Convert score to DataFrame for efficient insertion
54 | df = pd.DataFrame(
55 | [
56 | {
57 | "test_name": name,
58 | "model": score.model,
59 | "duration": score.duration,
60 | "output": score.llm_output,
61 | "description": score.description,
62 | "accuracy": score.accuracy,
63 | "tool_use": score.tool_use,
64 | "tool_calls": score.tool_calls,
65 | "redundant_tool_calls": score.redundant_tool_calls,
66 | "failed_tool_calls": score.failed_tool_calls,
67 | "completeness": score.completeness,
68 | "quality": score.quality,
69 | "hallucination_score": score.hallucination_score,
70 | "false_claims": json.dumps(score.false_claims),
71 | "tool_analysis": json.dumps(score.tool_analysis),
72 | }
73 | ]
74 | )
75 |
76 | df.to_sql("eval_results", self.conn, if_exists="append", index=False)
77 | if commit:
78 | self.conn.commit()
79 |
80 | def save_test(self, test: "Test"):
81 | self.conn.execute(
82 | """
83 | INSERT OR IGNORE INTO tests (name, prompt, prompt_check) VALUES (?, ?, ?);
84 | """,
85 | (test.name, test.prompt, test.check),
86 | )
87 | self.conn.commit()
88 |
89 | def save_results(self, name: str, results: Results):
90 | if not results.scores:
91 | return
92 |
93 | # Convert all scores to DataFrame at once
94 | records = [
95 | {
96 | "test_name": name,
97 | "model": score.model,
98 | "duration": score.duration,
99 | "output": score.llm_output,
100 | "description": score.description,
101 | "accuracy": score.accuracy,
102 | "tool_use": score.tool_use,
103 | "tool_calls": score.tool_calls,
104 | "redundant_tool_calls": score.redundant_tool_calls,
105 | "failed_tool_calls": score.failed_tool_calls,
106 | "completeness": score.completeness,
107 | "quality": score.quality,
108 | "hallucination_score": score.hallucination_score,
109 | "false_claims": json.dumps(score.false_claims),
110 | "tool_analysis": json.dumps(score.tool_analysis),
111 | }
112 | for score in results.scores
113 | ]
114 |
115 | df = pd.DataFrame(records)
116 | df.to_sql("eval_results", self.conn, if_exists="append", index=False)
117 | self.conn.commit()
118 |
119 | def average_results(self, name: str) -> Results:
120 | # Read results into a pandas DataFrame
121 | df = pd.read_sql_query(
122 | """
123 | SELECT *
124 | FROM eval_results
125 | WHERE test_name = ?
126 | """,
127 | self.conn,
128 | params=(name,),
129 | )
130 |
131 | if df.empty:
132 | print(f"No results found in database for test: {name}")
133 | print("Available tests:")
134 | tests = pd.read_sql_query(
135 | "SELECT DISTINCT test_name FROM eval_results", self.conn
136 | )
137 | if tests.empty:
138 | print(" No tests have been run yet")
139 | else:
140 | for test in tests["test_name"]:
141 | print(f" - {test}")
142 | return Results(scores=[])
143 |
144 | # Convert false_claims and tool_analysis from JSON strings
145 | df["false_claims"] = df["false_claims"].apply(json.loads)
146 | df["tool_analysis"] = df["tool_analysis"].apply(json.loads)
147 |
148 | # Group by model and aggregate
149 | grouped = (
150 | df.groupby("model")
151 | .agg(
152 | {
153 | "duration": "mean",
154 | "output": "first", # take first output as example
155 | "description": "first", # take first description as example
156 | "accuracy": "mean",
157 | "tool_use": "mean",
158 | "tool_calls": "mean",
159 | "redundant_tool_calls": "mean",
160 | "completeness": "mean",
161 | "quality": "mean",
162 | "hallucination_score": "mean",
163 | "false_claims": "sum", # combine all false claims
164 | "tool_analysis": "first", # take first tool analysis
165 | }
166 | )
167 | .reset_index()
168 | )
169 |
170 | # Convert back to Score objects
171 | scores = [
172 | Score(
173 | model=row["model"],
174 | duration=row["duration"],
175 | score=ScoreModel(
176 | llm_output=row["output"],
177 | description=row["description"],
178 | accuracy=row["accuracy"],
179 | tool_use=row["tool_use"],
180 | completeness=row["completeness"],
181 | quality=row["quality"],
182 | hallucination_score=row["hallucination_score"],
183 | false_claims=row["false_claims"],
184 | ),
185 | tool_analysis=row["tool_analysis"],
186 | redundant_tool_calls=int(row["redundant_tool_calls"]),
187 | tool_calls=int(row["tool_calls"]),
188 | )
189 | for _, row in grouped.iterrows()
190 | ]
191 |
192 | return Results(scores=scores)
193 |
194 | def get_test_stats(self, test_name: str | None = None) -> pd.DataFrame:
195 | """Get detailed statistics for tests.
196 |
197 | Args:
198 | test_name: Optional test name to filter results
199 |
200 | Returns:
201 | DataFrame with test statistics including:
202 | - Number of runs per model
203 | - Mean and std dev of scores
204 | - Min/max durations
205 | """
206 | query = """
207 | SELECT
208 | test_name,
209 | model,
210 | COUNT(*) as runs,
211 | AVG(duration) as mean_duration,
212 | MIN(duration) as min_duration,
213 | MAX(duration) as max_duration,
214 | AVG(accuracy) as mean_accuracy,
215 | AVG(tool_use) as mean_tool_use,
216 | AVG(tool_calls) as mean_tool_calls,
217 | AVG(redundant_tool_calls) as mean_redundant_calls,
218 | AVG(completeness) as mean_completeness,
219 | AVG(quality) as mean_quality,
220 | AVG(hallucination_score) as mean_hallucination
221 | FROM eval_results
222 | """
223 |
224 | if test_name:
225 | query += " WHERE test_name = ?"
226 | params = (test_name,)
227 | else:
228 | params = ()
229 |
230 | query += " GROUP BY test_name, model"
231 |
232 | return pd.read_sql_query(query, self.conn, params=params)
233 |
234 | def generate_json_summary(self):
235 | # Read results into a pandas DataFrame
236 | df = pd.read_sql_query(
237 | """
238 | SELECT
239 | test_name,
240 | model,
241 | AVG(accuracy) as accuracy,
242 | AVG(tool_use) as tool_use,
243 | AVG(tool_calls) as tool_calls,
244 | AVG(redundant_tool_calls) as redundant_tool_calls,
245 | AVG(failed_tool_calls) as failed_tool_calls,
246 | AVG(completeness) as completeness,
247 | AVG(quality) as quality,
248 | AVG(hallucination_score) as hallucination_score,
249 | AVG(duration) as duration,
250 | COUNT(*) as runs
251 | FROM eval_results
252 | GROUP BY test_name, model
253 | """,
254 | self.conn,
255 | )
256 |
257 | # Use pandas styling to create formatted HTML tables
258 | def style_table(df):
259 | return (
260 | df.style.format(
261 | {
262 | "accuracy": "{:.3f}%",
263 | "tool_use": "{:.3f}%",
264 | "completeness": "{:.3f}%",
265 | "quality": "{:.3f}%",
266 | "hallucination_score": "{:.3f}%",
267 | "tool_calls": "{:.1f}",
268 | "redundant_tool_calls": "{:.1f}",
269 | "runs": "{:.0f}",
270 | "duration": "{:.3f}",
271 | }
272 | )
273 | .background_gradient(
274 | subset=[
275 | "accuracy",
276 | "tool_use",
277 | "completeness",
278 | "quality",
279 | ],
280 | cmap="RdYlGn",
281 | )
282 | .background_gradient(subset=["hallucination_score"], cmap="RdYlGn_r")
283 | .set_properties(**{"text-align": "center"})
284 | .to_html()
285 | )
286 |
287 | # Generate summary structure
288 | summary = {
289 | "tests": {},
290 | "total": {
291 | "models": {},
292 | "metrics": {},
293 | "test_count": len(df["test_name"].unique()),
294 | "model_count": len(df["model"].unique()),
295 | },
296 | }
297 |
298 | # Calculate total metrics with formatted precision
299 | total_metrics = df.agg(
300 | {
301 | "accuracy": lambda x: round(x.mean(), 3),
302 | "tool_use": lambda x: round(x.mean(), 3),
303 | "tool_calls": lambda x: round(x.sum(), 1),
304 | "redundant_tool_calls": lambda x: round(x.sum(), 1),
305 | "completeness": lambda x: round(x.mean(), 3),
306 | "quality": lambda x: round(x.mean(), 3),
307 | "hallucination_score": lambda x: round(x.mean(), 3),
308 | }
309 | )
310 | summary["total"]["metrics"] = total_metrics.to_dict()
311 |
312 | # Process each test
313 | for test_name in df["test_name"].unique():
314 | test_df = df[df["test_name"] == test_name]
315 | test_df = test_df.sort_values("quality", ascending=False)
316 |
317 | # Calculate test metrics with formatted precision
318 | test_metrics = test_df.agg(
319 | {
320 | "accuracy": lambda x: round(x.mean(), 3),
321 | "tool_use": lambda x: round(x.mean(), 3),
322 | "tool_calls": lambda x: round(x.sum(), 1),
323 | "redundant_tool_calls": lambda x: round(x.sum(), 1),
324 | "completeness": lambda x: round(x.mean(), 3),
325 | "quality": lambda x: round(x.mean(), 3),
326 | "hallucination_score": lambda x: round(x.mean(), 3),
327 | }
328 | )
329 |
330 | # Round tool calls in test metrics to 1 decimal place
331 | if "tool_calls" in test_metrics:
332 | test_metrics["tool_calls"] = round(test_metrics["tool_calls"], 1)
333 | if "redundant_tool_calls" in test_metrics:
334 | test_metrics["redundant_tool_calls"] = round(
335 | test_metrics["redundant_tool_calls"], 1
336 | )
337 |
338 | summary["tests"][test_name] = {
339 | "models": {
340 | row["model"]: {
341 | "accuracy": row["accuracy"],
342 | "tool_use": row["tool_use"],
343 | "tool_calls": row["tool_calls"],
344 | "redundant_tool_calls": row["redundant_tool_calls"],
345 | "failed_tool_calls": row["failed_tool_calls"],
346 | "completeness": row["completeness"],
347 | "quality": row["quality"],
348 | "hallucination_score": row["hallucination_score"],
349 | "runs": row["runs"],
350 | "duration": row["duration"],
351 | }
352 | for _, row in test_df.iterrows()
353 | },
354 | "metrics": test_metrics.to_dict(),
355 | "model_count": len(test_df["model"].unique()),
356 | }
357 |
358 | # Update total models data
359 | for model in test_df["model"].unique():
360 | model_data = test_df[test_df["model"] == model].iloc[0]
361 | if model not in summary["total"]["models"]:
362 | summary["total"]["models"][model] = {
363 | "accuracy": 0.0,
364 | "tool_use": 0.0,
365 | "tool_calls": 0,
366 | "redundant_tool_calls": 0,
367 | "completeness": 0.0,
368 | "quality": 0.0,
369 | "hallucination_score": 0.0,
370 | "test_count": 0,
371 | "duration": 0.0,
372 | }
373 |
374 | summary["total"]["models"][model]["test_count"] += 1
375 | for metric in [
376 | "accuracy",
377 | "tool_use",
378 | "completeness",
379 | "quality",
380 | "hallucination_score",
381 | "duration",
382 | ]:
383 | summary["total"]["models"][model][metric] += model_data[metric]
384 | summary["total"]["models"][model]["tool_calls"] += model_data[
385 | "tool_calls"
386 | ]
387 | summary["total"]["models"][model]["redundant_tool_calls"] += model_data[
388 | "redundant_tool_calls"
389 | ]
390 |
391 | # Calculate averages for total model metrics
392 | for model in summary["total"]["models"]:
393 | test_count = summary["total"]["models"][model]["test_count"]
394 | if test_count > 0:
395 | for metric in [
396 | "accuracy",
397 | "tool_use",
398 | "completeness",
399 | "quality",
400 | "hallucination_score",
401 | "duration",
402 | ]:
403 | summary["total"]["models"][model][metric] /= test_count
404 |
405 | # Add timestamp
406 | summary["generated_at"] = datetime.now().isoformat()
407 |
408 | return summary
409 |
--------------------------------------------------------------------------------
/mcpx_eval/judge.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List, Dict, Any, Optional
3 | from datetime import datetime, timedelta
4 | import json
5 | import traceback
6 | import os
7 |
8 | from mcpx_py import Chat, mcp_run, openai_compatible_model
9 | import pystache
10 |
11 | from .models import ScoreModel, Score, Results, Test, Model
12 | from .database import Database
13 | from .constants import SYSTEM_PROMPT, TEST_PROMPT
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def is_int(x):
19 | if x is None:
20 | return False
21 | try:
22 | int(x)
23 | return True
24 | except ValueError:
25 | return False
26 |
27 |
28 | def task_run_index(
29 | client: mcp_run.Client, task: str, index: int = -1
30 | ) -> mcp_run.TaskRun | None:
31 | a = list(client.list_task_runs(task))
32 | a.reverse()
33 | try:
34 | return a[index]
35 | except IndexError:
36 | return None
37 |
38 |
39 | class ModelApiConfig:
40 | """Helper class to manage model API configurations."""
41 |
42 | @staticmethod
43 | def get_host_url(model_name: str, provider: str) -> str:
44 | """Get the appropriate API host URL for a given model and provider."""
45 | if provider in ["ollama", "llama"]:
46 | host = os.environ.get(
47 | f"{model_name.upper()}_HOST",
48 | os.environ.get(
49 | "LLAMA_HOST",
50 | os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434"),
51 | ),
52 | )
53 | return f"{host}/v1" if not host.endswith("/v1") else host
54 | elif provider == "openai":
55 | host = os.environ.get(
56 | f"{model_name.upper()}_HOST",
57 | os.environ.get("OPENAI_HOST", "https://api.openai.com"),
58 | )
59 | return f"{host}/v1" if not host.endswith("/v1") else host
60 | return ""
61 |
62 | @staticmethod
63 | def get_model_config(model: Model) -> str:
64 | """Get the appropriate model configuration for API calls."""
65 | if model.provider in ["ollama", "llama", "openai"]:
66 | host = ModelApiConfig.get_host_url(model.name, model.provider)
67 | return openai_compatible_model(host, model.name)
68 | return model.name
69 |
70 |
71 | class ToolAnalysis:
72 | """Helper class to analyze tool usage patterns."""
73 |
74 | def __init__(self):
75 | self.tool_analysis: Dict[str, Any] = {}
76 | self.redundant_tool_calls = 0
77 | self.seen_tool_patterns = set()
78 | self.total_tool_calls = 0
79 |
80 | def analyze_message(self, msg: Dict[str, Any], index: int) -> None:
81 | """Analyze a single message for tool usage patterns."""
82 | if not msg.get("tool"):
83 | return
84 |
85 | tool_name = msg["tool"]["name"]
86 | tool_input = msg["tool"]["input"]
87 | self.total_tool_calls += 1
88 |
89 | # Create pattern string for redundancy detection
90 | tool_pattern = f"{tool_name}:{str(tool_input)}"
91 |
92 | # Check for redundancy
93 | redundancy_status = (
94 | "redundant" if tool_pattern in self.seen_tool_patterns else "unique"
95 | )
96 | if redundancy_status == "redundant":
97 | self.redundant_tool_calls += 1
98 | else:
99 | self.seen_tool_patterns.add(tool_pattern)
100 |
101 | # Store tool analysis
102 | self.tool_analysis[f"tool_{index}"] = {
103 | "name": tool_name,
104 | "input": tool_input,
105 | "redundancy": redundancy_status,
106 | }
107 |
108 |
109 | def format_judge_prompt(prompt, results, check, expected_tools):
110 | if check is None or check == "":
111 | check = "Make sure the output matches the requirments of the prompt"
112 | return f"""
113 |
114 | Current date and time: {datetime.now().isoformat()}
115 |
116 |
117 | {prompt}
118 |
119 |
122 | {check}
123 | {", ".join(expected_tools)}
124 | """
125 |
126 |
127 | class Judge:
128 | """Evaluates model performance on given tests."""
129 |
130 | model: Model
131 | models: List[Model]
132 | ignore_tools: List[str]
133 | db: Database
134 | profile: Optional[str]
135 | retries: int
136 |
137 | def __init__(
138 | self,
139 | models: Optional[List[Model | str]] = None,
140 | db: Optional[Database] = None,
141 | profile: Optional[str] = None,
142 | judge_model: str = "claude-3-5-sonnet-latest",
143 | ignore_tools: Optional[List[str]] = None,
144 | retries: Optional[int] = None,
145 | ):
146 | self.retries = retries or 10
147 | self.profile = profile or mcp_run.ProfileSlug("~", "default")
148 | self.ignore_tools = ignore_tools or []
149 | self.db = db or Database()
150 | self.models = []
151 | self.model = Model(name=judge_model)
152 | if models is not None:
153 | for model in models:
154 | self.add_model(model)
155 |
156 | def add_model(
157 | self,
158 | model: Model | str,
159 | profile: Optional[str] = None,
160 | ) -> None:
161 | """Add a model to the evaluation list."""
162 | if isinstance(model, str):
163 | model = Model(name=model)
164 | if profile is not None:
165 | model.profile = profile
166 | self.models.append(model)
167 |
168 | async def run_test(self, test: Test, save: bool = True) -> Results:
169 | """Run a specific test configuration."""
170 | profile = test.profile
171 | if profile is None:
172 | profile = self.profile or mcp_run.ProfileSlug("~", "default")
173 | else:
174 | profile = mcp_run.ProfileSlug.parse(profile)
175 |
176 | if test.task is not None:
177 | client = mcp_run.Client(config=mcp_run.ClientConfig(profile=profile))
178 | tasks = client.tasks
179 | if test.task not in tasks:
180 | raise Exception(f"Invalid task, {test.task} not found in {profile}")
181 | test.prompt = tasks[test.task].prompt
182 |
183 | results = await self.run(
184 | pystache.render(test.prompt, test.vars),
185 | test.check,
186 | test.expected_tools,
187 | test.task,
188 | test.task_run,
189 | )
190 |
191 | if save:
192 | self.db.save_results(test.name, results)
193 | return results
194 |
195 | async def evaluate_model(
196 | self,
197 | model: Model,
198 | prompt: str,
199 | tool_analysis: ToolAnalysis,
200 | ) -> Dict[str, Any]:
201 | """Evaluate a single model's performance."""
202 | result = {"messages": [], "tools-available": []}
203 |
204 | try:
205 | model_config = ModelApiConfig.get_model_config(model)
206 | chat = Chat(
207 | client=mcp_run.Client(
208 | config=mcp_run.ClientConfig(profile=model.profile)
209 | ),
210 | model=model_config,
211 | ignore_tools=self.ignore_tools,
212 | system_prompt=TEST_PROMPT,
213 | retries=5,
214 | )
215 |
216 | # Get available tools, handling both real and mock objects
217 | try:
218 | result["tools-available"] = list(chat.client.tools.keys())
219 | except (TypeError, AttributeError):
220 | # If tools is a mock object, get the return value directly
221 | result["tools-available"] = chat.client.tools.keys()
222 |
223 | async for node in chat.iter(prompt):
224 | if hasattr(node, "model_response"):
225 | for part in node.model_response.parts:
226 | if part.part_kind == "text":
227 | logger.info(part.content)
228 | result["messages"].append(
229 | {"kind": part.part_kind, "text": part.content}
230 | )
231 | elif part.part_kind == "tool-call":
232 | logger.info(
233 | f"Tool {part.tool_name}({part.tool_call_id}): {part.args}"
234 | )
235 | result["messages"].append(
236 | {
237 | "kind": part.part_kind,
238 | "tool": {
239 | "name": part.tool_name,
240 | "input": part.args_as_dict(),
241 | },
242 | "tool_call_id": part.tool_call_id,
243 | }
244 | )
245 | tool_analysis.analyze_message(
246 | result["messages"][-1], len(result["messages"]) - 1
247 | )
248 |
249 | elif hasattr(node, "request"):
250 | for part in node.request.parts:
251 | if part.part_kind == "text":
252 | result["messages"].append(
253 | {"kind": part.part_kind, "text": part.content}
254 | )
255 | elif part.part_kind == "tool-return":
256 | logger.info(
257 | f"Tool returned {part.tool_name}({part.tool_call_id})"
258 | )
259 | logger.debug(
260 | f"Tool result {part.tool_name}({part.tool_call_id}):\n{part.content}"
261 | )
262 | result["messages"].append(
263 | {
264 | "kind": part.part_kind,
265 | "tool_name": part.tool_name,
266 | "content": part.content,
267 | "tool_call_id": part.tool_call_id,
268 | }
269 | )
270 | elif hasattr(node, "data"):
271 | logger.debug(f"Final result: {node.data.data}")
272 | result["messages"].append(
273 | {"kind": "final_result", "text": node.data.data}
274 | )
275 |
276 | except KeyboardInterrupt:
277 | return None
278 | except Exception:
279 | logger.error(f"{model.slug} failed: {traceback.format_exc()}")
280 | return None
281 |
282 | return result
283 |
284 | async def _evaluate_task_run(
285 | self,
286 | client: mcp_run.Client,
287 | run: mcp_run.TaskRun,
288 | check: str,
289 | expected_tools: List[str],
290 | model_config: ModelApiConfig,
291 | ) -> Score:
292 | logger.info(f"Analyzing task run {run.name}")
293 | prompt = run.results_list[0]["exchange"]["content"]
294 | agent = Chat(
295 | client=client,
296 | model=model_config,
297 | ignore_tools=self.ignore_tools,
298 | result_type=ScoreModel,
299 | system_prompt=SYSTEM_PROMPT,
300 | result_retries=self.retries,
301 | )
302 |
303 | res = await agent.send_message(
304 | format_judge_prompt(prompt, run.results_list, check, expected_tools)
305 | )
306 |
307 | tool_analysis = ToolAnalysis()
308 |
309 | for i, event in enumerate(run.results_list):
310 | if event["msg"] == "call tool request":
311 | tool_analysis.analyze_message(
312 | {
313 | "tool": {
314 | "name": event["params"]["name"],
315 | "input": event["params"]["arguments"],
316 | }
317 | },
318 | i,
319 | )
320 |
321 | duration = (run.modified_at - run.created_at).total_seconds()
322 | return Score(
323 | score=res.data,
324 | model=run._task.provider["settings"]["model"] + "-" + run.name,
325 | duration=duration,
326 | tool_analysis=tool_analysis.tool_analysis,
327 | redundant_tool_calls=tool_analysis.redundant_tool_calls,
328 | tool_calls=tool_analysis.total_tool_calls,
329 | trace=run.results_list,
330 | )
331 |
332 | async def run(
333 | self,
334 | prompt: str,
335 | check: str,
336 | expected_tools: List[str],
337 | task: str | None = None,
338 | task_run: str | None = None,
339 | vars: dict | None = None,
340 | ) -> Results:
341 | """Run evaluation across all models."""
342 | scores = []
343 | total_duration = timedelta(seconds=0)
344 |
345 | model_config = ModelApiConfig.get_model_config(self.model)
346 | if task is not None:
347 | client = mcp_run.Client(config=mcp_run.ClientConfig(profile=self.profile))
348 | if task_run.lower() == "all":
349 | for run in client.list_task_runs(task):
350 | scores.append(
351 | await self._evaluate_task_run(
352 | client, run, check, expected_tools, model_config
353 | )
354 | )
355 | elif is_int(task_run) or task_run == "latest":
356 | if task_run.lower() == "latest":
357 | task_run = -1
358 | task_run = int(task_run or -1)
359 | run = task_run_index(client, task, index=task_run)
360 | if run is not None:
361 | scores.append(
362 | await self._evaluate_task_run(
363 | client, run, check, expected_tools, model_config
364 | )
365 | )
366 | else:
367 | logger.error(f"Unable to load {task_run} for task {task}")
368 | elif task_run is not None and task_run.lower() != "new":
369 | found = False
370 | for run in client.list_task_runs(task):
371 | if run.name == task_run:
372 | scores.append(
373 | await self._evaluate_task_run(
374 | client, run, check, expected_tools, model_config
375 | )
376 | )
377 | found = True
378 | if not found:
379 | logger.error(f"Unable to load {task_run} for task {task}")
380 | elif len(self.models) == 0:
381 | logger.info("No task run specified, this will execute a new task run")
382 | run = client.tasks[task].run(vars or {})
383 | run.wait()
384 | run = task_run_index(client, task, index=-1)
385 | if run is not None:
386 | scores.append(
387 | await self._evaluate_task_run(
388 | client, run, check, expected_tools, model_config
389 | )
390 | )
391 | else:
392 | logger.error(f"Unable to load {task_run} for task {task}")
393 |
394 | for model in self.models:
395 | start = datetime.now()
396 | tool_analysis = ToolAnalysis()
397 |
398 | logger.info(f"Evaluating model {model.slug}")
399 | result = await self.evaluate_model(model, prompt, tool_analysis)
400 |
401 | if result is None:
402 | continue
403 |
404 | duration = datetime.now() - start
405 | duration_seconds = duration.total_seconds()
406 | total_duration += duration
407 |
408 | result["duration_in_seconds"] = f"{duration_seconds}s"
409 | result["number_of_tools_used"] = str(tool_analysis.total_tool_calls)
410 |
411 | logger.info(
412 | f"Analyzing results of {model.slug} with profile={self.profile}"
413 | )
414 | agent = Chat(
415 | client=mcp_run.Client(
416 | config=mcp_run.ClientConfig(profile=self.profile)
417 | ),
418 | model=model_config,
419 | ignore_tools=self.ignore_tools,
420 | result_type=ScoreModel,
421 | system_prompt=SYSTEM_PROMPT,
422 | result_retries=self.retries,
423 | )
424 |
425 | res = await agent.send_message(
426 | format_judge_prompt(prompt, result, check, expected_tools)
427 | )
428 | scores.append(
429 | Score(
430 | score=res.data,
431 | model=model.slug,
432 | duration=duration_seconds,
433 | tool_analysis=tool_analysis.tool_analysis,
434 | redundant_tool_calls=tool_analysis.redundant_tool_calls,
435 | tool_calls=tool_analysis.total_tool_calls,
436 | trace=result,
437 | )
438 | )
439 |
440 | return Results(scores=scores, duration=total_duration.total_seconds())
441 |
--------------------------------------------------------------------------------
/mcpx_eval/html.py:
--------------------------------------------------------------------------------
1 | def visualize_json(data, output_path=None):
2 | """Create an interactive HTML visualization of JSON data"""
3 | import json
4 | from datetime import datetime
5 | import matplotlib.pyplot as plt
6 | import io
7 | import base64
8 |
9 | def create_performance_graph(data):
10 | """Create a matplotlib graph of model performance"""
11 | if not data.get("total", {}).get("models"):
12 | return ""
13 |
14 | models = data["total"]["models"]
15 | model_names = list(models.keys())
16 | metrics = {
17 | "accuracy": [models[m]["accuracy"] for m in model_names],
18 | "tool_use": [models[m]["tool_use"] for m in model_names],
19 | "completeness": [models[m]["completeness"] for m in model_names],
20 | "quality": [models[m]["quality"] for m in model_names],
21 | "hallucination": [models[m]["hallucination_score"] for m in model_names],
22 | }
23 |
24 | # Sort by quality score
25 | sorted_indices = sorted(
26 | range(len(metrics["quality"])),
27 | key=lambda k: metrics["quality"][k],
28 | reverse=True,
29 | )
30 | model_names = [model_names[i] for i in sorted_indices]
31 | for metric in metrics:
32 | metrics[metric] = [metrics[metric][i] for i in sorted_indices]
33 |
34 | plt.figure(figsize=(15, 8))
35 | x = range(len(model_names))
36 | width = 0.15 # Narrower bars to fit all metrics
37 |
38 | # Plot each metric with offset positions
39 | plt.bar(
40 | [i - width * 2 for i in x],
41 | metrics["accuracy"],
42 | width,
43 | label="Accuracy",
44 | color="skyblue",
45 | )
46 | plt.bar(
47 | [i - width for i in x],
48 | metrics["tool_use"],
49 | width,
50 | label="Tool Use",
51 | color="lightgreen",
52 | )
53 | plt.bar(
54 | [i for i in x],
55 | metrics["completeness"],
56 | width,
57 | label="Completeness",
58 | color="orange",
59 | )
60 | plt.bar(
61 | [i + width for i in x],
62 | metrics["quality"],
63 | width,
64 | label="Quality",
65 | color="purple",
66 | )
67 | plt.bar(
68 | [i + width * 2 for i in x],
69 | metrics["hallucination"],
70 | width,
71 | label="Hallucination",
72 | color="red",
73 | )
74 |
75 | plt.xlabel("Models", fontsize=12)
76 | plt.ylabel("Score (%)", fontsize=12)
77 | plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14)
78 | plt.legend(loc="upper right", title="Metrics", fontsize=10)
79 |
80 | plt.grid(True, alpha=0.3)
81 | plt.tight_layout()
82 |
83 | # Convert plot to base64 string
84 | buf = io.BytesIO()
85 | plt.savefig(buf, format="png", dpi=300, bbox_inches="tight")
86 | plt.close()
87 | buf.seek(0)
88 | return base64.b64encode(buf.getvalue()).decode("utf-8")
89 |
90 | # Create HTML content with comparison tables and JSON viewer
91 | html = (
92 | """
93 |
94 |
95 |
96 |
97 | mcpx-eval Scoreboard
98 |
190 |
191 |
192 | mcpx-eval Open-Ended Tool Calling Scoreboard
193 | Generated on: """
194 | + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
195 | + """
196 |
197 |
198 |
Overview
199 |

202 |
203 |
204 |
205 |
206 |
207 |
Model Rankings (All Tests)
208 |
209 |
210 |
211 | | Rank |
212 | Model |
213 | Combined Score |
214 | Accuracy |
215 | Tool Use |
216 | Completeness |
217 | Quality |
218 | Hallucination |
219 | Duration (s) |
220 | Tool Calls |
221 | Redundant Calls |
222 | Failed Calls |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
613 |
614 |
615 | """
616 | )
617 |
618 | return html
619 |
--------------------------------------------------------------------------------
/mcpx_eval/htmlgen.py:
--------------------------------------------------------------------------------
1 | def visualize_json(data, output_path=None):
2 | """Create an interactive HTML visualization of JSON data"""
3 | import json
4 | from datetime import datetime
5 | import matplotlib.pyplot as plt
6 | import io
7 | import base64
8 |
9 | def create_performance_graph(data):
10 | """Create a matplotlib graph of model performance"""
11 | if not data.get("total", {}).get("models"):
12 | return ""
13 |
14 | models = data["total"]["models"]
15 | model_names = list(models.keys())
16 | metrics = {
17 | "accuracy": [models[m]["accuracy"] for m in model_names],
18 | "tool_use": [models[m]["tool_use"] for m in model_names],
19 | "completeness": [models[m]["completeness"] for m in model_names],
20 | "quality": [models[m]["quality"] for m in model_names],
21 | "hallucination": [models[m]["hallucination_score"] for m in model_names],
22 | }
23 |
24 | # Sort by quality score
25 | sorted_indices = sorted(
26 | range(len(metrics["quality"])),
27 | key=lambda k: metrics["quality"][k],
28 | reverse=True,
29 | )
30 | model_names = [model_names[i] for i in sorted_indices]
31 | for metric in metrics:
32 | metrics[metric] = [metrics[metric][i] for i in sorted_indices]
33 |
34 | plt.figure(figsize=(15, 8))
35 | x = range(len(model_names))
36 | width = 0.15 # Narrower bars to fit all metrics
37 |
38 | # Plot each metric with offset positions
39 | plt.bar(
40 | [i - width * 2 for i in x],
41 | metrics["accuracy"],
42 | width,
43 | label="Accuracy",
44 | color="skyblue",
45 | )
46 | plt.bar(
47 | [i - width for i in x],
48 | metrics["tool_use"],
49 | width,
50 | label="Tool Use",
51 | color="lightgreen",
52 | )
53 | plt.bar(
54 | [i for i in x],
55 | metrics["completeness"],
56 | width,
57 | label="Completeness",
58 | color="orange",
59 | )
60 | plt.bar(
61 | [i + width for i in x],
62 | metrics["quality"],
63 | width,
64 | label="Quality",
65 | color="purple",
66 | )
67 | plt.bar(
68 | [i + width * 2 for i in x],
69 | metrics["hallucination"],
70 | width,
71 | label="Hallucination",
72 | color="red",
73 | )
74 |
75 | plt.xlabel("Models", fontsize=12)
76 | plt.ylabel("Score (%)", fontsize=12)
77 | plt.xticks(x, model_names, rotation=45, ha="right", fontsize=14)
78 | plt.legend(loc="upper right", title="Metrics", fontsize=10)
79 |
80 | plt.grid(True, alpha=0.3)
81 | plt.tight_layout()
82 |
83 | # Convert plot to base64 string
84 | buf = io.BytesIO()
85 | plt.savefig(buf, format="png", dpi=300, bbox_inches="tight")
86 | plt.close()
87 | buf.seek(0)
88 | return base64.b64encode(buf.getvalue()).decode("utf-8")
89 |
90 | # Create HTML content with comparison tables and JSON viewer
91 | html = (
92 | """
93 |
94 |
95 |
96 |
97 | mcpx-eval Scoreboard
98 |
190 |
191 |
192 | mcpx-eval Open-Ended Tool Calling Scoreboard
193 | Generated on: """
194 | + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
195 | + """
196 |
197 |
198 |
Overview
199 |

202 |
203 |
204 |
205 |
206 |
207 |
Model Rankings (All Tests)
208 |
209 |
210 |
211 | | Rank |
212 | Model |
213 | Combined Score |
214 | Accuracy |
215 | Tool Use |
216 | Completeness |
217 | Quality |
218 | Hallucination |
219 | Duration (s) |
220 | Tool Calls |
221 | Redundant Calls |
222 | Failed Calls |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
613 |
614 |
615 | """
616 | )
617 |
618 | return html
619 |
--------------------------------------------------------------------------------