├── answers-generated └── .placeholder ├── answers └── 1-capital-italy.txt ├── prompts └── 1-capital-italy.txt ├── static ├── favicon.ico └── benchmark-report.png ├── frontend ├── postcss.config.js ├── src │ ├── App.jsx │ ├── main.jsx │ ├── components │ │ ├── ui │ │ │ ├── DarkModeToggle.jsx │ │ │ ├── Button.jsx │ │ │ └── Modal.jsx │ │ ├── ErrorBoundary.jsx │ │ ├── Charts │ │ │ ├── ResponseTimeChart.jsx │ │ │ ├── AccuracyChart.jsx │ │ │ └── ModelComparisonChart.jsx │ │ ├── SummaryTable.jsx │ │ ├── Dashboard.jsx │ │ ├── QuestionDetails.jsx │ │ ├── FilterPanel.jsx │ │ ├── ModelAnswersAccordion.jsx │ │ └── ResultsMatrix.jsx │ ├── hooks │ │ ├── useDarkMode.js │ │ ├── useApi.js │ │ └── useFilters.js │ ├── styles │ │ └── globals.css │ ├── utils │ │ ├── heatmapUtils.js │ │ └── dataProcessing.js │ └── test │ │ ├── Dashboard.test.jsx │ │ ├── App.test.jsx │ │ ├── useApi.test.js │ │ └── useFilters.test.js ├── index.html ├── setup.sh ├── package.json ├── tailwind.config.js ├── vite.config.js └── README.md ├── .gitignore ├── .env.example ├── LICENSE ├── pyproject.toml ├── AGENTS.md ├── start-dashboard.sh ├── reporting.py ├── tests ├── conftest.py └── unit │ ├── test_validation.py │ └── test_file_utils.py ├── shared.py ├── api_client.py ├── report_template.html ├── file_utils.py ├── api_server.py ├── server.py ├── README.md ├── validation.py └── main.py /answers-generated/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /answers/1-capital-italy.txt: -------------------------------------------------------------------------------- 1 | Torino -------------------------------------------------------------------------------- /prompts/1-capital-italy.txt: -------------------------------------------------------------------------------- 1 | scrivi solo il nome della la prima capitale d'Italia -------------------------------------------------------------------------------- /static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grigio/llm-eval-simple/HEAD/static/favicon.ico -------------------------------------------------------------------------------- /static/benchmark-report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grigio/llm-eval-simple/HEAD/static/benchmark-report.png -------------------------------------------------------------------------------- /frontend/postcss.config.js: -------------------------------------------------------------------------------- 1 | export default { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } -------------------------------------------------------------------------------- /frontend/src/App.jsx: -------------------------------------------------------------------------------- 1 | 2 | import { Dashboard } from './components/Dashboard'; 3 | import { ErrorBoundary } from './components/ErrorBoundary'; 4 | 5 | function App() { 6 | return ( 7 | 8 | 9 | 10 | ); 11 | } 12 | 13 | export default App; -------------------------------------------------------------------------------- /frontend/src/main.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import ReactDOM from 'react-dom/client' 3 | import App from './App.jsx' 4 | import './styles/globals.css' 5 | 6 | ReactDOM.createRoot(document.getElementById('root')).render( 7 | 8 | 9 | , 10 | ) -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LLM Eval Dashboard 8 | 9 | 10 |

11 | 12 | 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | .env 12 | prompts/* 13 | answers/* 14 | answers-generated/* 15 | 16 | # Node.js dependencies and build files 17 | frontend/node_modules/ 18 | frontend/dist/ 19 | frontend/.vite/ 20 | frontend/.env.local 21 | frontend/.env.development.local 22 | frontend/.env.test.local 23 | frontend/.env.production.local 24 | 25 | # Coverage reports 26 | coverage.xml 27 | htmlcov/ 28 | .coverage 29 | 30 | # IDE and editor files 31 | .vscode/ 32 | .idea/ 33 | *.swp 34 | *.swo 35 | *~ -------------------------------------------------------------------------------- /frontend/src/components/ui/DarkModeToggle.jsx: -------------------------------------------------------------------------------- 1 | import { Moon, Sun } from 'lucide-react'; 2 | import { useDarkMode } from '../../hooks/useDarkMode'; 3 | 4 | export const DarkModeToggle = () => { 5 | const { isDark, toggle } = useDarkMode(); 6 | 7 | return ( 8 | 19 | ); 20 | }; -------------------------------------------------------------------------------- /frontend/src/hooks/useDarkMode.js: -------------------------------------------------------------------------------- 1 | import { useState, useEffect } from 'react'; 2 | 3 | export const useDarkMode = () => { 4 | const [isDark, setIsDark] = useState(() => { 5 | // Check localStorage and system preference 6 | if (typeof window !== 'undefined') { 7 | const saved = localStorage.getItem('darkMode'); 8 | if (saved !== null) return saved === 'true'; 9 | return window.matchMedia('(prefers-color-scheme: dark)').matches; 10 | } 11 | return false; 12 | }); 13 | 14 | useEffect(() => { 15 | const root = document.documentElement; 16 | if (isDark) { 17 | root.classList.add('dark'); 18 | } else { 19 | root.classList.remove('dark'); 20 | } 21 | localStorage.setItem('darkMode', isDark); 22 | }, [isDark]); 23 | 24 | const toggle = () => setIsDark(!isDark); 25 | 26 | return { isDark, toggle }; 27 | }; -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | ## Local LLM 2 | ENDPOINT_URL=http://localhost:9292/v1/chat/completions 3 | API_KEY=fakey 4 | # You need to install them via llama-server or ollama 5 | MODEL_NAMES=gemma-3-12b-it-Q4_K_M,Qwen3-4B-IQ4_NL,Qwen3-Coder-30B-A3B-Instruct,gpt-oss-20b-mxfp4 6 | MODEL_EVALUATOR=gpt-oss-20b-mxfp4 7 | THROTTLING_SECS=0.1 8 | 9 | # or ------ OPENROUTER EXAMPLE 10 | # Different providers could have different models name 11 | # ENDPOINT_URL=https://openrouter.ai/api/v1/chat/completions 12 | # API_KEY=sk-or-v1-your-api-key 13 | # MODEL_NAMES=qwen/qwen3-4b:free,qwen/qwen3-30b-a3b:free,google/gemma-3-12b-it:free,openai/gpt-oss-20b:free,qwen/qwen3-coder:free 14 | # MODEL_EVALUATOR=openai/gpt-oss-20b:free 15 | # # THROTTLING_SECS is important or you get: Error for xyz.txt with openai/gpt-oss-20b:free: 429 Client Error: Too Many Requests for url: https://openrouter.ai/api/v1/chat/completions 16 | # THROTTLING_SECS=37.0 17 | -------------------------------------------------------------------------------- /frontend/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup script for the React frontend 4 | 5 | echo "🚀 Setting up LLM Eval Frontend..." 6 | 7 | # Check if we're in the frontend directory 8 | if [ ! -f "package.json" ]; then 9 | echo "❌ Error: package.json not found. Please run this script from the frontend directory." 10 | exit 1 11 | fi 12 | 13 | # Install dependencies 14 | echo "📦 Installing dependencies..." 15 | npm install 16 | 17 | # Build for production 18 | echo "🔨 Building for production..." 19 | npm run build 20 | 21 | echo "✅ Frontend setup complete!" 22 | echo "" 23 | echo "🎯 Next steps:" 24 | echo "1. Start the backend server: python api_server.py" 25 | echo "2. Visit http://localhost:8001 to see the dashboard" 26 | echo "" 27 | echo "🔧 For development:" 28 | echo "1. Run 'npm run dev' in this directory" 29 | echo "2. Start the backend server in another terminal" 30 | echo "3. Visit http://localhost:3000 for hot reload development" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Luigi Maselli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "llm-eval-frontend", 3 | "version": "1.0.2", 4 | "type": "module", 5 | "scripts": { 6 | "dev": "vite", 7 | "build": "vite build", 8 | "preview": "vite preview", 9 | "test": "vitest", 10 | "test:ui": "vitest --ui", 11 | "test:run": "vitest run", 12 | "test:coverage": "vitest run --coverage", 13 | "test:watch": "vitest --watch" 14 | }, 15 | "dependencies": { 16 | "chart.js": "^4.4.0", 17 | "clsx": "^2.0.0", 18 | "lucide-react": "^0.294.0", 19 | "react": "^18.2.0", 20 | "react-chartjs-2": "^5.2.0", 21 | "react-dom": "^18.2.0", 22 | "react-router-dom": "^7.9.5" 23 | }, 24 | "devDependencies": { 25 | "@testing-library/jest-dom": "^6.9.1", 26 | "@testing-library/react": "^16.3.0", 27 | "@testing-library/user-event": "^14.6.1", 28 | "@types/react": "^18.2.43", 29 | "@types/react-dom": "^18.2.17", 30 | "@vitejs/plugin-react": "^4.2.1", 31 | "autoprefixer": "^10.4.16", 32 | "jsdom": "^27.1.0", 33 | "postcss": "^8.4.32", 34 | "tailwindcss": "^3.3.6", 35 | "vite": "^5.0.8", 36 | "vitest": "^4.0.7" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /frontend/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | export default { 3 | content: [ 4 | "./index.html", 5 | "./src/**/*.{js,ts,jsx,tsx}", 6 | ], 7 | darkMode: 'class', 8 | theme: { 9 | extend: { 10 | colors: { 11 | primary: { 12 | 50: '#eff6ff', 13 | 500: '#3b82f6', 14 | 600: '#2563eb', 15 | 700: '#1d4ed8', 16 | }, 17 | success: { 18 | 50: '#f0fdf4', 19 | 500: '#22c55e', 20 | 600: '#16a34a', 21 | }, 22 | error: { 23 | 50: '#fef2f2', 24 | 500: '#ef4444', 25 | 600: '#dc2626', 26 | } 27 | }, 28 | animation: { 29 | 'fade-in': 'fadeIn 0.5s ease-in-out', 30 | 'slide-up': 'slideUp 0.3s ease-out', 31 | }, 32 | keyframes: { 33 | fadeIn: { 34 | '0%': { opacity: '0' }, 35 | '100%': { opacity: '1' }, 36 | }, 37 | slideUp: { 38 | '0%': { transform: 'translateY(10px)', opacity: '0' }, 39 | '100%': { transform: 'translateY(0)', opacity: '1' }, 40 | } 41 | } 42 | }, 43 | }, 44 | plugins: [], 45 | } -------------------------------------------------------------------------------- /frontend/src/components/ui/Button.jsx: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | 3 | export const Button = ({ 4 | children, 5 | variant = 'primary', 6 | size = 'md', 7 | className, 8 | ...props 9 | }) => { 10 | const baseClasses = 'font-medium rounded-lg transition-colors focus:outline-none focus:ring-2 focus:ring-offset-2'; 11 | 12 | const variants = { 13 | primary: 'bg-primary-600 text-white hover:bg-primary-700 focus:ring-primary-500', 14 | secondary: 'bg-gray-200 text-gray-900 hover:bg-gray-300 focus:ring-gray-500 dark:bg-gray-700 dark:text-gray-100 dark:hover:bg-gray-600', 15 | success: 'bg-success-600 text-white hover:bg-success-700 focus:ring-success-500', 16 | danger: 'bg-error-600 text-white hover:bg-error-700 focus:ring-error-500', 17 | ghost: 'hover:bg-gray-100 dark:hover:bg-gray-700 text-gray-700 dark:text-gray-300' 18 | }; 19 | 20 | const sizes = { 21 | sm: 'px-3 py-1.5 text-sm', 22 | md: 'px-4 py-2 text-base', 23 | lg: 'px-6 py-3 text-lg' 24 | }; 25 | 26 | return ( 27 | 38 | ); 39 | }; -------------------------------------------------------------------------------- /frontend/vite.config.js: -------------------------------------------------------------------------------- 1 | /// 2 | import { defineConfig } from 'vite' 3 | import react from '@vitejs/plugin-react' 4 | import fs from 'fs' 5 | import path from 'path' 6 | 7 | export default defineConfig({ 8 | plugins: [ 9 | react(), 10 | { 11 | name: 'configure-server', 12 | configureServer(server) { 13 | server.middlewares.use('/api/json', (req, res, next) => { 14 | if (req.method === 'GET' && req.url) { 15 | const filename = req.url.substring(1); // Remove leading slash 16 | const jsonPath = path.resolve(__dirname, '../answers-generated', filename); 17 | 18 | try { 19 | if (fs.existsSync(jsonPath) && filename.endsWith('.json')) { 20 | const content = fs.readFileSync(jsonPath, 'utf-8'); 21 | res.setHeader('Content-Type', 'application/json'); 22 | res.end(content); 23 | return; 24 | } 25 | } catch (error) { 26 | console.error('Error serving JSON file:', error); 27 | } 28 | } 29 | next(); 30 | }); 31 | } 32 | } 33 | ], 34 | test: { 35 | globals: true, 36 | environment: 'jsdom', 37 | css: true 38 | }, 39 | server: { 40 | port: 3000, 41 | proxy: { 42 | '/api': { 43 | target: 'http://localhost:4000', 44 | changeOrigin: true 45 | } 46 | } 47 | } 48 | }) -------------------------------------------------------------------------------- /frontend/src/hooks/useApi.js: -------------------------------------------------------------------------------- 1 | import { useState, useEffect } from 'react'; 2 | 3 | export const useApi = (url) => { 4 | const [data, setData] = useState(null); 5 | const [loading, setLoading] = useState(true); 6 | const [error, setError] = useState(null); 7 | 8 | const fetchData = async () => { 9 | try { 10 | setLoading(true); 11 | const response = await fetch(url); 12 | if (!response.ok) { 13 | throw new Error(`HTTP error! status: ${response.status}`); 14 | } 15 | const result = await response.json(); 16 | setData(result); 17 | } catch (err) { 18 | setError(err.message); 19 | } finally { 20 | setLoading(false); 21 | } 22 | }; 23 | 24 | useEffect(() => { 25 | if (url) { 26 | fetchData(); 27 | } 28 | }, [url]); 29 | 30 | return { data, loading, error, refetch: fetchData }; 31 | }; 32 | 33 | export const useJsonFile = (filename) => { 34 | const [data, setData] = useState(null); 35 | const [loading, setLoading] = useState(true); 36 | const [error, setError] = useState(null); 37 | 38 | const fetchJsonFile = async () => { 39 | if (!filename) { 40 | setLoading(false); 41 | return; 42 | } 43 | 44 | try { 45 | setLoading(true); 46 | const response = await fetch(`/api/json/${filename}`); 47 | if (!response.ok) { 48 | throw new Error(`HTTP error! status: ${response.status}`); 49 | } 50 | const result = await response.json(); 51 | setData(result); 52 | } catch (err) { 53 | setError(err.message); 54 | } finally { 55 | setLoading(false); 56 | } 57 | }; 58 | 59 | useEffect(() => { 60 | fetchJsonFile(); 61 | }, [filename]); 62 | 63 | return { data, loading, error, refetch: fetchJsonFile }; 64 | }; -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llm-eval-simple" 3 | version = "0.2.4" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "pydantic>=2.12.4", 9 | "python-dotenv>=1.1.1", 10 | "requests>=2.32.5", 11 | "tabulate>=0.9.0", 12 | ] 13 | 14 | [dependency-groups] 15 | dev = [ 16 | "pytest>=8.4.2", 17 | "pytest-asyncio>=1.2.0", 18 | "pytest-cov>=7.0.0", 19 | "pytest-mock>=3.15.1", 20 | ] 21 | 22 | [tool.pytest.ini_options] 23 | minversion = "6.0" 24 | addopts = [ 25 | "--strict-markers", 26 | "--strict-config", 27 | "--verbose", 28 | "--tb=short", 29 | "--cov=.", 30 | "--cov-report=term-missing", 31 | "--cov-report=html:htmlcov", 32 | "--cov-report=xml", 33 | "--cov-fail-under=80", 34 | ] 35 | testpaths = ["tests"] 36 | python_files = ["test_*.py", "*_test.py"] 37 | python_classes = ["Test*"] 38 | python_functions = ["test_*"] 39 | pythonpath = ["."] 40 | markers = [ 41 | "unit: mark test as a unit test", 42 | "integration: mark test as an integration test", 43 | "slow: mark test as slow running", 44 | ] 45 | filterwarnings = [ 46 | "error", 47 | "ignore::UserWarning", 48 | "ignore::DeprecationWarning", 49 | ] 50 | 51 | [tool.coverage.run] 52 | source = ["."] 53 | omit = [ 54 | "tests/*", 55 | "*/venv/*", 56 | "*/.venv/*", 57 | "*/site-packages/*", 58 | "*/dist/*", 59 | "*/build/*", 60 | ] 61 | 62 | [tool.coverage.report] 63 | exclude_lines = [ 64 | "pragma: no cover", 65 | "def __repr__", 66 | "if self.debug:", 67 | "if settings.DEBUG", 68 | "raise AssertionError", 69 | "raise NotImplementedError", 70 | "if 0:", 71 | "if __name__ == .__main__.:", 72 | "class .*\\bProtocol\\):", 73 | "@(abc\\.)?abstractmethod", 74 | ] 75 | -------------------------------------------------------------------------------- /frontend/src/components/ErrorBoundary.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | 3 | export class ErrorBoundary extends React.Component { 4 | constructor(props) { 5 | super(props); 6 | this.state = { hasError: false, error: null, errorInfo: null }; 7 | } 8 | 9 | static getDerivedStateFromError() { 10 | return { hasError: true }; 11 | } 12 | 13 | componentDidCatch(error, errorInfo) { 14 | this.setState({ 15 | error: error, 16 | errorInfo: errorInfo 17 | }); 18 | 19 | console.error('Error caught by boundary:', error, errorInfo); 20 | } 21 | 22 | render() { 23 | if (this.state.hasError) { 24 | return ( 25 |

26 |

27 |

28 | Something went wrong 29 |

30 |

31 |

32 | Error details 33 |

34 |

35 |                 {this.state.error && this.state.error.toString()}
36 |                 

37 |                 {this.state.errorInfo.componentStack}
38 |

39 |

40 | 46 |

47 |

48 | ); 49 | } 50 | 51 | return this.props.children; 52 | } 53 | } -------------------------------------------------------------------------------- /frontend/src/styles/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer base { 6 | :root { 7 | --background: 0 0% 100%; 8 | --foreground: 222.2 84% 4.9%; 9 | } 10 | 11 | .dark { 12 | --background: 222.2 84% 4.9%; 13 | --foreground: 210 40% 98%; 14 | } 15 | } 16 | 17 | @layer components { 18 | .btn { 19 | @apply px-4 py-2 rounded-lg font-medium transition-colors focus:outline-none focus:ring-2 focus:ring-offset-2; 20 | } 21 | 22 | .btn-primary { 23 | @apply bg-primary-600 text-white hover:bg-primary-700 focus:ring-primary-500; 24 | } 25 | 26 | .btn-secondary { 27 | @apply bg-gray-200 text-gray-900 hover:bg-gray-300 focus:ring-gray-500 dark:bg-gray-700 dark:text-gray-100 dark:hover:bg-gray-600; 28 | } 29 | 30 | .card { 31 | @apply bg-white dark:bg-gray-800 rounded-lg shadow-lg border border-gray-200 dark:border-gray-700; 32 | } 33 | 34 | .table-cell { 35 | @apply px-4 py-3 text-sm border-b border-gray-200 dark:border-gray-700; 36 | } 37 | 38 | .progress-bar { 39 | @apply w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2; 40 | } 41 | 42 | .progress-fill { 43 | @apply bg-success-500 h-2 rounded-full transition-all duration-500; 44 | } 45 | } 46 | 47 | /* Custom scrollbar styles */ 48 | ::-webkit-scrollbar { 49 | width: 8px; 50 | height: 8px; 51 | } 52 | 53 | ::-webkit-scrollbar-track { 54 | background: #f1f1f1; 55 | border-radius: 4px; 56 | } 57 | 58 | ::-webkit-scrollbar-thumb { 59 | background: #c1c1c1; 60 | border-radius: 4px; 61 | } 62 | 63 | ::-webkit-scrollbar-thumb:hover { 64 | background: #a8a8a8; 65 | } 66 | 67 | /* Dark mode scrollbar styles */ 68 | .dark ::-webkit-scrollbar-track { 69 | background: #374151; 70 | } 71 | 72 | .dark ::-webkit-scrollbar-thumb { 73 | background: #6b7280; 74 | } 75 | 76 | .dark ::-webkit-scrollbar-thumb:hover { 77 | background: #9ca3af; 78 | } -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # AGENTS.md 2 | 3 | ## Commands 4 | - **Run main script**: `uv run python main.py` 5 | - **Run specific actions**: `uv run python main.py --actions answer,evaluate,serve --pattern "prompts/*"` 6 | - **Start web server**: `uv run python main.py --actions serve` 7 | - **Start API server**: `uv run python api_server.py` (runs on port 4000) 8 | - **Start web UI**: `cd frontend && npm run dev` (runs on port 3000) 9 | - **Install dependencies**: `uv sync` 10 | - **Add dependency**: `uv add ` 11 | - **Remove dependency**: `uv remove ` 12 | 13 | ## Testing Commands 14 | ### Backend Testing 15 | - **Run all backend tests**: `uv run pytest` 16 | - **Run tests with coverage**: `uv run pytest --cov=. --cov-report=html` 17 | - **Run specific test file**: `uv run pytest tests/unit/test_main.py` 18 | - **Run tests with verbose output**: `uv run pytest -v` 19 | - **Run only unit tests**: `uv run pytest -m unit` 20 | - **Run only integration tests**: `uv run pytest -m integration` 21 | 22 | ### Frontend Testing 23 | - **Run all frontend tests**: `cd frontend && npm run test` 24 | - **Run tests once**: `cd frontend && npm run test:run` 25 | - **Run tests with coverage**: `cd frontend && npm run test:coverage` 26 | - **Run tests with UI**: `cd frontend && npm run test:ui` 27 | - **Run tests in watch mode**: `cd frontend && npm run test:watch` 28 | 29 | ## Code Style 30 | - **Package Manager**: Use `uv` (not pip) 31 | - **Execution**: Use `uv run` for all Python commands 32 | - **Dependencies**: Define in `pyproject.toml`, use `uv.lock` for reproducibility 33 | - **Python Version**: Requires Python 3.13+ 34 | - **Structure**: Use dataclasses for config, separate functions clearly 35 | - **Error Handling**: Use try/except for API requests, return None/False on failure 36 | - **Type Hints**: Use typing module (List, Dict, Any) 37 | - **Imports**: Standard library first, then third-party 38 | - **File Encoding**: Always UTF-8 39 | - **Constants**: Define at module level in UPPER_CASE 40 | - **Virtual Environment**: `uv` manages automatically - no manual venv needed 41 | - **Python Execution**: Always use `uv run python` 42 | - **Language**: All code and comments must be in English -------------------------------------------------------------------------------- /frontend/src/components/Charts/ResponseTimeChart.jsx: -------------------------------------------------------------------------------- 1 | import { Bar } from 'react-chartjs-2'; 2 | import { Chart as ChartJS, CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend } from 'chart.js'; 3 | import { calculateModelSummary } from '../../utils/dataProcessing'; 4 | 5 | ChartJS.register(CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend); 6 | 7 | export const ResponseTimeChart = ({ results }) => { 8 | const modelSummary = calculateModelSummary(results); 9 | 10 | const chartData = { 11 | labels: Object.keys(modelSummary), 12 | datasets: [ 13 | { 14 | label: 'Avg Response Time (s)', 15 | data: Object.values(modelSummary).map(stats => { 16 | const avgTime = stats.total > 0 ? stats.totalTime / stats.total : 0; 17 | return parseFloat(avgTime.toFixed(2)); 18 | }), 19 | backgroundColor: 'rgba(59, 130, 246, 0.8)', 20 | borderColor: 'rgba(59, 130, 246, 1)', 21 | borderWidth: 1, 22 | }, 23 | ], 24 | }; 25 | 26 | const options = { 27 | responsive: true, 28 | maintainAspectRatio: false, 29 | plugins: { 30 | legend: { 31 | display: false, 32 | }, 33 | title: { 34 | display: true, 35 | text: 'Average Response Time Comparison', 36 | color: 'rgb(55, 65, 81)', 37 | font: { 38 | size: 16, 39 | weight: 'bold', 40 | }, 41 | }, 42 | tooltip: { 43 | callbacks: { 44 | label: (context) => { 45 | return `Avg Time: ${context.parsed.y}s`; 46 | }, 47 | }, 48 | }, 49 | }, 50 | scales: { 51 | y: { 52 | beginAtZero: true, 53 | ticks: { 54 | callback: (value) => `${value}s`, 55 | }, 56 | grid: { 57 | color: 'rgba(0, 0, 0, 0.1)', 58 | }, 59 | }, 60 | x: { 61 | grid: { 62 | display: false, 63 | }, 64 | }, 65 | }, 66 | }; 67 | 68 | return ( 69 |

70 |

71 | 72 |

73 |

74 | ); 75 | }; -------------------------------------------------------------------------------- /frontend/src/components/Charts/AccuracyChart.jsx: -------------------------------------------------------------------------------- 1 | import { Bar } from 'react-chartjs-2'; 2 | import { Chart as ChartJS, CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend } from 'chart.js'; 3 | import { calculateModelSummary } from '../../utils/dataProcessing'; 4 | 5 | ChartJS.register(CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend); 6 | 7 | export const AccuracyChart = ({ results }) => { 8 | const modelSummary = calculateModelSummary(results); 9 | 10 | const chartData = { 11 | labels: Object.keys(modelSummary), 12 | datasets: [ 13 | { 14 | label: 'Accuracy %', 15 | data: Object.values(modelSummary).map(stats => { 16 | const accuracy = stats.total > 0 ? (stats.correct / stats.total) * 100 : 0; 17 | return parseFloat(accuracy.toFixed(1)); 18 | }), 19 | backgroundColor: 'rgba(34, 197, 94, 0.8)', 20 | borderColor: 'rgba(34, 197, 94, 1)', 21 | borderWidth: 1, 22 | }, 23 | ], 24 | }; 25 | 26 | const options = { 27 | responsive: true, 28 | maintainAspectRatio: false, 29 | plugins: { 30 | legend: { 31 | display: false, 32 | }, 33 | title: { 34 | display: true, 35 | text: 'Model Accuracy Comparison', 36 | color: 'rgb(55, 65, 81)', 37 | font: { 38 | size: 16, 39 | weight: 'bold', 40 | }, 41 | }, 42 | tooltip: { 43 | callbacks: { 44 | label: (context) => { 45 | return `Accuracy: ${context.parsed.y}%`; 46 | }, 47 | }, 48 | }, 49 | }, 50 | scales: { 51 | y: { 52 | beginAtZero: true, 53 | max: 100, 54 | ticks: { 55 | callback: (value) => `${value}%`, 56 | }, 57 | grid: { 58 | color: 'rgba(0, 0, 0, 0.1)', 59 | }, 60 | }, 61 | x: { 62 | grid: { 63 | display: false, 64 | }, 65 | }, 66 | }, 67 | }; 68 | 69 | return ( 70 |

71 |

72 | 73 |

74 |

75 | ); 76 | }; -------------------------------------------------------------------------------- /frontend/src/utils/heatmapUtils.js: -------------------------------------------------------------------------------- 1 | export const getHeatmapColor = (correct, responseTime, minTime, maxTime, isDarkMode) => { 2 | // Normalize response time between 0 and 1 3 | const normalizedTime = maxTime > minTime 4 | ? (responseTime - minTime) / (maxTime - minTime) 5 | : 0.5; 6 | 7 | // Color intensity based on correctness and speed 8 | // Fast results = darker colors, Slow results = lighter colors 9 | // Correct + Fast = Dark Green 10 | // Correct + Slow = Light Green 11 | // Incorrect + Fast = Dark Red 12 | // Incorrect + Slow = Light Red 13 | 14 | if (correct) { 15 | // Green colors 16 | if (normalizedTime < 0.5) { 17 | // Fast and correct - dark green 18 | return { 19 | bg: isDarkMode ? 'rgb(22, 163, 74)' : 'rgb(34, 197, 94)', // green-600/500 20 | hover: isDarkMode ? 'rgb(21, 128, 61)' : 'rgb(22, 163, 74)', // green-700/600 (darker on hover) 21 | text: 'white', 22 | intensity: 'high' 23 | }; 24 | } else { 25 | // Slow and correct - light green 26 | return { 27 | bg: isDarkMode ? 'rgb(134, 239, 172)' : 'rgb(187, 247, 208)', // green-300/200 28 | hover: isDarkMode ? 'rgb(74, 222, 128)' : 'rgb(134, 239, 172)', // green-400/300 (darker on hover) 29 | text: isDarkMode ? 'rgb(21, 128, 61)' : 'rgb(21, 128, 61)', // green-800 30 | intensity: 'low' 31 | }; 32 | } 33 | } else { 34 | // Red colors 35 | if (normalizedTime < 0.5) { 36 | // Fast but wrong - dark red 37 | return { 38 | bg: isDarkMode ? 'rgb(185, 28, 28)' : 'rgb(220, 38, 38)', // red-700/600 39 | hover: isDarkMode ? 'rgb(153, 27, 27)' : 'rgb(185, 28, 28)', // red-800/700 (darker on hover) 40 | text: 'white', 41 | intensity: 'high' 42 | }; 43 | } else { 44 | // Slow and wrong - light red 45 | return { 46 | bg: isDarkMode ? 'rgb(252, 165, 165)' : 'rgb(254, 202, 202)', // red-300/200 47 | hover: isDarkMode ? 'rgb(248, 113, 113)' : 'rgb(252, 165, 165)', // red-400/300 (darker on hover) 48 | text: isDarkMode ? 'rgb(127, 29, 29)' : 'rgb(127, 29, 29)', // red-800 49 | intensity: 'low' 50 | }; 51 | } 52 | } 53 | }; 54 | 55 | export const getResponseTimeStats = (results) => { 56 | if (!results || results.length === 0) { 57 | return { min: 0, max: 1, avg: 0 }; 58 | } 59 | 60 | const times = results 61 | .map(r => r.response_time) 62 | .filter(time => typeof time === 'number' && time > 0); 63 | 64 | if (times.length === 0) { 65 | return { min: 0, max: 1, avg: 0 }; 66 | } 67 | 68 | const min = Math.min(...times); 69 | const max = Math.max(...times); 70 | const avg = times.reduce((sum, time) => sum + time, 0) / times.length; 71 | 72 | return { min, max, avg }; 73 | }; -------------------------------------------------------------------------------- /frontend/src/components/ui/Modal.jsx: -------------------------------------------------------------------------------- 1 | import { X, ChevronLeft, ChevronRight } from 'lucide-react'; 2 | import { useEffect } from 'react'; 3 | 4 | export const Modal = ({ isOpen, onClose, title, children, showNavigation = false, onPrev = () => {}, onNext = () => {}, navLabel = null }) => { 5 | useEffect(() => { 6 | const handleKeyDown = (e) => { 7 | if (e.key === 'Escape' && isOpen) { 8 | onClose(); 9 | } else if (showNavigation && isOpen) { 10 | if (e.key === 'ArrowLeft') { 11 | onPrev(); 12 | } else if (e.key === 'ArrowRight') { 13 | onNext(); 14 | } 15 | } 16 | }; 17 | 18 | document.addEventListener('keydown', handleKeyDown); 19 | return () => document.removeEventListener('keydown', handleKeyDown); 20 | }, [isOpen, onClose, showNavigation, onPrev, onNext]); 21 | 22 | return isOpen ? ( 23 |

24 |

25 |

26 |

27 | 35 | 43 | {navLabel && ( 44 | 45 | {navLabel} 46 | 47 | )} 48 |

49 | {title} 50 |

51 |

52 | 58 |

59 |

60 | {children} 61 |

62 |

63 |

64 | ) : null; 65 | }; -------------------------------------------------------------------------------- /frontend/src/components/Charts/ModelComparisonChart.jsx: -------------------------------------------------------------------------------- 1 | import { Line } from 'react-chartjs-2'; 2 | import { Chart as ChartJS, CategoryScale, LinearScale, PointElement, LineElement, Title, Tooltip, Legend } from 'chart.js'; 3 | import { getUniquePromptsAndModels } from '../../utils/dataProcessing'; 4 | 5 | ChartJS.register(CategoryScale, LinearScale, PointElement, LineElement, Title, Tooltip, Legend); 6 | 7 | export const ModelComparisonChart = ({ results }) => { 8 | const { prompts, models } = getUniquePromptsAndModels(results); 9 | 10 | const datasets = models.map((model, index) => { 11 | const colors = [ 12 | 'rgb(34, 197, 94)', 13 | 'rgb(59, 130, 246)', 14 | 'rgb(239, 68, 68)', 15 | 'rgb(245, 158, 11)', 16 | 'rgb(139, 92, 246)', 17 | 'rgb(236, 72, 153)', 18 | ]; 19 | 20 | const modelResults = prompts.map(prompt => { 21 | const result = results.find(r => r.model === model && r.file === prompt); 22 | return result ? (result.correct ? 1 : 0) : null; 23 | }); 24 | 25 | return { 26 | label: model, 27 | data: modelResults, 28 | borderColor: colors[index % colors.length], 29 | backgroundColor: colors[index % colors.length] + '20', 30 | tension: 0.1, 31 | pointRadius: 4, 32 | pointHoverRadius: 6, 33 | }; 34 | }); 35 | 36 | const chartData = { 37 | labels: prompts, 38 | datasets, 39 | }; 40 | 41 | const options = { 42 | responsive: true, 43 | maintainAspectRatio: false, 44 | interaction: { 45 | mode: 'index', 46 | intersect: false, 47 | }, 48 | plugins: { 49 | legend: { 50 | position: 'top', 51 | }, 52 | title: { 53 | display: true, 54 | text: 'Model Performance Across Prompts', 55 | color: 'rgb(55, 65, 81)', 56 | font: { 57 | size: 16, 58 | weight: 'bold', 59 | }, 60 | }, 61 | tooltip: { 62 | callbacks: { 63 | label: (context) => { 64 | const value = context.parsed.y; 65 | return `${context.dataset.label}: ${value === 1 ? 'Correct' : value === 0 ? 'Incorrect' : 'No data'}`; 66 | }, 67 | }, 68 | }, 69 | }, 70 | scales: { 71 | y: { 72 | beginAtZero: true, 73 | max: 1.2, 74 | ticks: { 75 | stepSize: 1, 76 | callback: (value) => { 77 | if (value === 1) return 'Correct'; 78 | if (value === 0) return 'Incorrect'; 79 | return ''; 80 | }, 81 | }, 82 | grid: { 83 | color: 'rgba(0, 0, 0, 0.1)', 84 | }, 85 | }, 86 | x: { 87 | grid: { 88 | display: false, 89 | }, 90 | ticks: { 91 | maxRotation: 45, 92 | minRotation: 45, 93 | }, 94 | }, 95 | }, 96 | }; 97 | 98 | return ( 99 |

100 |

101 | 102 |

103 |

104 | ); 105 | }; -------------------------------------------------------------------------------- /frontend/src/hooks/useFilters.js: -------------------------------------------------------------------------------- 1 | import { useState, useMemo, useEffect } from 'react'; 2 | 3 | export const useFilters = (data) => { 4 | const [selectedModels, setSelectedModels] = useState([]); 5 | const [selectedFiles, setSelectedFiles] = useState([]); 6 | const [showCorrectOnly, setShowCorrectOnly] = useState(false); 7 | const [showIncorrectOnly, setShowIncorrectOnly] = useState(false); 8 | 9 | // Extract unique models from data 10 | const modelOptions = useMemo(() => { 11 | if (!data || !data.results) return []; 12 | const models = [...new Set(data.results.map(item => item.model))]; 13 | return models; 14 | }, [data]); 15 | 16 | // Extract unique files from data 17 | const fileOptions = useMemo(() => { 18 | if (!data || !data.results) return []; 19 | const files = [...new Set(data.results.map(item => item.file))]; 20 | return files; 21 | }, [data]); 22 | 23 | // Initialize selections on first load 24 | const [initialized, setInitialized] = useState(false); 25 | 26 | useEffect(() => { 27 | if (data && data.results && !initialized) { 28 | setSelectedModels(modelOptions); 29 | setSelectedFiles(fileOptions); 30 | setInitialized(true); 31 | } 32 | }, [data, modelOptions, fileOptions, initialized]); 33 | 34 | const handleModelChange = (models) => { 35 | setSelectedModels(models); 36 | }; 37 | 38 | const handleFileChange = (files) => { 39 | setSelectedFiles(files); 40 | }; 41 | 42 | const handleCorrectOnlyChange = (show) => { 43 | setShowCorrectOnly(show); 44 | if (show) { 45 | setShowIncorrectOnly(false); 46 | } 47 | }; 48 | 49 | const handleIncorrectOnlyChange = (show) => { 50 | setShowIncorrectOnly(show); 51 | if (show) { 52 | setShowCorrectOnly(false); 53 | } 54 | }; 55 | 56 | const clearFilters = () => { 57 | setSelectedModels(modelOptions); 58 | setSelectedFiles(fileOptions); 59 | setShowCorrectOnly(false); 60 | setShowIncorrectOnly(false); 61 | }; 62 | 63 | // Apply filters to data 64 | const filteredData = useMemo(() => { 65 | if (!data || !data.results) return []; 66 | 67 | return data.results.filter(item => { 68 | // Model filter 69 | if (selectedModels.length > 0 && !selectedModels.includes(item.model)) { 70 | return false; 71 | } 72 | 73 | // File filter 74 | if (selectedFiles.length > 0 && !selectedFiles.includes(item.file)) { 75 | return false; 76 | } 77 | 78 | // Correctness filter 79 | if (showCorrectOnly && !item.correct) return false; 80 | if (showIncorrectOnly && item.correct) return false; 81 | 82 | return true; 83 | }); 84 | }, [data, selectedModels, selectedFiles, showCorrectOnly, showIncorrectOnly]); 85 | 86 | return { 87 | selectedModels, 88 | selectedFiles, 89 | showCorrectOnly, 90 | showIncorrectOnly, 91 | modelOptions, 92 | fileOptions, 93 | handleModelChange, 94 | handleFileChange, 95 | handleCorrectOnlyChange, 96 | handleIncorrectOnlyChange, 97 | clearFilters, 98 | filteredData 99 | }; 100 | }; -------------------------------------------------------------------------------- /start-dashboard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start script for LLM Eval Dashboard 4 | 5 | echo "🚀 Starting LLM Eval Dashboard..." 6 | 7 | # Check if we're in the right directory 8 | if [ ! -f "api_server.py" ]; then 9 | echo "❌ Error: api_server.py not found. Please run from project root." 10 | exit 1 11 | fi 12 | 13 | # Kill any existing processes on ports 4000 and 3000 14 | echo "🔄 Cleaning up existing processes..." 15 | pkill -f "python api_server.py" 2>/dev/null || true 16 | pkill -f "vite" 2>/dev/null || true 17 | 18 | # More aggressive port cleanup 19 | echo "🔧 Ensuring ports are free..." 20 | lsof -ti:4000 | xargs kill -9 2>/dev/null || true 21 | lsof -ti:3000 | xargs kill -9 2>/dev/null || true 22 | 23 | # Wait for sockets to be released 24 | echo "⏳ Waiting for sockets to be released..." 25 | sleep 3 26 | 27 | # Double-check ports are really free 28 | for port in 4000 3000; do 29 | if lsof -i :$port >/dev/null 2>&1; then 30 | echo "⚠️ Port $port still in use, forcing cleanup..." 31 | lsof -ti:$port | xargs kill -9 2>/dev/null || true 32 | sleep 2 33 | fi 34 | done 35 | 36 | # Start API server with retry mechanism 37 | echo "🔧 Starting API server on port 4000..." 38 | for attempt in {1..3}; do 39 | uv run python api_server.py & 40 | API_PID=$! 41 | 42 | # Verify the process actually started 43 | sleep 2 44 | if kill -0 $API_PID 2>/dev/null; then 45 | echo "✅ API server process started (attempt $attempt)" 46 | break 47 | else 48 | echo "⚠️ API server failed to start (attempt $attempt)" 49 | if [ $attempt -eq 3 ]; then 50 | echo "❌ API server failed to start after 3 attempts" 51 | exit 1 52 | fi 53 | echo "🔄 Waiting before retry..." 54 | sleep 2 55 | fi 56 | done 57 | 58 | # Wait for API server to start and verify it's responding 59 | echo "⏳ Waiting for API server to be ready..." 60 | for i in {1..15}; do 61 | if curl -s http://localhost:4000/api/results > /dev/null 2>&1; then 62 | echo "✅ API server is ready!" 63 | break 64 | fi 65 | if [ $i -eq 15 ]; then 66 | echo "❌ API server failed to start after 15 seconds" 67 | echo "🔍 Checking what's using port 4000:" 68 | lsof -i :4000 || echo "Port 4000 appears to be free" 69 | kill $API_PID 2>/dev/null || true 70 | exit 1 71 | fi 72 | sleep 1 73 | done 74 | 75 | # Start frontend 76 | echo "🎨 Starting frontend on port 3000..." 77 | cd frontend 78 | npm run dev & 79 | FRONTEND_PID=$! 80 | 81 | echo "" 82 | echo "✅ Dashboard started successfully!" 83 | echo "" 84 | echo "📊 Frontend: http://localhost:3000" 85 | echo "🔌 API: http://localhost:4000/api/results" 86 | echo "" 87 | echo "Press Ctrl+C to stop both servers" 88 | echo "" 89 | 90 | # Function to cleanup on exit 91 | cleanup() { 92 | echo "" 93 | echo "🛑 Stopping servers..." 94 | kill $API_PID 2>/dev/null || true 95 | kill $FRONTEND_PID 2>/dev/null || true 96 | echo "✅ Servers stopped" 97 | exit 0 98 | } 99 | 100 | # Set up signal handlers 101 | trap cleanup SIGINT SIGTERM 102 | 103 | # Wait for processes 104 | wait -------------------------------------------------------------------------------- /frontend/src/components/SummaryTable.jsx: -------------------------------------------------------------------------------- 1 | import { calculateModelSummary, formatAccuracy, formatResponseTime } from '../utils/dataProcessing'; 2 | 3 | export const SummaryTable = ({ results }) => { 4 | const modelSummary = calculateModelSummary(results); 5 | 6 | return ( 7 |

8 |

9 |

10 | Model Performance Summary 11 |

12 |

13 | 14 | 15 | 16 | 19 | 22 | 25 | 26 | 27 | 28 | {Object.entries(modelSummary).map(([model, stats]) => { 29 | const total = stats.total; 30 | const correct = stats.correct; 31 | const totalTime = stats.totalTime; 32 | const accuracy = total > 0 ? (correct / total) * 100 : 0; 33 | const avgTime = total > 0 ? totalTime / total : 0; 34 | 35 | return ( 36 | 40 | 45 | 60 | 65 | 66 | ); 67 | })} 68 | 69 |

17 \| Model 18 \|	20 \| Correct 21 \|	23 \| Avg Response Time 24 \|
41 \| 42 \| {model} 43 \| 44 \|	46 \| 47 \| 48 \| {correct}/{total} ({formatAccuracy(correct, total)}) 49 \| 50 \| 51 \| 52 \| 56 \| 57 \| 58 \| 59 \|	61 \| 62 \| {formatResponseTime(avgTime)} 63 \| 64 \|

70 |

71 |

72 |

73 | ); 74 | }; -------------------------------------------------------------------------------- /reporting.py: -------------------------------------------------------------------------------- 1 | """Reporting and formatting functions for LLM evaluation results.""" 2 | 3 | from typing import Any, Dict, List, Tuple 4 | from tabulate import tabulate 5 | 6 | from shared import ( 7 | calculate_model_summary, 8 | create_progress_bar, 9 | format_accuracy, 10 | format_response_time, 11 | get_unique_prompts_and_models 12 | ) 13 | 14 | 15 | def create_detailed_table_data(results: List[Dict[str, Any]]) -> List[List[str]]: 16 | """Create data for the detailed results table.""" 17 | return [ 18 | [r["model"], r["file"], "correct" if r["correct"] else "wrong", format_response_time(r["response_time"]), r.get("note", "")] 19 | for r in results 20 | ] 21 | 22 | 23 | def create_matrix_table_data(results: List[Dict[str, Any]]) -> Tuple[List[str], List[List[str]]]: 24 | """Create data for the matrix table (prompts as columns, models as rows).""" 25 | prompts, models = get_unique_prompts_and_models(results) 26 | 27 | header = ["Model"] + prompts 28 | table_data = [] 29 | 30 | for model in models: 31 | row = [model] 32 | for prompt in prompts: 33 | found = False 34 | for r in results: 35 | if r["model"] == model and r["file"] == prompt: 36 | row.append("correct" if r["correct"] else "wrong") 37 | found = True 38 | break 39 | if not found: 40 | row.append("unavailable") 41 | table_data.append(row) 42 | 43 | return header, table_data 44 | 45 | 46 | def create_summary_table_data(results: List[Dict[str, Any]]) -> List[List[str]]: 47 | """Create data for the model performance summary table.""" 48 | model_summary = calculate_model_summary(results) 49 | summary_table = [] 50 | 51 | for model, stats in model_summary.items(): 52 | total = stats["total"] 53 | correct = stats["correct"] 54 | total_time = stats["total_time"] 55 | 56 | accuracy = (correct / total) * 100 if total > 0 else 0 57 | avg_time = total_time / total if total > 0 else 0 58 | 59 | bar = create_progress_bar(accuracy) 60 | 61 | summary_table.append([ 62 | model, 63 | f"{correct}/{total} ({format_accuracy(correct, total)}) [{bar}]", 64 | format_response_time(avg_time) 65 | ]) 66 | 67 | return summary_table 68 | 69 | 70 | def format_detailed_table(results: List[Dict[str, Any]]) -> str: 71 | """Format the detailed results table.""" 72 | table_data = create_detailed_table_data(results) 73 | return tabulate(table_data, headers=["Model", "File", "Correct", "Response Time", "Note"], tablefmt="fancy_grid") 74 | 75 | 76 | def format_matrix_table(results: List[Dict[str, Any]]) -> str: 77 | """Format the matrix table (prompts as columns, models as rows).""" 78 | header, table_data = create_matrix_table_data(results) 79 | return tabulate(table_data, headers=header, tablefmt="fancy_grid") 80 | 81 | 82 | def format_summary_table(results: List[Dict[str, Any]]) -> str: 83 | """Format the model performance summary table.""" 84 | table_data = create_summary_table_data(results) 85 | return tabulate(table_data, headers=["Model", "Correct", "Avg Response Time"], tablefmt="fancy_grid") 86 | 87 | 88 | def calculate_all_summary_data(results: List[Dict[str, Any]]) -> Dict[str, Any]: 89 | """Calculate all summary data for results.""" 90 | return { 91 | "detailed_table": create_detailed_table_data(results), 92 | "matrix_table": create_matrix_table_data(results), 93 | "summary_table": create_summary_table_data(results), 94 | "model_summary": calculate_model_summary(results), 95 | "unique_prompts_and_models": get_unique_prompts_and_models(results) 96 | } -------------------------------------------------------------------------------- /frontend/src/test/Dashboard.test.jsx: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react' 2 | import { describe, it, expect, vi, beforeEach } from 'vitest' 3 | import { BrowserRouter } from 'react-router-dom' 4 | import { Dashboard } from '../components/Dashboard.jsx' 5 | import '@testing-library/jest-dom' 6 | 7 | // Mock localStorage 8 | const localStorageMock = { 9 | getItem: vi.fn(), 10 | setItem: vi.fn(), 11 | removeItem: vi.fn(), 12 | clear: vi.fn(), 13 | }; 14 | global.localStorage = localStorageMock 15 | 16 | // Mock the hooks 17 | vi.mock('../hooks/useApi.js', () => ({ 18 | useApi: () => ({ 19 | data: { 20 | results: [ 21 | { 22 | model: 'test-model', 23 | file: 'test.txt', 24 | correct: true, 25 | response_time: 1.5 26 | } 27 | ], 28 | summary: [ 29 | { 30 | model: 'test-model', 31 | total: 1, 32 | correct: 1, 33 | accuracy: 100.0, 34 | avg_response_time: 1.5 35 | } 36 | ], 37 | matrix: { 38 | prompts: ['test.txt'], 39 | models: ['test-model'], 40 | cells: [ 41 | { 42 | model: 'test-model', 43 | file: 'test.txt', 44 | correct: true, 45 | response_time: 1.5, 46 | is_fastest_correct: true 47 | } 48 | ] 49 | }, 50 | details: { 51 | 'test.txt': { 52 | prompt: 'Test prompt', 53 | expected: 'Expected answer', 54 | models: [ 55 | { 56 | model: 'test-model', 57 | file: 'test.txt', 58 | correct: true, 59 | response_time: 1.5, 60 | generated: 'Generated answer' 61 | } 62 | ] 63 | } 64 | }, 65 | metadata: { 66 | total_results: 1, 67 | models: ['test-model'], 68 | files: ['test.txt'] 69 | } 70 | }, 71 | loading: false, 72 | error: null 73 | }), 74 | useJsonFile: () => ({ 75 | data: null, 76 | loading: false, 77 | error: null 78 | }) 79 | })) 80 | 81 | vi.mock('../hooks/useFilters.js', () => ({ 82 | useFilters: () => ({ 83 | selectedModels: ['test-model'], 84 | selectedFiles: ['test.txt'], 85 | showCorrectOnly: false, 86 | showIncorrectOnly: false, 87 | modelOptions: ['test-model'], 88 | fileOptions: ['test.txt'], 89 | handleModelChange: vi.fn(), 90 | handleFileChange: vi.fn(), 91 | handleCorrectOnlyChange: vi.fn(), 92 | handleIncorrectOnlyChange: vi.fn(), 93 | clearFilters: vi.fn() 94 | }) 95 | })) 96 | 97 | const renderWithRouter = (component) => { 98 | return render( 99 | 100 | {component} 101 | 102 | ) 103 | } 104 | 105 | describe('Dashboard Component', () => { 106 | beforeEach(() => { 107 | vi.clearAllMocks() 108 | }) 109 | 110 | it('renders dashboard title', () => { 111 | renderWithRouter() 112 | 113 | expect(screen.getByText('LLM Eval Dashboard')).toBeInTheDocument() 114 | }) 115 | 116 | it('renders evaluation heatmap', () => { 117 | renderWithRouter() 118 | 119 | expect(screen.getByText('Evaluation Heatmap')).toBeInTheDocument() 120 | expect(screen.getByText('test.txt')).toBeInTheDocument() 121 | expect(screen.getAllByText('test-model')).toHaveLength(2) // Appears in both table and accordion 122 | }) 123 | 124 | it('renders model answers details', () => { 125 | renderWithRouter() 126 | 127 | expect(screen.getByText('Model Answers Details')).toBeInTheDocument() 128 | }) 129 | 130 | it('renders footer', () => { 131 | renderWithRouter() 132 | 133 | expect(screen.getByText('LLM Eval Simple - Interactive Dashboard')).toBeInTheDocument() 134 | }) 135 | 136 | // Note: Tests for loading, error, and no-data states require dynamic mocking 137 | // which is complex with vitest. These states are tested indirectly through the App tests. 138 | }) -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # LLM Eval Frontend 2 | 3 | Modern React dashboard for LLM evaluation results with interactive charts and responsive design. 4 | 5 | ## Features 6 | 7 | - 🎨 **Modern UI** with Tailwind CSS and dark mode support 8 | - 📊 **Interactive Charts** using Chart.js (accuracy, response time, model comparison) 9 | - 📱 **Mobile Responsive** design that works on all devices 10 | - 🔍 **Advanced Filtering** by model, file, correctness, and response time 11 | - ⚡ **Real-time Updates** with API integration 12 | - 🌙 **Dark Mode** toggle with system preference detection 13 | - 📈 **Performance Metrics** with visual progress bars and color coding 14 | 15 | ## Quick Start 16 | 17 | ### Prerequisites 18 | 19 | - Node.js 16+ and npm 20 | - Python backend server running 21 | 22 | ### Setup 23 | 24 | ```bash 25 | # Navigate to frontend directory 26 | cd frontend 27 | 28 | # Run the setup script 29 | ./setup.sh 30 | 31 | # Or manually: 32 | npm install 33 | npm run build 34 | ``` 35 | 36 | ### Development 37 | 38 | ```bash 39 | # Start development server with hot reload 40 | npm run dev 41 | 42 | # In another terminal, start the backend 43 | cd .. 44 | python api_server.py 45 | 46 | # Visit http://localhost:3000 47 | ``` 48 | 49 | ### Production 50 | 51 | ```bash 52 | # Build for production 53 | npm run build 54 | 55 | # Start the enhanced backend server 56 | python api_server.py 57 | 58 | # Visit http://localhost:8001 59 | ``` 60 | 61 | ## Project Structure 62 | 63 | ``` 64 | frontend/ 65 | ├── src/ 66 | │ ├── components/ # React components 67 | │ │ ├── Charts/ # Chart.js components 68 | │ │ ├── ui/ # Reusable UI components 69 | │ │ ├── Dashboard.jsx # Main dashboard 70 | │ │ ├── SummaryTable.jsx 71 | │ │ ├── ResultsMatrix.jsx 72 | │ │ └── QuestionDetails.jsx 73 | │ ├── hooks/ # Custom React hooks 74 | │ ├── utils/ # Utility functions 75 | │ └── styles/ # Global styles 76 | ├── package.json 77 | ├── vite.config.js 78 | └── tailwind.config.js 79 | ``` 80 | 81 | ## Components 82 | 83 | ### Dashboard 84 | Main component that orchestrates all other components and handles data loading. 85 | 86 | ### Charts 87 | - **AccuracyChart**: Bar chart showing model accuracy percentages 88 | - **ResponseTimeChart**: Bar chart comparing average response times 89 | - **ModelComparisonChart**: Line chart showing performance across all prompts 90 | 91 | ### Data Display 92 | - **SummaryTable**: Model performance summary with progress bars 93 | - **ResultsMatrix**: Interactive grid with color-coded performance 94 | - **QuestionDetails**: Expandable details for each prompt 95 | 96 | ### UI Components 97 | - **DarkModeToggle**: Theme switcher with persistence 98 | - **Modal**: Reusable modal component 99 | - **Button**: Styled button with variants 100 | - **FilterPanel**: Advanced filtering options 101 | 102 | ## Features Explained 103 | 104 | ### Dark Mode 105 | - Automatic system preference detection 106 | - Manual toggle with localStorage persistence 107 | - Smooth transitions between themes 108 | 109 | ### Responsive Design 110 | - Mobile-first approach with Tailwind CSS 111 | - Adaptive layouts for different screen sizes 112 | - Touch-friendly interactions 113 | 114 | ### Interactive Charts 115 | - Hover tooltips with detailed information 116 | - Color-coded data visualization 117 | - Responsive sizing and animations 118 | 119 | ### Advanced Filtering 120 | - Filter by model, prompt file, correctness 121 | - Response time range filtering 122 | - Real-time filter application 123 | 124 | ## API Integration 125 | 126 | The frontend communicates with the backend via REST API: 127 | 128 | - `GET /api/results` - Fetches all evaluation results 129 | - Includes summary statistics, matrix data, and detailed information 130 | 131 | ## Development Notes 132 | 133 | - Uses Vite for fast development and building 134 | - React 18 with modern hooks and patterns 135 | - Chart.js for data visualization 136 | - Tailwind CSS for styling 137 | - Lucide React for icons 138 | 139 | ## Browser Support 140 | 141 | - Chrome/Edge 90+ 142 | - Firefox 88+ 143 | - Safari 14+ 144 | 145 | ## Contributing 146 | 147 | 1. Follow existing code patterns 148 | 2. Use TypeScript for new components (optional) 149 | 3. Test on mobile devices 150 | 4. Ensure dark mode compatibility -------------------------------------------------------------------------------- /frontend/src/utils/dataProcessing.js: -------------------------------------------------------------------------------- 1 | // Utility functions for data processing (ported from Python) 2 | 3 | export const calculateModelSummary = (results) => { 4 | const modelSummary = {}; 5 | for (const r of results) { 6 | const model = r.model; 7 | if (!modelSummary[model]) { 8 | modelSummary[model] = { total: 0, correct: 0, totalTime: 0 }; 9 | } 10 | 11 | modelSummary[model].total += 1; 12 | if (r.correct) { 13 | modelSummary[model].correct += 1; 14 | } 15 | modelSummary[model].totalTime += r.response_time; 16 | } 17 | 18 | return modelSummary; 19 | }; 20 | 21 | export const getUniquePromptsAndModels = (results) => { 22 | const prompts = [...new Set(results.map(r => r.file))].sort(); 23 | const models = [...new Set(results.map(r => r.model))].sort(); 24 | return { prompts, models }; 25 | }; 26 | 27 | export const formatResponseTime = (responseTime) => { 28 | return `${responseTime.toFixed(2)}s`; 29 | }; 30 | 31 | export const formatAccuracy = (correct, total) => { 32 | const accuracy = total > 0 ? (correct / total) * 100 : 0; 33 | return `${accuracy.toFixed(1)}%`; 34 | }; 35 | 36 | export const createProgressBar = (accuracy, length = 10) => { 37 | const barLength = Math.floor(accuracy / 10); 38 | return '█'.repeat(barLength) + '░'.repeat(length - barLength); 39 | }; 40 | 41 | export const normalizeTimeValue = (value, minVal, maxVal) => { 42 | const timeRange = maxVal !== minVal ? maxVal - minVal : 1; 43 | return (value - minVal) / timeRange; 44 | }; 45 | 46 | export const interpolateColor = (color1, color2, factor) => { 47 | const r = Math.round(color1[0] + (color2[0] - color1[0]) * factor); 48 | const g = Math.round(color1[1] + (color2[1] - color1[1]) * factor); 49 | const b = Math.round(color1[2] + (color2[2] - color1[2]) * factor); 50 | return `rgb(${r}, ${g}, ${b})`; 51 | }; 52 | 53 | export const findFastestCorrectPerPrompt = (results, prompts) => { 54 | const fastestCorrectPerPrompt = {}; 55 | for (const prompt of prompts) { 56 | let fastestTime = Infinity; 57 | let fastestModel = null; 58 | for (const r of results) { 59 | if (r.file === prompt && r.correct && r.response_time < fastestTime) { 60 | fastestTime = r.response_time; 61 | fastestModel = r.model; 62 | } 63 | } 64 | if (fastestModel) { 65 | fastestCorrectPerPrompt[prompt] = fastestModel; 66 | } 67 | } 68 | return fastestCorrectPerPrompt; 69 | }; 70 | 71 | export const groupResultsByFile = (results) => { 72 | const resultsByFile = {}; 73 | for (const r of results) { 74 | if (!resultsByFile[r.file]) { 75 | resultsByFile[r.file] = { 76 | prompt: r.prompt, 77 | expected: r.expected, 78 | models: [] 79 | }; 80 | } 81 | resultsByFile[r.file].models.push(r); 82 | } 83 | return resultsByFile; 84 | }; 85 | 86 | export const createCellDataDict = (results) => { 87 | const cellDataDict = {}; 88 | for (const r of results) { 89 | const cellId = `${r.model}-${r.file}`; 90 | cellDataDict[cellId] = { 91 | model: r.model, 92 | file: r.file, 93 | generated: r.generated, 94 | response_time: formatResponseTime(r.response_time), 95 | correct: r.correct 96 | }; 97 | } 98 | return cellDataDict; 99 | }; 100 | 101 | // Color constants (ported from Python) 102 | export const GOLD_RGB = [255, 215, 0]; 103 | export const GREEN_RGB = [0, 247, 0]; 104 | export const LIGHT_GREEN_RGB = [245, 255, 245]; 105 | 106 | export const getCellStyle = (result, fastestCorrectPerPrompt, minTime, maxTime, isDarkMode) => { 107 | try { 108 | if (!result) return ''; 109 | 110 | if (typeof result.response_time !== 'number' || isNaN(result.response_time)) { 111 | return 'background-color: #fef3c7;'; // Yellow for invalid data 112 | } 113 | 114 | const normalizedTime = normalizeTimeValue(result.response_time, minTime, maxTime); 115 | const isFastestCorrect = result.correct && 116 | fastestCorrectPerPrompt && 117 | fastestCorrectPerPrompt[result.file] === result.model; 118 | 119 | if (result.correct) { 120 | if (isFastestCorrect) { 121 | const color = interpolateColor(GOLD_RGB, GREEN_RGB, normalizedTime); 122 | return `background-color: ${color}; border: 2px solid #FFD700; box-shadow: 0 0 5px rgba(255, 215, 0, 0.5);`; 123 | } else { 124 | const color = interpolateColor(GREEN_RGB, LIGHT_GREEN_RGB, normalizedTime); 125 | return `background-color: ${color};`; 126 | } 127 | } else { 128 | const lightness = isDarkMode ? 30 : 70; 129 | const adjustedLightness = lightness + (30 * normalizedTime); 130 | return `background-color: hsl(0, 100%, ${adjustedLightness}%);`; 131 | } 132 | } catch (error) { 133 | console.error('Error in getCellStyle:', error, { result, fastestCorrectPerPrompt, minTime, maxTime, isDarkMode }); 134 | return 'background-color: #fecaca;'; // Red for errors 135 | } 136 | }; -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration and fixtures.""" 2 | 3 | import pytest 4 | import tempfile 5 | import json 6 | import os 7 | from pathlib import Path 8 | from typing import Dict, Any, List 9 | 10 | from unittest.mock import Mock 11 | 12 | 13 | @pytest.fixture 14 | def temp_dir(): 15 | """Create a temporary directory for tests.""" 16 | with tempfile.TemporaryDirectory() as tmpdir: 17 | yield Path(tmpdir) 18 | 19 | 20 | @pytest.fixture 21 | def sample_prompt_file(temp_dir): 22 | """Create a sample prompt file.""" 23 | prompt_file = temp_dir / "test_prompt.txt" 24 | prompt_file.write_text("What is 2 + 2?", encoding='utf-8') 25 | return prompt_file 26 | 27 | 28 | @pytest.fixture 29 | def sample_answer_file(temp_dir): 30 | """Create a sample answer file.""" 31 | answer_file = temp_dir / "test_answer.txt" 32 | answer_file.write_text("4", encoding='utf-8') 33 | return answer_file 34 | 35 | 36 | @pytest.fixture 37 | def sample_json_file(temp_dir): 38 | """Create a sample JSON file.""" 39 | json_file = temp_dir / "test_data.json" 40 | data = [ 41 | { 42 | "model": "test-model", 43 | "file": "test.txt", 44 | "prompt": "Test prompt", 45 | "expected": "Expected answer", 46 | "generated": "Generated answer", 47 | "correct": True, 48 | "response_time": 1.5 49 | } 50 | ] 51 | json_file.write_text(json.dumps(data), encoding='utf-8') 52 | return json_file 53 | 54 | 55 | @pytest.fixture 56 | def sample_results(): 57 | """Sample evaluation results.""" 58 | return [ 59 | { 60 | "model": "model-a", 61 | "file": "test1.txt", 62 | "prompt": "What is 1+1?", 63 | "expected": "2", 64 | "generated": "2", 65 | "correct": True, 66 | "response_time": 1.0 67 | }, 68 | { 69 | "model": "model-b", 70 | "file": "test1.txt", 71 | "prompt": "What is 1+1?", 72 | "expected": "2", 73 | "generated": "3", 74 | "correct": False, 75 | "response_time": 2.0 76 | }, 77 | { 78 | "model": "model-a", 79 | "file": "test2.txt", 80 | "prompt": "What is 2+2?", 81 | "expected": "4", 82 | "generated": "4", 83 | "correct": True, 84 | "response_time": 1.5 85 | } 86 | ] 87 | 88 | 89 | @pytest.fixture 90 | def mock_api_response(): 91 | """Mock OpenAI-compatible API response.""" 92 | return { 93 | "choices": [ 94 | { 95 | "message": { 96 | "content": "Test response", 97 | "role": "assistant" 98 | }, 99 | "finish_reason": "stop", 100 | "index": 0 101 | } 102 | ], 103 | "usage": { 104 | "prompt_tokens": 10, 105 | "completion_tokens": 5, 106 | "total_tokens": 15 107 | }, 108 | "model": "test-model" 109 | } 110 | 111 | 112 | @pytest.fixture 113 | def mock_requests_response(mock_api_response): 114 | """Mock requests.Response object.""" 115 | mock_resp = Mock() 116 | mock_resp.json.return_value = mock_api_response 117 | mock_resp.raise_for_status.return_value = None 118 | mock_resp.status_code = 200 119 | return mock_resp 120 | 121 | 122 | @pytest.fixture 123 | def sample_config(): 124 | """Sample configuration for testing.""" 125 | return { 126 | "endpoint_url": "http://localhost:9292/v1/chat/completions", 127 | "model_names": ["model-a", "model-b"], 128 | "model_evaluator": "evaluator-model", 129 | "pattern": "prompts/*", 130 | "actions": ["answer", "evaluate"], 131 | "api_key": "test-key", 132 | "throttling_secs": 0.1, 133 | "prompt_dir": "prompts", 134 | "answer_dir": "answers" 135 | } 136 | 137 | 138 | @pytest.fixture 139 | def large_content(): 140 | """Large content for testing size limits.""" 141 | return "x" * 100000 # 100KB 142 | 143 | 144 | @pytest.fixture 145 | def invalid_json_file(temp_dir): 146 | """Create an invalid JSON file.""" 147 | invalid_file = temp_dir / "invalid.json" 148 | invalid_file.write_text("{ invalid json content", encoding='utf-8') 149 | return invalid_file 150 | 151 | 152 | @pytest.fixture 153 | def empty_file(temp_dir): 154 | """Create an empty file.""" 155 | empty = temp_dir / "empty.txt" 156 | empty.write_text("", encoding='utf-8') 157 | return empty 158 | 159 | 160 | @pytest.fixture 161 | def oversized_file(temp_dir): 162 | """Create an oversized file for testing limits.""" 163 | oversized = temp_dir / "oversized.txt" 164 | # Create a file larger than the limit (assuming 10MB limit) 165 | oversized.write_text("x" * (11 * 1024 * 1024), encoding='utf-8') 166 | return oversized -------------------------------------------------------------------------------- /frontend/src/components/Dashboard.jsx: -------------------------------------------------------------------------------- 1 | import { useState, useEffect } from 'react'; 2 | import { useApi, useJsonFile } from '../hooks/useApi'; 3 | import { Loader2 } from 'lucide-react'; 4 | import { ResultsMatrix } from './ResultsMatrix'; 5 | import { ModelAnswersAccordion } from './ModelAnswersAccordion'; 6 | import { DarkModeToggle } from './ui/DarkModeToggle'; 7 | 8 | export const Dashboard = () => { 9 | const [renderFile, setRenderFile] = useState(null); 10 | 11 | // Parse URL parameters for render query 12 | useEffect(() => { 13 | const params = new URLSearchParams(window.location.search); 14 | const renderParam = params.get('render'); 15 | 16 | if (renderParam) { 17 | setRenderFile(renderParam); 18 | } else if (window.location.pathname === '/' && !window.location.search) { 19 | // Redirect to default JSON file if no parameters 20 | window.location.href = '/?render=report-evaluated.json'; 21 | } 22 | }, []); 23 | 24 | const { data: apiData, loading: apiLoading, error: apiError } = useApi('/api/results'); 25 | const { data: jsonData, loading: jsonLoading, error: jsonError } = useJsonFile(renderFile); 26 | 27 | const results = renderFile ? jsonData : (apiData?.results || []); 28 | const loading = renderFile ? jsonLoading : apiLoading; 29 | const error = renderFile ? jsonError : apiError; 30 | 31 | if (loading) { 32 | return ( 33 |

34 |

35 | 36 |

Loading evaluation results...

37 |

38 |

39 | ); 40 | } 41 | 42 | if (error) { 43 | return ( 44 |

45 |

46 |

47 |

48 | Error Loading Results 49 |

50 |

{error}

51 |

52 |

53 |

54 | ); 55 | } 56 | 57 | if (!results || results.length === 0) { 58 | return ( 59 |

60 |

61 |

62 | No Results Available 63 |

64 |

65 | Please run the evaluation first to generate results. 66 |

67 |

68 |

69 | ); 70 | } 71 | 72 | return ( 73 |

74 | {/* Header */} 75 |

76 |

77 |

78 |

79 |

80 | LLM Eval Dashboard 81 |

82 |

83 | Interactive model evaluation results 84 |

85 |

86 | 87 |

88 |

89 |

90 | 91 | {/* Main Content */} 92 |

93 |

94 | {renderFile && ( 95 |

96 |

97 | Rendering heatmap for: {renderFile} 98 |

99 |

100 | )} 101 | 102 | {/* Heatmap Results Display */} 103 | 104 | 105 | {/* Model Answers Accordion */} 106 | 107 |

108 |

109 | 110 | {/* Footer */} 111 | 118 |

119 | ); 120 | }; -------------------------------------------------------------------------------- /frontend/src/components/QuestionDetails.jsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import { ChevronDown, ChevronRight } from 'lucide-react'; 3 | import { groupResultsByFile } from '../utils/dataProcessing'; 4 | 5 | export const QuestionDetails = ({ results }) => { 6 | const [expandedFiles, setExpandedFiles] = useState(new Set()); 7 | const resultsByFile = groupResultsByFile(results); 8 | 9 | const toggleFile = (filename) => { 10 | const newExpanded = new Set(expandedFiles); 11 | if (newExpanded.has(filename)) { 12 | newExpanded.delete(filename); 13 | } else { 14 | newExpanded.add(filename); 15 | } 16 | setExpandedFiles(newExpanded); 17 | }; 18 | 19 | return ( 20 |

21 |

22 |

23 | Question Details 24 |

25 |

26 | {Object.entries(resultsByFile).map(([file, data]) => ( 27 |

31 | 44 | 45 | {expandedFiles.has(file) && ( 46 |

47 |

48 |

49 | Prompt: 50 |

51 |

 52 |                       {data.prompt}
 53 |

54 |

55 | 56 |

57 | {data.models.map((modelResult, index) => ( 58 |

66 |

67 | {modelResult.model} 68 |

69 |

70 |

71 | 72 | Expected Answer: 73 | 74 |

 75 |                               {data.expected}
 76 |

77 |

78 |

79 | 80 | Generated Answer: 81 | 82 |

 83 |                               {modelResult.generated}
 84 |

85 |

86 |

87 |

88 | Response Time: {modelResult.response_time.toFixed(2)}s 89 |

90 | {modelResult.note && ( 91 |

92 | 93 | Note: 94 | 95 |

96 | {modelResult.note} 97 |

98 |

99 | )} 100 |

101 | ))} 102 |

103 |

104 | )} 105 |

106 | ))} 107 |

108 |

109 |

110 | ); 111 | }; -------------------------------------------------------------------------------- /frontend/src/test/App.test.jsx: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react' 2 | import { describe, it, expect, vi, beforeEach } from 'vitest' 3 | import { BrowserRouter } from 'react-router-dom' 4 | import '@testing-library/jest-dom' 5 | 6 | // Mock the hooks before importing App 7 | vi.mock('../hooks/useApi.js', () => ({ 8 | useApi: vi.fn(), 9 | useJsonFile: vi.fn() 10 | })) 11 | 12 | vi.mock('../hooks/useDarkMode.js', () => ({ 13 | useDarkMode: vi.fn() 14 | })) 15 | 16 | vi.mock('../hooks/useFilters.js', () => ({ 17 | useFilters: () => ({ 18 | selectedModels: ['test-model'], 19 | selectedFiles: ['test.txt'], 20 | showCorrectOnly: false, 21 | showIncorrectOnly: false, 22 | modelOptions: ['test-model'], 23 | fileOptions: ['test.txt'], 24 | handleModelChange: vi.fn(), 25 | handleFileChange: vi.fn(), 26 | handleCorrectOnlyChange: vi.fn(), 27 | handleIncorrectOnlyChange: vi.fn(), 28 | clearFilters: vi.fn() 29 | }) 30 | })) 31 | 32 | // Mock window.location 33 | Object.defineProperty(window, 'location', { 34 | value: { 35 | search: '', 36 | pathname: '/', 37 | href: '' 38 | }, 39 | writable: true 40 | }) 41 | 42 | // Mock localStorage 43 | const localStorageMock = { 44 | getItem: vi.fn(), 45 | setItem: vi.fn(), 46 | removeItem: vi.fn(), 47 | clear: vi.fn() 48 | } 49 | Object.defineProperty(window, 'localStorage', { 50 | value: localStorageMock 51 | }) 52 | 53 | // Import App after mocking 54 | import App from '../App.jsx' 55 | import { useApi, useJsonFile } from '../hooks/useApi.js' 56 | import { useDarkMode } from '../hooks/useDarkMode.js' 57 | 58 | const renderWithRouter = (component) => { 59 | return render( 60 | 61 | {component} 62 | 63 | ) 64 | } 65 | 66 | describe('App Component', () => { 67 | beforeEach(() => { 68 | // Clear all mocks before each test 69 | vi.clearAllMocks() 70 | 71 | // Reset default mock implementations 72 | useApi.mockReturnValue({ 73 | data: { 74 | results: [ 75 | { 76 | model: 'test-model', 77 | file: 'test.txt', 78 | correct: true, 79 | response_time: 1.5 80 | } 81 | ] 82 | }, 83 | loading: false, 84 | error: null 85 | }) 86 | 87 | useJsonFile.mockReturnValue({ 88 | data: null, 89 | loading: false, 90 | error: null 91 | }) 92 | 93 | useDarkMode.mockReturnValue({ 94 | isDark: false, 95 | toggleDarkMode: vi.fn() 96 | }) 97 | 98 | // Reset window.location 99 | window.location.search = '' 100 | window.location.pathname = '/' 101 | window.location.href = '' 102 | }) 103 | 104 | it('renders without crashing', () => { 105 | renderWithRouter() 106 | 107 | // Check if main components are rendered 108 | expect(screen.getByText('LLM Eval Dashboard')).toBeInTheDocument() 109 | }) 110 | 111 | it('displays loading state initially', () => { 112 | // Override the mock to show loading state 113 | useApi.mockReturnValue({ 114 | data: null, 115 | loading: true, 116 | error: null 117 | }) 118 | 119 | renderWithRouter() 120 | 121 | expect(screen.getByText('Loading evaluation results...')).toBeInTheDocument() 122 | }) 123 | 124 | it('displays error state when API fails', () => { 125 | // Override the mock to show error state 126 | useApi.mockReturnValue({ 127 | data: null, 128 | loading: false, 129 | error: 'Failed to fetch data' 130 | }) 131 | 132 | renderWithRouter() 133 | 134 | expect(screen.getByText('Error Loading Results')).toBeInTheDocument() 135 | expect(screen.getByText('Failed to fetch data')).toBeInTheDocument() 136 | }) 137 | 138 | it('displays dashboard when data is loaded', () => { 139 | renderWithRouter() 140 | 141 | // Check if dashboard components are rendered 142 | expect(screen.getByText('LLM Eval Dashboard')).toBeInTheDocument() 143 | expect(screen.getByText('Interactive model evaluation results')).toBeInTheDocument() 144 | }) 145 | 146 | it('renders results matrix', () => { 147 | renderWithRouter() 148 | 149 | // Check if results matrix is rendered 150 | expect(screen.getByText('test.txt')).toBeInTheDocument() 151 | expect(screen.getAllByText('test-model')).toHaveLength(2) // One in table, one in accordion 152 | }) 153 | 154 | it('toggles dark mode', () => { 155 | const mockToggle = vi.fn() 156 | useDarkMode.mockReturnValue({ 157 | isDark: false, 158 | toggle: mockToggle 159 | }) 160 | 161 | renderWithRouter() 162 | 163 | // Find dark mode toggle button by aria-label 164 | const darkModeToggle = screen.getByLabelText('Toggle dark mode') 165 | expect(darkModeToggle).toBeInTheDocument() 166 | }) 167 | 168 | it('applies dark mode class when enabled', () => { 169 | useDarkMode.mockReturnValue({ 170 | isDark: true, 171 | toggle: vi.fn() 172 | }) 173 | 174 | renderWithRouter() 175 | 176 | // Check that sun icon is shown when dark mode is enabled 177 | const darkModeToggle = screen.getByLabelText('Toggle dark mode') 178 | expect(darkModeToggle).toBeInTheDocument() 179 | // The dark mode class application is handled by the hook's useEffect, 180 | // which doesn't run in the mocked test environment 181 | }) 182 | }) -------------------------------------------------------------------------------- /shared.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Any, Dict, List, Tuple 4 | 5 | # Constants 6 | GENERATED_ANSWERS_DIR = "answers-generated" 7 | RAW_REPORT_PATH = os.path.join(GENERATED_ANSWERS_DIR, "report.json") 8 | EVALUATED_REPORT_PATH = os.path.join(GENERATED_ANSWERS_DIR, "report-evaluated.json") 9 | 10 | def get_evaluated_report_path(custom_path: str = None) -> str: 11 | """Get the evaluated report path, allowing custom override.""" 12 | return custom_path if custom_path else EVALUATED_REPORT_PATH 13 | HTML_REPORT_PATH = os.path.join(GENERATED_ANSWERS_DIR, "report-evaluated.html") 14 | 15 | # Server constants 16 | DEFAULT_SERVER_PORT = 4000 17 | TEMPLATE_PATH = "report_template.html" 18 | 19 | # Color constants for HTML output 20 | GOLD_COLOR = (255, 215, 0) 21 | GREEN_COLOR = (0, 247, 0) 22 | LIGHT_GREEN_COLOR = (245, 255, 245) 23 | 24 | # Response time constants 25 | RESPONSE_TIME_DECIMAL_PLACES = 2 26 | ACCURACY_DECIMAL_PLACES = 1 27 | PERCENTAGE_MULTIPLIER = 100 28 | BAR_LENGTH_DIVISOR = 10 29 | 30 | # Color constants 31 | RGB_MAX = 255 32 | HSL_LIGHTNESS_MIN = 70 33 | HSL_LIGHTNESS_RANGE = 30 34 | 35 | # HTML/CSS constants 36 | GOLD_RGB = (255, 215, 0) 37 | GREEN_RGB = (0, 247, 0) 38 | LIGHT_GREEN_RGB = (245, 255, 245) 39 | 40 | # Time constants 41 | INFINITE_TIME = float('inf') 42 | DEFAULT_PROGRESS_BAR_LENGTH = 10 43 | 44 | 45 | def calculate_model_summary(results: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]: 46 | """Calculate summary statistics for each model.""" 47 | model_summary = {} 48 | for r in results: 49 | model = r["model"] 50 | if model not in model_summary: 51 | model_summary[model] = {"total": 0, "correct": 0, "total_time": 0} 52 | 53 | model_summary[model]["total"] += 1 54 | if r["correct"]: 55 | model_summary[model]["correct"] += 1 56 | model_summary[model]["total_time"] += r["response_time"] 57 | 58 | return model_summary 59 | 60 | 61 | def get_unique_prompts_and_models(results: List[Dict[str, Any]]) -> Tuple[List[str], List[str]]: 62 | """Get sorted lists of unique prompts and models from results.""" 63 | prompts = sorted(list(set(r["file"] for r in results))) 64 | models = sorted(list(set(r["model"] for r in results))) 65 | return prompts, models 66 | 67 | 68 | def format_response_time(response_time: float) -> str: 69 | """Format response time with consistent decimal places.""" 70 | return f"{response_time:.{RESPONSE_TIME_DECIMAL_PLACES}f}s" 71 | 72 | 73 | def format_accuracy(correct: int, total: int) -> str: 74 | """Format accuracy as percentage with consistent decimal places.""" 75 | accuracy = (correct / total) * PERCENTAGE_MULTIPLIER if total > 0 else 0 76 | return f"{accuracy:.{ACCURACY_DECIMAL_PLACES}f}%" 77 | 78 | 79 | def create_progress_bar(accuracy: float, length: int = DEFAULT_PROGRESS_BAR_LENGTH) -> str: 80 | """Create a text-based progress bar.""" 81 | bar_length = int(accuracy / BAR_LENGTH_DIVISOR) 82 | return "█" * bar_length + "░" * (length - bar_length) 83 | 84 | 85 | def normalize_time_value(value: float, min_val: float, max_val: float) -> float: 86 | """Normalize a time value between 0 and 1.""" 87 | time_range = max_val - min_val if max_val != min_val else 1 88 | return (value - min_val) / time_range 89 | 90 | 91 | def interpolate_color(color1: tuple, color2: tuple, factor: float) -> tuple: 92 | """Interpolate between two RGB colors.""" 93 | r = int(color1[0] + (color2[0] - color1[0]) * factor) 94 | g = int(color1[1] + (color2[1] - color1[1]) * factor) 95 | b = int(color1[2] + (color2[2] - color1[2]) * factor) 96 | return (r, g, b) 97 | 98 | 99 | def find_fastest_correct_per_prompt(results: List[Dict[str, Any]], prompts: List[str]) -> Dict[str, str]: 100 | """Find the fastest correct model for each prompt.""" 101 | fastest_correct_per_prompt = {} 102 | for prompt in prompts: 103 | fastest_time = INFINITE_TIME 104 | fastest_model = None 105 | for r in results: 106 | if r["file"] == prompt and r["correct"] and r["response_time"] < fastest_time: 107 | fastest_time = r["response_time"] 108 | fastest_model = r["model"] 109 | if fastest_model: 110 | fastest_correct_per_prompt[prompt] = fastest_model 111 | return fastest_correct_per_prompt 112 | 113 | 114 | def group_results_by_file(results: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: 115 | """Group results by file for detailed display.""" 116 | results_by_file = {} 117 | for r in results: 118 | if r['file'] not in results_by_file: 119 | results_by_file[r['file']] = { 120 | "prompt": r['prompt'], 121 | "expected": r['expected'], 122 | "models": [] 123 | } 124 | results_by_file[r['file']]['models'].append(r) 125 | return results_by_file 126 | 127 | 128 | def create_cell_data_dict(results: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: 129 | """Create cell data dictionary for JavaScript interactions.""" 130 | cell_data_dict = {} 131 | for r in results: 132 | cell_id = f"{r['model']}-{r['file']}" 133 | cell_data_dict[cell_id] = { 134 | "model": r["model"], 135 | "file": r["file"], 136 | "generated": r["generated"], 137 | "response_time": format_response_time(r['response_time']), 138 | "correct": r["correct"], 139 | "note": r.get("note", "") 140 | } 141 | return cell_data_dict -------------------------------------------------------------------------------- /frontend/src/test/useApi.test.js: -------------------------------------------------------------------------------- 1 | import { renderHook, waitFor } from '@testing-library/react' 2 | import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest' 3 | import { useApi } from '../hooks/useApi.js' 4 | 5 | // Mock fetch 6 | global.fetch = vi.fn() 7 | 8 | // Mock window.matchMedia 9 | Object.defineProperty(window, 'matchMedia', { 10 | writable: true, 11 | value: vi.fn().mockImplementation(query => ({ 12 | matches: false, 13 | media: query, 14 | onchange: null, 15 | addListener: vi.fn(), 16 | removeListener: vi.fn(), 17 | addEventListener: vi.fn(), 18 | removeEventListener: vi.fn(), 19 | dispatchEvent: vi.fn(), 20 | })), 21 | }) 22 | 23 | describe('useApi Hook', () => { 24 | beforeEach(() => { 25 | // Clear fetch mock before each test 26 | global.fetch.mockClear() 27 | }) 28 | 29 | afterEach(() => { 30 | // Reset fetch mock after each test 31 | global.fetch.mockReset() 32 | }) 33 | 34 | it('should initialize with loading state', () => { 35 | global.fetch.mockResolvedValue({ 36 | ok: true, 37 | json: async () => ({ data: 'test' }) 38 | }) 39 | 40 | const { result } = renderHook(() => useApi('/api/test')) 41 | 42 | expect(result.current.loading).toBe(true) 43 | expect(result.current.data).toBe(null) 44 | expect(result.current.error).toBe(null) 45 | }) 46 | 47 | it('should fetch data successfully', async () => { 48 | const mockData = { results: [], summary: [] } 49 | global.fetch.mockResolvedValue({ 50 | ok: true, 51 | json: async () => mockData 52 | }) 53 | 54 | const { result } = renderHook(() => useApi('/api/results')) 55 | 56 | await waitFor(() => { 57 | expect(result.current.loading).toBe(false) 58 | }) 59 | 60 | expect(result.current.data).toEqual(mockData) 61 | expect(result.current.error).toBe(null) 62 | expect(global.fetch).toHaveBeenCalledWith('/api/results') 63 | }) 64 | 65 | it('should handle HTTP errors', async () => { 66 | global.fetch.mockResolvedValue({ 67 | ok: false, 68 | status: 404, 69 | statusText: 'Not Found' 70 | }) 71 | 72 | const { result } = renderHook(() => useApi('/api/results')) 73 | 74 | await waitFor(() => { 75 | expect(result.current.loading).toBe(false) 76 | }) 77 | 78 | expect(result.current.data).toBe(null) 79 | expect(result.current.error).toBe('HTTP error! status: 404') 80 | }) 81 | 82 | it('should handle network errors', async () => { 83 | const networkError = new Error('Network error') 84 | global.fetch.mockRejectedValue(networkError) 85 | 86 | const { result } = renderHook(() => useApi('/api/results')) 87 | 88 | await waitFor(() => { 89 | expect(result.current.loading).toBe(false) 90 | }) 91 | 92 | expect(result.current.data).toBe(null) 93 | expect(result.current.error).toBe(networkError.message) 94 | }) 95 | 96 | it('should not fetch if url is empty', () => { 97 | renderHook(() => useApi('')) 98 | 99 | expect(global.fetch).not.toHaveBeenCalled() 100 | }) 101 | 102 | it('should refetch data when refetch is called', async () => { 103 | const mockData = { results: [], summary: [] } 104 | global.fetch.mockResolvedValue({ 105 | ok: true, 106 | json: async () => mockData 107 | }) 108 | 109 | const { result } = renderHook(() => useApi('/api/results')) 110 | 111 | await waitFor(() => { 112 | expect(result.current.loading).toBe(false) 113 | }) 114 | 115 | // Reset fetch mock 116 | global.fetch.mockClear() 117 | 118 | // Call refetch 119 | result.current.refetch() 120 | 121 | expect(global.fetch).toHaveBeenCalledWith('/api/results') 122 | }) 123 | 124 | it('should handle JSON parsing errors', async () => { 125 | global.fetch.mockResolvedValue({ 126 | ok: true, 127 | json: async () => { 128 | throw new Error('Invalid JSON') 129 | } 130 | }) 131 | 132 | const { result } = renderHook(() => useApi('/api/results')) 133 | 134 | await waitFor(() => { 135 | expect(result.current.loading).toBe(false) 136 | }) 137 | 138 | expect(result.current.data).toBe(null) 139 | expect(result.current.error).toBe('Invalid JSON') 140 | }) 141 | 142 | it('should handle empty response', async () => { 143 | global.fetch.mockResolvedValue({ 144 | ok: true, 145 | json: async () => null 146 | }) 147 | 148 | const { result } = renderHook(() => useApi('/api/results')) 149 | 150 | await waitFor(() => { 151 | expect(result.current.loading).toBe(false) 152 | }) 153 | 154 | expect(result.current.data).toBe(null) 155 | expect(result.current.error).toBe(null) 156 | }) 157 | 158 | it('should update data when url changes', async () => { 159 | const mockData1 = { results: ['data1'] } 160 | const mockData2 = { results: ['data2'] } 161 | 162 | global.fetch 163 | .mockResolvedValueOnce({ 164 | ok: true, 165 | json: async () => mockData1 166 | }) 167 | .mockResolvedValueOnce({ 168 | ok: true, 169 | json: async () => mockData2 170 | }) 171 | 172 | const { result, rerender } = renderHook( 173 | ({ url }) => useApi(url), 174 | { initialProps: { url: '/api/results1' } } 175 | ) 176 | 177 | await waitFor(() => { 178 | expect(result.current.data).toEqual(mockData1) 179 | }) 180 | 181 | rerender({ url: '/api/results2' }) 182 | 183 | await waitFor(() => { 184 | expect(result.current.data).toEqual(mockData2) 185 | }) 186 | 187 | expect(global.fetch).toHaveBeenCalledTimes(2) 188 | expect(global.fetch).toHaveBeenNthCalledWith(1, '/api/results1') 189 | expect(global.fetch).toHaveBeenNthCalledWith(2, '/api/results2') 190 | }) 191 | }) -------------------------------------------------------------------------------- /frontend/src/components/FilterPanel.jsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import { Filter, X } from 'lucide-react'; 3 | import { Button } from './ui/Button'; 4 | import { useFilters } from '../hooks/useFilters'; 5 | import { getUniquePromptsAndModels } from '../utils/dataProcessing'; 6 | 7 | export const FilterPanel = ({ results, onFiltersChange }) => { 8 | const { filters, updateFilter, clearFilters } = useFilters(); 9 | const [isOpen, setIsOpen] = useState(false); 10 | const { prompts, models } = getUniquePromptsAndModels(results); 11 | 12 | const handleFilterChange = (key, value) => { 13 | updateFilter(key, value); 14 | onFiltersChange(filters); 15 | }; 16 | 17 | const handleClear = () => { 18 | clearFilters(); 19 | onFiltersChange({}); 20 | }; 21 | 22 | return ( 23 |

24 |

25 |

26 |

27 | 28 | 29 | Filters 30 | 31 | {(filters.model || filters.file || filters.correctness !== undefined) && ( 32 | 33 | Active 34 | 35 | )} 36 |

37 | 44 |

45 | 46 | {isOpen && ( 47 |

48 | {/* Model Filter */} 49 |

50 | 51 | Model 52 | 53 | 63 |

64 | 65 | {/* File Filter */} 66 |

67 | 68 | Prompt File 69 | 70 | 80 |

81 | 82 | {/* Correctness Filter */} 83 |

84 | 85 | Correctness 86 | 87 | 96 |

97 | 98 | {/* Max Response Time Filter */} 99 |

100 | 101 | Max Response Time (seconds) 102 | 103 | handleFilterChange('maxResponseTime', e.target.value ? parseFloat(e.target.value) : null)} 107 | placeholder="No limit" 108 | min="0" 109 | step="0.1" 110 | className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-700 text-gray-900 dark:text-white focus:ring-2 focus:ring-primary-500 focus:border-primary-500" 111 | /> 112 |

113 | 114 | {/* Clear Filters Button */} 115 | 122 |

123 | )} 124 |

125 |

126 | ); 127 | }; -------------------------------------------------------------------------------- /frontend/src/test/useFilters.test.js: -------------------------------------------------------------------------------- 1 | import { renderHook, act } from '@testing-library/react' 2 | import { describe, it, expect, beforeEach } from 'vitest' 3 | import { useFilters } from '../hooks/useFilters.js' 4 | 5 | describe('useFilters Hook', () => { 6 | const mockData = { 7 | results: [ 8 | { model: 'model-a', file: 'test1.txt', correct: true }, 9 | { model: 'model-a', file: 'test2.txt', correct: false }, 10 | { model: 'model-b', file: 'test1.txt', correct: true }, 11 | { model: 'model-b', file: 'test2.txt', correct: true } 12 | ], 13 | summary: [ 14 | { model: 'model-a', accuracy: 50 }, 15 | { model: 'model-b', accuracy: 100 } 16 | ], 17 | matrix: { 18 | prompts: ['test1.txt', 'test2.txt'], 19 | models: ['model-a', 'model-b'], 20 | cells: [] 21 | }, 22 | details: {}, 23 | metadata: { 24 | models: ['model-a', 'model-b'], 25 | files: ['test1.txt', 'test2.txt'] 26 | } 27 | } 28 | 29 | beforeEach(() => { 30 | // Reset any state before each test 31 | }) 32 | 33 | it('should initialize with default values', () => { 34 | const { result } = renderHook(() => useFilters(mockData)) 35 | 36 | expect(result.current.selectedModels).toEqual(['model-a', 'model-b']) 37 | expect(result.current.selectedFiles).toEqual(['test1.txt', 'test2.txt']) 38 | expect(result.current.showCorrectOnly).toBe(false) 39 | expect(result.current.showIncorrectOnly).toBe(false) 40 | expect(result.current.modelOptions).toEqual(['model-a', 'model-b']) 41 | expect(result.current.fileOptions).toEqual(['test1.txt', 'test2.txt']) 42 | }) 43 | 44 | it('should handle model selection changes', () => { 45 | const { result } = renderHook(() => useFilters(mockData)) 46 | 47 | act(() => { 48 | result.current.handleModelChange(['model-a']) 49 | }) 50 | 51 | expect(result.current.selectedModels).toEqual(['model-a']) 52 | }) 53 | 54 | it('should handle file selection changes', () => { 55 | const { result } = renderHook(() => useFilters(mockData)) 56 | 57 | act(() => { 58 | result.current.handleFileChange(['test1.txt']) 59 | }) 60 | 61 | expect(result.current.selectedFiles).toEqual(['test1.txt']) 62 | }) 63 | 64 | it('should handle correct only toggle', () => { 65 | const { result } = renderHook(() => useFilters(mockData)) 66 | 67 | act(() => { 68 | result.current.handleCorrectOnlyChange(true) 69 | }) 70 | 71 | expect(result.current.showCorrectOnly).toBe(true) 72 | expect(result.current.showIncorrectOnly).toBe(false) 73 | }) 74 | 75 | it('should handle incorrect only toggle', () => { 76 | const { result } = renderHook(() => useFilters(mockData)) 77 | 78 | act(() => { 79 | result.current.handleIncorrectOnlyChange(true) 80 | }) 81 | 82 | expect(result.current.showIncorrectOnly).toBe(true) 83 | expect(result.current.showCorrectOnly).toBe(false) 84 | }) 85 | 86 | it('should clear all filters', () => { 87 | const { result } = renderHook(() => useFilters(mockData)) 88 | 89 | // Set some filters first 90 | act(() => { 91 | result.current.handleModelChange(['model-a']) 92 | result.current.handleFileChange(['test1.txt']) 93 | result.current.handleCorrectOnlyChange(true) 94 | }) 95 | 96 | // Clear filters 97 | act(() => { 98 | result.current.clearFilters() 99 | }) 100 | 101 | expect(result.current.selectedModels).toEqual(['model-a', 'model-b']) 102 | expect(result.current.selectedFiles).toEqual(['test1.txt', 'test2.txt']) 103 | expect(result.current.showCorrectOnly).toBe(false) 104 | expect(result.current.showIncorrectOnly).toBe(false) 105 | }) 106 | 107 | it('should handle empty data gracefully', () => { 108 | const emptyData = { 109 | results: [], 110 | summary: [], 111 | matrix: { prompts: [], models: [], cells: [] }, 112 | details: {}, 113 | metadata: { models: [], files: [] } 114 | } 115 | 116 | const { result } = renderHook(() => useFilters(emptyData)) 117 | 118 | expect(result.current.selectedModels).toEqual([]) 119 | expect(result.current.selectedFiles).toEqual([]) 120 | expect(result.current.modelOptions).toEqual([]) 121 | expect(result.current.fileOptions).toEqual([]) 122 | }) 123 | 124 | it('should not allow both correct and incorrect only filters simultaneously', () => { 125 | const { result } = renderHook(() => useFilters(mockData)) 126 | 127 | // Enable correct only 128 | act(() => { 129 | result.current.handleCorrectOnlyChange(true) 130 | }) 131 | 132 | expect(result.current.showCorrectOnly).toBe(true) 133 | expect(result.current.showIncorrectOnly).toBe(false) 134 | 135 | // Try to enable incorrect only (should disable correct only) 136 | act(() => { 137 | result.current.handleIncorrectOnlyChange(true) 138 | }) 139 | 140 | expect(result.current.showCorrectOnly).toBe(false) 141 | expect(result.current.showIncorrectOnly).toBe(true) 142 | }) 143 | 144 | it('should extract unique models from data', () => { 145 | const dataWithDuplicates = { 146 | ...mockData, 147 | metadata: { 148 | models: ['model-a', 'model-b', 'model-a'], // duplicate 149 | files: ['test1.txt', 'test2.txt'] 150 | } 151 | } 152 | 153 | const { result } = renderHook(() => useFilters(dataWithDuplicates)) 154 | 155 | expect(result.current.modelOptions).toEqual(['model-a', 'model-b']) 156 | }) 157 | 158 | it('should extract unique files from data', () => { 159 | const dataWithDuplicates = { 160 | ...mockData, 161 | metadata: { 162 | models: ['model-a', 'model-b'], 163 | files: ['test1.txt', 'test2.txt', 'test1.txt'] // duplicate 164 | } 165 | } 166 | 167 | const { result } = renderHook(() => useFilters(dataWithDuplicates)) 168 | 169 | expect(result.current.fileOptions).toEqual(['test1.txt', 'test2.txt']) 170 | }) 171 | 172 | it('should handle null/undefined data', () => { 173 | const { result } = renderHook(() => useFilters(null)) 174 | 175 | expect(result.current.selectedModels).toEqual([]) 176 | expect(result.current.selectedFiles).toEqual([]) 177 | expect(result.current.modelOptions).toEqual([]) 178 | expect(result.current.fileOptions).toEqual([]) 179 | }) 180 | }) -------------------------------------------------------------------------------- /api_client.py: -------------------------------------------------------------------------------- 1 | """API interaction functions for LLM evaluation.""" 2 | 3 | import time 4 | from typing import Any, Dict 5 | 6 | import requests 7 | 8 | from shared import RESPONSE_TIME_DECIMAL_PLACES 9 | from validation import APIRequest, EvaluationRequest, APIResponseValidator 10 | 11 | 12 | def get_model_response(endpoint_url: str, model: str, prompt: str, api_key: str = None, 13 | system_prompt: str = None, throttling_secs: float = 0.1) -> Dict[str, Any]: 14 | """Gets a response from the specified model with input validation.""" 15 | try: 16 | # Validate inputs 17 | if not endpoint_url or not model or not prompt: 18 | raise ValueError("endpoint_url, model, and prompt are required") 19 | 20 | # Sanitize and validate prompt content 21 | prompt = APIResponseValidator.sanitize_content(prompt, 10000) 22 | if system_prompt: 23 | system_prompt = APIResponseValidator.sanitize_content(system_prompt, 2000) 24 | 25 | # Apply throttling 26 | time.sleep(throttling_secs) 27 | 28 | # Build messages 29 | messages = [] 30 | if system_prompt: 31 | messages.append({"role": "system", "content": system_prompt}) 32 | messages.append({"role": "user", "content": prompt}) 33 | 34 | # Validate request structure 35 | api_request = APIRequest( 36 | model=model, 37 | messages=messages, 38 | stream=False 39 | ) 40 | 41 | # Prepare request 42 | payload = api_request.dict() 43 | headers = {"Content-Type": "application/json"} 44 | 45 | if api_key: 46 | headers["Authorization"] = f"Bearer {api_key}" 47 | 48 | # Make request with timeout 49 | response = requests.post( 50 | endpoint_url, 51 | json=payload, 52 | headers=headers, 53 | timeout=700 # 700 second timeout 54 | ) 55 | response.raise_for_status() 56 | 57 | # Validate response structure 58 | response_data = response.json() 59 | APIResponseValidator.validate_openai_response(response_data) 60 | 61 | return response_data 62 | 63 | except requests.exceptions.Timeout: 64 | raise ValueError(f"Request timeout for model {model}") 65 | except requests.exceptions.ConnectionError: 66 | raise ValueError(f"Connection error for endpoint {endpoint_url}") 67 | except requests.exceptions.HTTPError as e: 68 | if hasattr(e, 'response') and e.response is not None: 69 | raise ValueError(f"HTTP error {e.response.status_code}: {e.response.text}") 70 | else: 71 | raise ValueError(f"HTTP error: {e}") 72 | except ValueError as e: 73 | raise ValueError(f"Validation error: {e}") 74 | except Exception as e: 75 | raise ValueError(f"Unexpected error: {e}") 76 | 77 | 78 | def evaluate_correctness(endpoint_url: str, evaluator_model: str, expected_answer: str, 79 | generated_answer: str, api_key: str = None, 80 | throttling_secs: float = 0.1) -> tuple[bool, str]: 81 | """Evaluates the correctness of a generated answer using an evaluator model.""" 82 | try: 83 | # Validate inputs 84 | if not expected_answer or not generated_answer: 85 | # Return False for empty answers instead of raising an error 86 | return False, "Both expected_answer and generated_answer are required" 87 | 88 | # Simple string comparison if no evaluator model 89 | if not evaluator_model: 90 | is_correct = generated_answer.lower().strip() == expected_answer.lower().strip() 91 | note = "Exact string matching evaluation" if not is_correct else "" 92 | return is_correct, note 93 | 94 | # Validate evaluation request 95 | eval_request = EvaluationRequest( 96 | expected_answer=expected_answer, 97 | generated_answer=generated_answer, 98 | evaluator_model=evaluator_model 99 | ) 100 | 101 | system_prompt = "You are an evaluator. Compare the expected answer with the generated answer. Ignore the tag content. The generated answers may vary slightly in wording but should preserve the original meaning. If the answers are equivalent in meaning, mark as correct. Respond with only 'CORRECT' or 'INCORRECT'. If the answer is INCORRECT, provide a brief explanation of why on a new line starting with 'NOTE:'." 102 | user_prompt = f"Expected Answer: {eval_request.expected_answer}\nGenerated Answer: {eval_request.generated_answer}" 103 | 104 | # Get evaluation from model 105 | eval_response = get_model_response( 106 | endpoint_url, 107 | evaluator_model, 108 | user_prompt, 109 | api_key, 110 | system_prompt, 111 | throttling_secs 112 | ) 113 | 114 | eval_result = eval_response.get('choices', [{}])[0].get('message', {}).get('content', '').strip() 115 | 116 | # Validate evaluation result 117 | if not eval_result: 118 | raise ValueError("Empty evaluation response") 119 | 120 | # Parse evaluation result and optional note 121 | lines = eval_result.split('\n') 122 | is_correct = False 123 | note = "" 124 | 125 | # Check first line for CORRECT/INCORRECT 126 | if "CORRECT" == lines[0]: 127 | is_correct = True 128 | elif "INCORRECT" == lines[0]: 129 | is_correct = False 130 | else: 131 | # Default to incorrect if response is ambiguous 132 | print(f"⚠️ Ambiguous evaluation response: '{eval_result}', marking as incorrect") 133 | is_correct = False 134 | 135 | # Extract note if present (starts with NOTE:) 136 | for line in lines[1:]: 137 | if line.strip().startswith('NOTE:'): 138 | note = line.strip()[5:].strip() # Remove 'NOTE:' prefix 139 | break 140 | 141 | return is_correct, note 142 | 143 | except ValueError as e: 144 | print(f"❌ Evaluation validation error: {e}") 145 | return False, f"Evaluation validation error: {e}" 146 | except Exception as e: 147 | print(f"❌ Evaluation error: {e}") 148 | return False, f"Evaluation error: {e}" -------------------------------------------------------------------------------- /report_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | llm-eval-simple: Model Evaluation Report 8 | 77 | 78 | 79 |

llm-eval-simple: Model Evaluation Report

80 | 81 |

Model Performance Summary

82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | __SUMMARY_TABLE__ 92 | 93 |

Model	Correct	Avg Response Time

94 | 95 |

Detailed Results

96 | 97 | 98 | 99 | 100 | __DETAILED_RESULTS_HEADER__ 101 | 102 | 103 | 104 | __DETAILED_RESULTS_BODY__ 105 | 106 |

Model

107 | 108 |

109 | __QUESTIONS_DETAILS__ 110 |

111 | 112 | 113 |

114 |

115 | × 116 |

117 |

118 |

119 | 120 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /file_utils.py: -------------------------------------------------------------------------------- 1 | """File handling functions for LLM evaluation.""" 2 | 3 | import glob 4 | import json 5 | import os 6 | from typing import List, Dict, Any 7 | from pathlib import Path 8 | 9 | from shared import RAW_REPORT_PATH, EVALUATED_REPORT_PATH 10 | from validation import FileOperationValidator, validate_glob_pattern 11 | 12 | 13 | def get_prompt_files(pattern: str) -> List[str]: 14 | """Gets a sorted list of prompt files matching the pattern.""" 15 | try: 16 | # Validate pattern for security 17 | validated_pattern = validate_glob_pattern(pattern) 18 | 19 | files = glob.glob(validated_pattern) 20 | valid_files = [] 21 | 22 | for file_path in files: 23 | try: 24 | # Validate each file path 25 | validated_path = FileOperationValidator.validate_file_path(file_path) 26 | 27 | # Check if it's a file and has allowed extension 28 | if validated_path.is_file(): 29 | FileOperationValidator.validate_file_size(validated_path) 30 | valid_files.append(str(validated_path)) 31 | 32 | except ValueError: 33 | # Skip invalid files 34 | continue 35 | 36 | return sorted(valid_files) 37 | 38 | except ValueError as e: 39 | print(f"❌ Error in file pattern validation: {e}") 40 | return [] 41 | except Exception as e: 42 | print(f"❌ Unexpected error getting prompt files: {e}") 43 | return [] 44 | 45 | 46 | def read_file_content(file_path: str) -> str: 47 | """Read content from a file with UTF-8 encoding and validation.""" 48 | try: 49 | # Validate file path 50 | validated_path = FileOperationValidator.validate_file_path(file_path) 51 | 52 | # Validate file size 53 | FileOperationValidator.validate_file_size(validated_path) 54 | 55 | # Read content with size limit 56 | with open(validated_path, 'r', encoding='utf-8') as f: 57 | content = f.read() 58 | 59 | # Validate content length 60 | return FileOperationValidator.validate_content_length(content, 50000).strip() 61 | 62 | except UnicodeDecodeError: 63 | raise ValueError(f"File {file_path} is not valid UTF-8 text") 64 | except ValueError as e: 65 | raise ValueError(f"File validation error for {file_path}: {e}") 66 | except Exception as e: 67 | raise ValueError(f"Error reading file {file_path}: {e}") 68 | 69 | 70 | def write_json_report(data: List[Dict[str, Any]], file_path: str) -> None: 71 | """Write data to a JSON file with proper formatting and validation.""" 72 | try: 73 | # Validate file path 74 | validated_path = FileOperationValidator.validate_file_path(file_path) 75 | 76 | # Ensure parent directory exists 77 | validated_path.parent.mkdir(parents=True, exist_ok=True) 78 | 79 | # Validate data size 80 | json_str = json.dumps(data, indent=2) 81 | if len(json_str.encode('utf-8')) > 100 * 1024 * 1024: # 100MB limit 82 | raise ValueError("Data too large to write") 83 | 84 | # Write atomically to prevent corruption 85 | temp_path = validated_path.with_suffix('.tmp') 86 | with open(temp_path, 'w', encoding='utf-8') as f: 87 | json.dump(data, f, indent=2) 88 | 89 | # Atomic move 90 | temp_path.replace(validated_path) 91 | 92 | except ValueError as e: 93 | raise ValueError(f"Validation error writing {file_path}: {e}") 94 | except Exception as e: 95 | raise ValueError(f"Error writing file {file_path}: {e}") 96 | 97 | 98 | def read_json_report(file_path: str) -> List[Dict[str, Any]]: 99 | """Read data from a JSON report file with validation.""" 100 | try: 101 | # Validate file path 102 | validated_path = FileOperationValidator.validate_file_path(file_path) 103 | 104 | # Validate file size 105 | FileOperationValidator.validate_file_size(validated_path) 106 | 107 | # Read and parse JSON 108 | with open(validated_path, 'r', encoding='utf-8') as f: 109 | data = json.load(f) 110 | 111 | # Validate data structure 112 | if not isinstance(data, list): 113 | raise ValueError("JSON data must be a list") 114 | 115 | # Validate each item 116 | validated_data = [] 117 | for i, item in enumerate(data): 118 | if not isinstance(item, dict): 119 | raise ValueError(f"Item {i} must be a dictionary") 120 | validated_data.append(item) 121 | 122 | return validated_data 123 | 124 | except json.JSONDecodeError as e: 125 | raise ValueError(f"Invalid JSON in {file_path}: {e}") 126 | except ValueError as e: 127 | raise ValueError(f"Validation error reading {file_path}: {e}") 128 | except Exception as e: 129 | raise ValueError(f"Error reading file {file_path}: {e}") 130 | 131 | 132 | def ensure_directory_exists(directory: str) -> None: 133 | """Ensure that a directory exists, creating it if necessary.""" 134 | try: 135 | # Validate directory path 136 | validated_path = FileOperationValidator.validate_file_path(directory) 137 | 138 | # Create directory with parents 139 | validated_path.mkdir(parents=True, exist_ok=True) 140 | 141 | except ValueError as e: 142 | raise ValueError(f"Invalid directory path {directory}: {e}") 143 | except Exception as e: 144 | raise ValueError(f"Error creating directory {directory}: {e}") 145 | 146 | 147 | def save_raw_results(results: List[Dict[str, Any]]) -> None: 148 | """Save raw results to the standard report path.""" 149 | write_json_report(results, RAW_REPORT_PATH) 150 | 151 | 152 | def load_raw_results() -> List[Dict[str, Any]]: 153 | """Load raw results from the standard report path.""" 154 | return read_json_report(RAW_REPORT_PATH) 155 | 156 | 157 | def save_evaluated_results(results: List[Dict[str, Any]]) -> None: 158 | """Save evaluated results to the standard report path.""" 159 | write_json_report(results, EVALUATED_REPORT_PATH) 160 | 161 | 162 | def load_evaluated_results() -> List[Dict[str, Any]]: 163 | """Load evaluated results from the standard report path.""" 164 | return read_json_report(EVALUATED_REPORT_PATH) -------------------------------------------------------------------------------- /frontend/src/components/ModelAnswersAccordion.jsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import { ChevronDown, ChevronRight } from 'lucide-react'; 3 | 4 | export const ModelAnswersAccordion = ({ results }) => { 5 | const [expandedModels, setExpandedModels] = useState(new Set()); 6 | 7 | if (!results || !Array.isArray(results) || results.length === 0) { 8 | return null; 9 | } 10 | 11 | // Group results by model 12 | const resultsByModel = {}; 13 | results.forEach(result => { 14 | if (!resultsByModel[result.model]) { 15 | resultsByModel[result.model] = []; 16 | } 17 | resultsByModel[result.model].push(result); 18 | }); 19 | 20 | // Calculate fastest correct answers count per model 21 | const calculateFastestCorrectCount = (modelResults, modelName) => { 22 | // Group all results by file to find fastest correct answer per file 23 | const fastestByFile = {}; 24 | results.forEach(result => { 25 | if (result.correct) { 26 | if (!fastestByFile[result.file] || result.response_time < fastestByFile[result.file].response_time) { 27 | fastestByFile[result.file] = result; 28 | } 29 | } 30 | }); 31 | 32 | // Count how many files where this model is the fastest correct 33 | let fastestCorrectCount = 0; 34 | Object.values(fastestByFile).forEach(fastest => { 35 | if (fastest.model === modelName) { 36 | fastestCorrectCount++; 37 | } 38 | }); 39 | 40 | return fastestCorrectCount; 41 | }; 42 | 43 | const toggleModel = (modelName) => { 44 | const newExpanded = new Set(expandedModels); 45 | if (newExpanded.has(modelName)) { 46 | newExpanded.delete(modelName); 47 | } else { 48 | newExpanded.add(modelName); 49 | } 50 | setExpandedModels(newExpanded); 51 | }; 52 | 53 | return ( 54 |

55 |

56 |

57 |

58 | Model Answers Details 59 |

60 | {results[0]?.evaluator_model && ( 61 | 62 | (evaluated by {results[0].evaluator_model}) 63 | 64 | )} 65 |

66 |

67 | {Object.entries(resultsByModel).sort(([a], [b]) => a.localeCompare(b)).map(([modelName, modelResults]) => ( 68 |

69 | 101 | 102 | {expandedModels.has(modelName) && ( 103 |

104 |

105 | {modelResults.map((result, index) => ( 106 |

107 |

108 |

109 |

110 | 111 | {result.file} 112 | 113 | 118 | {result.correct ? 'Correct' : 'Incorrect'} 119 | 120 | 121 | {typeof result.response_time === 'number' ? result.response_time.toFixed(2) + 's' : 'N/A'} 122 | 123 |

124 | 125 |

126 |

127 |

128 | Prompt: 129 |

130 |

131 |                                   {result.prompt}
132 |

133 |

134 | 135 |

136 |

137 | Expected: 138 |

139 |

140 |                                   {result.expected}
141 |

142 |

143 | 144 |

145 |

146 | Generated: 147 |

148 |

149 |                                   {result.generated}
150 |

151 |

152 |

153 |

154 |

155 |

156 | ))} 157 |

158 |

159 | )} 160 |

161 | ))} 162 |

163 |

164 |

165 | ); 166 | }; -------------------------------------------------------------------------------- /api_server.py: -------------------------------------------------------------------------------- 1 | import http.server 2 | import json 3 | import os 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | from shared import ( 7 | DEFAULT_SERVER_PORT, 8 | EVALUATED_REPORT_PATH, 9 | TEMPLATE_PATH, 10 | calculate_model_summary, 11 | create_cell_data_dict, 12 | find_fastest_correct_per_prompt, 13 | get_unique_prompts_and_models, 14 | group_results_by_file, 15 | get_evaluated_report_path 16 | ) 17 | from validation import FileOperationValidator, APIResponseValidator 18 | 19 | SERVER_PORT = DEFAULT_SERVER_PORT 20 | 21 | class APIHandler(http.server.SimpleHTTPRequestHandler): 22 | def do_GET(self): 23 | try: 24 | parsed_path = urlparse(self.path) 25 | 26 | # Validate path for security 27 | self._validate_path(parsed_path.path) 28 | 29 | # API endpoint for results 30 | if parsed_path.path == '/api/results': 31 | self.handle_api_request() 32 | else: 33 | self.send_error(404, "API endpoint not found. Use /api/results for evaluation data.") 34 | except ValueError as e: 35 | self.send_error(400, f"Invalid request: {e}") 36 | except Exception as e: 37 | self.send_error(500, f"Internal server error: {e}") 38 | 39 | def _validate_path(self, path: str): 40 | """Validate request path for security.""" 41 | if not path or not isinstance(path, str): 42 | raise ValueError("Invalid path") 43 | 44 | # Check for path traversal 45 | if '..' in path or path.startswith('//'): 46 | raise ValueError("Path traversal detected") 47 | 48 | # Only allow specific endpoints 49 | allowed_endpoints = {'/api/results', '/'} 50 | if path not in allowed_endpoints and not path.startswith('/static/'): 51 | raise ValueError("Endpoint not allowed") 52 | 53 | def handle_api_request(self): 54 | """Handle API requests for evaluation results.""" 55 | try: 56 | # Allow custom report path via query parameter 57 | parsed_path = urlparse(self.path) 58 | query_params = parse_qs(parsed_path.query) 59 | custom_report = query_params.get('report', [None])[0] 60 | 61 | # Get the appropriate report path 62 | report_path_str = get_evaluated_report_path(custom_report) 63 | 64 | # Validate file path and size 65 | report_path = FileOperationValidator.validate_file_path(report_path_str) 66 | FileOperationValidator.validate_file_size(report_path) 67 | 68 | with open(report_path, 'r', encoding='utf-8') as f: 69 | results = json.load(f) 70 | 71 | # Validate results structure 72 | if not isinstance(results, list): 73 | raise ValueError("Invalid results format") 74 | 75 | # Sanitize and validate results 76 | sanitized_results = [] 77 | for result in results: 78 | if not isinstance(result, dict): 79 | continue 80 | 81 | # Validate required fields 82 | required_fields = {'model', 'file', 'correct', 'response_time'} 83 | if not required_fields.issubset(result.keys()): 84 | continue 85 | 86 | # Sanitize content fields 87 | if 'prompt' in result: 88 | result['prompt'] = APIResponseValidator.sanitize_content(result['prompt'], 5000) 89 | if 'expected' in result: 90 | result['expected'] = APIResponseValidator.sanitize_content(result['expected'], 5000) 91 | if 'generated' in result: 92 | result['generated'] = APIResponseValidator.sanitize_content(result['generated'], 5000) 93 | 94 | sanitized_results.append(result) 95 | 96 | # Prepare API response with all necessary data 97 | api_response = { 98 | 'results': sanitized_results, 99 | 'summary': self.prepare_summary_data(sanitized_results), 100 | 'matrix': self.prepare_matrix_data(sanitized_results), 101 | 'details': self.prepare_details_data(sanitized_results), 102 | 'metadata': { 103 | 'total_results': len(sanitized_results), 104 | 'models': list(set(r['model'] for r in sanitized_results)), 105 | 'files': list(set(r['file'] for r in sanitized_results)) 106 | } 107 | } 108 | 109 | # Validate response size 110 | response_json = json.dumps(api_response) 111 | if len(response_json.encode('utf-8')) > 50 * 1024 * 1024: # 50MB limit 112 | raise ValueError("Response too large") 113 | 114 | self.send_response(200) 115 | self.send_header('Content-type', 'application/json') 116 | self.send_header('Access-Control-Allow-Origin', '*') 117 | self.send_header('X-Content-Type-Options', 'nosniff') 118 | self.end_headers() 119 | self.wfile.write(response_json.encode('utf-8')) 120 | 121 | except FileNotFoundError: 122 | self.send_error(404, "Evaluation results not found. Please run the evaluation first.") 123 | except ValueError as e: 124 | self.send_error(400, f"Validation error: {e}") 125 | except Exception as e: 126 | self.send_error(500, f"Internal server error: {e}") 127 | 128 | def prepare_summary_data(self, results): 129 | """Prepare summary data for API response.""" 130 | model_summary = calculate_model_summary(results) 131 | summary_data = [] 132 | 133 | for model, stats in model_summary.items(): 134 | total = stats["total"] 135 | correct = stats["correct"] 136 | total_time = stats["total_time"] 137 | accuracy = (correct / total) * 100 if total > 0 else 0 138 | avg_time = total_time / total if total > 0 else 0 139 | 140 | summary_data.append({ 141 | 'model': model, 142 | 'total': total, 143 | 'correct': correct, 144 | 'accuracy': round(accuracy, 1), 145 | 'avg_response_time': round(avg_time, 2) 146 | }) 147 | 148 | return summary_data 149 | 150 | def prepare_matrix_data(self, results): 151 | """Prepare matrix data for API response.""" 152 | prompts, models = get_unique_prompts_and_models(results) 153 | fastest_correct = find_fastest_correct_per_prompt(results, prompts) 154 | 155 | matrix_data = { 156 | 'prompts': prompts, 157 | 'models': models, 158 | 'cells': [] 159 | } 160 | 161 | for result in results: 162 | cell_data = { 163 | 'model': result['model'], 164 | 'file': result['file'], 165 | 'correct': result['correct'], 166 | 'response_time': result['response_time'], 167 | 'is_fastest_correct': ( 168 | result['correct'] and 169 | result['file'] in fastest_correct and 170 | fastest_correct[result['file']] == result['model'] 171 | ) 172 | } 173 | matrix_data['cells'].append(cell_data) 174 | 175 | return matrix_data 176 | 177 | def prepare_details_data(self, results): 178 | """Prepare detailed question data for API response.""" 179 | return group_results_by_file(results) 180 | 181 | 182 | 183 | def run_server(): 184 | """Run the API server.""" 185 | import socketserver 186 | import socket 187 | 188 | print(f"🔌 Starting API server at http://localhost:{SERVER_PORT}") 189 | print(f"📊 API endpoint available at: http://localhost:{SERVER_PORT}/api/results") 190 | 191 | # Create server with socket reuse to handle TIME_WAIT state 192 | class ReusableTCPServer(socketserver.TCPServer): 193 | allow_reuse_address = True 194 | 195 | with ReusableTCPServer(("", SERVER_PORT), APIHandler) as httpd: 196 | httpd.serve_forever() 197 | 198 | if __name__ == "__main__": 199 | run_server() -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import http.server 2 | import json 3 | import os 4 | import socketserver 5 | from urllib.parse import parse_qs, urlparse 6 | 7 | from shared import ( 8 | DEFAULT_SERVER_PORT, 9 | EVALUATED_REPORT_PATH, 10 | TEMPLATE_PATH, 11 | GOLD_RGB, 12 | GREEN_RGB, 13 | HSL_LIGHTNESS_MIN, 14 | HSL_LIGHTNESS_RANGE, 15 | LIGHT_GREEN_RGB, 16 | RGB_MAX, 17 | calculate_model_summary, 18 | create_cell_data_dict, 19 | find_fastest_correct_per_prompt, 20 | format_accuracy, 21 | format_response_time, 22 | get_unique_prompts_and_models, 23 | group_results_by_file, 24 | interpolate_color, 25 | normalize_time_value 26 | ) 27 | 28 | SERVER_PORT = DEFAULT_SERVER_PORT 29 | 30 | class ReportHandler(http.server.SimpleHTTPRequestHandler): 31 | def do_GET(self): 32 | if self.path == '/': 33 | self.send_response(301) 34 | self.send_header('Location', '/?render=report-evaluated.json') 35 | self.end_headers() 36 | return 37 | 38 | parsed_path = urlparse(self.path) 39 | if parsed_path.path == '/': 40 | query_components = parse_qs(parsed_path.query) 41 | json_file = query_components.get("render", [None])[0] 42 | if json_file: 43 | json_path = os.path.join("answers-generated", json_file) 44 | else: 45 | json_path = EVALUATED_REPORT_PATH 46 | 47 | try: 48 | with open(json_path, 'r', encoding='utf-8') as f: 49 | results = json.load(f) 50 | 51 | with open(TEMPLATE_PATH, 'r', encoding='utf-8') as f: 52 | template = f.read() 53 | 54 | summary_table, detailed_results_header, detailed_results_body, questions_details, cell_data = self.format_results(results) 55 | 56 | html = template.replace("__SUMMARY_TABLE__", summary_table) 57 | html = html.replace("__DETAILED_RESULTS_HEADER__", detailed_results_header) 58 | html = html.replace("__DETAILED_RESULTS_BODY__", detailed_results_body) 59 | html = html.replace("__QUESTIONS_DETAILS__", questions_details) 60 | html = html.replace("__CELL_DATA__", cell_data) 61 | 62 | self.send_response(200) 63 | self.send_header("Content-type", "text/html") 64 | self.end_headers() 65 | self.wfile.write(html.encode('utf-8')) 66 | except FileNotFoundError: 67 | self.send_error(404, "Report file not found. Please run the evaluation first.") 68 | except Exception as e: 69 | self.send_error(500, f"An error occurred: {e}") 70 | else: 71 | super().do_GET() 72 | 73 | def format_results(self, results): 74 | # Calculate summary using shared function 75 | model_summary = calculate_model_summary(results) 76 | 77 | summary_table = "" 78 | for model, stats in model_summary.items(): 79 | total = stats["total"] 80 | correct = stats["correct"] 81 | total_time = stats["total_time"] 82 | accuracy = (correct / total) * 100 if total > 0 else 0 83 | avg_time = total_time / total if total > 0 else 0 84 | summary_table += f""" 85 | 86 | {model} 87 | 88 | {correct}/{total} ({accuracy:.1f}%) 89 |

90 |

91 |

92 | 93 | {avg_time:.2f}s 94 | 95 | """ 96 | 97 | # Detailed results grid 98 | prompts = sorted(list(set(r["file"] for r in results))) 99 | models = sorted(list(set(r["model"] for r in results))) 100 | 101 | all_response_times = [r["response_time"] for r in results] 102 | min_time = min(all_response_times) 103 | max_time = max(all_response_times) 104 | time_range = max_time - min_time if max_time != min_time else 1 105 | 106 | detailed_results_header = "" 107 | for prompt in prompts: 108 | detailed_results_header += f"{prompt}" 109 | 110 | # Find fastest correct test for each prompt file using shared function 111 | fastest_correct_per_prompt = find_fastest_correct_per_prompt(results, prompts) 112 | 113 | detailed_results_body = "" 114 | for model in models: 115 | detailed_results_body += f"{model}" 116 | for prompt in prompts: 117 | cell_style = "" 118 | response_time_text = "" 119 | is_fastest_correct = False 120 | 121 | for r in results: 122 | if r["model"] == model and r["file"] == prompt: 123 | normalized_time = (r["response_time"] - min_time) / time_range 124 | response_time_text = f'

{r["response_time"]:.2f}s

' 125 | 126 | # Check if this is the fastest correct test for this prompt 127 | is_fastest_correct = (r["correct"] and 128 | prompt in fastest_correct_per_prompt and 129 | fastest_correct_per_prompt[prompt] == model) 130 | 131 | if r["correct"]: 132 | if is_fastest_correct: 133 | # Highlight fastest correct with gold/yellow 134 | r_val, g_val, b_val = interpolate_color(GOLD_RGB, GREEN_RGB, normalized_time) 135 | cell_style = f' style="background-color: rgb({r_val}, {g_val}, {b_val}); border: 2px solid #FFD700; box-shadow: 0 0 5px rgba(255, 215, 0, 0.5);"' 136 | else: 137 | # Regular correct answers 138 | r_val, g_val, b_val = interpolate_color(GREEN_RGB, LIGHT_GREEN_RGB, normalized_time) 139 | cell_style = f' style="background-color: rgb({r_val}, {g_val}, {b_val});"' 140 | else: 141 | lightness = int(HSL_LIGHTNESS_MIN + HSL_LIGHTNESS_RANGE * normalized_time) 142 | cell_style = f' style="background-color: hsl(0, 100%, {lightness}%);"' 143 | break 144 | 145 | if is_fastest_correct: 146 | response_time_text = '

⭐ ' + response_time_text.split('>')[1] if '>' in response_time_text else response_time_text 147 | 148 | cell_id = f"{model}-{prompt}" 149 | detailed_results_body += f'{response_time_text}' 150 | detailed_results_body += "" 151 | 152 | # Questions details using shared function 153 | results_by_file = group_results_by_file(results) 154 | 155 | questions_details = "" 156 | for file, data in results_by_file.items(): 157 | questions_details += f""" 158 |

159 |

▶ {file}

160 |

161 |

Prompt:

162 |

{data['prompt']}

163 |

Expected Answer:

164 |

{data['expected']}

165 |

171 |

{model_result['model']}

172 |

Generated Answer:

173 |

{model_result['generated']}

174 |

Response Time: {model_result['response_time']:.2f}s

175 |

176 | """ 177 | questions_details += """ 178 |

179 |

180 | """ 181 | 182 | # Cell data for JavaScript using shared function 183 | cell_data_dict = create_cell_data_dict(results) 184 | cell_data = json.dumps(json.dumps(cell_data_dict)) 185 | 186 | return summary_table, detailed_results_header, detailed_results_body, questions_details, cell_data 187 | 188 | if __name__ == "__main__": 189 | with socketserver.TCPServer(("", SERVER_PORT), ReportHandler) as httpd: 190 | print("serving at port", SERVER_PORT) 191 | httpd.serve_forever() 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Eval Simple 2 | 3 | [![Python Version](https://img.shields.io/badge/python-3.13+-blue.svg)](https://www.python.org/downloads/) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | 6 | ![benchmark report](./static/benchmark-report.png) 7 | 8 | A simple tool for evaluating Large Language Models (LLMs) using a set of prompts and expected answers. It supports testing multiple models via an OpenAI-compatible API endpoint, measures response times, evaluates correctness (using an optional evaluator model or exact matching), and generates a summary report in tabular format. 9 | 10 | This script is useful for benchmarking LLM performance on custom datasets, such as accuracy on specific tasks or questions. 11 | 12 | ## Features 13 | 14 | - **Batch Testing**: Evaluate multiple LLM models simultaneously 15 | - **Flexible Evaluation**: Use AI evaluator models or exact string matching 16 | - **Performance Metrics**: Track response times and accuracy 17 | - **Rich Reporting**: Detailed tables and summary statistics 18 | - **Web Dashboard**: Interactive visualization of results 19 | - **Configurable**: Environment-based configuration for different setups 20 | 21 | ## Quick Start 22 | 23 | ```bash 24 | # Install dependencies 25 | uv sync 26 | 27 | # Copy environment configuration 28 | cp .env.example .env 29 | 30 | # Edit .env with your API endpoint and model names 31 | # Then run evaluation 32 | uv run python main.py 33 | 34 | # Start web dashboard 35 | ./start-dashboard.sh 36 | ``` 37 | 38 | ## Project Structure 39 | 40 | ``` 41 | llm-eval-simple/ 42 | ├── main.py # Main evaluation script 43 | ├── api_server.py # REST API server 44 | ├── server.py # Web server for dashboard 45 | ├── api_client.py # OpenAI-compatible API client 46 | ├── reporting.py # Result formatting and display 47 | ├── validation.py # Input validation utilities 48 | ├── file_utils.py # File operations and utilities 49 | ├── shared.py # Shared constants and utilities 50 | ├── prompts/ # Input prompt files 51 | ├── answers/ # Expected answer files 52 | ├── frontend/ # React dashboard application 53 | ├── tests/ # Backend test suite 54 | └── static/ # Static assets 55 | ``` 56 | 57 | ## Prerequisites 58 | 59 | - Python 3.13+. 60 | - [uv](https://github.com/astral-sh/uv) installed for dependency management (a fast alternative to pip and venv). 61 | - Access to an OpenAI-compatible API endpoint (e.g., local server or hosted service) for model inference. 62 | - Directories for prompts and answers (created automatically if missing). 63 | 64 | ## Installation 65 | 66 | 1. Clone the repository: 67 | 68 | ```bash 69 | git clone 70 | cd llm-eval-simple 71 | ``` 72 | 73 | 2. Install dependencies using uv: 74 | 75 | ```bash 76 | uv sync 77 | ``` 78 | 79 | This will create a virtual environment and install all required packages from `pyproject.toml` or `requirements.txt`. 80 | 81 | If you prefer not to use uv, you can manually install dependencies: 82 | 83 | ```bash 84 | python -m venv .venv 85 | source .venv/bin/activate # On Unix-based systems 86 | # or .venv\Scripts\activate on Windows 87 | pip install -r requirements.txt 88 | ``` 89 | 90 | Note: The script assumes uv for running, but you can adapt it for standard Python. 91 | 92 | ## Configuration 93 | 94 | 1. Create a `.env` file in the root directory based on `.env.example`: 95 | 96 | ```bash 97 | cp .env.example .env 98 | ``` 99 | 100 | Edit `.env` with your settings: 101 | - `ENDPOINT_URL`: Your OpenAI-compatible API endpoint (default: `http://localhost:9292/v1/chat/completions`). 102 | - `API_KEY`: Your API key for authentication with the OpenAI-compatible API (optional). 103 | - `MODEL_NAMES`: Comma-separated list of model names to test (e.g., `gemma-3-270m-it-Q4_K_M,Qwen3-8B-Q4_K_M`). 104 | - `MODEL_EVALUATOR`: Optional model name for evaluating correctness (if empty, uses exact matching). 105 | 106 | 2. Prepare your test data: 107 | - Place prompt files in the `prompts` directory (e.g., `1-math-question.txt`). 108 | - Place corresponding expected answer files in the `answers` directory with matching names (e.g., `1-math-question.txt`). 109 | - Files should contain plain text: prompts for input to the model, answers for comparison. 110 | - Use consistent naming and ensure files are UTF-8 encoded. 111 | 112 | ## Usage 113 | 114 | ### Running Evaluations 115 | 116 | Run the evaluation script with customizable actions: 117 | 118 | ```bash 119 | # Run all actions (answer, evaluate, render, serve) 120 | uv run python main.py 121 | 122 | # Run specific actions with pattern filtering 123 | uv run python main.py --actions answer,evaluate,serve --pattern "prompts/REASON*" 124 | 125 | # Available actions: 126 | # - answer: Generate model responses for prompts 127 | # - evaluate: Evaluate correctness of responses 128 | # - render: Display results in terminal 129 | # - serve: Start web dashboard 130 | ``` 131 | 132 | ### Starting the Dashboard 133 | 134 | For a web-based dashboard to view results, use the `start-dashboard.sh` script: 135 | 136 | ```bash 137 | ./start-dashboard.sh 138 | ``` 139 | 140 | This will start: 141 | - API server on port 4000 142 | - Web UI on port 3000 143 | 144 | Alternatively, start components manually: 145 | ```bash 146 | # Start API server 147 | uv run python api_server.py 148 | 149 | # Start web UI (in another terminal) 150 | cd frontend && npm run dev 151 | ``` 152 | 153 | - This will process all prompt files, test each model, evaluate results, and print detailed per-file results followed by a summary table. 154 | - Output includes: 155 | - Per-model testing logs. 156 | - Detailed table with model, file, correctness, and response time. 157 | - Summary table with accuracy percentage and average response time. 158 | 159 | Example output snippet: 160 | 161 | ```text 162 | ... 163 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 164 | │ gemma-3-27b-it-qat-q4_0-q3_k_m │ REASON-column-words.txt │ 𐄂 │ 14.53s │ 165 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 166 | │ gemma-3-27b-it-qat-q4_0-q3_k_m │ REASON-ramarronero.txt │ 𐄂 │ 5.85s │ 167 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 168 | │ gpt-oss-20b-mxfp4 │ 1-capital-italy.txt │ 🮱 │ 26.83s │ 169 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 170 | │ gpt-oss-20b-mxfp4 │ BIGCONTEXT-kuleba.txt │ 🮱 │ 48.03s │ 171 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 172 | │ gpt-oss-20b-mxfp4 │ CODING-typescript-rust.txt │ 🮱 │ 33.07s │ 173 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 174 | │ gpt-oss-20b-mxfp4 │ EXTRACT-USDT-APY.txt │ 🮱 │ 133.22s │ 175 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 176 | │ gpt-oss-20b-mxfp4 │ KNOWLEDGE-translate-pesca.txt │ 🮱 │ 18.67s │ 177 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 178 | │ gpt-oss-20b-mxfp4 │ MATH-battery-discarge.txt │ 🮱 │ 29.25s │ 179 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 180 | │ gpt-oss-20b-mxfp4 │ REASON-column-words.txt │ 🮱 │ 81.82s │ 181 | ├────────────────────────────────┼───────────────────────────────┼───────────┼─────────────────┤ 182 | │ gpt-oss-20b-mxfp4 │ REASON-ramarronero.txt │ 🮱 │ 16.90s │ 183 | ╘════════════════════════════════╧═══════════════════════════════╧═══════════╧═════════════════╛ 184 | 185 | Model Performance Summary 186 | ╒════════════════════════════════╤═══════════════════════════╤═════════════════════╕ 187 | │ Model │ Correct │ Avg Response Time │ 188 | ╞════════════════════════════════╪═══════════════════════════╪═════════════════════╡ 189 | │ Qwen3-4B-IQ4_NL │ 5/8 (62.5%) [██████░░░░] │ 87.92s │ 190 | ├────────────────────────────────┼───────────────────────────┼─────────────────────┤ 191 | │ gemma-3-27b-it-qat-q4_0-q3_k_m │ 6/8 (75.0%) [███████░░░] │ 112.57s │ 192 | ├────────────────────────────────┼───────────────────────────┼─────────────────────┤ 193 | │ gpt-oss-20b-mxfp4 │ 8/8 (100.0%) [██████████] │ 48.47s │ 194 | ╘════════════════════════════════╧═══════════════════════════╧═════════════════════╛ 195 | ``` 196 | 197 | ## Troubleshooting 198 | 199 | - **API Errors**: Ensure your endpoint is running and accessible. Check the URL and model names in `.env`. 200 | - **Evaluator Failures**: If using `MODEL_EVALUATOR`, it should return "CORRECT" or "INCORRECT". The script now handles variations like "not correct". 201 | - **No Matching Answers**: The script skips prompts without corresponding answer files. 202 | - **Dependencies**: If uv is not installed, download it from [astral-sh/uv](https://github.com/astral-sh/uv). 203 | - **Customization**: Modify `main.py` for advanced features, like adding more metrics or output formats. 204 | 205 | ## Testing 206 | 207 | The project includes comprehensive testing with 80%+ code coverage. 208 | 209 | ### Backend Tests (pytest) 210 | 211 | ```bash 212 | # Run all tests with coverage 213 | uv run pytest 214 | 215 | # Run specific test file 216 | uv run pytest tests/unit/test_main.py 217 | 218 | # Run with coverage report 219 | uv run pytest --cov=. --cov-report=html 220 | ``` 221 | 222 | ### Frontend Tests (Vitest) 223 | 224 | ```bash 225 | cd frontend 226 | 227 | # Run all tests 228 | npm run test:run 229 | 230 | # Run with coverage 231 | npm run test:coverage 232 | 233 | # Run in watch mode 234 | npm run test 235 | ``` 236 | 237 | Test structure: `tests/unit/` for backend, `src/test/` for frontend. Both support unit and integration tests with proper mocking. 238 | 239 | ## Contributing 240 | 241 | We welcome contributions! Please: 242 | 243 | 1. Fork the repository 244 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`) 245 | 3. Add tests for new functionality 246 | 4. Ensure all tests pass (`uv run pytest && cd frontend && npm run test:run`) 247 | 5. Follow existing code style and patterns 248 | 6. Submit a pull request 249 | 250 | For questions or issues, please open a GitHub issue. 251 | 252 | ## License 253 | 254 | MIT License. See [`LICENSE`](LICENSE) for details. 255 | -------------------------------------------------------------------------------- /validation.py: -------------------------------------------------------------------------------- 1 | """Input validation models and utilities for LLM evaluation.""" 2 | 3 | import os 4 | import re 5 | from pathlib import Path 6 | from typing import List, Dict, Any, Optional 7 | from urllib.parse import urlparse 8 | 9 | from pydantic import BaseModel, Field, validator, HttpUrl 10 | 11 | 12 | class APIRequest(BaseModel): 13 | """Model for API request validation.""" 14 | model: str = Field(..., min_length=1, max_length=100) 15 | messages: List[Dict[str, str]] = Field(..., min_items=1) 16 | stream: bool = Field(default=False) 17 | 18 | @validator('messages') 19 | def validate_messages(cls, v): 20 | """Validate message structure.""" 21 | if not v: 22 | raise ValueError('Messages list cannot be empty') 23 | 24 | for i, message in enumerate(v): 25 | if not isinstance(message, dict): 26 | raise ValueError(f'Message {i} must be a dictionary') 27 | 28 | required_keys = {'role', 'content'} 29 | if not required_keys.issubset(message.keys()): 30 | raise ValueError(f'Message {i} must contain {required_keys}') 31 | 32 | valid_roles = {'system', 'user', 'assistant'} 33 | if message['role'] not in valid_roles: 34 | raise ValueError(f'Message {i} role must be one of {valid_roles}') 35 | 36 | if not message['content'].strip(): 37 | raise ValueError(f'Message {i} content cannot be empty') 38 | 39 | return v 40 | 41 | 42 | class EvaluationRequest(BaseModel): 43 | """Model for evaluation request validation.""" 44 | expected_answer: str = Field(..., min_length=1) 45 | generated_answer: str = Field(..., min_length=1) 46 | evaluator_model: str = Field(..., min_length=1) 47 | 48 | @validator('expected_answer', 'generated_answer') 49 | def validate_answers(cls, v): 50 | """Validate answer content.""" 51 | if not v or not v.strip(): 52 | raise ValueError('Answer cannot be empty') 53 | if len(v) > 10000: # Reasonable limit 54 | raise ValueError('Answer too long (max 10000 characters)') 55 | return v.strip() 56 | 57 | 58 | class ConfigValidation(BaseModel): 59 | """Enhanced configuration validation model.""" 60 | endpoint_url: str = Field(..., min_length=1) 61 | model_names: List[str] = Field(..., min_items=1) 62 | model_evaluator: str = Field(..., min_length=1) 63 | pattern: str = Field(..., min_length=1) 64 | actions: List[str] = Field(..., min_items=1) 65 | api_key: Optional[str] = None 66 | throttling_secs: float = Field(..., ge=0) 67 | prompt_dir: str = Field(..., min_length=1) 68 | answer_dir: str = Field(..., min_length=1) 69 | 70 | @validator('endpoint_url') 71 | def validate_endpoint_url(cls, v): 72 | """Validate endpoint URL format.""" 73 | if not v: 74 | raise ValueError('Endpoint URL cannot be empty') 75 | 76 | try: 77 | parsed = urlparse(v) 78 | if parsed.scheme not in ('http', 'https'): 79 | raise ValueError('URL must use http or https protocol') 80 | if not parsed.netloc: 81 | raise ValueError('URL must have a valid host') 82 | return v.rstrip('/') 83 | except Exception: 84 | raise ValueError('Invalid URL format') 85 | 86 | @validator('model_names', 'model_evaluator') 87 | def validate_model_names(cls, v): 88 | """Validate model names.""" 89 | if isinstance(v, str): 90 | v = [v] 91 | 92 | for name in v: 93 | if not name or not name.strip(): 94 | raise ValueError('Model names cannot be empty') 95 | if len(name) > 100: 96 | raise ValueError('Model name too long (max 100 characters)') 97 | # Allow alphanumeric, hyphens, underscores, dots 98 | if not re.match(r'^[a-zA-Z0-9._-]+$', name): 99 | raise ValueError(f'Invalid model name: {name}') 100 | 101 | return v 102 | 103 | @validator('actions') 104 | def validate_actions(cls, v): 105 | """Validate actions list.""" 106 | valid_actions = {'answer', 'evaluate', 'render', 'serve'} 107 | for action in v: 108 | if action not in valid_actions: 109 | raise ValueError(f'Invalid action: {action}. Valid actions: {valid_actions}') 110 | return v 111 | 112 | @validator('pattern') 113 | def validate_pattern(cls, v): 114 | """Validate glob pattern.""" 115 | if not v or not v.strip(): 116 | raise ValueError('Pattern cannot be empty') 117 | 118 | # Basic security check for path traversal 119 | if '..' in v: 120 | raise ValueError('Pattern cannot contain path traversal') 121 | 122 | return v.strip() 123 | 124 | 125 | 126 | @validator('prompt_dir', 'answer_dir') 127 | def validate_directories(cls, v): 128 | """Validate directory paths.""" 129 | if not v or not v.strip(): 130 | raise ValueError('Directory cannot be empty') 131 | 132 | # Security check for path traversal 133 | if '..' in v: 134 | raise ValueError('Directory cannot contain path traversal') 135 | 136 | return v.strip() 137 | 138 | 139 | class FileOperationValidator: 140 | """Security utilities for file operations.""" 141 | 142 | MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB 143 | ALLOWED_EXTENSIONS = {'.txt', '.md', '.py', '.js', '.jsx', '.ts', '.tsx', '.json'} 144 | BASE_DIRECTORIES = {'prompts', 'answers', 'generated_answers'} 145 | 146 | @staticmethod 147 | def validate_file_path(file_path: str, base_dir: str = None) -> Path: 148 | """Validate file path for security.""" 149 | try: 150 | # Handle both relative and absolute paths 151 | if os.path.isabs(file_path): 152 | path = Path(file_path) 153 | else: 154 | # For relative paths, resolve from current directory 155 | path = Path(file_path).resolve() 156 | 157 | # Check for path traversal 158 | if base_dir: 159 | base_path = Path(base_dir).resolve() 160 | if not str(path).startswith(str(base_path)): 161 | raise ValueError(f'Path traversal detected: {file_path}') 162 | 163 | # Check file extension - allow if no extension (for directories) 164 | if path.suffix and path.suffix.lower() not in FileOperationValidator.ALLOWED_EXTENSIONS: 165 | raise ValueError(f'File type not allowed: {path.suffix}') 166 | 167 | return path 168 | except Exception as e: 169 | raise ValueError(f'Invalid file path: {e}') 170 | 171 | @staticmethod 172 | def validate_file_size(file_path: Path) -> bool: 173 | """Validate file size.""" 174 | if file_path.exists() and file_path.stat().st_size > FileOperationValidator.MAX_FILE_SIZE: 175 | raise ValueError(f'File too large: {file_path}') 176 | return True 177 | 178 | @staticmethod 179 | def validate_content_length(content: str, max_length: int = 50000) -> str: 180 | """Validate content length.""" 181 | if len(content) > max_length: 182 | raise ValueError(f'Content too long: {len(content)} > {max_length}') 183 | return content 184 | 185 | 186 | class APIResponseValidator: 187 | """Utilities for validating API responses.""" 188 | 189 | @staticmethod 190 | def validate_openai_response(response: Dict[str, Any]) -> Dict[str, Any]: 191 | """Validate OpenAI-compatible API response.""" 192 | if not isinstance(response, dict): 193 | raise ValueError('Response must be a dictionary') 194 | 195 | if 'choices' not in response: 196 | raise ValueError('Response missing required field: choices') 197 | 198 | choices = response['choices'] 199 | if not isinstance(choices, list) or not choices: 200 | raise ValueError('Choices must be a non-empty list') 201 | 202 | choice = choices[0] 203 | if not isinstance(choice, dict): 204 | raise ValueError('Choice must be a dictionary') 205 | 206 | if 'message' not in choice: 207 | raise ValueError('Choice missing required field: message') 208 | 209 | message = choice['message'] 210 | if not isinstance(message, dict): 211 | raise ValueError('Message must be a dictionary') 212 | 213 | if 'content' not in message: 214 | raise ValueError('Message missing required field: content') 215 | 216 | content = message['content'] 217 | if not isinstance(content, str): 218 | raise ValueError('Content must be a string') 219 | 220 | return response 221 | 222 | @staticmethod 223 | def sanitize_content(content: str, max_length: int = 10000) -> str: 224 | """Sanitize and validate content.""" 225 | if not isinstance(content, str): 226 | raise ValueError('Content must be a string') 227 | 228 | # Remove potential HTML/script tags for security 229 | content = re.sub(r']*>.*?', '', content, flags=re.IGNORECASE | re.DOTALL) 230 | content = re.sub(r'<[^>]+>', '', content) 231 | 232 | # Limit length 233 | if len(content) > max_length: 234 | content = content[:max_length] + '...' 235 | 236 | return content.strip() 237 | 238 | 239 | def validate_glob_pattern(pattern: str) -> str: 240 | """Validate glob pattern for security.""" 241 | if not pattern or not pattern.strip(): 242 | raise ValueError('Pattern cannot be empty') 243 | 244 | pattern = pattern.strip() 245 | 246 | # Security checks 247 | if '..' in pattern: 248 | raise ValueError('Pattern cannot contain path traversal') 249 | 250 | # Check for dangerous patterns (more specific checks) 251 | if pattern == '/*' or pattern == '/.*': 252 | raise ValueError('Dangerous pattern detected: absolute root patterns not allowed') 253 | if pattern.startswith('~') or '$HOME' in pattern: 254 | raise ValueError('Dangerous pattern detected: home directory patterns not allowed') 255 | 256 | return pattern 257 | 258 | 259 | def validate_model_list(model_names: str) -> List[str]: 260 | """Validate and parse model names list.""" 261 | if not model_names or not model_names.strip(): 262 | raise ValueError('Model names cannot be empty') 263 | 264 | models = [name.strip() for name in model_names.split(',') if name.strip()] 265 | 266 | if not models: 267 | raise ValueError('No valid model names found') 268 | 269 | for model in models: 270 | if not re.match(r'^[a-zA-Z0-9._-]+$', model): 271 | raise ValueError(f'Invalid model name: {model}') 272 | 273 | return models -------------------------------------------------------------------------------- /frontend/src/components/ResultsMatrix.jsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import { Modal } from './ui/Modal'; 3 | import { getHeatmapColor, getResponseTimeStats } from '../utils/heatmapUtils'; 4 | import { useDarkMode } from '../hooks/useDarkMode'; 5 | 6 | export const ResultsMatrix = ({ results }) => { 7 | const [selectedCell, setSelectedCell] = useState(null); 8 | const { isDark } = useDarkMode(); 9 | 10 | // Add safety checks 11 | if (!results || !Array.isArray(results) || results.length === 0) { 12 | return ( 13 |

14 |

15 |

16 | Evaluation Heatmap 17 |

18 |

19 | No results available to display. 20 |

21 |

22 |

23 | ); 24 | } 25 | 26 | // Get unique models and files safely 27 | const models = [...new Set(results.map(r => r?.model).filter(Boolean))].sort(); 28 | const prompts = [...new Set(results.map(r => r?.file).filter(Boolean))].sort(); 29 | 30 | // Calculate response time stats for color normalization 31 | const { min: minTime, max: maxTime } = getResponseTimeStats(results); 32 | 33 | // Find the fastest correct result for each test (file) 34 | const fastestCorrectByFile = {}; 35 | results.forEach(result => { 36 | if (result.correct && typeof result.response_time === 'number') { 37 | const file = result.file; 38 | if (!fastestCorrectByFile[file] || result.response_time < fastestCorrectByFile[file].response_time) { 39 | fastestCorrectByFile[file] = result; 40 | } 41 | } 42 | }); 43 | 44 | const handleCellClick = (model, file) => { 45 | const result = results.find(r => r.model === model && r.file === file); 46 | if (result) { 47 | setSelectedCell({ 48 | model: result.model, 49 | file: result.file, 50 | prompt: result.prompt, 51 | generated: result.generated, 52 | expected: result.expected, 53 | response_time: result.response_time?.toFixed(2) + 's', 54 | correct: result.correct, 55 | evaluator_model: result.evaluator_model, 56 | note: result.note || '' 57 | }); 58 | } 59 | }; 60 | 61 | const navigateToModel = (direction) => { 62 | if (!selectedCell) return; 63 | 64 | const currentIndex = models.indexOf(selectedCell.model); 65 | let newIndex; 66 | 67 | if (direction === 'prev') { 68 | newIndex = currentIndex > 0 ? currentIndex - 1 : models.length - 1; 69 | } else { 70 | newIndex = currentIndex < models.length - 1 ? currentIndex + 1 : 0; 71 | } 72 | 73 | const nextModel = models[newIndex]; 74 | const result = results.find(r => r.model === nextModel && r.file === selectedCell.file); 75 | 76 | if (result) { 77 | setSelectedCell({ 78 | model: result.model, 79 | file: result.file, 80 | prompt: result.prompt, 81 | generated: result.generated, 82 | expected: result.expected, 83 | response_time: result.response_time?.toFixed(2) + 's', 84 | correct: result.correct, 85 | evaluator_model: result.evaluator_model, 86 | note: result.note || '' 87 | }); 88 | } 89 | }; 90 | 91 | const getHeatmapCell = (model, prompt) => { 92 | const result = results.find(r => r.model === model && r.file === prompt); 93 | 94 | if (!result || typeof result.response_time !== 'number') { 95 | return ( 96 | 100 | - 101 | 102 | ); 103 | } 104 | 105 | const colors = getHeatmapColor(result.correct, result.response_time, minTime, maxTime, isDark); 106 | const isFastestCorrect = fastestCorrectByFile[prompt] === result; 107 | 108 | return ( 109 | handleCellClick(model, prompt)} 117 | onMouseEnter={(e) => { 118 | e.currentTarget.style.backgroundColor = colors.hover; 119 | e.currentTarget.style.transform = 'scale(1.05)'; 120 | }} 121 | onMouseLeave={(e) => { 122 | e.currentTarget.style.backgroundColor = colors.bg; 123 | e.currentTarget.style.transform = 'scale(1)'; 124 | }} 125 | > 126 |

127 | {isFastestCorrect && ( 128 | ✦ 129 | )} 130 | {result.response_time.toFixed(2)}s 131 |

132 |

134 | 135 | ); 136 | }; 137 | 138 | return ( 139 | <> 140 |

141 |

142 |

143 |

144 | Evaluation Heatmap 145 |

146 |

147 |

148 |

149 | Fast & Correct 150 |

151 |

152 |

153 | Slow & Correct 154 |

155 |

156 |

157 | Fast & Wrong 158 |

159 |

160 |

161 | Slow & Wrong 162 |

163 |

164 |

165 | 166 |

167 | 168 | 169 | 170 | 173 | {prompts.map(prompt => ( 174 | 179 | ))} 180 | 181 | 182 | 183 | {models.map(model => ( 184 | 185 | 190 | {prompts.map(prompt => getHeatmapCell(model, prompt))} 191 | 192 | ))} 193 | 194 |

171 \| Model 172 \|	175 \| 176 \| {prompt} 177 \| 178 \|
186 \| 187 \| {model} 188 \| 189 \|

195 |

196 |

197 |

198 | 199 | setSelectedCell(null)} 202 | title={`${selectedCell?.model} - ${selectedCell?.file}`} 203 | showNavigation={!!selectedCell} 204 | onPrev={() => navigateToModel('prev')} 205 | onNext={() => navigateToModel('next')} 206 | navLabel={selectedCell ? `${models.indexOf(selectedCell.model) + 1} / ${models.length}` : null} 207 | > 208 | {selectedCell && ( 209 |

210 | 211 |

212 | 217 | {selectedCell.correct ? 'Correct' : 'Incorrect'} 218 | 219 | 220 | Response Time: {selectedCell.response_time} 221 | 222 | {selectedCell.evaluator_model && ( 223 | 224 | Evaluator: {selectedCell.evaluator_model} 225 | 226 | )} 227 |

228 | 229 |

230 |

231 | Prompt: 232 |

233 |

234 |                 {selectedCell.prompt}
235 |

236 |

237 | 238 |

239 |

240 | Expected Answer: 241 |

242 |

243 |                 {selectedCell.expected}
244 |

245 |

246 | 247 |

248 |

249 | Generated Answer: 250 |

251 |

252 |                 {selectedCell.generated}
253 |

254 |

255 | 256 | {selectedCell.note && ( 257 |

258 |

259 | Evaluation Note: 260 |

261 |

262 |

263 | {selectedCell.note} 264 |

265 |

266 |

267 | )} 268 |

269 | )} 270 | 271 | 272 | ); 273 | }; -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | from dataclasses import dataclass, field 5 | from typing import List, Dict, Any 6 | import re 7 | from pathlib import Path 8 | 9 | from dotenv import load_dotenv 10 | 11 | from shared import GENERATED_ANSWERS_DIR, EVALUATED_REPORT_PATH, RAW_REPORT_PATH 12 | from api_client import get_model_response, evaluate_correctness 13 | from file_utils import get_prompt_files, read_file_content, save_raw_results, load_raw_results, save_evaluated_results, load_evaluated_results, ensure_directory_exists 14 | from reporting import format_detailed_table, format_matrix_table, format_summary_table 15 | from validation import ConfigValidation, FileOperationValidator, validate_glob_pattern, validate_model_list 16 | 17 | # --- Constants --- 18 | 19 | # Default configuration values 20 | DEFAULT_PROMPT_DIR = "prompts" 21 | DEFAULT_ANSWER_DIR = "answers" 22 | DEFAULT_ENDPOINT_URL = "http://localhost:9292/v1/chat/completions" 23 | DEFAULT_MODEL_EVALUATOR = "some-quite-powerful-model-8B" 24 | DEFAULT_PATTERN = "*" 25 | DEFAULT_ACTIONS = ["answer", "evaluate", "render", "serve"] 26 | DEFAULT_THROTTLING_SECS = 0.1 27 | 28 | # --- Configuration --- 29 | 30 | @dataclass 31 | class Config: 32 | """Holds the configuration for the evaluation script.""" 33 | prompt_dir: str = DEFAULT_PROMPT_DIR 34 | answer_dir: str = DEFAULT_ANSWER_DIR 35 | endpoint_url: str = DEFAULT_ENDPOINT_URL 36 | model_names: List[str] = field(default_factory=list) 37 | model_evaluator: str = DEFAULT_MODEL_EVALUATOR 38 | pattern: str = DEFAULT_PATTERN 39 | actions: List[str] = field(default_factory=list) 40 | api_key: str = None 41 | throttling_secs: float = DEFAULT_THROTTLING_SECS 42 | custom_report_json: str = None 43 | 44 | def validate(self) -> List[str]: 45 | """Validate configuration using Pydantic models.""" 46 | try: 47 | # Use Pydantic for comprehensive validation 48 | config_validation = ConfigValidation( 49 | endpoint_url=self.endpoint_url, 50 | model_names=self.model_names, 51 | model_evaluator=self.model_evaluator, 52 | pattern=self.pattern, 53 | actions=self.actions, 54 | api_key=self.api_key, 55 | throttling_secs=self.throttling_secs, 56 | prompt_dir=self.prompt_dir, 57 | answer_dir=self.answer_dir 58 | ) 59 | # If validation passes, return empty errors list 60 | return [] 61 | except Exception as e: 62 | return [str(e)] 63 | 64 | def load_config() -> Config: 65 | """Loads configuration from environment variables and command-line arguments.""" 66 | load_dotenv() 67 | 68 | parser = argparse.ArgumentParser(description="Test models on prompts with optional filtering.") 69 | parser.add_argument('--pattern', type=str, default="prompts/*", help="Glob pattern to filter prompt files (e.g., '*CODE*')") 70 | parser.add_argument('--actions', type=str, default="answer,evaluate,render,serve", help="Comma-separated list of actions to perform (answer,evaluate,render,serve)") 71 | parser.add_argument('--report-json', type=str, help="Path to custom JSON report file (overrides default)") 72 | args = parser.parse_args() 73 | 74 | try: 75 | # Validate and parse pattern 76 | pattern = validate_glob_pattern(args.pattern) 77 | 78 | # Validate and parse model names 79 | model_names_str = os.getenv("MODEL_NAMES", "gemma-3-270m-it-Q4_K_M,Qwen3-8B-Q4_K_M") 80 | model_names = validate_model_list(model_names_str) 81 | 82 | # Parse and validate actions 83 | actions_str = args.actions 84 | actions = [action.strip() for action in actions_str.split(',') if action.strip()] 85 | 86 | # Parse throttling with validation 87 | throttling_str = os.getenv("THROTTLING_SECS", str(DEFAULT_THROTTLING_SECS)) 88 | try: 89 | throttling_secs = float(throttling_str) 90 | if throttling_secs < 0: 91 | raise ValueError("throttling_secs must be non-negative") 92 | except ValueError as e: 93 | raise ValueError(f"Invalid THROTTLING_SECS: {e}") 94 | 95 | config = Config( 96 | prompt_dir=DEFAULT_PROMPT_DIR, 97 | answer_dir=DEFAULT_ANSWER_DIR, 98 | endpoint_url=os.getenv("ENDPOINT_URL", DEFAULT_ENDPOINT_URL), 99 | model_names=model_names, 100 | model_evaluator=os.getenv("MODEL_EVALUATOR", DEFAULT_MODEL_EVALUATOR), 101 | pattern=pattern, 102 | actions=actions, 103 | api_key=os.getenv("API_KEY"), 104 | throttling_secs=throttling_secs 105 | ) 106 | 107 | # Store custom report path if provided 108 | if args.report_json: 109 | config.custom_report_json = args.report_json 110 | 111 | # Validate configuration using Pydantic 112 | errors = config.validate() 113 | if errors: 114 | print("❌ Configuration validation errors:") 115 | for error in errors: 116 | print(f" - {error}") 117 | exit(1) 118 | 119 | # Validate directories exist 120 | prompt_path = Path(config.prompt_dir) 121 | if not prompt_path.exists(): 122 | print(f"ℹ️ Info: Prompt directory '{config.prompt_dir}' does not exist - will be created if needed") 123 | 124 | answer_path = Path(config.answer_dir) 125 | if not answer_path.exists(): 126 | print(f"ℹ️ Info: Answer directory '{config.answer_dir}' does not exist - will be created if needed") 127 | 128 | return config 129 | 130 | except ValueError as e: 131 | print(f"❌ Configuration error: {e}") 132 | exit(1) 133 | except Exception as e: 134 | print(f"❌ Unexpected configuration error: {e}") 135 | exit(1) 136 | 137 | # --- Action Functions --- 138 | 139 | 140 | # --- Action Functions --- 141 | 142 | def answer_prompt(prompt_path: str, model_name: str, config: Config) -> Dict[str, Any]: 143 | """Processes a single prompt file and returns the generated answer.""" 144 | try: 145 | # Validate file paths for security 146 | validated_prompt_path = FileOperationValidator.validate_file_path(prompt_path, config.prompt_dir) 147 | base_name = os.path.basename(prompt_path) 148 | answer_path = os.path.join(config.answer_dir, base_name) 149 | validated_answer_path = FileOperationValidator.validate_file_path(answer_path, config.answer_dir) 150 | 151 | if not validated_answer_path.exists(): 152 | print(f"Skipping {base_name}: No matching answer file found.") 153 | return None 154 | 155 | # Validate file sizes 156 | FileOperationValidator.validate_file_size(validated_prompt_path) 157 | FileOperationValidator.validate_file_size(validated_answer_path) 158 | 159 | prompt = read_file_content(prompt_path) 160 | expected_answer = read_file_content(answer_path) 161 | 162 | # Validate content length 163 | prompt = FileOperationValidator.validate_content_length(prompt, 10000) 164 | expected_answer = FileOperationValidator.validate_content_length(expected_answer, 10000) 165 | 166 | except ValueError as e: 167 | print(f"Skipping {prompt_path}: {e}") 168 | return None 169 | except Exception as e: 170 | print(f"Error validating files for {prompt_path}: {e}") 171 | return None 172 | 173 | start_time = time.time() 174 | try: 175 | response_json = get_model_response(config.endpoint_url, model_name, prompt, config.api_key, throttling_secs=config.throttling_secs) 176 | end_time = time.time() 177 | 178 | generated_answer = response_json.get('choices', [{}])[0].get('message', {}).get('content', '').strip() 179 | 180 | result = { 181 | "model": model_name, 182 | "file": base_name, 183 | "prompt": prompt, 184 | "response_time": end_time - start_time, 185 | "expected": expected_answer, 186 | "generated": generated_answer 187 | } 188 | 189 | print(f"Answered: {base_name} with {model_name}") 190 | return result 191 | 192 | except Exception as e: 193 | print(f"Error for {base_name} with {model_name}: {str(e)}") 194 | return None 195 | 196 | def answer(config: Config): 197 | """Generates answers for all prompts and models.""" 198 | print("--- Starting Answer Generation ---") 199 | prompt_files = get_prompt_files(config.pattern) 200 | results = [] 201 | 202 | for model_name in config.model_names: 203 | print(f"\n🔬 Testing model: {model_name}") 204 | for prompt_path in prompt_files: 205 | result = answer_prompt(prompt_path, model_name, config) 206 | if result: 207 | results.append(result) 208 | 209 | save_raw_results(results) 210 | print(f"\nGenerated {len(results)} answers. Report saved to {RAW_REPORT_PATH}") 211 | 212 | def evaluate(config: Config): 213 | """Evaluates the generated answers.""" 214 | print("\n--- Starting Evaluation ---") 215 | try: 216 | results = load_raw_results() 217 | except FileNotFoundError: 218 | print(f"Error: {RAW_REPORT_PATH} not found. Please run the 'answer' action first.") 219 | return 220 | 221 | evaluated_results = [] 222 | for result in results: 223 | is_correct, note = evaluate_correctness( 224 | config.endpoint_url, 225 | config.model_evaluator, 226 | result["expected"], 227 | result["generated"], 228 | config.api_key, 229 | config.throttling_secs 230 | ) 231 | result["correct"] = is_correct 232 | result["note"] = note 233 | result["evaluator_model"] = config.model_evaluator 234 | evaluated_results.append(result) 235 | print(f"Evaluated: {result['file']} for {result['model']} -> {'Correct' if is_correct else 'Incorrect'}") 236 | if note: 237 | print(f" Note: {note}") 238 | 239 | save_evaluated_results(evaluated_results) 240 | print(f"\nEvaluation complete. Report saved to {EVALUATED_REPORT_PATH}") 241 | 242 | def render(config: Config): 243 | """Renders the final report.""" 244 | print("\n--- Rendering Report ---") 245 | try: 246 | results = load_evaluated_results() 247 | except FileNotFoundError: 248 | print(f"Error: {EVALUATED_REPORT_PATH} not found. Please run the 'evaluate' action first.") 249 | return 250 | 251 | print_summary(results) 252 | 253 | def serve(config: Config): 254 | """Starts a web server to display the report.""" 255 | print("\n--- Starting Web Server ---") 256 | # Note: this will block the terminal until the server is stopped (e.g. with Ctrl+C) 257 | os.system("uv run python server.py") 258 | 259 | 260 | # --- Reporting --- 261 | 262 | def print_summary(results: List[Dict[str, Any]]): 263 | """Prints the detailed and summary tables of the results.""" 264 | if not results: 265 | print("No results to display.") 266 | return 267 | 268 | # Detailed table 269 | print("\nDetailed Results") 270 | print(format_detailed_table(results)) 271 | 272 | # Matrix table: prompts as columns, models as rows 273 | print("\nMatrix Results") 274 | print(format_matrix_table(results)) 275 | 276 | # Summary table 277 | print("\nModel Performance Summary") 278 | print(format_summary_table(results)) 279 | 280 | # --- Main Execution --- 281 | 282 | def main(): 283 | """Main function to run the model evaluation.""" 284 | config = load_config() 285 | ensure_directory_exists(GENERATED_ANSWERS_DIR) 286 | 287 | if "answer" in config.actions: 288 | answer(config) 289 | if "evaluate" in config.actions: 290 | evaluate(config) 291 | if "render" in config.actions: 292 | render(config) 293 | if "serve" in config.actions: 294 | serve(config) 295 | 296 | if __name__ == "__main__": 297 | main() 298 | -------------------------------------------------------------------------------- /tests/unit/test_validation.py: -------------------------------------------------------------------------------- 1 | """Unit tests for validation module.""" 2 | 3 | import pytest 4 | from pathlib import Path 5 | from pydantic import ValidationError 6 | 7 | from validation import ( 8 | APIRequest, 9 | EvaluationRequest, 10 | ConfigValidation, 11 | FileOperationValidator, 12 | APIResponseValidator, 13 | validate_glob_pattern, 14 | validate_model_list 15 | ) 16 | 17 | 18 | class TestAPIRequest: 19 | """Test APIRequest validation.""" 20 | 21 | def test_valid_api_request(self): 22 | """Test valid API request creation.""" 23 | messages = [{"role": "user", "content": "Hello"}] 24 | request = APIRequest(model="test-model", messages=messages) 25 | 26 | assert request.model == "test-model" 27 | assert request.messages == messages 28 | assert request.stream is False 29 | 30 | def test_empty_messages_validation(self): 31 | """Test validation of empty messages.""" 32 | with pytest.raises(ValidationError, match="List should have at least 1 item"): 33 | APIRequest(model="test", messages=[]) 34 | 35 | def test_invalid_message_structure(self): 36 | """Test validation of invalid message structure.""" 37 | invalid_messages = [{"invalid": "message"}] 38 | 39 | with pytest.raises(ValidationError, match="Message 0 must contain"): 40 | APIRequest(model="test", messages=invalid_messages) 41 | 42 | def test_invalid_role(self): 43 | """Test validation of invalid role.""" 44 | invalid_messages = [{"role": "invalid", "content": "Hello"}] 45 | 46 | with pytest.raises(ValidationError, match="Message 0 role must be"): 47 | APIRequest(model="test", messages=invalid_messages) 48 | 49 | def test_empty_content(self): 50 | """Test validation of empty content.""" 51 | invalid_messages = [{"role": "user", "content": ""}] 52 | 53 | with pytest.raises(ValidationError, match="Message 0 content cannot be empty"): 54 | APIRequest(model="test", messages=invalid_messages) 55 | 56 | 57 | class TestEvaluationRequest: 58 | """Test EvaluationRequest validation.""" 59 | 60 | def test_valid_evaluation_request(self): 61 | """Test valid evaluation request.""" 62 | request = EvaluationRequest( 63 | expected_answer="Expected", 64 | generated_answer="Generated", 65 | evaluator_model="evaluator" 66 | ) 67 | 68 | assert request.expected_answer == "Expected" 69 | assert request.generated_answer == "Generated" 70 | assert request.evaluator_model == "evaluator" 71 | 72 | def test_empty_expected_answer(self): 73 | """Test validation of empty expected answer.""" 74 | with pytest.raises(ValidationError, match="String should have at least 1 character"): 75 | EvaluationRequest( 76 | expected_answer="", 77 | generated_answer="Generated", 78 | evaluator_model="evaluator" 79 | ) 80 | 81 | def test_too_long_answer(self): 82 | """Test validation of too long answer.""" 83 | long_answer = "x" * 10001 84 | 85 | with pytest.raises(ValidationError, match="Answer too long"): 86 | EvaluationRequest( 87 | expected_answer=long_answer, 88 | generated_answer="Generated", 89 | evaluator_model="evaluator" 90 | ) 91 | 92 | 93 | class TestConfigValidation: 94 | """Test ConfigValidation.""" 95 | 96 | def test_valid_config(self, sample_config): 97 | """Test valid configuration.""" 98 | config = ConfigValidation(**sample_config) 99 | 100 | assert config.endpoint_url == sample_config["endpoint_url"] 101 | assert config.model_names == sample_config["model_names"] 102 | assert config.throttling_secs == sample_config["throttling_secs"] 103 | 104 | def test_invalid_url(self, sample_config): 105 | """Test invalid URL validation.""" 106 | sample_config["endpoint_url"] = "invalid-url" 107 | 108 | with pytest.raises(ValidationError, match="Invalid URL format"): 109 | ConfigValidation(**sample_config) 110 | 111 | def test_invalid_model_names(self, sample_config): 112 | """Test invalid model names.""" 113 | sample_config["model_names"] = ["invalid name!"] 114 | 115 | with pytest.raises(ValidationError, match="Invalid model name"): 116 | ConfigValidation(**sample_config) 117 | 118 | def test_invalid_actions(self, sample_config): 119 | """Test invalid actions.""" 120 | sample_config["actions"] = ["invalid_action"] 121 | 122 | with pytest.raises(ValidationError, match="Invalid action"): 123 | ConfigValidation(**sample_config) 124 | 125 | def test_negative_throttling(self, sample_config): 126 | """Test negative throttling validation.""" 127 | sample_config["throttling_secs"] = -1.0 128 | 129 | with pytest.raises(ValidationError, match="Input should be greater than or equal to 0"): 130 | ConfigValidation(**sample_config) 131 | 132 | def test_path_traversal_in_pattern(self, sample_config): 133 | """Test path traversal in pattern.""" 134 | sample_config["pattern"] = "../../../etc/passwd" 135 | 136 | with pytest.raises(ValidationError, match="Pattern cannot contain path traversal"): 137 | ConfigValidation(**sample_config) 138 | 139 | 140 | 141 | 142 | class TestFileOperationValidator: 143 | """Test FileOperationValidator.""" 144 | 145 | def test_valid_file_path(self, sample_json_file): 146 | """Test valid file path validation.""" 147 | validated_path = FileOperationValidator.validate_file_path(str(sample_json_file)) 148 | 149 | assert validated_path == sample_json_file.resolve() 150 | 151 | def test_invalid_file_extension(self, temp_dir): 152 | """Test invalid file extension.""" 153 | invalid_file = temp_dir / "test.exe" 154 | invalid_file.write_text("content") 155 | 156 | with pytest.raises(ValueError, match="File type not allowed"): 157 | FileOperationValidator.validate_file_path(str(invalid_file)) 158 | 159 | def test_path_traversal_protection(self, temp_dir): 160 | """Test path traversal protection.""" 161 | with pytest.raises(ValueError, match="Path traversal detected"): 162 | FileOperationValidator.validate_file_path("../../../etc/passwd", str(temp_dir)) 163 | 164 | def test_file_size_validation(self, temp_dir): 165 | """Test file size validation.""" 166 | # Create a file larger than the limit 167 | large_file = temp_dir / "large.txt" 168 | large_file.write_text("x" * (11 * 1024 * 1024)) # 11MB 169 | 170 | with pytest.raises(ValueError, match="File too large"): 171 | FileOperationValidator.validate_file_size(large_file) 172 | 173 | def test_content_length_validation(self): 174 | """Test content length validation.""" 175 | long_content = "x" * 60000 # Exceeds default 50000 limit 176 | 177 | with pytest.raises(ValueError, match="Content too long"): 178 | FileOperationValidator.validate_content_length(long_content) 179 | 180 | def test_allowed_extensions(self): 181 | """Test allowed file extensions.""" 182 | allowed = {'.txt', '.md', '.py', '.js', '.jsx', '.ts', '.tsx', '.json'} 183 | assert FileOperationValidator.ALLOWED_EXTENSIONS == allowed 184 | 185 | 186 | class TestAPIResponseValidator: 187 | """Test APIResponseValidator.""" 188 | 189 | def test_valid_openai_response(self, mock_api_response): 190 | """Test valid OpenAI response validation.""" 191 | validated = APIResponseValidator.validate_openai_response(mock_api_response) 192 | 193 | assert validated == mock_api_response 194 | 195 | def test_missing_choices(self): 196 | """Test response missing choices.""" 197 | invalid_response = {"data": "test"} 198 | 199 | with pytest.raises(ValueError, match="Response missing required field: choices"): 200 | APIResponseValidator.validate_openai_response(invalid_response) 201 | 202 | def test_empty_choices(self): 203 | """Test empty choices list.""" 204 | invalid_response = {"choices": []} 205 | 206 | with pytest.raises(ValueError, match="Choices must be a non-empty list"): 207 | APIResponseValidator.validate_openai_response(invalid_response) 208 | 209 | def test_missing_message(self): 210 | """Test choice missing message.""" 211 | invalid_response = {"choices": [{"data": "test"}]} 212 | 213 | with pytest.raises(ValueError, match="Choice missing required field: message"): 214 | APIResponseValidator.validate_openai_response(invalid_response) 215 | 216 | def test_missing_content(self): 217 | """Test message missing content.""" 218 | invalid_response = {"choices": [{"message": {"role": "assistant"}}]} 219 | 220 | with pytest.raises(ValueError, match="Message missing required field: content"): 221 | APIResponseValidator.validate_openai_response(invalid_response) 222 | 223 | def test_sanitize_content(self): 224 | """Test content sanitization.""" 225 | content_with_script = "Hello" 226 | sanitized = APIResponseValidator.sanitize_content(content_with_script) 227 | 228 | assert "