97 | Rendering heatmap for: {renderFile} 98 |
99 |├── answers-generated
└── .placeholder
├── answers
└── 1-capital-italy.txt
├── prompts
└── 1-capital-italy.txt
├── static
├── favicon.ico
└── benchmark-report.png
├── frontend
├── postcss.config.js
├── src
│ ├── App.jsx
│ ├── main.jsx
│ ├── components
│ │ ├── ui
│ │ │ ├── DarkModeToggle.jsx
│ │ │ ├── Button.jsx
│ │ │ └── Modal.jsx
│ │ ├── ErrorBoundary.jsx
│ │ ├── Charts
│ │ │ ├── ResponseTimeChart.jsx
│ │ │ ├── AccuracyChart.jsx
│ │ │ └── ModelComparisonChart.jsx
│ │ ├── SummaryTable.jsx
│ │ ├── Dashboard.jsx
│ │ ├── QuestionDetails.jsx
│ │ ├── FilterPanel.jsx
│ │ ├── ModelAnswersAccordion.jsx
│ │ └── ResultsMatrix.jsx
│ ├── hooks
│ │ ├── useDarkMode.js
│ │ ├── useApi.js
│ │ └── useFilters.js
│ ├── styles
│ │ └── globals.css
│ ├── utils
│ │ ├── heatmapUtils.js
│ │ └── dataProcessing.js
│ └── test
│ │ ├── Dashboard.test.jsx
│ │ ├── App.test.jsx
│ │ ├── useApi.test.js
│ │ └── useFilters.test.js
├── index.html
├── setup.sh
├── package.json
├── tailwind.config.js
├── vite.config.js
└── README.md
├── .gitignore
├── .env.example
├── LICENSE
├── pyproject.toml
├── AGENTS.md
├── start-dashboard.sh
├── reporting.py
├── tests
├── conftest.py
└── unit
│ ├── test_validation.py
│ └── test_file_utils.py
├── shared.py
├── api_client.py
├── report_template.html
├── file_utils.py
├── api_server.py
├── server.py
├── README.md
├── validation.py
└── main.py
/answers-generated/.placeholder:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/answers/1-capital-italy.txt:
--------------------------------------------------------------------------------
1 | Torino
--------------------------------------------------------------------------------
/prompts/1-capital-italy.txt:
--------------------------------------------------------------------------------
1 | scrivi solo il nome della la prima capitale d'Italia
--------------------------------------------------------------------------------
/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grigio/llm-eval-simple/HEAD/static/favicon.ico
--------------------------------------------------------------------------------
/static/benchmark-report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grigio/llm-eval-simple/HEAD/static/benchmark-report.png
--------------------------------------------------------------------------------
/frontend/postcss.config.js:
--------------------------------------------------------------------------------
1 | export default {
2 | plugins: {
3 | tailwindcss: {},
4 | autoprefixer: {},
5 | },
6 | }
--------------------------------------------------------------------------------
/frontend/src/App.jsx:
--------------------------------------------------------------------------------
1 |
2 | import { Dashboard } from './components/Dashboard';
3 | import { ErrorBoundary } from './components/ErrorBoundary';
4 |
5 | function App() {
6 | return (
7 |
35 | {this.state.error && this.state.error.toString()}
36 |
37 | {this.state.errorInfo.componentStack}
38 |
39 | | 17 | Model 18 | | 19 |20 | Correct 21 | | 22 |23 | Avg Response Time 24 | | 25 |
|---|---|---|
| 41 | 42 | {model} 43 | 44 | | 45 |
46 |
47 |
48 | {correct}/{total} ({formatAccuracy(correct, total)})
49 |
50 |
59 |
51 |
58 |
52 |
56 |
57 | |
60 | 61 | 62 | {formatResponseTime(avgTime)} 63 | 64 | | 65 |
Loading evaluation results...
37 |{error}
51 |65 | Please run the evaluation first to generate results. 66 |
67 |83 | Interactive model evaluation results 84 |
85 |97 | Rendering heatmap for: {renderFile} 98 |
99 |
52 | {data.prompt}
53 |
54 |
75 | {data.expected}
76 |
77 |
83 | {modelResult.generated}
84 |
85 | 96 | {modelResult.note} 97 |
98 || Model | 86 |Correct | 87 |Avg Response Time | 88 |
|---|
| Model | 100 | __DETAILED_RESULTS_HEADER__ 101 |
|---|
131 | {result.prompt}
132 |
133 |
140 | {result.expected}
141 |
142 |
149 | {result.generated}
150 |
151 | Prompt:
162 |{data['prompt']}
163 | Expected Answer:
164 |{data['expected']}
165 | Generated Answer:
173 |{model_result['generated']}
174 | Response Time: {model_result['response_time']:.2f}s
175 |19 | No results available to display. 20 |
21 || 171 | Model 172 | | 173 | {prompts.map(prompt => ( 174 |
175 |
176 | {prompt}
177 |
178 | |
179 | ))}
180 |
|---|---|
|
186 |
187 | {model}
188 |
189 | |
190 | {prompts.map(prompt => getHeatmapCell(model, prompt))}
191 |
234 | {selectedCell.prompt}
235 |
236 |
243 | {selectedCell.expected}
244 |
245 |
252 | {selectedCell.generated}
253 |
254 | 263 | {selectedCell.note} 264 |
265 |