├── .env.example
├── test_cases
    ├── modal-dialog
    │   ├── prompt.md
    │   ├── examples
    │   │   ├── pass-native.html
    │   │   ├── fail-popover.html
    │   │   ├── fail-aria.html
    │   │   ├── pass-aria.html
    │   │   ├── fail-has-css-transition.html
    │   │   └── fail-has-js-delay.html
    │   └── test.js
    ├── shopping-home-page
    │   ├── prompt.md
    │   ├── examples
    │   │   ├── fail.html
    │   │   └── pass.html
    │   └── test.js
    ├── disclosure-widget
    │   ├── prompt.md
    │   ├── examples
    │   │   ├── good-details-summary.html
    │   │   ├── bad-missing-all-semantics.html
    │   │   ├── bad-hidden-content-in-reading-order.html
    │   │   ├── good-button.html
    │   │   └── good-hidden-from-everyone.html
    │   └── test.js
    └── readme.md
├── requirements.txt
├── setup.cfg
├── scripts
    └── install_node_deps.sh
├── config
    └── models.yaml.example
├── a11y_llm_tests
    ├── __init__.py
    ├── utils.py
    ├── cache.py
    ├── node_bridge.py
    ├── metrics.py
    ├── schema.py
    ├── generator.py
    ├── cli.py
    └── report.py
├── node_runner
    ├── package.json
    ├── package-lock.json
    └── runner.js
├── SECURITY.md
├── CODE_OF_CONDUCT.md
├── tests
    ├── test_metrics.py
    ├── test_generator_prompts.py
    ├── test_cli_sampling.py
    └── test_examples.py
├── .github
    └── workflows
    │   └── build.yml
├── LICENSE
├── SUPPORT.md
├── README.md
└── .gitignore


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | ANTHROPIC_API_KEY=
3 | GOOGLE_API_KEY=
4 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/prompt.md:
--------------------------------------------------------------------------------
1 | Create an example of a modal dialog component. It is closed by default, and the button to open it has a `trigger` class.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | litellm
 2 | pydantic>=2
 3 | rich
 4 | jinja2
 5 | aiofiles
 6 | httpx
 7 | python-dotenv
 8 | orjson
 9 | typer
10 | beautifulsoup4
11 | lxml
12 | pytest
13 | pyyaml
14 | 


--------------------------------------------------------------------------------
/test_cases/shopping-home-page/prompt.md:
--------------------------------------------------------------------------------
1 | Generate an HTML file for the home page of a shopping website. The home page has several sections of content for popular items, deals, and blog posts.
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = a11y-llm-tests
 3 | version = 0.1.0
 4 | 
 5 | [options]
 6 | packages = find:
 7 | python_requires = >=3.10
 8 | 
 9 | [tool:pytest]
10 | pythonpath = .
11 | addopts = -ra
12 | 


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/prompt.md:
--------------------------------------------------------------------------------
1 | - Generate an HTML file that demonstrates an expand/collapse widget.
2 | - Wrap each widget with a div that has an `example` class.
3 | - Give the container for controlled content a `details` class.
4 | 


--------------------------------------------------------------------------------
/scripts/install_node_deps.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | cd "$(dirname "$0")/../node_runner"
 4 | if [ ! -f package.json ]; then
 5 |   echo "package.json missing" >&2
 6 |   exit 1
 7 | fi
 8 | npm install
 9 | npx playwright install chromium
10 | echo "Node dependencies installed." 
11 | 


--------------------------------------------------------------------------------
/config/models.yaml.example:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   #temperature: 0.2
 3 |   #max_tokens: 1200
 4 |   #system_prompt: |
 5 |   #  You are generating a single standalone HTML document.
 6 |   #custom_instructions_markdown: custom-instructions.md
 7 | models:
 8 |   - name: azure_ai/gpt-5-mini
 9 |     display_name: GPT-5 Mini
10 |   - name: claude-sonnet-4-20250514
11 |     display_name: Claud Sonnet 4
12 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """a11y_llm_tests
 2 | 
 3 | Toolkit to evaluate how well various LLMs generate accessible HTML.
 4 | 
 5 | Primary entrypoints:
 6 |  - cli.py (Typer CLI)
 7 |  - generator.py (LLM generation + caching)
 8 |  - node_bridge.py (Playwright + axe-core invocation)
 9 |  - report.py (HTML report rendering)
10 | """
11 | 
12 | __all__ = [
13 |     "generator",
14 |     "node_bridge",
15 |     "report",
16 | ]
17 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/utils.py:
--------------------------------------------------------------------------------
 1 | """Miscellaneous utility helpers (placeholder)."""
 2 | 
 3 | def ensure_single_html(doc: str) -> str:
 4 |     """Return only the first <html>...</html> segment if multiple exist."""
 5 |     lower = doc.lower()
 6 |     if "<html" in lower and "</html>" in lower:
 7 |         start = lower.index("<html")
 8 |         end = lower.index("</html>") + len("</html>")
 9 |         return doc[start:end]
10 |     return doc
11 | 


--------------------------------------------------------------------------------
/node_runner/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "a11y-llm-tests-runner",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "type": "commonjs",
 6 |   "description": "Playwright + axe-core runner for a11y-llm-tests",
 7 |   "main": "runner.js",
 8 |   "scripts": {
 9 |     "start": "node runner.js"
10 |   },
11 |   "dependencies": {
12 |     "axe-core": "^4.10.0",
13 |     "deepmerge": "^4.3.1",
14 |     "playwright": "^1.48.0"
15 |   },
16 |   "engines": {
17 |     "node": ">=18"
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V1.0.0 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which
 6 | includes all source code repositories in our GitHub organizations.
 7 | 
 8 | **Please do not report security vulnerabilities through public GitHub issues.**
 9 | 
10 | For security reporting information, locations, contact information, and policies,
11 | please review the latest guidance for Microsoft repositories at
12 | [https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md).
13 | 
14 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | - Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support)
11 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/cache.py:
--------------------------------------------------------------------------------
 1 | """Cache utilities for generation artifacts.
 2 | 
 3 | Currently minimal: provides helper to compose cache keys that account for model,
 4 | prompt hash, and optional seed, ensuring sampled generations can coexist.
 5 | """
 6 | 
 7 | from pathlib import Path
 8 | from typing import Optional
 9 | 
10 | CACHE_ROOT = Path('.cache')
11 | CACHE_ROOT.mkdir(exist_ok=True)
12 | 
13 | def generation_cache_key(model: str, prompt_hash: str, seed: Optional[int] = None) -> str:
14 | 	"""Return a filename-safe cache key for a generation.
15 | 
16 | 	Example: modelabc_deadbeef or modelabc_deadbeef_s42
17 | 	"""
18 | 	if seed is None:
19 | 		return f"{model}_{prompt_hash}"
20 | 	return f"{model}_{prompt_hash}_s{seed}"
21 | 
22 | __all__ = ["generation_cache_key", "CACHE_ROOT"]
23 | 


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/examples/good-details-summary.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Details/Summary Disclosure</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "All examples have a valid semantics": "pass",
 9 |             "Collapsed content is hidden from assistive technology": "pass"
10 |         }
11 |     }</script>
12 | </head>
13 | <body>
14 |     <h1>Details/Summary Disclosure</h1>
15 | 
16 |     <div class="example">
17 |         <details class="details">
18 |             <summary>Show Details</summary>
19 |             <p>This is the hidden content that can be disclosed.</p>
20 |         </details>
21 |     </div>
22 | </body>
23 | </html>


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/examples/bad-missing-all-semantics.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Missing all semantics</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "All examples have a valid semantics": "fail",
 9 |             "Collapsed content is hidden from assistive technology": "pass"
10 |         }
11 |     }</script>
12 | </head>
13 | <body>
14 |     <h1>Missing all semantics</h1>
15 |     <div class="example">
16 |         <div onclick="toggleContent()">Show Details</div>
17 |         <div class="details" style="display: none;">
18 |             <p>This is the hidden content that can be disclosed.</p>
19 |         </div>
20 |     </div>
21 | </body>
22 | </html>


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/examples/bad-hidden-content-in-reading-order.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Hidden content is still in reading order</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "All examples have a valid semantics": "pass",
 9 |             "Collapsed content is hidden from assistive technology": "fail"
10 |         }
11 |     }</script>
12 | </head>
13 | <body>
14 |     <h1>Hidden content is still in reading order</h1>
15 |     <div class="example">
16 |         <button aria-expanded="false">Show Details</button>
17 |         <div class="details" style="opacity: 0">This is the hidden content that can be disclosed.</div>
18 |     </div>
19 | </body>
20 | </html>


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/examples/good-button.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Basic Disclosure using button and aria-expanded</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "All examples have a valid semantics": "pass",
 9 |             "Collapsed content is hidden from assistive technology": "pass"
10 |         }
11 |     }</script>
12 | </head>
13 | <body>
14 |     <h1>Basic Disclosure using button and aria-expanded</h1>
15 | 
16 |     <div class="example">
17 |         <button aria-expanded="false" aria-controls="content">Show Details</button>
18 |         <div class="details" style="display: none;">
19 |             <p>This is the hidden content that can be disclosed.</p>
20 |         </div>
21 |     </div>
22 | </body>
23 | </html>


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/examples/good-hidden-from-everyone.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Hidden content is hidden from everyone (opacity and aria-hidden)</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "All examples have a valid semantics": "pass",
 9 |             "Collapsed content is hidden from assistive technology": "pass"
10 |         }
11 |     }</script>
12 | </head>
13 | <body>
14 |     <h1>Hidden content is hidden from everyone (opacity and aria-hidden)</h1>
15 | 
16 |     <div class="example">
17 |         <button aria-expanded="false">Show Details</button>
18 |         <div class="details" style="opacity: 0;" aria-hidden="true">This is the hidden content that can be disclosed.</div>
19 |     </div>
20 | </body>
21 | </html>


--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | from a11y_llm_tests.metrics import compute_pass_at_k, format_pass_at_k
 2 | 
 3 | 
 4 | def test_pass_at_k_basic_cases():
 5 |     # All fail
 6 |     assert compute_pass_at_k(0, 5, [1, 2, 5]) == {1: 0.0, 2: 0.0, 5: 0.0}
 7 |     # All pass
 8 |     assert compute_pass_at_k(5, 5, [1, 2, 5]) == {1: 1.0, 2: 1.0, 5: 1.0}
 9 |     # Example: n=5, c=1
10 |     r = compute_pass_at_k(1, 5, [1, 2])
11 |     # pass@1 = c/n = 0.2
12 |     assert abs(r[1] - 0.2) < 1e-9
13 |     # pass@2 = 1 - ( (4C2)/(5C2) ) = 1 - (6/10) = 0.4
14 |     assert abs(r[2] - 0.4) < 1e-9
15 | 
16 | 
17 | def test_pass_at_k_edge_values():
18 |     # k larger than n
19 |     r = compute_pass_at_k(1, 3, [5])
20 |     # k treated as n => probability that at least one passes = 1 when c>0
21 |     assert r[5] == 1.0
22 |     # zero samples
23 |     r0 = compute_pass_at_k(0, 0, [1, 5])
24 |     assert r0 == {1: 0.0, 5: 0.0}
25 | 
26 | 
27 | def test_format_pass_at_k():
28 |     formatted = format_pass_at_k({5: 1.0, 1: 0.2})
29 |     # Keys become strings and sorted
30 |     assert list(formatted.keys()) == ["1", "5"]
31 |     assert formatted["1"] == 0.2
32 |     assert formatted["5"] == 1.0
33 | 


--------------------------------------------------------------------------------
/test_cases/shopping-home-page/examples/fail.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Example fail</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "Has an h1": "fail",
 9 |             "Has single h1": "fail",
10 |             "Has at least one h2": "fail",
11 |             "Has a single banner": "fail",
12 |             "Has a single maincontent": "fail",
13 |             "Has a single navigation": "fail",
14 |             "Has a single footer": "fail"
15 |         }
16 |     }</script>
17 | </head>
18 |     <body>
19 |         <div>Header content
20 |             <div><a href="">nav link 1</a><a href="">nav link 2</a></div>
21 |         </div>
22 |         <div>
23 |             <div>page title</div>
24 |             <div>content</div>
25 |             <div>section title</div>
26 |             <div>content</div>
27 |             <div>section title</div>
28 |             <div>content</div>
29 |         </div>
30 |         <div>
31 |             Footer
32 |         </div>
33 |     </body>
34 | </html>


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v5
17 |     - name: Set up Python 3.11
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: "3.11"
21 |     - name: Set up Node.js
22 |       uses: actions/setup-node@v4
23 |       with:
24 |         node-version: "22.x"
25 |     - name: Disable AppArmor
26 |       run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |         bash scripts/install_node_deps.sh
33 |     - name: Test with pytest
34 |       run: |
35 |         python -m pytest -s
36 | 


--------------------------------------------------------------------------------
/tests/test_generator_prompts.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from a11y_llm_tests import generator
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True)
 7 | def reset_prompts():
 8 |     generator.configure_prompts(None, None)
 9 |     yield
10 |     generator.configure_prompts(None, None)
11 | 
12 | 
13 | def test_compute_prompt_hash_changes_with_system_prompt():
14 |     baseline = generator.compute_prompt_hash("Prompt body")
15 |     generator.configure_prompts("Revised system prompt", None)
16 |     changed = generator.compute_prompt_hash("Prompt body")
17 |     assert baseline != changed
18 | 
19 | 
20 | def test_hash_changes_with_custom_instructions():
21 |     generator.configure_prompts(None, "Alpha instructions")
22 |     first = generator.compute_prompt_hash("Prompt body")
23 |     generator.configure_prompts(None, "Beta instructions")
24 |     second = generator.compute_prompt_hash("Prompt body")
25 |     assert first != second
26 | 
27 | 
28 | def test_effective_system_prompt_includes_custom_instructions():
29 |     generator.configure_prompts("Base prompt", "### Custom\n- Item")
30 |     effective = generator.get_effective_system_prompt()
31 |     assert "Base prompt" in effective
32 |     assert "### Custom" in effective
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/test_cases/shopping-home-page/examples/pass.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 5 |     <title>Example pass</title>
 6 |     <script id="a11y-assertions" type="application/json">{
 7 |         "assertions": {
 8 |             "Has an h1": "pass",
 9 |             "Has single h1": "pass",
10 |             "Has at least one h2": "pass",
11 |             "Has a single banner": "pass",
12 |             "Has a single maincontent": "pass",
13 |             "Has a single navigation": "pass",
14 |             "Has a single footer": "pass"
15 |         }
16 |     }</script>
17 | </head>
18 |     <body>
19 |         <header>Header content
20 |             <nav>
21 |                 <ul>
22 |                     <li><a href="">nav link 1</a></li>
23 |                     <li><a href="">nav link 2</a></li>
24 |                 </ul>
25 |             </nav>
26 |         </header>
27 |         <main>
28 |             <h1>page title</h1>
29 |             <div>content</div>
30 |             <h2>section title</h2>
31 |             <div>content</div>
32 |             <h2>section title</h2>
33 |             <div>content</div>
34 |         </main>
35 |         <footer>
36 |             Footer
37 |         </footer>
38 |     </body>
39 | </html>


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/test_cases/shopping-home-page/test.js:
--------------------------------------------------------------------------------
 1 | // New harness signature with dependency injection
 2 | module.exports.run = async ({ page, assert }) => {
 3 |   await assert("Has an h1", async () => {
 4 |     const h1 = await page.$$eval("h1", els => els.length);
 5 |     const roleH1 = await page.$$eval("[role='heading'][aria-level='1']", els => els.length);
 6 |     return (h1 + roleH1) >= 1;
 7 |   });
 8 | 
 9 |   await assert("Has single h1", async () => {
10 |     const h1 = await page.getByRole('heading', { level: 1 });
11 |     const count = await h1.count();
12 |     return count === 1;
13 |   }, {type: 'BP'});
14 | 
15 |   await assert("Has at least one h2", async () => {
16 |     const h2 = await page.getByRole('heading', { level: 2 });
17 |     const count = await h2.count();
18 |     return count >= 1;
19 |   });
20 | 
21 |   await assert("Has a single banner", async () => {
22 |     const banner = await page.getByRole('banner');
23 |     return (await banner.count()) === 1;
24 |   });
25 | 
26 |   await assert("Has a single maincontent", async () => {
27 |     const main = await page.getByRole('main');
28 |     return (await main.count()) === 1;
29 |   });
30 | 
31 |   await assert("Has a single navigation", async () => {
32 |     const nav = await page.getByRole('navigation');
33 |     return (await nav.count()) === 1;
34 |   });
35 | 
36 |   await assert("Has a single footer", async () => {
37 |     let footer = await page.getByRole('contentinfo');
38 |     return (await footer.count()) === 1;
39 |   });
40 | 
41 |   return {}; // assertions collected via injected assert
42 | };
43 | 


--------------------------------------------------------------------------------
/test_cases/disclosure-widget/test.js:
--------------------------------------------------------------------------------
 1 | // New harness signature: module.exports.run = async ({ page, assert }) => { ... }
 2 | module.exports.run = async ({ page, assert }) => {
 3 |   const examples = await page.$$(".example");
 4 | 
 5 |   const hasValidSemantics = async (example) => {
 6 |     if (await example.$("button[aria-expanded], [role='button'][aria-expanded]")) {
 7 |       return true;
 8 |     }
 9 |     if (await example.$("details summary")) {
10 |       return true;
11 |     }
12 |     return false;
13 |   }
14 | 
15 |   await assert("All examples have a valid semantics", async () => {
16 |     for (const example of examples) {
17 |       if (! await hasValidSemantics(example)) {
18 |         return false;
19 |       }
20 |     }
21 |     return true;
22 |   });
23 | 
24 |   await assert("Collapsed content is hidden from assistive technology", async () => {
25 |     for (const example of examples) {
26 |       if (!await example.$("button[aria-expanded=false], [role='button'][aria-expanded=false]")) {
27 |         // Only check button implementations
28 |         continue;
29 |       }
30 |       
31 |       let isHidden = await example.$eval(".details", el => {
32 |         // Use axe-core's isVisible util to determine if hidden from sighted users but available to AT
33 |         let isVisuallyHidden = !window.axe.commons.dom.isVisible(el, false, true);
34 |         let isScreenReaderHidden = !window.axe.commons.dom.isVisible(el, true, true);
35 |         return isVisuallyHidden && isScreenReaderHidden;
36 |       });
37 | 
38 |       if (!isHidden) {
39 |         return false;
40 |       }
41 |     }
42 |     return true;
43 |   });
44 | };
45 | 
46 | 


--------------------------------------------------------------------------------
/test_cases/readme.md:
--------------------------------------------------------------------------------
 1 | # Example Test Structure Documentation
 2 | 
 3 | This document explains the new structure for test cases and their examples.
 4 | 
 5 | ## Directory Structure
 6 | 
 7 | Each test case should have the following structure:
 8 | 
 9 | ```
10 | test_cases/
11 | └── <test-case-name>/
12 |     ├── prompt.md          # Prompt to generate code for the test case
13 |     ├── test.js            # JavaScript test assertions
14 |     └── examples/          # Directory containing example HTML files and expectations
15 |         ├── example1.html  # HTML example file with embedded json expectations for assertions
16 |         ├── example2.html  # Another HTML example file with embedded json expectations for assertions
17 | ```
18 | 
19 | ## Embedded JSON expectations for assertions
20 | 
21 | Each HTML example file should have a script tag in the `<head>` that defines which assertions should pass or fail for that specific example.
22 | 
23 | ### Format
24 | 
25 | ```html
26 | <script id="a11y-assertions" type="application/json">{
27 |     "assertions": {
28 |         "assertion name": "value",
29 |     }
30 | }</script>
31 | ```
32 | 
33 | ### Example
34 | 
35 | ```html
36 | <script id="a11y-assertions" type="application/json">{
37 |     "assertions": {
38 |         "All examples have a valid semantics": "pass",
39 |         "Collapsed content is hidden from assistive technology": "pass"
40 |     }
41 | }</script>
42 | ```
43 | 
44 | ## Benefits
45 | 
46 | 1. **Performance**: Each HTML file is only processed once by the node runner, regardless of how many assertions it contains.
47 | 2. **Flexibility**: You can have multiple example files for each test case, each with different assertion expectations.
48 | 3. **Granular Testing**: Individual assertions can be tested separately, making it easier to identify specific failures.
49 | 4. **Clear Expectations**: The JSON make it explicit which assertions should pass or fail for each example.
50 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/node_bridge.py:
--------------------------------------------------------------------------------
 1 | """Bridge for invoking the Node-based Playwright + axe-core runner.
 2 | 
 3 | The API is intentionally small: ``run`` executes a single HTML +
 4 | test.js pair and returns a JSON-compatible dict produced by the Node script.
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | import subprocess
 9 | import tempfile
10 | import json
11 | import os
12 | import pathlib
13 | import time
14 | from typing import Optional, Dict, Any
15 | 
16 | _NODE_DIR = pathlib.Path(__file__).resolve().parent.parent / "node_runner"
17 | PLAYWRIGHT_RUNNER = _NODE_DIR / "runner.js"
18 | 
19 | 
20 | def run(html: str, test_js_path: str, screenshot_path: Optional[str]) -> Dict[str, Any]:
21 |     if not PLAYWRIGHT_RUNNER.exists():
22 |         return {"error": f"Runner script not found: {PLAYWRIGHT_RUNNER}", "duration_s": 0.0, "engine": "playwright"}
23 |     with tempfile.TemporaryDirectory() as td:
24 |         html_path = os.path.join(td, "gen.html")
25 |         out_json = os.path.join(td, "out.json")
26 |         with open(html_path, "w", encoding="utf-8") as f:
27 |             f.write(html)
28 |         args = [
29 |             "node",
30 |             str(PLAYWRIGHT_RUNNER),
31 |             html_path,
32 |             test_js_path,
33 |             out_json,
34 |             screenshot_path or "",
35 |         ]
36 |         start = time.time()
37 |         proc = subprocess.run(args, capture_output=True, text=True)
38 |         duration = time.time() - start
39 |         if proc.returncode != 0:
40 |             return {"error": f"Node runner failed: {proc.stderr}", "duration_s": duration, "engine": "playwright"}
41 |         try:
42 |             with open(out_json, "r", encoding="utf-8") as jf:
43 |                 data = json.load(jf)
44 |         except Exception as e:
45 |             return {"error": f"Failed reading JSON output: {e}", "duration_s": duration, "engine": "playwright"}
46 |         data["duration_s"] = duration
47 |         data.setdefault("engine", "playwright")
48 |         return data
49 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/metrics.py:
--------------------------------------------------------------------------------
 1 | """Metrics utilities for evaluating multiple sampled generations (pass@k)."""
 2 | from __future__ import annotations
 3 | from math import comb
 4 | from typing import Iterable, Dict, List
 5 | 
 6 | 
 7 | def compute_pass_at_k(c: int, n: int, ks: Iterable[int]) -> Dict[int, float]:
 8 |     """Compute pass@k for given counts.
 9 | 
10 |     pass@k = 1 - ((n-c choose k) / (n choose k)) for 0 < c < n and k <= n.
11 |     Handles edge cases:
12 |       - If c == 0 -> 0.0
13 |       - If c == n -> 1.0
14 |       - If k > n -> treat k as n (probability reduces to c>0 ? 1 : 0)
15 |       - If k <= 0 -> 0.0
16 |     Parameters:
17 |       c: number of passing samples
18 |       n: total number of samples
19 |       ks: iterable of k values
20 |     Returns:
21 |       Dict mapping each k to probability (float)
22 |     Raises:
23 |       ValueError if counts invalid.
24 |     """
25 |     if n < 0 or c < 0 or c > n:
26 |         raise ValueError("Require 0 <= c <= n and n >= 0")
27 |     if n == 0:
28 |         return {int(k): 0.0 for k in ks}
29 | 
30 |     result: Dict[int, float] = {}
31 |     for k in ks:
32 |         k_int = int(k)
33 |         if k_int <= 0:
34 |             result[k_int] = 0.0
35 |             continue
36 |         k_eff = k_int if k_int <= n else n
37 |         if c == 0:
38 |             result[k_int] = 0.0
39 |             continue
40 |         if c == n:
41 |             result[k_int] = 1.0
42 |             continue
43 |         if k_eff == 0:
44 |             result[k_int] = 0.0
45 |             continue
46 |         numerator = comb(n - c, k_eff) if (n - c) >= k_eff else 0
47 |         denominator = comb(n, k_eff)
48 |         result[k_int] = 1.0 - (numerator / denominator)
49 |     return result
50 | 
51 | 
52 | def format_pass_at_k(pass_at_k: Dict[int, float]) -> Dict[str, float]:
53 |     """Convert int keys to strings for JSON serialization stability."""
54 |     return {str(k): float(v) for k, v in sorted(pass_at_k.items(), key=lambda x: x[0])}
55 | 
56 | 
57 | __all__ = ["compute_pass_at_k", "format_pass_at_k"]
58 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/pass-native.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!doctype html>
 3 | <html lang="en-us">
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |         <title>
 7 |             Pass - Native - Modal Dialog example
 8 |         </title>
 9 |         <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
10 |         <script id="a11y-assertions" type="application/json">{
11 |             "assertions": {
12 |                 "Each dialog has a dialog role": "pass",
13 |                 "Each dialog can be closed by escape key": "pass",
14 |                 "Each modal dialog traps keyboard focus": "pass",
15 |                 "Each modal dialog takes focus when opened": "pass",
16 |                 "Focus is not lost when each dialog closes": "pass",
17 |                 "Each modal dialog hides content behind it while open": "pass"
18 |             }
19 |         }</script>
20 |     </head>
21 |     <body>
22 | 		<main>
23 |             <h2>Pass - Native - Modal Dialog example</h2>
24 |             <div class=test-case>
25 |                 <button id=t1 class="trigger">Invoke example</button>
26 |                 <dialog id=d1>
27 |                     <p>Test content</p>
28 |                     <form method=dialog>
29 |                         <button>OK</button>
30 |                     </form>
31 |                 </dialog>
32 |                 <script>
33 |                     const t1 = document.getElementById('t1');
34 |                     const d1= document.getElementById('d1');
35 | 
36 |                     t1.addEventListener('click', ()=> {
37 |                         d1.showModal();
38 |                     })
39 |                 </script>
40 |                 <style>
41 |                     #d1 {
42 |                     box-shadow: 0 0 1000px 1000px rgba(0,0,0,.625);
43 |                     }
44 |                 </style>
45 |             </div>
46 | 
47 |             <div class=test-case>
48 |                 <button id=t2 class="trigger">Invoke example 2</button>
49 |                 <dialog id=d2>
50 |                     <p>Test content 2</p>
51 |                     <form method=dialog>
52 |                         <button>OK</button>
53 |                     </form>
54 |                 </dialog>
55 |                 <script>
56 |                     const t2 = document.getElementById('t2');
57 |                     const d2 = document.getElementById('d2');
58 | 
59 |                     t2.addEventListener('click', ()=> {
60 |                         d2.showModal();
61 |                     })
62 |                 </script>
63 |                 <style>
64 |                     #d2 {
65 |                     box-shadow: 0 0 1000px 1000px rgba(0,0,0,.625);
66 |                     }
67 |                 </style>
68 |             </div>
69 |         </main>
70 |     </body>
71 | </html>
72 | 


--------------------------------------------------------------------------------
/node_runner/package-lock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "a11y-llm-tests-runner",
 3 |   "version": "0.1.0",
 4 |   "lockfileVersion": 3,
 5 |   "requires": true,
 6 |   "packages": {
 7 |     "": {
 8 |       "name": "a11y-llm-tests-runner",
 9 |       "version": "0.1.0",
10 |       "dependencies": {
11 |         "axe-core": "^4.10.0",
12 |         "deepmerge": "^4.3.1",
13 |         "playwright": "^1.48.0"
14 |       },
15 |       "engines": {
16 |         "node": ">=18"
17 |       }
18 |     },
19 |     "node_modules/axe-core": {
20 |       "version": "4.10.3",
21 |       "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.3.tgz",
22 |       "integrity": "sha512-Xm7bpRXnDSX2YE2YFfBk2FnF0ep6tmG7xPh8iHee8MIcrgq762Nkce856dYtJYLkuIoYZvGfTs/PbZhideTcEg==",
23 |       "license": "MPL-2.0",
24 |       "engines": {
25 |         "node": ">=4"
26 |       }
27 |     },
28 |     "node_modules/deepmerge": {
29 |       "version": "4.3.1",
30 |       "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
31 |       "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
32 |       "license": "MIT",
33 |       "engines": {
34 |         "node": ">=0.10.0"
35 |       }
36 |     },
37 |     "node_modules/fsevents": {
38 |       "version": "2.3.2",
39 |       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
40 |       "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
41 |       "hasInstallScript": true,
42 |       "license": "MIT",
43 |       "optional": true,
44 |       "os": [
45 |         "darwin"
46 |       ],
47 |       "engines": {
48 |         "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
49 |       }
50 |     },
51 |     "node_modules/playwright": {
52 |       "version": "1.56.0",
53 |       "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.56.0.tgz",
54 |       "integrity": "sha512-X5Q1b8lOdWIE4KAoHpW3SE8HvUB+ZZsUoN64ZhjnN8dOb1UpujxBtENGiZFE+9F/yhzJwYa+ca3u43FeLbboHA==",
55 |       "license": "Apache-2.0",
56 |       "dependencies": {
57 |         "playwright-core": "1.56.0"
58 |       },
59 |       "bin": {
60 |         "playwright": "cli.js"
61 |       },
62 |       "engines": {
63 |         "node": ">=18"
64 |       },
65 |       "optionalDependencies": {
66 |         "fsevents": "2.3.2"
67 |       }
68 |     },
69 |     "node_modules/playwright-core": {
70 |       "version": "1.56.0",
71 |       "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.56.0.tgz",
72 |       "integrity": "sha512-1SXl7pMfemAMSDn5rkPeZljxOCYAmQnYLBTExuh6E8USHXGSX3dx6lYZN/xPpTz1vimXmPA9CDnILvmJaB8aSQ==",
73 |       "license": "Apache-2.0",
74 |       "bin": {
75 |         "playwright-core": "cli.js"
76 |       },
77 |       "engines": {
78 |         "node": ">=18"
79 |       }
80 |     }
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/fail-popover.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!doctype html>
 3 | <html lang="en-us">
 4 | 	<head>
 5 |         <meta charset="utf-8">
 6 |         <title>
 7 |             Fail - custom - modal dialog example
 8 |         </title>
 9 |         <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
10 |         <script id="a11y-assertions" type="application/json">{
11 |             "assertions": {
12 |                 "Each dialog has a dialog role": "fail",
13 |                 "Each dialog can be closed by escape key": "fail",
14 |                 "Each modal dialog traps keyboard focus": "fail",
15 |                 "Each modal dialog takes focus when opened": "fail",
16 |                 "Focus is not lost when each dialog closes": "fail",
17 |                 "Each modal dialog hides content behind it while open": "fail"
18 |             }
19 |         }</script>
20 | 	</head>
21 | 	<body>
22 | 		<main>
23 |             <h1>Fail - popover - modal dialog example</h1>
24 | 
25 |             <p>The following are examples of failing modal dialogs that are implemented as popovers.</p>
26 | 
27 |             <h2>Example 1</h2>
28 |             <p>
29 |                 This example has role of dialog but does not trap keyboard or screen reader focus.
30 |                 Content behind the dialog is still available to keyboard and screen reader users.
31 |             </p>
32 |             <div class=test-case>
33 |                 <button popovertarget=d1 class="trigger">Invoke popover 1</button>
34 | 
35 |                 <div role=dialog popover id=d1>
36 |                     <h2>popover 1</h2>
37 |                     <p>Look ma, no JS!  But is it good enough?</p>
38 |                     <button popovertarget=d1>close</button>
39 |                 </div>
40 | 
41 |                 <style>
42 |                     [popover]::backdrop {
43 |                     background: rgba(0,0,0,.75);
44 |                     }
45 |                 </style>
46 |             </div>
47 | 
48 |             <h2>Example 2</h2>
49 |             <p>
50 |                 This example does not have a role of dialog and does not trap keyboard or screen reader focus.
51 |                 Content behind the dialog is still available to keyboard and screen reader users.
52 |             </p>
53 |             <div class=test-case>
54 |                 <button popovertarget=d2 class="trigger">Invoke popover 2</button>
55 | 
56 |                 <div popover id=d2>
57 |                     <h2>popover 2</h2>
58 |                     <p>Look ma, no JS!  But is it good enough?</p>
59 |                     <button popovertarget=d2>close</button>
60 |                 </div>
61 | 
62 |                 <style>
63 |                     [popover]::backdrop {
64 |                     background: rgba(0,0,0,.75);
65 |                     }
66 |                 </style>
67 |             </div>
68 | 		</main>
69 | 	</body>
70 | </html>
71 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/schema.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel
  2 | from typing import List, Optional, Dict, Any
  3 | from datetime import datetime
  4 | 
  5 | 
  6 | class AssertionResult(BaseModel):
  7 |     name: str
  8 |     status: str  # pass|fail
  9 |     message: Optional[str] = None
 10 |     type: str = "R"  # R = Requirement (default), BP = Best Practice
 11 | 
 12 |     def model_post_init(self, __context):  # type: ignore[override]
 13 |         # Normalize and validate type for backward compatibility
 14 |         t = (self.type or "R").upper()
 15 |         if t not in {"R", "BP"}:
 16 |             t = "R"
 17 |         object.__setattr__(self, "type", t)
 18 | 
 19 | 
 20 | class TestFunctionResult(BaseModel):
 21 |     status: str  # pass|fail|error|timeout
 22 |     assertions: List[AssertionResult] = []
 23 |     error: Optional[str] = None
 24 |     duration_ms: Optional[int] = None
 25 |     total_assertion_failures: int = 0
 26 |     total_assertion_bp_failures: int = 0
 27 | 
 28 | 
 29 | class AxeNode(BaseModel):
 30 |     html: Optional[str]
 31 |     target: List[str] = []
 32 | 
 33 | 
 34 | class AxeFailure(BaseModel):
 35 |     id: str
 36 |     impact: Optional[str]
 37 |     description: str
 38 |     helpUrl: Optional[str]
 39 |     nodes: List[AxeNode] = []
 40 |     tags: List[str] = []
 41 | 
 42 | 
 43 | class AxeResult(BaseModel):
 44 |     failure_count: int  # WCAG failures only (affects pass/fail)
 45 |     failures: List[AxeFailure] = []  # WCAG failures only
 46 |     best_practice_count: int = 0  # Best practice failures (informational)
 47 |     best_practice_failures: List[AxeFailure] = []  # Best practice failures
 48 | 
 49 | 
 50 | class GenerationMeta(BaseModel):
 51 |     latency_s: float
 52 |     prompt_hash: str
 53 |     cached: bool
 54 |     tokens_in: Optional[int] = None
 55 |     tokens_out: Optional[int] = None
 56 |     total_tokens: Optional[int] = None
 57 |     cost_usd: Optional[float] = None
 58 |     # Added for sampling diversity / metadata
 59 |     seed: Optional[int] = None
 60 |     temperature: Optional[float] = None
 61 |     system_prompt: Optional[str] = None
 62 |     custom_instructions: Optional[str] = None
 63 |     effective_system_prompt: Optional[str] = None
 64 | 
 65 | 
 66 | class ResultRecord(BaseModel):
 67 |     test_name: str
 68 |     model_name: str
 69 |     timestamp: datetime
 70 |     generation_html_path: str
 71 |     screenshot_path: Optional[str]
 72 |     test_function: TestFunctionResult
 73 |     axe: Optional[AxeResult]
 74 |     result: str # PASS|FAIL|ERROR
 75 |     generation: GenerationMeta
 76 |     # Index of the sample for (test_name, model_name). 0-based. None for legacy single-sample runs.
 77 |     sample_index: Optional[int] = None
 78 | 
 79 | 
 80 | class RunSummary(BaseModel):
 81 |     run_id: str
 82 |     created_at: datetime
 83 |     results: List[ResultRecord]
 84 |     models: List[str]
 85 |     tests: List[str]
 86 | 
 87 | 
 88 | class AggregateStats(BaseModel):
 89 |     per_model: Dict[str, Dict[str, Any]]
 90 | 
 91 | 
 92 | class AggregateRecord(BaseModel):
 93 |     """Aggregate statistics for a (test_name, model_name) pair across multiple samples."""
 94 |     test_name: str
 95 |     model_name: str
 96 |     n_samples: int
 97 |     n_pass: int
 98 |     pass_at_k: Dict[str, float]  # JSON-friendly string keys
 99 |     k_values: List[int]
100 |     computed_at: datetime
101 | 
102 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/fail-aria.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!doctype html>
 3 | <html lang="en-us">
 4 | 	<head>
 5 | 		<meta charset="utf-8">
 6 | 		<title>
 7 | 			Fail - ARIA - Modal Dialog Examples
 8 | 		</title>
 9 |         <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
10 |         <script id="a11y-assertions" type="application/json">{
11 |             "assertions": {
12 |                 "Each dialog has a dialog role": "fail",
13 |                 "Each dialog can be closed by escape key": "fail",
14 |                 "Each modal dialog traps keyboard focus": "fail",
15 |                 "Each modal dialog takes focus when opened": "fail",
16 |                 "Focus is not lost when each dialog closes": "fail",
17 |                 "Each modal dialog hides content behind it while open": "fail"
18 |             }
19 |         }</script>
20 | 	</head>
21 | 	<body>
22 | 		<main>
23 |             <h1>Fail - ARIA - Modal Dialog Examples</h1>
24 | 
25 |             <p>
26 |                 This is a failing example of an ARIA modal dialog. It has an incorrect role, does not hide content behind the dialog from
27 |                 keyboard and screen reader users, and does not manage keyboard focus.
28 |             </p>
29 | 				
30 |             <button id=ba class="trigger">Invoke ARIA dialog</button>
31 | 
32 |             <div class=dc>
33 |                 <div role=modal aria-modal=true class=dialog data-code-snippet="name-role-value" data-component="dialog">
34 |                     <p>ARIA can be used to create modal <a href="https://w3c.github.io/aria/#dialog">dialogs</a>.</p>
35 |                     <p>There's extra work to make them properly. The native HTML
36 |                         <a href="https://html.spec.whatwg.org/multipage/interactive-elements.html#the-dialog-element">dialog element</a> 
37 |                         handles a lot of this for us automatically.</p>
38 |                     <p>
39 |                         It's generally better to use the native HTML dialog unless
40 |                         you have specific use cases where a custom dialog might perform better.
41 |                         But even then, you may be able to still use native HTML features
42 |                         over ARIA... just sayin...
43 |                     </p>
44 |                     <button>close</button>
45 |                 </div>
46 |             </div>
47 | 
48 |     
49 |             <style>
50 |                 .dialog {
51 |                     background: white;
52 |                     border: 2px solid;
53 |                     padding: 1em;
54 |                     box-shadow: 0 0 1000px 1000px rgba(0,0,0,.625);
55 |                     position: fixed;
56 |                     inset: 0;
57 |                     margin: auto;
58 |                     max-width: 500px;
59 |                     max-height: 50vh;
60 |                     overflow: auto;
61 |                 }
62 |                 .dc {
63 |                     display: none;
64 |                 }
65 |                 .dc.open {
66 |                     display: block;
67 |                 }
68 |             </style>
69 |             <script>
70 |                 // Generic dialog functionality for multiple instances
71 |                 const dialogButtons = document.querySelectorAll('#ba');
72 |                 const dialogContainers = document.querySelectorAll('.dc');
73 |                 const dialogs = document.querySelectorAll('.dialog');
74 | 
75 |                 // Set up each dialog button
76 |                 dialogButtons.forEach((button, index) => {
77 |                     const dialogContainer = dialogContainers[index];
78 |                     const dialog = dialogs[index];
79 | 
80 |                     button.addEventListener('click', () => {
81 |                         // Close any other open dialogs first
82 |                         dialogContainers.forEach(container => container.classList.remove('open'));
83 |                         
84 |                         // Open this dialog
85 |                         dialogContainer.classList.add('open');
86 |                     });
87 | 
88 |                     // Close button functionality for this dialog
89 |                     const closeButton = dialog.querySelector('button');
90 |                     closeButton.addEventListener('click', () => {
91 |                         dialogContainer.classList.remove('open');
92 |                     });
93 |                 });
94 |             </script>
95 | 		</main>
96 | 	</body>
97 | </html>
98 | 


--------------------------------------------------------------------------------
/node_runner/runner.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | // Playwright + axe-core executor (mirrors runner.js for Puppeteer).
  3 | // NOTE: Initially Chromium-only; future work may add firefox/webkit via arg/env.
  4 | 
  5 | const fs = require("fs");
  6 | const path = require("path");
  7 | const { chromium } = require("playwright");
  8 | const axeSource = require("axe-core").source;
  9 | const merge = require('deepmerge')
 10 | 
 11 | async function main() {
 12 |   const [,, htmlPath, testJsPath, outJsonPath, screenshotPath] = process.argv;
 13 |   if (!htmlPath || !testJsPath || !outJsonPath) {
 14 |     console.error("Usage: node playwright_runner.js <htmlPath> <testJsPath> <outJsonPath> [screenshotPath]");
 15 |     process.exit(2);
 16 |   }
 17 |   const html = fs.readFileSync(htmlPath, "utf-8");
 18 |   let testFn;
 19 |   try {
 20 |     testFn = require(path.resolve(testJsPath));
 21 |   } catch (e) {
 22 |     console.error("Failed loading test file:", e);
 23 |     testFn = {};
 24 |   }
 25 |   let launchOptions = { headless: true };
 26 |   if (process.env.A11Y_LLM_EVAL_DEBUG === "1") {
 27 |     launchOptions = { headless: false, slowMo: 1000 };
 28 |   }
 29 | 
 30 |   const browser = await chromium.launch(launchOptions);
 31 |   const context = await browser.newContext({ viewport: { width: 1280, height: 800 } });
 32 |   const page = await context.newPage();
 33 |   const consoleLogs = [];
 34 |   page.on("console", msg => consoleLogs.push(msg.text()));
 35 | 
 36 |   const start = Date.now();
 37 |   let testFunctionResult = { status: "error", assertions: [] };
 38 |   let axeResult = null;
 39 |   let errorMsg = null;
 40 | 
 41 |   async function loadHTML() {
 42 |     await page.reload();
 43 |     await page.setContent(html, { waitUntil: "load" });
 44 |     await page.addScriptTag({ content: axeSource });
 45 |     await page.evaluate(() => { window.axe.setup();});
 46 |   }
 47 | 
 48 |   async function runAxeOnPage(page) {
 49 |     return await page.evaluate(async () => {
 50 |       return await window.axe.run();
 51 |     });
 52 |   }
 53 | 
 54 |   const utils = { reload: loadHTML, runAxeOnPage, merge };
 55 | 
 56 |   try {
 57 |     await loadHTML();
 58 | 
 59 |     if (!testFn.run || typeof testFn.run !== 'function') {
 60 |       testFunctionResult = { status: 'error', assertions: [], error: 'No run export (expected module.exports.run = async ({ page, assert }) => {...})' };
 61 |     } else {
 62 |       const collected = [];
 63 |       const assert = async (name, fn, opts = {}) => {
 64 |         const { type = 'R' } = opts;
 65 |         let normalizedType = (type || 'R').toUpperCase();
 66 |         if (!['R','BP'].includes(normalizedType)) normalizedType = 'R';
 67 |         try {
 68 |           const r = await fn();
 69 |           // Allow boolean or object { pass, message }
 70 |           let passVal = r;
 71 |           let message;
 72 |           if (r && typeof r === 'object' && 'pass' in r) {
 73 |             passVal = r.pass;
 74 |             message = r.message;
 75 |           }
 76 |           collected.push({ name, status: passVal ? 'pass' : 'fail', message, type: normalizedType });
 77 |         } catch (e) {
 78 |           collected.push({ name, status: 'fail', message: e.message, type: normalizedType });
 79 |         }
 80 |       };
 81 | 
 82 |       const runStart = Date.now();
 83 |       try {
 84 |         await testFn.run({ page, assert, utils });
 85 |       } catch (e) {
 86 |         errorMsg = e.stack || e.message;
 87 |       }
 88 |       const duration_ms = Date.now() - runStart;
 89 | 
 90 |       // Normalize & determine status based only on requirement failures
 91 |       const hasAssertionFailure = collected.some(a => a.type === 'R' && a.status === 'fail');
 92 |       const totalAssertionFailures = collected.filter(a => a.type === 'R' && a.status === 'fail').length;
 93 |       const totalAssertionBpFailures = collected.filter(a => a.type === 'BP' && a.status === 'fail').length;
 94 |       testFunctionResult = {
 95 |         status: hasAssertionFailure ? 'fail' : 'pass',
 96 |         assertions: collected,
 97 |         duration_ms,
 98 |         total_assertion_failures: totalAssertionFailures,
 99 |         total_assertion_bp_failures: totalAssertionBpFailures
100 |       };
101 |     }
102 | 
103 |     const processAxeResults = (results) => {
104 |       // Separate WCAG violations from best practice violations
105 |       const wcagViolations = [];
106 |       const bestPracticeViolations = [];
107 |       
108 |       let wcagCount = 0;
109 |       let bestPracticeCount = 0;
110 |       results.violations.forEach(v => {
111 |         const mappedViolation = {
112 |             id: v.id,
113 |             impact: v.impact,
114 |             description: v.description,
115 |             helpUrl: v.helpUrl,
116 |             nodes: v.nodes.map(n => ({ html: n.html, target: n.target })),
117 |             tags: v.tags
118 |         };
119 |         if (v.tags.includes('best-practice')) {
120 |           bestPracticeViolations.push(mappedViolation);
121 |           bestPracticeCount += mappedViolation.nodes.length;
122 |         } else {
123 |           wcagViolations.push(mappedViolation);
124 |           wcagCount += mappedViolation.nodes.length;
125 |         }
126 |       });
127 |       return {
128 |         failure_count: wcagCount,
129 |         failures: wcagViolations,
130 |         best_practice_count: bestPracticeCount,
131 |         best_practice_failures: bestPracticeViolations
132 |       };
133 |     }
134 | 
135 |     if (screenshotPath) {
136 |       try {
137 |         await page.screenshot({ path: screenshotPath, fullPage: true });
138 |       } catch (e) {
139 |         console.error('Screenshot failed:', e.message);
140 |       }
141 |     }
142 | 
143 |     axeResult = await runAxeOnPage(page);
144 | 
145 |     if (testFn.runAxe && typeof testFn.runAxe === 'function') {
146 |       const axeCustomResult = await testFn.runAxe({ page, utils});
147 |       if (axeCustomResult && typeof axeCustomResult === 'object') {
148 |         axeResult = merge(axeResult || {}, axeCustomResult);
149 |       }
150 |     }
151 | 
152 |     axeResult = processAxeResults(axeResult);
153 |   } catch (e) {
154 |     errorMsg = e.stack || e.message;
155 |     if (testFunctionResult.status === "error") {
156 |       testFunctionResult.error = errorMsg;
157 |     }
158 |   } finally {
159 |     await browser.close();
160 |   }
161 | 
162 |   const out = {
163 |     engine: 'playwright',
164 |     browser: 'chromium',
165 |     testFunctionResult,
166 |     axeResult,
167 |     consoleLogs,
168 |     error: errorMsg,
169 |     total_duration_ms: Date.now() - start
170 |   };
171 |   fs.writeFileSync(outJsonPath, JSON.stringify(out, null, 2), "utf-8");
172 | }
173 | 
174 | main().catch(e => {
175 |   console.error(e);
176 |   process.exit(1);
177 | });
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A11y LLM Evaluation Harness and Dataset
  2 | 
  3 | This is a research project to evaluate how well various LLM models generate accessible HTML content.
  4 | 
  5 | ## Problem
  6 | LLMs currently generate code with accessibility bugs, resulting in blockers for people with disabilities and costly re-work and fixes downstream. 
  7 | 
  8 | ## Goal
  9 | Create a public test suite which can be used to benchmark how well various LLMs generates accessible HTML code. Eventually, it could also be used to help train models to generate more accessible code by default.
 10 | 
 11 | ## Methdology
 12 | - Each test case contains a prompt to generate an HTML page to demonstrate a specific pattern or component.
 13 | - This page is rendered in a real browser using Playwright (Chromium). Tests are executed against this rendered page.
 14 | - The HTML is evaluated against axe-core, one of the most popular automated accessibility testing engines.
 15 | - The HTML is also evaluated against a manually defined set of assertions, customized for the specific test case. This allows for more robust testing than just using axe-core.
 16 | - Tests only pass if zero axe-core failures are found AND all *requirement* assertions pass. Best Practice (BP) assertion failures do not fail the test but are tracked separately.
 17 | 
 18 | ## Features
 19 | - Python orchestrator (generation, execution, reporting)
 20 | - Node.js Playwright + axe-core evaluation
 21 | - Per-test prompts & injected JS assertions
 22 | - HTML report summarizing performance
 23 | - Token + cost tracking (tokens in/out/total, per-generation cost, aggregated per model)
 24 | - Multi-sample generation with pass@k metrics (probability at least one passing generation in k draws)
 25 | 
 26 | ## Sampling & pass@k Metrics
 27 | You can request multiple independent generations ("samples") per (test, model). This enables computation of pass@k metrics similar to code evaluation benchmarks.
 28 | 
 29 | ### CLI Usage
 30 | 
 31 | Step 1: Send prompts to the LLMs and generate HTML
 32 | ```bash
 33 | python -m a11y_llm_tests.cli run \
 34 |   --models-file config/models.yaml \
 35 |   --out runs \
 36 |   --samples 20 \
 37 | ```
 38 | 
 39 | Step 2: Run the eval and generate the report
 40 | ```bash
 41 | python -m a11y_llm_tests.cli evaluate \
 42 |   <path to run directory>
 43 |   --k 1,5,10
 44 | ```
 45 | 
 46 | Artifacts:
 47 | - Each sample's HTML: `runs/<ts>/raw/<test>/<model>__s<idx>.html` (single-sample keeps legacy `<model>.html`)
 48 | - Screenshots with analogous naming
 49 | - `results.json` now includes per-sample records + an `aggregates` array with pass@k stats.
 50 | - Report includes an aggregate pass@k table and grouped per-sample cards.
 51 | 
 52 | Tips:
 53 | - Increase `temperature` (or other diversity params) to reduce sample correlation.
 54 | - Use `--disable-cache` if you want fresh generations even when prompt/model/seed repeat.
 55 | 
 56 | 
 57 | ## Quick Start
 58 | ```bash
 59 | python3 -m venv .venv
 60 | source .venv/bin/activate
 61 | pip install --upgrade pip
 62 | pip install -r requirements.txt
 63 | 
 64 | # Node deps
 65 | bash scripts/install_node_deps.sh
 66 | 
 67 | # Copy env and set keys
 68 | cp .env.example .env
 69 | export OPENAI_API_KEY=... # etc. or put in .env and use dotenv
 70 | 
 71 | # Copy model config and set API keys
 72 | cp config/models.yaml.example config/models.yaml
 73 | 
 74 | # Run all tests against configured models
 75 | python -m a11y_llm_tests.cli run --models-file config/models.yaml --out runs
 76 | ```
 77 | 
 78 | ## Adding a Test Case
 79 | Create a new folder under `test_cases/`:
 80 | ```
 81 | test_cases/
 82 |   form-labels/
 83 |     prompt.md
 84 |     test.js
 85 |     example-fail/
 86 |     example-pass/
 87 | ```
 88 | 
 89 | `prompt.md` contains ONLY the user-facing instruction for the model.
 90 | 
 91 | `test.js` must export:
 92 | 
 93 | ```js
 94 | module.exports.run = async ({ page, assert }) => {
 95 |   await assert("Has an h1", async () => {
 96 |     const count = await page.$$eval('h1', els => els.length);
 97 |     return count >= 1; // truthy => pass, falsy => fail
 98 |   });
 99 |   await assert("Sequential heading levels", async () => {
100 |     // Return object form to include custom message
101 |     const ok = await page.$$eval('h1 + h2', els => els.length) > 0;
102 |     return { pass: ok, message: ok ? undefined : 'h2 does not follow h1' };
103 |   }, { type: 'BP' });
104 |   return {}; // assertions collected automatically
105 | };
106 | ```
107 | 
108 | The runner injects an `assert(name, fn, opts?)` helper:
109 | 
110 | | Parameter | Description |
111 | |-----------|-------------|
112 | | `name` | Human-readable assertion label |
113 | | `fn` | Async/Sync function returning boolean OR `{ pass, message? }` |
114 | | `opts.type` | `'R'` (Requirement, default) or `'BP'` (Best Practice) |
115 | 
116 | Return shape from `run` can be empty.
117 | 
118 | ### Assertion Types
119 | 
120 | Each assertion may now include a `type` field:
121 | 
122 | | Type | Meaning | Affects Test Pass/Fail | Aggregated Separately |
123 | |------|---------|------------------------|-----------------------|
124 | | `R`  | Requirement (default) | Yes (any failing R => test fails) | Requirement Pass Rate |
125 | | `BP` | Best Practice | No (ignored for pass/fail) | Best Practice Pass Rate |
126 | 
127 | If `type` is omitted it defaults to `R` for backward compatibility. The HTML report shows both Requirement Pass Rate (percentage of tests whose requirement assertions passed) and Best Practice Pass Rate (percentage of tests containing BP assertions where all BP assertions passed).
128 | 
129 | Example assertion objects returned from `run`:
130 | 
131 | ```js
132 | return {
133 |   assertions: [
134 |     { name: 'has main landmark', status: 'pass', type: 'R' },
135 |     { name: 'images have alt text', status: 'fail', type: 'BP', message: '1 of 5 images missing alt' }
136 |   ]
137 | };
138 | ```
139 | 
140 | ## Report
141 | Generated at `runs/<timestamp>/report.html` with:
142 | - Summary stats per model
143 | - Detailed per model/test breakdown
144 | - Axe violations
145 | - Assertions & statuses
146 | - Pass@k aggregate table and per-sample cards when multiple samples are collected
147 | 
148 | ## Contributing
149 | 
150 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
151 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
152 | the rights to use your contribution. For details, visit [Contributor License Agreements](https://cla.opensource.microsoft.com).
153 | 
154 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
155 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
156 | provided by the bot. You will only need to do this once across all repos using our CLA.
157 | 
158 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
159 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
160 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
161 | 
162 | ## Trademarks
163 | 
164 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
165 | trademarks or logos is subject to and must follow
166 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/legal/intellectualproperty/trademarks/usage/general).
167 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
168 | Any use of third-party trademarks or logos are subject to those third-party's policies.
169 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/pass-aria.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!doctype html>
  3 | <html lang="en-us">
  4 | 	<head>
  5 | 		<meta charset="utf-8">
  6 | 		<title>
  7 | 			Pass - ARIA - Modal Dialog Examples
  8 | 		</title>
  9 |         <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
 10 |         <script id="a11y-assertions" type="application/json">{
 11 |             "assertions": {
 12 |                 "Each dialog has a dialog role": "pass",
 13 |                 "Each dialog can be closed by escape key": "pass",
 14 |                 "Each modal dialog traps keyboard focus": "pass",
 15 |                 "Each modal dialog takes focus when opened": "pass",
 16 |                 "Focus is not lost when each dialog closes": "pass",
 17 |                 "Each modal dialog hides content behind it while open": "pass"
 18 |             }
 19 |         }</script>
 20 | 	</head>
 21 | 	<body>
 22 | 		<main>
 23 |             <div id="maincontent">
 24 |                  <h1>Pass - ARIA - Modal Dialog Examples</h1>
 25 | 				
 26 |                 <button id=ba class="trigger">Invoke ARIA dialog</button>
 27 |             </div>
 28 |            
 29 |             <div class=dc>
 30 |                 <div tabindex=0 data-bumper></div>
 31 |                 <div role=dialog aria-modal=true class=dialog data-code-snippet="name-role-value" data-component="dialog">
 32 |                     <p>ARIA can be used to create modal <a href="https://w3c.github.io/aria/#dialog">dialogs</a>.</p>
 33 |                     <p>There's extra work to make them properly. The native HTML
 34 |                         <a href="https://html.spec.whatwg.org/multipage/interactive-elements.html#the-dialog-element">dialog element</a> 
 35 |                         handles a lot of this for us automatically.</p>
 36 |                     <p>
 37 |                         It's generally better to use the native HTML dialog unless
 38 |                         you have specific use cases where a custom dialog might perform better.
 39 |                         But even then, you may be able to still use native HTML features
 40 |                         over ARIA... just sayin...
 41 |                     </p>
 42 |                     <button>close</button>
 43 |                 </div>
 44 |                 <div tabindex=0 data-bumper></div>
 45 |             </div>
 46 | 
 47 |     
 48 |             <style>
 49 |                 .dialog {
 50 |                     background: white;
 51 |                     border: 2px solid;
 52 |                     padding: 1em;
 53 |                     box-shadow: 0 0 1000px 1000px rgba(0,0,0,.625);
 54 |                     position: fixed;
 55 |                     inset: 0;
 56 |                     margin: auto;
 57 |                     max-width: 500px;
 58 |                     max-height: 50vh;
 59 |                     overflow: auto;
 60 |                 }
 61 |                 .dc {
 62 |                     display: none;
 63 |                 }
 64 |                 .dc.open {
 65 |                     display: block;
 66 |                 }
 67 |             </style>
 68 |             <script>
 69 |                 // Generic dialog functionality for multiple instances
 70 |                 const dialogButtons = document.querySelectorAll('#ba');
 71 |                 const dialogContainers = document.querySelectorAll('.dc');
 72 |                 const dialogs = document.querySelectorAll('.dialog');
 73 |                 const mainContent = document.getElementById('maincontent');
 74 |                 let dialogOpen = false;
 75 | 
 76 |                 // Set up each dialog button
 77 |                 dialogButtons.forEach((button, index) => {
 78 |                     const dialogContainer = dialogContainers[index];
 79 |                     const dialog = dialogs[index];
 80 |                     const open = () => {
 81 |                         dialogContainer.classList.add('open');
 82 |                         mainContent.setAttribute('aria-hidden', 'true');
 83 |                         dialogOpen = true;
 84 |                     };
 85 |                     const close = () => {
 86 |                         dialogContainer.classList.remove('open');
 87 |                         mainContent.removeAttribute('aria-hidden');
 88 |                         dialogOpen = false;
 89 |                         button.focus();
 90 |                     };
 91 | 
 92 |                     dialog.addEventListener('keydown', (event) => {
 93 |                         if (event.key === 'Escape') {
 94 |                             close();
 95 |                         }
 96 |                     });
 97 | 
 98 |                     button.addEventListener('click', () => {
 99 |                         // Close any other open dialogs first
100 |                         dialogContainers.forEach(container => container.classList.remove('open'));
101 |                         
102 |                         // Open this dialog
103 |                         open();
104 |                         
105 |                         // Find first focusable element within the dialog
106 |                         const focusableElements = dialog.querySelectorAll(
107 |                             'button, [href], input:not([type=hidden]), select, textarea, [tabindex]'
108 |                             );
109 |                         
110 |                         if (focusableElements.length > 0) {
111 |                             focusableElements[0].focus();
112 |                         }
113 |                     });
114 | 
115 |                     // Close button functionality for this dialog
116 |                     const closeButton = dialog.querySelector('button');
117 |                     closeButton.addEventListener('click', () => {
118 |                         close();
119 |                     });
120 |                 });
121 | 
122 |                 // Focus trap for bumper elements
123 |                 const bumpers = document.querySelectorAll('[data-bumper]');
124 |                 
125 |                 bumpers.forEach((bumper) => {
126 |                     bumper.addEventListener('focus', () => {
127 |                         // Find which dialog container this bumper belongs to
128 |                         const dialogContainer = bumper.parentElement;
129 |                         const dialog = dialogContainer.querySelector('.dialog');
130 |                         
131 |                         if (dialog) {
132 |                             const focusableElements = dialog.querySelectorAll(
133 |                                 'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'
134 |                             );
135 |                             
136 |                             if (focusableElements.length > 0) {
137 |                                 // Check if this is the first or last bumper in this container
138 |                                 const containerBumpers = dialogContainer.querySelectorAll('[data-bumper]');
139 |                                 const bumperIndex = Array.from(containerBumpers).indexOf(bumper);
140 |                                 
141 |                                 if (bumperIndex === 0) {
142 |                                 // First bumper - move to last focusable element
143 |                                 focusableElements[focusableElements.length - 1].focus();
144 |                                 } else {
145 |                                 // Last bumper - move to first focusable element
146 |                                 focusableElements[0].focus();
147 |                                 }
148 |                             }
149 |                         }
150 |                     });
151 |                 });
152 | 
153 |                 function checkARIADialogFocusEscape() {
154 |                     const activeElement = document.activeElement;
155 |                     
156 |                     // Check if dta dialog is open and focus has escaped
157 |                     if (dialogOpen) {
158 |                         const dta = document.querySelector('[role=dialog]');
159 |                         if (!dta.closest('.dc').contains(activeElement)) {
160 |                             const focusableElements = dta.querySelectorAll(
161 |                                 'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'
162 |                             );
163 |                             if (focusableElements.length > 0) {
164 |                                 focusableElements[0].focus();
165 |                             }
166 |                         }
167 |                     }
168 |                 }
169 |                 // Check for focus escape on focus events for ARIA dialog
170 |                 document.addEventListener('focusin', checkARIADialogFocusEscape);
171 |             </script>
172 | 		</main>
173 | 	</body>
174 | </html>
175 | 


--------------------------------------------------------------------------------
/tests/test_cli_sampling.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | from typer.testing import CliRunner
  4 | from a11y_llm_tests.cli import app
  5 | 
  6 | # We'll monkeypatch generator and node_bridge to avoid real API calls
  7 | 
  8 | class DummyResp:
  9 |     def __init__(self, content):
 10 |         self.choices = [type("c", (), {"message": type("m", (), {"content": content})()})]
 11 |         self.usage = {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}
 12 |         self.response_cost = 0.001
 13 | 
 14 | 
 15 | def fake_generate_html_with_meta(model, prompt, iteration, temperature=None, seed=None, disable_cache=False):
 16 |     # Generate deterministic pass/fail pattern by seed (even seed -> pass, odd -> fail)
 17 |     status_comment = f"<!-- seed:{seed} -->" if seed is not None else ""
 18 |     html = f"<html><body><h1>Test {model}</h1>{status_comment}</body></html>"
 19 |     return html, {
 20 |         "cached": False,
 21 |         "latency_s": 0.01,
 22 |         "prompt_hash": "deadbeef",
 23 |         "tokens_in": 5,
 24 |         "tokens_out": 7,
 25 |         "total_tokens": 12,
 26 |         "cost_usd": 0.0005,
 27 |         "seed": seed,
 28 |         "temperature": temperature,
 29 |     }
 30 | 
 31 | 
 32 | def fake_run(html, test_js_path, screenshot_path):
 33 |     # Extract seed from comment to decide pass/fail
 34 |     import re
 35 |     m = re.search(r"seed:(\d+)", html)
 36 |     seed = int(m.group(1)) if m else 0
 37 |     status = "pass" if seed % 2 == 0 else "fail"
 38 |     return {
 39 |         "testFunctionResult": {
 40 |             "status": status,
 41 |             "assertions": [
 42 |                 {"name": "dummy", "status": status, "message": None, "type": "R"},
 43 |             ],
 44 |             "duration_ms": 5,
 45 |         },
 46 |         "axeResult": {
 47 |             "violation_count": 0,
 48 |             "violations": [],
 49 |         },
 50 |     }
 51 | 
 52 | 
 53 | def test_cli_sampling_multi(monkeypatch, tmp_path):
 54 |     monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", fake_generate_html_with_meta)
 55 |     monkeypatch.setattr("a11y_llm_tests.node_bridge.run", fake_run)
 56 | 
 57 |     # Create a minimal test case directory
 58 |     tc_dir = tmp_path / "test_cases" / "sample-case"
 59 |     tc_dir.mkdir(parents=True)
 60 |     (tc_dir / "prompt.md").write_text("Generate a page", encoding="utf-8")
 61 |     (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8")
 62 | 
 63 |     # Provide models config
 64 |     config_dir = tmp_path / "config"
 65 |     config_dir.mkdir()
 66 |     (config_dir / "models.yaml").write_text("""models:\n  - name: test-model\n""", encoding="utf-8")
 67 | 
 68 |     runner = CliRunner()
 69 |     # Generation phase only
 70 |     gen_result = runner.invoke(app, [
 71 |         "run",
 72 |         "--models-file", str(config_dir / "models.yaml"),
 73 |         "--out", str(tmp_path / "runs"),
 74 |         "--test-cases-dir", str(tmp_path / "test_cases"),
 75 |         "--samples", "4",
 76 |         "--k", "1,2,4",
 77 |         "--base-seed", "100",
 78 |     ])
 79 |     assert gen_result.exit_code == 0, gen_result.output
 80 |     runs_dir = tmp_path / "runs"
 81 |     run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir())
 82 |     assert run_subdirs, "No run directory created"
 83 |     latest = run_subdirs[-1]
 84 |     # Ensure aggregates are empty pre-evaluation
 85 |     pre_data = json.loads((latest / "results.json").read_text(encoding="utf-8"))
 86 |     assert pre_data["aggregates"] == []
 87 |     # Evaluation phase
 88 |     eval_result = runner.invoke(app, [
 89 |         "evaluate",
 90 |         str(latest),
 91 |         "--test-cases-dir", str(tmp_path / "test_cases"),
 92 |         "--k", "1,2,4",
 93 |         "--no-generate-report",
 94 |     ])
 95 |     assert eval_result.exit_code == 0, eval_result.output
 96 |     data = json.loads((latest / "results.json").read_text(encoding="utf-8"))
 97 |     aggs = data["aggregates"]
 98 |     assert len(aggs) == 1
 99 |     agg = aggs[0]
100 |     assert agg["n_samples"] == 4
101 |     assert agg["n_pass"] == 2  # Seeds 100,101,102,103 -> pass,fail,pass,fail
102 |     p1 = agg["pass_at_k"]["1"]
103 |     p2 = agg["pass_at_k"]["2"]
104 |     assert abs(p1 - 0.5) < 1e-6
105 |     assert 0.82 < p2 < 0.85
106 |     assert agg["pass_at_k"]["4"] == 1.0
107 |     sample_indices = sorted(r["sample_index"] for r in data["results"])
108 |     assert sample_indices == [0, 1, 2, 3]
109 | 
110 | 
111 | def test_cli_sampling_single(monkeypatch, tmp_path):
112 |     monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", fake_generate_html_with_meta)
113 |     monkeypatch.setattr("a11y_llm_tests.node_bridge.run", fake_run)
114 | 
115 |     tc_dir = tmp_path / "test_cases" / "single"
116 |     tc_dir.mkdir(parents=True)
117 |     (tc_dir / "prompt.md").write_text("Prompt", encoding="utf-8")
118 |     (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8")
119 | 
120 |     config_dir = tmp_path / "config"
121 |     config_dir.mkdir(exist_ok=True)
122 |     (config_dir / "models.yaml").write_text("""models:\n  - name: m1\n""", encoding="utf-8")
123 | 
124 |     runner = CliRunner()
125 |     gen_result = runner.invoke(app, [
126 |         "run",
127 |         "--models-file", str(config_dir / "models.yaml"),
128 |         "--out", str(tmp_path / "runs"),
129 |         "--test-cases-dir", str(tmp_path / "test_cases"),
130 |         "--samples", "1",
131 |         "--k", "1,5",
132 |         "--base-seed", "5",
133 |     ])
134 |     assert gen_result.exit_code == 0, gen_result.output
135 |     runs_dir = tmp_path / "runs"
136 |     run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir())
137 |     latest = run_subdirs[-1]
138 |     pre_data = json.loads((latest / "results.json").read_text(encoding="utf-8"))
139 |     assert pre_data["aggregates"] == []
140 |     eval_result = runner.invoke(app, [
141 |         "evaluate",
142 |         str(latest),
143 |         "--test-cases-dir", str(tmp_path / "test_cases"),
144 |         "--k", "1,5",
145 |         "--no-generate-report",
146 |     ])
147 |     assert eval_result.exit_code == 0, eval_result.output
148 |     data = json.loads((latest / "results.json").read_text(encoding="utf-8"))
149 |     agg = data["aggregates"][0]
150 |     assert agg["n_samples"] == 1
151 |     assert agg["n_pass"] == 0  # Seed=5 -> fail (odd)
152 |     assert agg["pass_at_k"]["1"] == 0.0
153 | 
154 | 
155 | def test_bp_failure_not_affect_requirement_pass(monkeypatch, tmp_path):
156 |     # Requirement passes, BP fails => overall should pass
157 |     def gen_html(model, prompt, iteration, temperature=None, seed=None, disable_cache=False):
158 |         return "<html><body><h1>Page</h1></body></html>", {
159 |             "cached": False,
160 |             "latency_s": 0.01,
161 |             "prompt_hash": "hash",
162 |             "cost_usd": 0.0001,
163 |             "seed": 1,
164 |             "temperature": temperature,
165 |         }
166 | 
167 |     def run(html, test_js_path, screenshot_path):
168 |         return {
169 |             "testFunctionResult": {
170 |                 "status": "pass",  # legacy status (will be recomputed logic wise in runner normally)
171 |                 "assertions": [
172 |                     {"name": "req-1", "status": "pass", "type": "R"},
173 |                     {"name": "bp-1", "status": "fail", "type": "BP"},
174 |                 ],
175 |                 "duration_ms": 3,
176 |             },
177 |             "axeResult": {"violation_count": 0, "violations": []},
178 |         }
179 | 
180 |     monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", gen_html)
181 |     monkeypatch.setattr("a11y_llm_tests.node_bridge.run", run)
182 | 
183 |     tc_dir = tmp_path / "test_cases" / "bp-case"
184 |     tc_dir.mkdir(parents=True)
185 |     (tc_dir / "prompt.md").write_text("Prompt", encoding="utf-8")
186 |     (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8")
187 | 
188 |     config_dir = tmp_path / "config"
189 |     config_dir.mkdir(exist_ok=True)
190 |     (config_dir / "models.yaml").write_text("""models:\n  - name: modelX\n""", encoding="utf-8")
191 | 
192 |     runner_cli = CliRunner()
193 |     gen_result = runner_cli.invoke(app, [
194 |         "run",
195 |         "--models-file", str(config_dir / "models.yaml"),
196 |         "--out", str(tmp_path / "runs"),
197 |         "--test-cases-dir", str(tmp_path / "test_cases"),
198 |         "--samples", "1",
199 |         "--k", "1",
200 |     ])
201 |     assert gen_result.exit_code == 0, gen_result.output
202 |     runs_dir = tmp_path / "runs"
203 |     run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir())
204 |     latest = run_subdirs[-1]
205 |     # Evaluate
206 |     eval_result = runner_cli.invoke(app, [
207 |         "evaluate",
208 |         str(latest),
209 |         "--test-cases-dir", str(tmp_path / "test_cases"),
210 |         "--k", "1",
211 |         "--no-generate-report",
212 |     ])
213 |     assert eval_result.exit_code == 0, eval_result.output
214 |     data = json.loads((latest / "results.json").read_text(encoding="utf-8"))
215 |     assert data["results"][0]["result"] == "PASS"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | __pycache__/
  2 | .pytest_cache/
  3 | .mypy_cache/
  4 | .cache/
  5 | .venv/
  6 | node_modules/
  7 | runs/*
  8 | !runs/.gitkeep
  9 | *.pyc
 10 | *.pyo
 11 | *.env
 12 | .DS_Store
 13 | config/models.yaml
 14 | 
 15 | 
 16 | ## Ignore Visual Studio temporary files, build results, and
 17 | ## files generated by popular Visual Studio add-ons.
 18 | ##
 19 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
 20 | 
 21 | # User-specific files
 22 | *.rsuser
 23 | *.suo
 24 | *.user
 25 | *.userosscache
 26 | *.sln.docstates
 27 | *.env
 28 | 
 29 | # User-specific files (MonoDevelop/Xamarin Studio)
 30 | *.userprefs
 31 | 
 32 | # Mono auto generated files
 33 | mono_crash.*
 34 | 
 35 | # Build results
 36 | [Dd]ebug/
 37 | [Dd]ebugPublic/
 38 | [Rr]elease/
 39 | [Rr]eleases/
 40 | x64/
 41 | x86/
 42 | [Ww][Ii][Nn]32/
 43 | [Aa][Rr][Mm]/
 44 | [Aa][Rr][Mm]64/
 45 | [Aa][Rr][Mm]64[Ee][Cc]/
 46 | bld/
 47 | [Oo]bj/
 48 | [Oo]ut/
 49 | [Ll]og/
 50 | [Ll]ogs/
 51 | 
 52 | # Build results on 'Bin' directories
 53 | **/[Bb]in/*
 54 | # Uncomment if you have tasks that rely on *.refresh files to move binaries
 55 | # (https://github.com/github/gitignore/pull/3736)
 56 | #!**/[Bb]in/*.refresh
 57 | 
 58 | # Visual Studio 2015/2017 cache/options directory
 59 | .vs/
 60 | # Uncomment if you have tasks that create the project's static files in wwwroot
 61 | #wwwroot/
 62 | 
 63 | # Visual Studio 2017 auto generated files
 64 | Generated\ Files/
 65 | 
 66 | # MSTest test Results
 67 | [Tt]est[Rr]esult*/
 68 | [Bb]uild[Ll]og.*
 69 | *.trx
 70 | 
 71 | # NUnit
 72 | *.VisualState.xml
 73 | TestResult.xml
 74 | nunit-*.xml
 75 | 
 76 | # Approval Tests result files
 77 | *.received.*
 78 | 
 79 | # Build Results of an ATL Project
 80 | [Dd]ebugPS/
 81 | [Rr]eleasePS/
 82 | dlldata.c
 83 | 
 84 | # Benchmark Results
 85 | BenchmarkDotNet.Artifacts/
 86 | 
 87 | # .NET Core
 88 | project.lock.json
 89 | project.fragment.lock.json
 90 | artifacts/
 91 | 
 92 | # ASP.NET Scaffolding
 93 | ScaffoldingReadMe.txt
 94 | 
 95 | # StyleCop
 96 | StyleCopReport.xml
 97 | 
 98 | # Files built by Visual Studio
 99 | *_i.c
100 | *_p.c
101 | *_h.h
102 | *.ilk
103 | *.meta
104 | *.obj
105 | *.idb
106 | *.iobj
107 | *.pch
108 | *.pdb
109 | *.ipdb
110 | *.pgc
111 | *.pgd
112 | *.rsp
113 | # but not Directory.Build.rsp, as it configures directory-level build defaults
114 | !Directory.Build.rsp
115 | *.sbr
116 | *.tlb
117 | *.tli
118 | *.tlh
119 | *.tmp
120 | *.tmp_proj
121 | *_wpftmp.csproj
122 | *.log
123 | *.tlog
124 | *.vspscc
125 | *.vssscc
126 | .builds
127 | *.pidb
128 | *.svclog
129 | *.scc
130 | 
131 | # Chutzpah Test files
132 | _Chutzpah*
133 | 
134 | # Visual C++ cache files
135 | ipch/
136 | *.aps
137 | *.ncb
138 | *.opendb
139 | *.opensdf
140 | *.sdf
141 | *.cachefile
142 | *.VC.db
143 | *.VC.VC.opendb
144 | 
145 | # Visual Studio profiler
146 | *.psess
147 | *.vsp
148 | *.vspx
149 | *.sap
150 | 
151 | # Visual Studio Trace Files
152 | *.e2e
153 | 
154 | # TFS 2012 Local Workspace
155 | $tf/
156 | 
157 | # Guidance Automation Toolkit
158 | *.gpState
159 | 
160 | # ReSharper is a .NET coding add-in
161 | _ReSharper*/
162 | *.[Rr]e[Ss]harper
163 | *.DotSettings.user
164 | 
165 | # TeamCity is a build add-in
166 | _TeamCity*
167 | 
168 | # DotCover is a Code Coverage Tool
169 | *.dotCover
170 | 
171 | # AxoCover is a Code Coverage Tool
172 | .axoCover/*
173 | !.axoCover/settings.json
174 | 
175 | # Coverlet is a free, cross platform Code Coverage Tool
176 | coverage*.json
177 | coverage*.xml
178 | coverage*.info
179 | 
180 | # Visual Studio code coverage results
181 | *.coverage
182 | *.coveragexml
183 | 
184 | # NCrunch
185 | _NCrunch_*
186 | .NCrunch_*
187 | .*crunch*.local.xml
188 | nCrunchTemp_*
189 | 
190 | # MightyMoose
191 | *.mm.*
192 | AutoTest.Net/
193 | 
194 | # Web workbench (sass)
195 | .sass-cache/
196 | 
197 | # Installshield output folder
198 | [Ee]xpress/
199 | 
200 | # DocProject is a documentation generator add-in
201 | DocProject/buildhelp/
202 | DocProject/Help/*.HxT
203 | DocProject/Help/*.HxC
204 | DocProject/Help/*.hhc
205 | DocProject/Help/*.hhk
206 | DocProject/Help/*.hhp
207 | DocProject/Help/Html2
208 | DocProject/Help/html
209 | 
210 | # Click-Once directory
211 | publish/
212 | 
213 | # Publish Web Output
214 | *.[Pp]ublish.xml
215 | *.azurePubxml
216 | # Note: Comment the next line if you want to checkin your web deploy settings,
217 | # but database connection strings (with potential passwords) will be unencrypted
218 | *.pubxml
219 | *.publishproj
220 | 
221 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
222 | # checkin your Azure Web App publish settings, but sensitive information contained
223 | # in these scripts will be unencrypted
224 | PublishScripts/
225 | 
226 | # NuGet Packages
227 | *.nupkg
228 | # NuGet Symbol Packages
229 | *.snupkg
230 | # The packages folder can be ignored because of Package Restore
231 | **/[Pp]ackages/*
232 | # except build/, which is used as an MSBuild target.
233 | !**/[Pp]ackages/build/
234 | # Uncomment if necessary however generally it will be regenerated when needed
235 | #!**/[Pp]ackages/repositories.config
236 | # NuGet v3's project.json files produces more ignorable files
237 | *.nuget.props
238 | *.nuget.targets
239 | 
240 | # Microsoft Azure Build Output
241 | csx/
242 | *.build.csdef
243 | 
244 | # Microsoft Azure Emulator
245 | ecf/
246 | rcf/
247 | 
248 | # Windows Store app package directories and files
249 | AppPackages/
250 | BundleArtifacts/
251 | Package.StoreAssociation.xml
252 | _pkginfo.txt
253 | *.appx
254 | *.appxbundle
255 | *.appxupload
256 | 
257 | # Visual Studio cache files
258 | # files ending in .cache can be ignored
259 | *.[Cc]ache
260 | # but keep track of directories ending in .cache
261 | !?*.[Cc]ache/
262 | 
263 | # Others
264 | ClientBin/
265 | ~$*
266 | *~
267 | *.dbmdl
268 | *.dbproj.schemaview
269 | *.jfm
270 | *.pfx
271 | *.publishsettings
272 | orleans.codegen.cs
273 | 
274 | # Including strong name files can present a security risk
275 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
276 | #*.snk
277 | 
278 | # Since there are multiple workflows, uncomment next line to ignore bower_components
279 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
280 | #bower_components/
281 | 
282 | # RIA/Silverlight projects
283 | Generated_Code/
284 | 
285 | # Backup & report files from converting an old project file
286 | # to a newer Visual Studio version. Backup files are not needed,
287 | # because we have git ;-)
288 | _UpgradeReport_Files/
289 | Backup*/
290 | UpgradeLog*.XML
291 | UpgradeLog*.htm
292 | ServiceFabricBackup/
293 | *.rptproj.bak
294 | 
295 | # SQL Server files
296 | *.mdf
297 | *.ldf
298 | *.ndf
299 | 
300 | # Business Intelligence projects
301 | *.rdl.data
302 | *.bim.layout
303 | *.bim_*.settings
304 | *.rptproj.rsuser
305 | *- [Bb]ackup.rdl
306 | *- [Bb]ackup ([0-9]).rdl
307 | *- [Bb]ackup ([0-9][0-9]).rdl
308 | 
309 | # Microsoft Fakes
310 | FakesAssemblies/
311 | 
312 | # GhostDoc plugin setting file
313 | *.GhostDoc.xml
314 | 
315 | # Node.js Tools for Visual Studio
316 | .ntvs_analysis.dat
317 | node_modules/
318 | 
319 | # Visual Studio 6 build log
320 | *.plg
321 | 
322 | # Visual Studio 6 workspace options file
323 | *.opt
324 | 
325 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
326 | *.vbw
327 | 
328 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
329 | *.vbp
330 | 
331 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
332 | *.dsw
333 | *.dsp
334 | 
335 | # Visual Studio 6 technical files
336 | *.ncb
337 | *.aps
338 | 
339 | # Visual Studio LightSwitch build output
340 | **/*.HTMLClient/GeneratedArtifacts
341 | **/*.DesktopClient/GeneratedArtifacts
342 | **/*.DesktopClient/ModelManifest.xml
343 | **/*.Server/GeneratedArtifacts
344 | **/*.Server/ModelManifest.xml
345 | _Pvt_Extensions
346 | 
347 | # Paket dependency manager
348 | **/.paket/paket.exe
349 | paket-files/
350 | 
351 | # FAKE - F# Make
352 | **/.fake/
353 | 
354 | # CodeRush personal settings
355 | **/.cr/personal
356 | 
357 | # Python Tools for Visual Studio (PTVS)
358 | **/__pycache__/
359 | *.pyc
360 | 
361 | # Cake - Uncomment if you are using it
362 | #tools/**
363 | #!tools/packages.config
364 | 
365 | # Tabs Studio
366 | *.tss
367 | 
368 | # Telerik's JustMock configuration file
369 | *.jmconfig
370 | 
371 | # BizTalk build output
372 | *.btp.cs
373 | *.btm.cs
374 | *.odx.cs
375 | *.xsd.cs
376 | 
377 | # OpenCover UI analysis results
378 | OpenCover/
379 | 
380 | # Azure Stream Analytics local run output
381 | ASALocalRun/
382 | 
383 | # MSBuild Binary and Structured Log
384 | *.binlog
385 | MSBuild_Logs/
386 | 
387 | # AWS SAM Build and Temporary Artifacts folder
388 | .aws-sam
389 | 
390 | # NVidia Nsight GPU debugger configuration file
391 | *.nvuser
392 | 
393 | # MFractors (Xamarin productivity tool) working folder
394 | **/.mfractor/
395 | 
396 | # Local History for Visual Studio
397 | **/.localhistory/
398 | 
399 | # Visual Studio History (VSHistory) files
400 | .vshistory/
401 | 
402 | # BeatPulse healthcheck temp database
403 | healthchecksdb
404 | 
405 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
406 | MigrationBackup/
407 | 
408 | # Ionide (cross platform F# VS Code tools) working folder
409 | **/.ionide/
410 | 
411 | # Fody - auto-generated XML schema
412 | FodyWeavers.xsd
413 | 
414 | # VS Code files for those working on multiple tools
415 | .vscode/*
416 | !.vscode/settings.json
417 | !.vscode/tasks.json
418 | !.vscode/launch.json
419 | !.vscode/extensions.json
420 | !.vscode/*.code-snippets
421 | 
422 | # Local History for Visual Studio Code
423 | .history/
424 | 
425 | # Built Visual Studio Code Extensions
426 | *.vsix
427 | 
428 | # Windows Installer files from build outputs
429 | *.cab
430 | *.msi
431 | *.msix
432 | *.msm
433 | *.msp
434 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | import json
  4 | import pytest
  5 | import re
  6 | 
  7 | from a11y_llm_tests import node_bridge
  8 | 
  9 | TEST_CASES_ROOT = Path("test_cases")
 10 | SCREENSHOT_ROOT = Path("runs") / "pytest_screenshots"
 11 | 
 12 | def _collect_example_html():
 13 |     """Yield tuples: (test_case_name, html_path, yaml_path, test_js_path)."""
 14 |     for case_dir in TEST_CASES_ROOT.iterdir():
 15 |         if not case_dir.is_dir():
 16 |             continue
 17 |         test_js = case_dir / "test.js"
 18 |         if not test_js.exists():
 19 |             continue
 20 |         examples_dir = case_dir / "examples"
 21 |         if not examples_dir.exists():
 22 |             continue
 23 |         # Only consider HTML files that include json
 24 |         for html_file in examples_dir.glob("*.html"):
 25 |             fm, _ = parse_html_with_expectations(html_file)
 26 |             if fm is None:
 27 |                 # Skip HTML files without json expectations (they are invalid in the new format)
 28 |                 continue
 29 |             yield (
 30 |                 case_dir.name,
 31 |                 html_file,
 32 |                 None,
 33 |                 test_js,
 34 |             )
 35 | 
 36 | # EXAMPLES will be computed after helper functions are defined
 37 | 
 38 | 
 39 | def _collect_assertion_names_from_testjs(test_js_path: Path):
 40 |     """Parse a test.js file and return a set of assertion names used with assert("name", ...).
 41 | 
 42 |     This uses a simple regex to find string literals passed as the first argument to an
 43 |     `assert(...)` call. It's intentionally permissive and assumes tests call `assert` with
 44 |     a literal string as the first argument (the common pattern in our harnesses).
 45 |     """
 46 |     content = test_js_path.read_text(encoding="utf-8")
 47 |     # Match assert(  'name'  , or assert ( `name` , or assert("name",
 48 |     pattern = re.compile(r"\bassert\s*\(\s*([\'\"`])(.+?)\1", re.DOTALL)
 49 |     names = {m.group(2) for m in pattern.finditer(content)}
 50 |     return names
 51 | 
 52 | 
 53 | # Json parser for merged HTML + json files
 54 | SCRIPT_RE = re.compile(r"<script[^>]+id=[\"']a11y-assertions[\"'][^>]*type=[\"']application/json[\"'][^>]*>(.*?)</script>", re.DOTALL | re.IGNORECASE)
 55 | 
 56 | 
 57 | def parse_html_with_expectations(path: Path):
 58 |     """Return (assertions_dict_or_None, html_text).
 59 | 
 60 |     This looks for a <script id="a11y-assertions" type="application/json">...json...</script>
 61 |     inside the HTML. If found, returns (parsed_json_dict, full_html_text). Otherwise (None, full_text).
 62 |     """
 63 |     text = path.read_text(encoding="utf-8")
 64 |     m = SCRIPT_RE.search(text)
 65 |     if not m:
 66 |         return None, text
 67 |     json_text = m.group(1)
 68 |     try:
 69 |         data = json.loads(json_text) or {}
 70 |     except Exception:
 71 |         return None, text
 72 |     return data, text
 73 | 
 74 | 
 75 | def _collect_assertions_for_case(case_dir: Path):
 76 |     """Return a mapping of example path -> dict(assertion name -> expected value) for examples in a case.
 77 | 
 78 |     This reads assertions embedded as JSON in the HTML examples using the
 79 |     <script id="a11y-assertions" type="application/json"> container. The returned
 80 |     mapping keys are Path objects referring to the example file and the values
 81 |     are dicts mapping assertion name -> normalized expectation ('pass'/'fail'/etc.).
 82 |     """
 83 |     out = {}
 84 |     examples_dir = case_dir / "examples"
 85 |     if not examples_dir.exists():
 86 |         return out
 87 | 
 88 |     # Include embedded JSON in HTML examples
 89 |     for html_file in examples_dir.glob("*.html"):
 90 |         fm, _ = parse_html_with_expectations(html_file)
 91 |         if not fm:
 92 |             continue
 93 |         assertion_expectations = fm.get("assertions", {}) if isinstance(fm, dict) else {}
 94 |         norm_map = {}
 95 |         for k, v in (assertion_expectations.items() if isinstance(assertion_expectations, dict) else []):
 96 |             if isinstance(v, bool):
 97 |                 norm = 'pass' if v else 'fail'
 98 |             elif isinstance(v, str):
 99 |                 norm = v.strip().lower()
100 |             elif v is None:
101 |                 norm = 'none'
102 |             else:
103 |                 norm = str(v).strip().lower()
104 |             norm_map[k] = norm
105 |         out[html_file] = norm_map
106 |     return out
107 | 
108 | 
109 | EXAMPLES = list(_collect_example_html())
110 | 
111 | @pytest.mark.parametrize(
112 |     "case_name,html_path,yaml_path,test_js_path",
113 |     EXAMPLES,
114 |     ids=lambda p: getattr(p, "stem", p) if isinstance(p, Path) else p,
115 | )
116 | def test_example_html(case_name, html_path, yaml_path, test_js_path, tmp_path):
117 |     # Load expectations from embedded JSON in the HTML
118 |     fm, _ = parse_html_with_expectations(html_path)
119 |     assert fm is not None, f"Example {html_path} must include embedded JSON assertions in a <script id=\"a11y-assertions\"> element"
120 |     expectations = fm
121 |     assertion_expectations = expectations.get("assertions", {}) if isinstance(expectations, dict) else {}
122 |     html = html_path.read_text(encoding="utf-8")
123 |     # Make sure screenshot dir exists
124 |     SCREENSHOT_ROOT.mkdir(parents=True, exist_ok=True)
125 |     screenshot_file = SCREENSHOT_ROOT / f"{case_name}__{html_path.stem}.png"
126 | 
127 |     result = node_bridge.run(html, str(test_js_path), str(screenshot_file))
128 | 
129 |     if "error" in result and result["error"]:
130 |         print(f"Error running test case {case_name} with HTML {html_path}: {result['error']}")
131 | 
132 |     tf = result.get("testFunctionResult", {})
133 |     status = tf.get("status")
134 |     assertions = tf.get("assertions", [])  # Assertions may now include type ('R' or 'BP')
135 |     axe = result.get("axeResult") or result.get("axe_result") or result.get("axe")
136 | 
137 |     # Build a map of assertion name to result
138 |     assertion_results = {}
139 |     for assertion in assertions:
140 |         assertion_name = assertion.get("name")
141 |         assertion_status = assertion.get("status")
142 |         if assertion_name:
143 |             assertion_results[assertion_name] = assertion_status
144 | 
145 |     # Test each assertion individually
146 |     failed_assertions = []
147 |     for assertion_name, expected_result in assertion_expectations.items():
148 |         actual_result = assertion_results.get(assertion_name)
149 |         if actual_result != expected_result:
150 |             failed_assertions.append({
151 |                 "assertion": assertion_name,
152 |                 "expected": expected_result,
153 |                 "actual": actual_result
154 |             })
155 | 
156 |     debug_info = {
157 |         "case": case_name,
158 |         "html_example": str(html_path),
159 |         "embedded_assertions": expectations,
160 |         "overall_status": status,
161 |         "assertions": assertions,
162 |         "assertion_results": assertion_results,
163 |         "assertion_expectations": assertion_expectations,
164 |         "failed_assertions": failed_assertions,
165 |         "axe_violation_count": axe.get("violation_count") if isinstance(axe, dict) else None,
166 |         "error": tf.get("error"),
167 |     }
168 |     
169 |     assert len(failed_assertions) == 0, f"Assertion mismatches found. Details:\n{json.dumps(debug_info, indent=2)}"
170 | 
171 | 
172 | def test_assertion_coverage():
173 |     """Ensure that for each test case, the combined example YAMLs cover every assertion
174 |     declared in the test.js file. Examples are allowed to define only a subset of assertions;
175 |     this test enforces that collectively they reach 100% coverage of assertion names.
176 |     """
177 |     for case_dir in TEST_CASES_ROOT.iterdir():
178 |         if not case_dir.is_dir():
179 |             continue
180 |         test_js = case_dir / "test.js"
181 |         if not test_js.exists():
182 |             # Skip directories that aren't test cases
183 |             continue
184 | 
185 |         # Gather all assertion names declared in test.js
186 |         declared = _collect_assertion_names_from_testjs(test_js)
187 |         # Gather all assertion names mentioned in example embedded assertions for this case
188 |         assertions_map = _collect_assertions_for_case(case_dir)
189 | 
190 |         # For missing assertions entirely (no expectation present in any example)
191 |         names_union = set().union(*(set(d.keys()) for d in assertions_map.values())) if assertions_map else set()
192 |         missing_entirely = declared - names_union
193 |         if missing_entirely:
194 |             coverage_lines = []
195 |             for p, names in assertions_map.items():
196 |                 coverage_lines.append(f"{p}: {sorted(names.keys())}")
197 | 
198 |             msg = (
199 |                 f"Test case '{case_dir.name}' is missing expectations for {len(missing_entirely)} assertion(s):\n"
200 |                 f"  Missing: {sorted(missing_entirely)}\n"
201 |                 f"  Declared in test.js: {sorted(declared)}\n"
202 |                 f"  Example coverage:\n    " + "\n    ".join(coverage_lines)
203 |             )
204 |             pytest.fail(msg)
205 | 
206 |         # For each declared assertion, ensure examples collectively include both 'pass' and 'fail'
207 |         missing_variants = {}
208 |         for assertion in declared:
209 |             seen = set()
210 |             for d in assertions_map.values():
211 |                 val = d.get(assertion)
212 |                 if val:
213 |                     seen.add(val)
214 | 
215 |             want = {"pass", "fail"}
216 |             lacking = want - seen
217 |             if lacking:
218 |                 missing_variants[assertion] = {
219 |                     "seen": sorted(seen),
220 |                     "missing": sorted(lacking),
221 |                 }
222 | 
223 |         if missing_variants:
224 |             coverage_lines = []
225 |             for p, names in assertions_map.items():
226 |                 coverage_lines.append(f"{p}: {sorted(names.items())}")
227 | 
228 |             msg = (
229 |                 f"Test case '{case_dir.name}' does not include both pass and fail expectations for all assertions:\n"
230 |                 f"  Details: {json.dumps(missing_variants, indent=2)}\n"
231 |                 f"  Example coverage (assertion -> expectation):\n    " + "\n    ".join(coverage_lines)
232 |             )
233 |             pytest.fail(msg)
234 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/fail-has-css-transition.html:
--------------------------------------------------------------------------------
  1 | <html lang="en">
  2 | <head>
  3 |     <meta charset="UTF-8">
  4 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  5 |     <title>Fail - Has CSS Transition</title>
  6 | <script id="a11y-assertions" type="application/json">{
  7 | "assertions": {
  8 |     "Each dialog has a dialog role": "pass",
  9 |     "Each dialog can be closed by escape key": "pass",
 10 |     "Each modal dialog traps keyboard focus": "pass",
 11 |     "Each modal dialog takes focus when opened": "fail",
 12 |     "Focus is not lost when each dialog closes": "pass",
 13 |     "Each modal dialog hides content behind it while open": "fail"
 14 | }
 15 | }</script>
 16 |     <style>
 17 |         /* Basic Reset & Body Styling */
 18 |         body {
 19 |             font-family: sans-serif;
 20 |             margin: 0;
 21 |             padding: 50px; /* Space for the trigger button */
 22 |             display: flex;
 23 |             justify-content: center;
 24 |             align-items: flex-start; /* Position content at the top */
 25 |             min-height: 100vh;
 26 |             background-color: #f4f7f6;
 27 |             color: #333;
 28 |         }
 29 | 
 30 |         /* Trigger Button Styles */
 31 |         .trigger {
 32 |             padding: 12px 25px;
 33 |             font-size: 1.1em;
 34 |             cursor: pointer;
 35 |             background-color: #007bff;
 36 |             color: white;
 37 |             border: none;
 38 |             border-radius: 6px;
 39 |             box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
 40 |             transition: background-color 0.2s ease, transform 0.2s ease;
 41 |         }
 42 | 
 43 |         .trigger:hover {
 44 |             background-color: #0056b3;
 45 |             transform: translateY(-1px);
 46 |         }
 47 | 
 48 |         /* Modal Styles */
 49 |         .modal {
 50 |             position: fixed;
 51 |             top: 0;
 52 |             left: 0;
 53 |             width: 100%;
 54 |             height: 100%;
 55 |             background-color: rgba(0, 0, 0, 0.6);
 56 |             display: flex;
 57 |             justify-content: center;
 58 |             align-items: center;
 59 |             z-index: 1000;
 60 | 
 61 |             /* Hidden by default */
 62 |             visibility: hidden;
 63 |             opacity: 0;
 64 |             pointer-events: none; /* Allows clicks to pass through when hidden */
 65 |             transition: visibility 0.3s ease, opacity 0.3s ease;
 66 |         }
 67 | 
 68 |         .modal--is-open {
 69 |             visibility: visible;
 70 |             opacity: 1;
 71 |             pointer-events: auto; /* Re-enable pointer events when open */
 72 |         }
 73 | 
 74 |         .modal-content {
 75 |             background-color: white;
 76 |             padding: 30px;
 77 |             border-radius: 10px;
 78 |             box-shadow: 0 8px 25px rgba(0, 0, 0, 0.3);
 79 |             width: 90%;
 80 |             max-width: 550px;
 81 |             position: relative;
 82 |             transform: translateY(-30px); /* Initial state for animation */
 83 |             transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1), opacity 0.3s ease;
 84 |             opacity: 0; /* Also fade in content */
 85 |         }
 86 | 
 87 |         .modal--is-open .modal-content {
 88 |             transform: translateY(0); /* End state for animation */
 89 |             opacity: 1;
 90 |         }
 91 | 
 92 |         .modal-header {
 93 |             display: flex;
 94 |             justify-content: space-between;
 95 |             align-items: center;
 96 |             margin-bottom: 20px;
 97 |             padding-bottom: 15px;
 98 |             border-bottom: 1px solid #eee;
 99 |         }
100 | 
101 |         .modal-header h2 {
102 |             margin: 0;
103 |             font-size: 1.8em;
104 |             color: #2c3e50;
105 |         }
106 | 
107 |         .modal-close-button {
108 |             background: none;
109 |             border: none;
110 |             font-size: 2.2em;
111 |             line-height: 1;
112 |             cursor: pointer;
113 |             color: #aaa;
114 |             padding: 0;
115 |             transition: color 0.2s ease;
116 |         }
117 | 
118 |         .modal-close-button:hover,
119 |         .modal-close-button:focus {
120 |             color: #666;
121 |             outline: none; /* Remove default focus outline */
122 |         }
123 | 
124 |         .modal-body {
125 |             line-height: 1.7;
126 |             color: #555;
127 |             margin-bottom: 25px;
128 |         }
129 | 
130 |         .modal-body p {
131 |             margin-bottom: 1em;
132 |         }
133 | 
134 |         .modal-footer {
135 |             text-align: right;
136 |             padding-top: 20px;
137 |             border-top: 1px solid #eee;
138 |         }
139 | 
140 |         .modal-footer button {
141 |             padding: 10px 20px;
142 |             border: none;
143 |             border-radius: 5px;
144 |             cursor: pointer;
145 |             font-size: 1em;
146 |             transition: background-color 0.2s ease, transform 0.1s ease;
147 |         }
148 | 
149 |         .modal-footer button + button {
150 |             margin-left: 10px;
151 |         }
152 | 
153 |         .modal-footer .cancel-button {
154 |             background-color: #dc3545;
155 |             color: white;
156 |         }
157 | 
158 |         .modal-footer .confirm-button {
159 |             background-color: #28a745;
160 |             color: white;
161 |         }
162 | 
163 |         .modal-footer .cancel-button:hover {
164 |             background-color: #c82333;
165 |             transform: translateY(-1px);
166 |         }
167 | 
168 |         .modal-footer .confirm-button:hover {
169 |             background-color: #218838;
170 |             transform: translateY(-1px);
171 |         }
172 |     </style>
173 | </head>
174 | <body>
175 | 
176 |     <h1>Fail - Has CSS Transition</h1>
177 |     <p>
178 |       The following is a failing example of a modal dialog component. It passes most of the requirements but
179 |       does not hide content behind it from screen reader users and does not take focus when opened. This example has slight CSS transitions when opening/closing.
180 |       This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility.
181 |     </p>
182 | 
183 |     <button class="trigger" id="openModalButton">Open Dialog</button>
184 | 
185 |     <div id="myModal" class="modal" role="dialog" aria-labelledby="modalTitle" aria-modal="true">
186 |         <div class="modal-content">
187 |             <div class="modal-header">
188 |                 <h2 id="modalTitle">Modal Dialog Title</h2>
189 |                 <button class="modal-close-button" aria-label="Close dialog">&times;</button>
190 |             </div>
191 |             <div class="modal-body">
192 |                 <p>This is an example of a simple modal dialog component.</p>
193 |                 <p>It's hidden by default and triggered by a button with the <code>.trigger</code> class.</p>
194 |                 <p>You can close it by:</p>
195 |                 <ul>
196 |                     <li>Clicking the 'X' button.</li>
197 |                     <li>Clicking the 'Cancel' button.</li>
198 |                     <li>Pressing the 'Escape' key.</li>
199 |                     <li>Clicking on the dimmed area outside the dialog box.</li>
200 |                 </ul>
201 |             </div>
202 |             <div class="modal-footer">
203 |                 <button class="cancel-button">Cancel</button>
204 |                 <button class="confirm-button">Confirm Action</button>
205 |             </div>
206 |         </div>
207 |     </div>
208 | 
209 |     <script>
210 |         document.addEventListener('DOMContentLoaded', () => {
211 |             const triggerButton = document.getElementById('openModalButton');
212 |             const modal = document.getElementById('myModal');
213 |             const closeButton = modal.querySelector('.modal-close-button');
214 |             const cancelButton = modal.querySelector('.cancel-button');
215 |             const confirmButton = modal.querySelector('.confirm-button'); // Optional: for actual action
216 | 
217 |             let previouslyFocusedElement = null; // To store the element that had focus before the modal opened
218 | 
219 |             function openModal() {
220 |                 previouslyFocusedElement = document.activeElement; // Store currently focused element
221 |                 modal.classList.add('modal--is-open');
222 |                 closeButton.focus(); // Focus on a manageable element inside the modal for accessibility
223 |                 document.body.style.overflow = 'hidden'; // Prevent scrolling on the body
224 |             }
225 | 
226 |             function closeModal() {
227 |                 modal.classList.remove('modal--is-open');
228 |                 document.body.style.overflow = ''; // Restore scrolling on the body
229 |                 if (previouslyFocusedElement) {
230 |                     previouslyFocusedElement.focus(); // Return focus to the element that opened the modal
231 |                     previouslyFocusedElement = null;
232 |                 }
233 |             }
234 | 
235 |             // Open modal when trigger button is clicked
236 |             triggerButton.addEventListener('click', openModal);
237 | 
238 |             // Close modal when close button is clicked
239 |             closeButton.addEventListener('click', closeModal);
240 | 
241 |             // Close modal when cancel button is clicked
242 |             cancelButton.addEventListener('click', closeModal);
243 | 
244 |             // Optional: Close modal when confirm button is clicked
245 |             // confirmButton.addEventListener('click', closeModal);
246 | 
247 |             // Close modal when clicking outside the modal content
248 |             modal.addEventListener('click', (event) => {
249 |                 // If the click target is the modal backdrop itself (not content inside)
250 |                 if (event.target === modal) {
251 |                     closeModal();
252 |                 }
253 |             });
254 | 
255 |             // Close modal when Escape key is pressed
256 |             document.addEventListener('keydown', (event) => {
257 |                 if (event.key === 'Escape' && modal.classList.contains('modal--is-open')) {
258 |                     closeModal();
259 |                 }
260 |             });
261 |         });
262 |     </script>
263 | 
264 | </body>
265 | </html>


--------------------------------------------------------------------------------
/a11y_llm_tests/generator.py:
--------------------------------------------------------------------------------
  1 | """LLM HTML generation & caching layer."""
  2 | from __future__ import annotations
  3 | import hashlib, time, random
  4 | from pathlib import Path
  5 | from typing import Tuple, Dict, Any, Optional
  6 | import json
  7 | import litellm
  8 | 
  9 | CACHE_DIR = Path(".cache/generations")
 10 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
 11 | 
 12 | # Retry policy for litellm calls
 13 | RETRY_MAX_ATTEMPTS = 5
 14 | RETRY_BASE_DELAY = 1.0  # seconds
 15 | RETRY_MAX_DELAY = 60.0  # seconds
 16 | 
 17 | DEFAULT_SYSTEM_PROMPT = (
 18 |     "You are generating a single standalone HTML document. "
 19 |     "Do NOT wrap output in markdown fences. Include <head> and <body>. "
 20 |     "Do NOT explain the code, just output it."
 21 | )
 22 | 
 23 | _PROMPT_JOINER = "\n|:|\n"
 24 | _configured_system_prompt: str = DEFAULT_SYSTEM_PROMPT
 25 | _custom_instructions: Optional[str] = None
 26 | 
 27 | 
 28 | def configure_prompts(system_prompt: Optional[str] = None, custom_instructions: Optional[str] = None) -> None:
 29 |     """Configure the base system prompt and optional custom instructions."""
 30 |     global _configured_system_prompt, _custom_instructions
 31 |     base = (system_prompt or "").strip()
 32 |     _configured_system_prompt = base or DEFAULT_SYSTEM_PROMPT
 33 |     if custom_instructions is None:
 34 |         _custom_instructions = None
 35 |     else:
 36 |         text = custom_instructions.rstrip("\n")
 37 |         _custom_instructions = text if text.strip() else None
 38 | 
 39 | 
 40 | def get_base_system_prompt() -> str:
 41 |     return _configured_system_prompt
 42 | 
 43 | 
 44 | def get_custom_instructions() -> Optional[str]:
 45 |     return _custom_instructions
 46 | 
 47 | 
 48 | def get_effective_system_prompt() -> str:
 49 |     if _custom_instructions:
 50 |         return f"{_configured_system_prompt}\n\n{_custom_instructions}".strip()
 51 |     return _configured_system_prompt
 52 | 
 53 | 
 54 | def prompt_hash(text: str) -> str:
 55 |     return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
 56 | 
 57 | 
 58 | def compute_prompt_hash(user_prompt: str) -> str:
 59 |     combined = _PROMPT_JOINER.join([
 60 |         _configured_system_prompt,
 61 |         _custom_instructions or "",
 62 |         user_prompt,
 63 |     ])
 64 |     return prompt_hash(combined)
 65 | 
 66 | 
 67 | def clean_generation(raw: str) -> str:
 68 |     # Strip markdown fences if present
 69 |     if "```" in raw:
 70 |         parts = []
 71 |         inside = False
 72 |         for line in raw.splitlines():
 73 |             if line.strip().startswith("```"):
 74 |                 inside = not inside
 75 |                 continue
 76 |             if inside:
 77 |                 parts.append(line)
 78 |         if parts:
 79 |             raw = "\n".join(parts)
 80 |     # Keep only first <html>...</html>
 81 |     lower = raw.lower()
 82 |     if "<html" in lower and "</html>" in lower:
 83 |         start = lower.index("<html")
 84 |         end = lower.index("</html>") + len("</html>")
 85 |         raw = raw[start:end]
 86 |     return raw.strip()
 87 | 
 88 | 
 89 | def _meta_path(cache_file: Path) -> Path:
 90 |     return cache_file.with_suffix(cache_file.suffix + ".meta.json")
 91 | 
 92 | 
 93 | def generate_html_with_meta(
 94 |     model: str,
 95 |     user_prompt: str,
 96 |     iteration: int,
 97 |     temperature: Optional[float] = None,
 98 |     seed: Optional[int] = None,
 99 |     disable_cache: bool = False,
100 | ) -> Tuple[str, Dict[str, Any]]:
101 |     """Generate (or load cached) HTML plus metadata including token usage & cost.
102 | 
103 |     Returns:
104 |         html (str): The generated HTML document.
105 |         meta (dict): {
106 |             'cached': bool,
107 |             'latency_s': float,
108 |             'prompt_hash': str,
109 |             'tokens_in': int|None,
110 |             'tokens_out': int|None,
111 |             'total_tokens': int|None,
112 |             'cost_usd': float|None,
113 |         }
114 |     """
115 |     base_system_prompt = get_base_system_prompt()
116 |     custom_instructions = get_custom_instructions()
117 |     effective_system_prompt = get_effective_system_prompt()
118 |     h = compute_prompt_hash(user_prompt)
119 |     # Incorporate seed into cache identity for sampling diversity
120 |     seed_part = f"_s{seed}" if seed is not None else ""
121 |     iteration_part = f"_i{iteration}"
122 |     cache_file = CACHE_DIR / f"{model}_{h}{seed_part}{iteration_part}.html"
123 |     meta_file = _meta_path(cache_file)
124 |     if not disable_cache and cache_file.exists():
125 |         html = cache_file.read_text(encoding="utf-8")
126 |         meta: Dict[str, Any] = {
127 |             "cached": True,
128 |             "latency_s": 0.0,
129 |             "prompt_hash": h,
130 |             "tokens_in": None,
131 |             "tokens_out": None,
132 |             "total_tokens": None,
133 |             "cost_usd": None,
134 |             "seed": seed,
135 |             "temperature": temperature,
136 |             "system_prompt": base_system_prompt,
137 |             "custom_instructions": custom_instructions,
138 |             "effective_system_prompt": effective_system_prompt,
139 |         }
140 |         if meta_file.exists():
141 |             try:
142 |                 loaded = json.loads(meta_file.read_text(encoding="utf-8"))
143 |                 meta.update({
144 |                     k: loaded.get(k) for k in [
145 |                         "tokens_in", "tokens_out", "total_tokens", "cost_usd",
146 |                         "system_prompt", "custom_instructions", "effective_system_prompt",
147 |                     ]
148 |                 })
149 |             except Exception:
150 |                 pass  # ignore malformed meta
151 |         return html, meta
152 | 
153 |     start = time.time()
154 |     litellm.drop_params = True
155 |     print(f"Generating HTML with model={model}, temp={temperature}, seed={seed}...")
156 | 
157 |     resp = None
158 |     last_exc: Optional[BaseException] = None
159 |     for attempt in range(1, RETRY_MAX_ATTEMPTS + 1):
160 |         try:
161 |             resp = litellm.completion(
162 |                 model=model,
163 |                 messages=[
164 |                     {"role": "system", "content": effective_system_prompt},
165 |                     {"role": "user", "content": user_prompt},
166 |                 ],
167 |                 temperature=temperature,
168 |                 seed=seed,
169 |             )
170 |             # Basic validation: ensure we have choices and text
171 |             if resp and getattr(resp, "choices", None) and len(resp.choices) > 0:
172 |                 break
173 |             # Treat missing choices as transient error
174 |             last_exc = RuntimeError("litellm returned no choices")
175 |         except Exception as e:
176 |             last_exc = e
177 | 
178 |         # If we're here, we will retry unless this was the last attempt
179 |         if attempt == RETRY_MAX_ATTEMPTS:
180 |             break
181 |         # Exponential backoff with jitter
182 |         delay = min(RETRY_BASE_DELAY * (2 ** (attempt - 1)), RETRY_MAX_DELAY)
183 |         jitter = random.uniform(0, delay * 0.1)
184 |         sleep_for = delay + jitter
185 |         print(f"litellm call failed (attempt {attempt}/{RETRY_MAX_ATTEMPTS}): {last_exc}; retrying in {sleep_for:.1f}s...")
186 |         time.sleep(sleep_for)
187 | 
188 |     elapsed = time.time() - start
189 | 
190 |     if resp is None or not getattr(resp, "choices", None):
191 |         # Raise the last exception or a generic one so callers can handle it
192 |         if last_exc:
193 |             raise last_exc
194 |         raise RuntimeError("litellm.completion failed with no response")
195 | 
196 |     # Extract tokens & cost defensively
197 |     usage = getattr(resp, "usage", None) or getattr(resp, "_hidden_params", {}).get("usage") or {}
198 |     tokens_in = usage.get("prompt_tokens") if isinstance(usage, dict) else None
199 |     tokens_out = usage.get("completion_tokens") if isinstance(usage, dict) else None
200 |     total_tokens = usage.get("total_tokens") if isinstance(usage, dict) else None
201 | 
202 |     cost_usd = None
203 |     # liteLLM often attaches response_cost either directly or hidden
204 |     cost_usd = getattr(resp, "response_cost", None)
205 |     if cost_usd is None:
206 |         hidden = getattr(resp, "_hidden_params", {})
207 |         if isinstance(hidden, dict):
208 |             cost_usd = hidden.get("response_cost")
209 | 
210 |     raw = resp.choices[0].message.content
211 |     html = clean_generation(raw)
212 |     cache_file.parent.mkdir(exist_ok=True, parents=True)
213 |     cache_file.write_text(html, encoding="utf-8")
214 |     # Write meta file for future cache hits
215 |     meta_payload = {
216 |         "model": model,
217 |         "prompt_hash": h,
218 |         "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
219 |         "tokens_in": tokens_in,
220 |         "tokens_out": tokens_out,
221 |         "total_tokens": total_tokens,
222 |         "cost_usd": cost_usd,
223 |         "seed": seed,
224 |         "temperature": temperature,
225 |         "system_prompt": base_system_prompt,
226 |         "custom_instructions": custom_instructions,
227 |         "effective_system_prompt": effective_system_prompt,
228 |     }
229 |     try:
230 |         meta_file.write_text(json.dumps(meta_payload, indent=2), encoding="utf-8")
231 |     except Exception:
232 |         pass
233 | 
234 |     meta = {
235 |         "cached": False,
236 |         "latency_s": elapsed,
237 |         "prompt_hash": h,
238 |         "tokens_in": tokens_in,
239 |         "tokens_out": tokens_out,
240 |         "total_tokens": total_tokens,
241 |         "cost_usd": cost_usd,
242 |         "seed": seed,
243 |         "temperature": temperature,
244 |         "system_prompt": base_system_prompt,
245 |         "custom_instructions": custom_instructions,
246 |         "effective_system_prompt": effective_system_prompt,
247 |     }
248 |     return html, meta
249 | 
250 | 
251 | def generate_html(model: str, user_prompt: str, temperature: float = None, seed: Optional[int] = None, disable_cache: bool = False) -> Tuple[str, bool, float]:
252 |     """Backward-compatible shim. Prefer generate_html_with_meta.
253 | 
254 |     Returns legacy tuple (html, cached, latency_s)."""
255 |     html, meta = generate_html_with_meta(
256 |         model,
257 |         user_prompt,
258 |         iteration=0,
259 |         temperature=temperature,
260 |         seed=seed,
261 |         disable_cache=disable_cache,
262 |     )
263 |     return html, meta["cached"], meta["latency_s"]
264 | 


--------------------------------------------------------------------------------
/test_cases/modal-dialog/test.js:
--------------------------------------------------------------------------------
  1 | /* Function to dismiss the dialog by clicking a button with common dismissal names, pressing Escape, or refreshing the page */
  2 | const dismissDialog = async (page, reload = true) => {
  3 |     if (!await dialogIsOpen(page)) {
  4 |         return;
  5 |     }
  6 | 
  7 |     if (await dialogIsOpen(page)) {
  8 |         // Try pressing escape on the dialog
  9 |         await page.getByRole('dialog').press('Escape');
 10 |     }
 11 | 
 12 |     if (await dialogIsOpen(page)) {
 13 |         // Fallback: press Escape on body
 14 |         await page.keyboard.press('Escape');
 15 |     }
 16 | 
 17 |     if (await dialogIsOpen(page)) {
 18 |         // Fallback: by clicking outside the dialog
 19 |         await page.locator('body').click({position: {x: 0, y: 0}});
 20 |     }
 21 | 
 22 |     const closeButton = await page.getByRole('button', { name: /\b(close|okay|ok|dismiss|exit|cancel|submit|apply|x)\b/iu });
 23 |     if (await closeButton.count() > 0) {
 24 |         await closeButton.first().click();
 25 |     }
 26 | 
 27 |     const closeControl = await page.getByRole('*', { name: /\b(close|okay|ok|dismiss|exit|cancel|submit|apply|x)\b/iu });
 28 |     if (await closeButton.count() > 0) {
 29 |         await closeButton.first().click();
 30 |     }
 31 | 
 32 |     if (reload && await dialogIsOpen(page)) {
 33 |         // If still open, refresh the page to reset state
 34 |         await utils.reload();
 35 |     }
 36 | }
 37 | 
 38 | const waitForAnimationEnd = async (locator) => {
 39 |   return locator.evaluate((element) => 
 40 |     Promise.all(
 41 |         element
 42 |             .getAnimations({ subtree: true })
 43 |             .map((animation) => animation.finished)
 44 |         )
 45 |     )
 46 | }
 47 | 
 48 | const dialogIsOpen = async (page) => {
 49 |     // Some JS frameworks delay the addition/removal of the dialog to the DOM until after animations complete.
 50 |     await page.waitForTimeout(50);
 51 | 
 52 |     // Now wait for any animations to end
 53 |     const body = await page.locator('body');
 54 |     await waitForAnimationEnd(body);
 55 | 
 56 |     // Now, check for dialog presence
 57 |     const dialog = await page.getByRole('dialog');
 58 |     return await dialog.count() > 0;
 59 | }
 60 | 
 61 | /* Function to check if focus is inside the dialog
 62 | *  Checks if the activeElement is contained within the dialog or is the body (which can happen if focus is sent to the browser chrome).
 63 | */
 64 | const focusIsInDialog = async (page) => {
 65 |     return await page.evaluate((obj) => obj.dialog.contains(document.activeElement) || document.activeElement.tagName === 'BODY', 
 66 |         { dialog: await page.getByRole('dialog').elementHandle()});
 67 | }
 68 | 
 69 | const tryToEscapeDialog = async (page, key, iterations) => {
 70 |     // Tab forward many times to see if we can escape the dialog.
 71 |     let foundElements = [];
 72 |     for (let i = 0; i < iterations; i++) {
 73 |         await page.keyboard.press(key);
 74 |         let focusedElement = await page.evaluate(() => document.activeElement);
 75 |         if (foundElements.includes(focusedElement)) {
 76 |             // We have cycled through all focusable elements, so stop.
 77 |             return false;
 78 |         }
 79 |         foundElements.push(focusedElement);
 80 |         if (!await focusIsInDialog(page)) {
 81 |             // Focus escaped the dialog, so fail this iteration.
 82 |             return true;
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | const getTriggers = async (page) => {
 88 |     return await page.locator('.trigger').filter({ visible: true });
 89 | }
 90 | 
 91 | module.exports.run = async ({ page, assert, utils }) => {
 92 |     /* Loop through all dialog triggers, open the dialog, and assert that a dialog role is present */
 93 |     await assert("Each dialog has a dialog role", async () => {
 94 |         await utils.reload(); // Ensure clean state before starting
 95 |         await dismissDialog(page, false); // Ensure no dialog is open
 96 |         const triggers = await getTriggers(page);
 97 |         const totalTriggers = await triggers.count();
 98 |         let totalDialogs = 0;
 99 |         for (const trigger of await triggers.all()) {
100 |             await trigger.click();
101 |             if (await dialogIsOpen(page)) {
102 |                 totalDialogs += 1;
103 |             }
104 |             await dismissDialog(page);
105 |         }
106 |         return totalDialogs === totalTriggers;
107 |     });
108 | 
109 |     await assert("Each dialog can be closed by escape key", async () => {
110 |         await utils.reload(); // Ensure clean state before starting
111 |         await dismissDialog(page, false); // Ensure no dialog is open
112 |         const triggers = await getTriggers(page);
113 |         const totalTriggers = await triggers.count();
114 |         let totalSuccess = 0;
115 |         for (const trigger of await triggers.all()) {
116 |             await dismissDialog(page);
117 |             await trigger.click(); 
118 |             if (!await dialogIsOpen(page)) {
119 |                 throw new Error("Unable to test because no dialog was found");
120 |             }
121 | 
122 |             await page.getByRole('dialog').press('Escape');
123 |             if (!(await dialogIsOpen(page))) {
124 |                 totalSuccess += 1;
125 |             }
126 |         }
127 |         return totalSuccess === totalTriggers;
128 |     }, {type: 'BP'});
129 | 
130 |     await assert("Each modal dialog traps keyboard focus", async () => {
131 |         await utils.reload(); // Ensure clean state before starting
132 |         await dismissDialog(page, false); // Ensure no dialog is open
133 |         const triggers = await getTriggers(page);
134 |         const totalTriggers = await triggers.count();
135 |         let totalSuccess = 0;
136 |         for (const trigger of await triggers.all()) {
137 |             await dismissDialog(page);
138 |             await trigger.click(); 
139 |             if (!await dialogIsOpen(page)) {
140 |                 throw new Error("Unable to test because no dialog was found");
141 |             }
142 | 
143 |             if (await tryToEscapeDialog(page, 'Tab', 20)) {
144 |                 await dismissDialog(page);
145 |                 continue;
146 |             }
147 | 
148 |             if (await tryToEscapeDialog(page, 'Shift+Tab', 20)) {
149 |                 await dismissDialog(page);
150 |                 continue;
151 |             }
152 | 
153 |             totalSuccess += 1;
154 |         }
155 |         return totalSuccess === totalTriggers;
156 |     });
157 | 
158 |     await assert("Each modal dialog takes focus when opened", async () => {
159 |         await utils.reload(); // Ensure clean state before starting
160 |         await dismissDialog(page, false); // Ensure no dialog is open
161 |         const triggers = await getTriggers(page);
162 |         const totalTriggers = await triggers.count();
163 |         let totalSuccess = 0;
164 |         for (const trigger of await triggers.all()) {
165 |             await dismissDialog(page);
166 |             await trigger.click();
167 |             if (!await dialogIsOpen(page)) {
168 |                 throw new Error("Unable to test because no dialog was found");
169 |             }
170 | 
171 |             if (!(await focusIsInDialog(page))) {
172 |                 // Focus is not in the dialog, so fail this iteration.
173 |                 continue;
174 |             }
175 | 
176 |             const bodyIsFocused = await page.evaluate(() => document.activeElement.tagName === 'BODY');
177 |             if (bodyIsFocused) {
178 |                 // Focus is on body, meaning that focus was lost, so fail this iteration.
179 |                 // focusIsInDialog would have returned true if focus was on the Body element.
180 |                 continue;
181 |             }
182 | 
183 |             totalSuccess += 1;
184 |         }
185 |         return totalSuccess === totalTriggers;
186 |     });
187 | 
188 |     await assert("Focus is not lost when each dialog closes", async () => {
189 |         await utils.reload(); // Ensure clean state before starting
190 |         await dismissDialog(page, false); // Ensure no dialog is open
191 |         const triggers = await getTriggers(page);
192 |         const totalTriggers = await triggers.count();
193 |         let totalSuccess = 0;
194 |         for (const trigger of await triggers.all()) {
195 |             await dismissDialog(page);
196 |             await trigger.click();
197 |             if (!await dialogIsOpen(page)) {
198 |                 throw new Error("Unable to test because no dialog was found");
199 |             }
200 | 
201 |             await dismissDialog(page, false);
202 | 
203 |             const bodyIsFocused = await page.evaluate(() => document.activeElement.tagName === 'BODY');
204 |             if (bodyIsFocused) {
205 |                 // Focus is on body, meaning that focus was lost, so fail this iteration.
206 |                 // focusIsInDialog would have returned true if focus was on the Body element.
207 |                 // Note: this does not cover the scenario where the modal dialog triggers automatically on page load before the user can interact with the page. In this situation, focus should return to the body.
208 |                 continue;
209 |             }
210 | 
211 |             totalSuccess += 1;
212 |         }
213 |         return totalSuccess === totalTriggers;
214 |     });
215 | 
216 |     await assert("Each modal dialog hides content behind it while open", async () => {
217 |         await utils.reload(); // Ensure clean state before starting
218 |         await dismissDialog(page, false); // Ensure no dialog is open
219 |         const triggers = await getTriggers(page);
220 |         const totalTriggers = await triggers.count();
221 |         let totalSuccess = 0;
222 | 
223 |         for (const trigger of await triggers.all()) {
224 |             await dismissDialog(page);
225 |             await trigger.click();
226 |             if (!await dialogIsOpen(page)) {
227 |                 throw new Error("Unable to test because no dialog was found");
228 |             }
229 | 
230 |             // Determine if native modal dialog is opened, which always hides background content.
231 |             let isNativeModal = await page.evaluate(el => {
232 |                 return !!document.querySelector(':modal')
233 |             });
234 | 
235 |             if (!isNativeModal) {
236 |                 // If not a native modal dialog, check if content behind the dialog is hidden from screen reader users.
237 |                 let isScreenReaderHidden = await trigger.evaluate(el => {
238 |                     // Use axe-core's util to determine hidden from screen reader users.
239 |                     let vEl = window.axe.utils.getNodeFromTree(el)
240 |                     return !window.axe.commons.dom.isVisibleToScreenReaders(vEl);
241 |                 });
242 |             
243 |                 if (!isScreenReaderHidden) {
244 |                     // Trigger is still visible to screen reader users, so fail this iteration.
245 |                     continue;
246 |                 }
247 |             }
248 |             
249 |             totalSuccess += 1;
250 |         }
251 |         return totalSuccess === totalTriggers;
252 |     });
253 | 
254 |   return {}; // assertions collected via injected assert
255 | };
256 | 
257 | module.exports.runAxe = async ({ page, utils }) => {
258 |     await utils.reload(); // Ensure clean state before starting
259 |     await dismissDialog(page, false); // Ensure no dialog is open
260 | 
261 |     const triggers = await getTriggers(page);
262 |     let axeResult = {};
263 | 
264 |     for (const trigger of await triggers.all()) {
265 |         await dismissDialog(page);
266 |         await trigger.click();
267 |         await dialogIsOpen(page);
268 |         axeResult = utils.merge(axeResult, await utils.runAxeOnPage(page));
269 |     }
270 |     
271 |     return axeResult;
272 | };


--------------------------------------------------------------------------------
/test_cases/modal-dialog/examples/fail-has-js-delay.html:
--------------------------------------------------------------------------------
  1 | <html lang="en">
  2 | <head>
  3 |   <meta charset="utf-8" />
  4 |   <meta name="viewport" content="width=device-width,initial-scale=1" />
  5 |   <title>Fail - Has JS Delay</title>
  6 | <script id="a11y-assertions" type="application/json">{
  7 | "assertions": {
  8 |     "Each dialog has a dialog role": "pass",
  9 |     "Each dialog can be closed by escape key": "pass",
 10 |     "Each modal dialog traps keyboard focus": "pass",
 11 |     "Each modal dialog takes focus when opened": "pass",
 12 |     "Focus is not lost when each dialog closes": "pass",
 13 |     "Each modal dialog hides content behind it while open": "fail"
 14 | }
 15 | }</script>
 16 |   <style>
 17 |     :root{
 18 |       --overlay-bg: rgba(0,0,0,0.5);
 19 |       --modal-bg: #fff;
 20 |       --radius: 8px;
 21 |       --max-width: 680px;
 22 |       --gap: 24px;
 23 |       --transition: 5s cubic-bezier(.2,.9,.3,1);
 24 |     }
 25 | 
 26 |     body.no-scroll {
 27 |       overflow: hidden;
 28 |     }
 29 | 
 30 |     /* Trigger button */
 31 |     .trigger{
 32 |       display: inline-block;
 33 |       padding: 10px 16px;
 34 |       border-radius: 6px;
 35 |       background: #0366d6;
 36 |       color: #fff;
 37 |       font-weight: 600;
 38 |       border: none;
 39 |       cursor: pointer;
 40 |       box-shadow: 0 2px 6px rgba(0,0,0,0.12);
 41 |     }
 42 |     .trigger:focus {
 43 |       outline: 3px solid rgba(3,102,214,0.25);
 44 |       outline-offset: 2px;
 45 |     }
 46 | 
 47 |     /* Overlay / modal container */
 48 |     .modal-overlay {
 49 |       --visible: 0;
 50 |       position: fixed;
 51 |       inset: 0;
 52 |       display: grid;
 53 |       place-items: center;
 54 |       background: rgba(0,0,0,0); /* initially transparent */
 55 |       pointer-events: none;
 56 |       opacity: 0;
 57 |       transition: opacity var(--transition), background var(--transition);
 58 |     }
 59 |     .modal-overlay.open {
 60 |       pointer-events: auto;
 61 |       opacity: 1;
 62 |       background: var(--overlay-bg);
 63 |     }
 64 | 
 65 |     /* Modal panel */
 66 |     .modal {
 67 |       width: calc(100% - 48px);
 68 |       max-width: var(--max-width);
 69 |       background: var(--modal-bg);
 70 |       border-radius: var(--radius);
 71 |       box-shadow: 0 10px 30px rgba(0,0,0,0.25);
 72 |       transform: translateY(12px) scale(.98);
 73 |       opacity: 0;
 74 |       transition: transform calc(var(--transition) * 1.1), opacity var(--transition);
 75 |       pointer-events: auto;
 76 |       max-height: calc(100vh - 80px);
 77 |       display: flex;
 78 |       flex-direction: column;
 79 |       overflow: hidden;
 80 |     }
 81 |     .modal-overlay.open .modal {
 82 |       transform: translateY(0) scale(1);
 83 |       opacity: 1;
 84 |     }
 85 | 
 86 |     .modal-header {
 87 |       display: flex;
 88 |       align-items: center;
 89 |       justify-content: space-between;
 90 |       padding: 18px 20px;
 91 |       border-bottom: 1px solid #eee;
 92 |     }
 93 |     .modal-title {
 94 |       margin: 0;
 95 |       font-size: 1.125rem;
 96 |       font-weight: 700;
 97 |       color: #111;
 98 |     }
 99 |     .modal-close {
100 |       background: transparent;
101 |       border: none;
102 |       font-size: 18px;
103 |       line-height: 1;
104 |       padding: 6px;
105 |       border-radius: 6px;
106 |       cursor: pointer;
107 |     }
108 |     .modal-close:focus {
109 |       outline: 3px solid rgba(3,102,214,0.25);
110 |       outline-offset: 2px;
111 |     }
112 | 
113 |     .modal-body {
114 |       padding: 18px 20px;
115 |       overflow: auto;
116 |       color: #333;
117 |       font-size: 0.97rem;
118 |       line-height: 1.45;
119 |     }
120 | 
121 |     .modal-footer {
122 |       display: flex;
123 |       gap: 8px;
124 |       padding: 14px 20px;
125 |       border-top: 1px solid #eee;
126 |       justify-content: flex-end;
127 |       background: rgba(255,255,255,0.6);
128 |     }
129 | 
130 |     .btn {
131 |       padding: 8px 12px;
132 |       border-radius: 6px;
133 |       border: 1px solid #ddd;
134 |       background: #f6f6f8;
135 |       cursor: pointer;
136 |     }
137 |     .btn.primary {
138 |       background: #0366d6;
139 |       color: #fff;
140 |       border-color: rgba(0,0,0,0.06);
141 |     }
142 | 
143 |     /* Visually hide when closed for assistive tech */
144 |     .modal-overlay[aria-hidden="true"] {
145 |       display: none;
146 |     }
147 | 
148 |     /* Small tweaks for demonstration */
149 |     main {
150 |       padding: 40px;
151 |       font-family: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial;
152 |     }
153 |   </style>
154 | </head>
155 | <body>
156 |   <main>
157 |     <h1>Fail - Has JS Delay</h1>
158 |     <p>
159 |       The following is a failing example of a modal dialog component. It passes most of the requirements but
160 |       does not hide content behind it from screen reader users. This example has a slight JS delays when opening/closing.
161 |       This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility.
162 |     </p>
163 | 
164 |     <p>Click the button to open the modal. The button has the class "trigger".</p>
165 | 
166 |     <button class="trigger" type="button" aria-haspopup="dialog" id="openModal">Open Modal</button>
167 |   </main>
168 | 
169 |   <!-- Modal structure (closed by default) -->
170 |   <div class="modal-overlay" id="modalOverlay" role="presentation" aria-hidden="true">
171 |     <div
172 |       class="modal"
173 |       role="dialog"
174 |       aria-modal="true"
175 |       aria-labelledby="modalTitle"
176 |       aria-describedby="modalDesc"
177 |       tabindex="-1"
178 |       id="modal"
179 |     >
180 |       <div class="modal-header">
181 |         <h2 id="modalTitle" class="modal-title">Example Modal</h2>
182 |         <button class="modal-close" aria-label="Close modal" id="closeButton">&times;</button>
183 |       </div>
184 |       <div class="modal-body" id="modalDesc">
185 |         <p>
186 |           This is a simple accessible modal dialog demo. It is closed by default. The
187 |           open button uses the class <code>trigger</code>.
188 |         </p>
189 |         <p>
190 |           The modal traps focus while open, closes when you press Escape, and closes when you click the overlay.
191 |         </p>
192 |         <label style="display:block; margin-top:12px;">
193 |           Example input:
194 |           <input type="text" placeholder="Focus me" style="width:100%; margin-top:6px; padding:8px;">
195 |         </label>
196 |       </div>
197 |       <div class="modal-footer">
198 |         <button class="btn" type="button" id="cancelBtn">Cancel</button>
199 |         <button class="btn primary" type="button" id="confirmBtn">Confirm</button>
200 |       </div>
201 |     </div>
202 |   </div>
203 | 
204 |   <script>
205 |     (function () {
206 |       var openTriggers = document.querySelectorAll('.trigger');
207 |       var overlay = document.getElementById('modalOverlay');
208 |       var modal = document.getElementById('modal');
209 |       var closeButton = document.getElementById('closeButton');
210 |       var cancelBtn = document.getElementById('cancelBtn');
211 |       var confirmBtn = document.getElementById('confirmBtn');
212 | 
213 |       var lastFocusedElement = null;
214 | 
215 |       // Focusable selector
216 |       var focusableSelectors = [
217 |         'a[href]',
218 |         'area[href]',
219 |         'input:not([disabled]):not([type="hidden"])',
220 |         'select:not([disabled])',
221 |         'textarea:not([disabled])',
222 |         'button:not([disabled])',
223 |         'iframe',
224 |         'object',
225 |         'embed',
226 |         '[contenteditable]',
227 |         '[tabindex]:not([tabindex="-1"])'
228 |       ].join(',');
229 | 
230 |       function getFocusableElements(container) {
231 |         return Array.prototype.slice.call(container.querySelectorAll(focusableSelectors))
232 |           .filter(function (el) {
233 |             // visible
234 |             return el.offsetWidth || el.offsetHeight || el.getClientRects().length;
235 |           });
236 |       }
237 | 
238 |       function openModal() {
239 |         lastFocusedElement = document.activeElement;
240 |         overlay.classList.add('open');
241 |         overlay.setAttribute('aria-hidden', 'false');
242 |         document.body.classList.add('no-scroll');
243 | 
244 |         // Wait a tick so transitions apply then focus
245 |         window.setTimeout(function () {
246 |           modal.focus();
247 |         }, 10);
248 | 
249 |         // trap focus
250 |         document.addEventListener('focus', enforceFocus, true);
251 |         document.addEventListener('keydown', handleKeyDown);
252 |       }
253 | 
254 |       function closeModal() {
255 |         overlay.classList.remove('open');
256 |         overlay.setAttribute('aria-hidden', 'true');
257 |         document.body.classList.remove('no-scroll');
258 | 
259 |         document.removeEventListener('focus', enforceFocus, true);
260 |         document.removeEventListener('keydown', handleKeyDown);
261 | 
262 |         if (lastFocusedElement && typeof lastFocusedElement.focus === 'function') {
263 |           lastFocusedElement.focus();
264 |         }
265 |       }
266 | 
267 |       function enforceFocus(event) {
268 |         if (!modal.contains(event.target)) {
269 |           // redirect focus to first focusable inside modal
270 |           var focusable = getFocusableElements(modal);
271 |           if (focusable.length) {
272 |             focusable[0].focus();
273 |           } else {
274 |             modal.focus();
275 |           }
276 |           event.stopPropagation();
277 |           event.preventDefault();
278 |         }
279 |       }
280 | 
281 |       function handleKeyDown(e) {
282 |         if (e.key === 'Escape' || e.key === 'Esc') {
283 |           e.preventDefault();
284 |           closeModal();
285 |           return;
286 |         }
287 |         if (e.key === 'Tab') {
288 |           // focus trap: keep cycling inside modal
289 |           var focusable = getFocusableElements(modal);
290 |           if (focusable.length === 0) {
291 |             // no focusable elements; keep focus on modal
292 |             e.preventDefault();
293 |             modal.focus();
294 |             return;
295 |           }
296 |           var first = focusable[0];
297 |           var last = focusable[focusable.length - 1];
298 |           if (e.shiftKey) {
299 |             if (document.activeElement === first || document.activeElement === modal) {
300 |               e.preventDefault();
301 |               last.focus();
302 |             }
303 |           } else {
304 |             if (document.activeElement === last) {
305 |               e.preventDefault();
306 |               first.focus();
307 |             }
308 |           }
309 |         }
310 |       }
311 | 
312 |       // Open handlers for all triggers
313 |       openTriggers.forEach(function (btn) {
314 |         btn.addEventListener('click', function (e) {
315 |           e.preventDefault();
316 |           openModal();
317 |         });
318 |       });
319 | 
320 |       // Close handlers
321 |       closeButton.addEventListener('click', function () {
322 |         closeModal();
323 |       });
324 |       cancelBtn.addEventListener('click', function () {
325 |         closeModal();
326 |       });
327 |       confirmBtn.addEventListener('click', function () {
328 |         // Example action
329 |         alert('Confirmed!');
330 |         closeModal();
331 |       });
332 | 
333 |       // Overlay click closes when clicking outside the modal content
334 |       overlay.addEventListener('mousedown', function (e) {
335 |         // Only close when target is the overlay itself, not children
336 |         if (e.target === overlay) {
337 |           // on mousedown we don't want the focus enforcement to interfere
338 |           // wait a tick before closing
339 |           setTimeout(closeModal, 0);
340 |         }
341 |       });
342 | 
343 |       // Prevent clicks inside modal from being interpreted as overlay clicks
344 |       modal.addEventListener('mousedown', function (e) {
345 |         e.stopPropagation();
346 |       });
347 | 
348 |       // Ensure modal hidden to assistive tech on initial load
349 |       overlay.setAttribute('aria-hidden', 'true');
350 | 
351 |       // Allow opening the modal by pressing Enter/Space on trigger if focused
352 |       // (standard button already does this; no extra code required)
353 |     })();
354 |   </script>
355 | </body>
356 | </html>


--------------------------------------------------------------------------------
/a11y_llm_tests/cli.py:
--------------------------------------------------------------------------------
  1 | """Typer CLI for running evaluations and generating reports."""
  2 | import json
  3 | import multiprocessing
  4 | from datetime import datetime
  5 | from pathlib import Path
  6 | import typer
  7 | import yaml
  8 | from typing import List
  9 | 
 10 | from . import generator, node_bridge
 11 | from .schema import (
 12 |     ResultRecord,
 13 |     TestFunctionResult,
 14 |     AxeResult,
 15 |     GenerationMeta,
 16 |     AggregateRecord,
 17 | )
 18 | from .metrics import compute_pass_at_k, format_pass_at_k
 19 | 
 20 | # importing os module for environment variables
 21 | import os
 22 | # importing necessary functions from dotenv library
 23 | from dotenv import load_dotenv, dotenv_values 
 24 | # loading variables from .env file
 25 | load_dotenv() 
 26 | 
 27 | app = typer.Typer(add_completion=False)
 28 | 
 29 | 
 30 | def _evaluate_worker(args_tuple):
 31 |     """Top-level worker for multiprocessing to ensure picklability on spawn-based systems (macOS, Windows)."""
 32 |     html_path, test_js_path, screenshot_path, test_name, model, sample_index, gen_meta, prompt_text = args_tuple
 33 |     html = Path(html_path).read_text(encoding="utf-8")
 34 |     sp = Path(screenshot_path)
 35 |     sp.parent.mkdir(parents=True, exist_ok=True)
 36 |     node_res = node_bridge.run(html, test_js_path, screenshot_path)
 37 |     tf = node_res.get("testFunctionResult", {})
 38 |     assertions_raw = tf.get("assertions", [])
 39 |     norm_assertions = []
 40 |     for a in assertions_raw:
 41 |         if not isinstance(a, dict):
 42 |             continue
 43 |         atype = (a.get("type") or "R").upper()
 44 |         if atype not in {"R", "BP"}:
 45 |             atype = "R"
 46 |         norm_assertions.append({
 47 |             "name": a.get("name", "unknown"),
 48 |             "status": a.get("status", "fail"),
 49 |             "message": a.get("message"),
 50 |             "type": atype,
 51 |         })
 52 |     test_result = TestFunctionResult(
 53 |         status=tf.get("status", "error"),
 54 |         assertions=norm_assertions,
 55 |         error=tf.get("error"),
 56 |         duration_ms=tf.get("duration_ms"),
 57 |         total_assertion_failures=tf.get("total_assertion_failures", 0),
 58 |         total_assertion_bp_failures=tf.get("total_assertion_bp_failures", 0)
 59 |     )
 60 |     axe_data = node_res.get("axeResult") or node_res.get("axe_result") or node_res.get("axe")
 61 |     axe_obj = None
 62 |     if axe_data and isinstance(axe_data, dict):
 63 |         axe_obj = AxeResult(
 64 |             failure_count=axe_data.get("failure_count", 0),
 65 |             failures=axe_data.get("failures", []),
 66 |             best_practice_count=axe_data.get("best_practice_count", 0),
 67 |             best_practice_failures=axe_data.get("best_practice_failures", []),
 68 |         )
 69 |     result_pass = bool(axe_obj) and (test_result.status == "pass" and axe_obj.failure_count == 0)
 70 |     rec = ResultRecord(
 71 |         test_name=test_name,
 72 |         model_name=model,
 73 |         timestamp=datetime.utcnow(),
 74 |         generation_html_path=html_path,
 75 |         screenshot_path=screenshot_path,
 76 |         test_function=test_result,
 77 |         axe=axe_obj,
 78 |         result="PASS" if result_pass else "FAIL",
 79 |         generation=GenerationMeta(
 80 |             latency_s=gen_meta.get("latency_s", 0.0),
 81 |             prompt_hash=gen_meta.get("prompt_hash", generator.compute_prompt_hash(prompt_text)),
 82 |             cached=gen_meta.get("cached", False),
 83 |             tokens_in=gen_meta.get("tokens_in"),
 84 |             tokens_out=gen_meta.get("tokens_out"),
 85 |             total_tokens=gen_meta.get("total_tokens"),
 86 |             cost_usd=gen_meta.get("cost_usd"),
 87 |             seed=gen_meta.get("seed"),
 88 |             temperature=gen_meta.get("temperature"),
 89 |             system_prompt=gen_meta.get("system_prompt", generator.get_base_system_prompt()),
 90 |             custom_instructions=gen_meta.get("custom_instructions", generator.get_custom_instructions()),
 91 |             effective_system_prompt=gen_meta.get("effective_system_prompt", generator.get_effective_system_prompt()),
 92 |         ),
 93 |         sample_index=sample_index,
 94 |     )
 95 |     return json.loads(rec.model_dump_json()), test_name, model, result_pass
 96 | 
 97 | 
 98 | def _generate_worker(task):
 99 |     """Top-level generation worker for multiprocessing; receives a tuple of parameters."""
100 |     test_name, model, sample_index, prompt_text, seed, temperature, disable_cache = task
101 |     html, meta = generator.generate_html_with_meta(
102 |         model,
103 |         prompt_text,
104 |         sample_index,
105 |         temperature=temperature,
106 |         seed=seed,
107 |         disable_cache=disable_cache,
108 |     )
109 |     return test_name, model, sample_index, prompt_text, meta, html
110 | 
111 | 
112 | @app.command()
113 | def run(
114 |     models_file: str = typer.Option("config/models.yaml", help="Models config YAML"),
115 |     out: str = typer.Option("runs", help="Output directory"),
116 |     samples: int = typer.Option(1, min=1, help="Number of samples per (test,model)."),
117 |     k: str = typer.Option("1,5,10", help="Comma-separated k values for pass@k metrics (stored for later evaluation)."),
118 |     base_seed: int = typer.Option(None, help="Base seed for reproducibility; each sample adds its index."),
119 |     temperature: float = typer.Option(None, help="Override model temperature (if supported)."),
120 |     disable_cache: bool = typer.Option(False, help="Disable generation cache (always re-generate)."),
121 |     test_cases_dir: str = typer.Option("test_cases", help="Directory containing test case folders."),
122 |     processes: int = typer.Option(None, "--processes", "-p", help="Parallel processes for generation (defaults CPU count; use 1 to disable)."),
123 | ):
124 |     """Generate HTML samples ONLY (no evaluation). A later 'evaluate' command will run tests & build report."""
125 |     run_id = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
126 |     out_dir = Path(out) / run_id
127 |     (out_dir / "raw").mkdir(parents=True, exist_ok=True)
128 |     # Prepare screenshots directory (will be populated during evaluation phase)
129 |     (out_dir / "screenshots").mkdir(parents=True, exist_ok=True)
130 | 
131 |     models_cfg = yaml.safe_load(open(models_file))
132 |     defaults_cfg = models_cfg.get("defaults") or {}
133 |     config_dir = Path(models_file).resolve().parent
134 |     system_prompt_override = defaults_cfg.get("system_prompt")
135 |     instructions_cfg = defaults_cfg.get("custom_instructions_markdown")
136 |     custom_instructions_text = None
137 |     custom_instructions_path = None
138 |     if instructions_cfg:
139 |         instructions_path = Path(instructions_cfg)
140 |         if not instructions_path.is_absolute():
141 |             instructions_path = config_dir / instructions_path
142 |         instructions_path = instructions_path.resolve()
143 |         if not instructions_path.exists():
144 |             typer.secho(f"Custom instructions file not found: {instructions_path}", err=True)
145 |             raise typer.Exit(code=1)
146 |         try:
147 |             custom_instructions_text = instructions_path.read_text(encoding="utf-8")
148 |         except OSError as exc:
149 |             typer.secho(f"Failed to read custom instructions file '{instructions_path}': {exc}", err=True)
150 |             raise typer.Exit(code=1)
151 |         custom_instructions_path = str(instructions_path)
152 |     generator.configure_prompts(system_prompt_override, custom_instructions_text)
153 |     model_names = [m["name"] for m in models_cfg.get("models", [])]
154 |     models_info = []
155 |     for m in models_cfg.get("models", []):
156 |         name = m.get("name")
157 |         display_name = m.get("display_name") or (name.split('/')[-1] if isinstance(name, str) else name)
158 |         models_info.append({"name": name, "display_name": display_name})
159 |     tcd = Path(test_cases_dir)
160 |     test_dirs = [p for p in tcd.iterdir() if p.is_dir() and (p / "prompt.md").exists()]
161 |     # Build generation tasks
162 |     results = []  # stub pending evaluation records
163 |     prompts_map = {}
164 |     gen_tasks = []  # (test_name, model, sample_index, prompt, seed)
165 |     for td in test_dirs:
166 |         prompt_text = (td / "prompt.md").read_text(encoding="utf-8")
167 |         prompts_map[td.name] = prompt_text
168 |         for model in model_names:
169 |             for sample_index in range(samples):
170 |                 seed = (base_seed + sample_index) if base_seed is not None else None
171 |                 gen_tasks.append((td.name, model, sample_index, prompt_text, seed, temperature, disable_cache))
172 | 
173 |     gen_tasks.sort(key=lambda t: (t[0], t[1], t[2]))
174 | 
175 |     if gen_tasks:
176 |         pool_size = None
177 |         if processes is None:
178 |             pool_size = min(multiprocessing.cpu_count(), len(gen_tasks))
179 |         else:
180 |             pool_size = max(1, processes)
181 |         if pool_size == 1:
182 |             gen_results_iter = map(_generate_worker, gen_tasks)
183 |         else:
184 |             typer.echo(f"Generating with {pool_size} processes...")
185 |             with multiprocessing.Pool(processes=pool_size) as pool:
186 |                 gen_results_iter = pool.map(_generate_worker, gen_tasks)
187 |         for test_name, model, sample_index, prompt_text, meta, html in gen_results_iter:
188 |             raw_path = out_dir / "raw" / test_name
189 |             html_file = raw_path / f"{model}__s{sample_index}.html" if samples > 1 else raw_path / f"{model}.html"
190 |             html_file.parent.mkdir(exist_ok=True, parents=True)
191 |             html_file.write_text(html, encoding="utf-8")
192 |             rec = ResultRecord(
193 |                 test_name=test_name,
194 |                 model_name=model,
195 |                 timestamp=datetime.utcnow(),
196 |                 generation_html_path=str(html_file),
197 |                 screenshot_path=None,
198 |                 test_function=TestFunctionResult(status="PENDING", assertions=[], error=None, duration_ms=None),
199 |                 axe=None,
200 |                 result="PENDING",
201 |                 generation=GenerationMeta(
202 |                     latency_s=meta.get("latency_s", 0.0),
203 |                     prompt_hash=meta.get("prompt_hash", generator.compute_prompt_hash(prompt_text)),
204 |                     cached=meta.get("cached", False),
205 |                     tokens_in=meta.get("tokens_in"),
206 |                     tokens_out=meta.get("tokens_out"),
207 |                     total_tokens=meta.get("total_tokens"),
208 |                     cost_usd=meta.get("cost_usd"),
209 |                     seed=meta.get("seed"),
210 |                     temperature=meta.get("temperature"),
211 |                     system_prompt=meta.get("system_prompt", generator.get_base_system_prompt()),
212 |                     custom_instructions=meta.get("custom_instructions", generator.get_custom_instructions()),
213 |                     effective_system_prompt=meta.get("effective_system_prompt", generator.get_effective_system_prompt()),
214 |                 ),
215 |                 sample_index=sample_index,
216 |             )
217 |             results.append(json.loads(rec.model_dump_json()))
218 | 
219 |     run_json = {
220 |         "run_id": run_id,
221 |         "models": model_names,
222 |         "tests": [d.name for d in test_dirs],
223 |         "prompts": prompts_map,
224 |         "results": results,
225 |         "aggregates": [],  # will be populated after evaluation
226 |         "meta": {
227 |             "sampling": {
228 |                 "samples_per_case": samples,
229 |                 "k_values": [int(x.strip()) for x in k.split(",") if x.strip().isdigit()],  # stored but not yet computed
230 |                 "temperature": temperature,
231 |                 "base_seed": base_seed,
232 |                 "disable_cache": disable_cache,
233 |                 "processes_generation": (processes if processes is not None else min(multiprocessing.cpu_count(), len(gen_tasks))) if gen_tasks else None,
234 |             },
235 |             "prompting": {
236 |                 "system_prompt": generator.get_base_system_prompt(),
237 |                 "effective_system_prompt": generator.get_effective_system_prompt(),
238 |                 "custom_instructions": generator.get_custom_instructions(),
239 |                 "custom_instructions_path": custom_instructions_path,
240 |             },
241 |             "models_info": models_info,
242 |             "status": "GENERATED_ONLY",
243 |         },
244 |     }
245 |     (out_dir / "results.json").write_text(json.dumps(run_json, indent=2), encoding="utf-8")
246 |     latest_link = Path(out) / "latest"
247 |     try:
248 |         if latest_link.exists() or latest_link.is_symlink():
249 |             latest_link.unlink()
250 |         latest_link.symlink_to(out_dir)
251 |     except OSError:
252 |         pass
253 |     typer.echo(f"Generation complete. Run directory ready for evaluation: {out_dir}")
254 | 
255 | 
256 | @app.command()
257 | def evaluate(
258 |     run_dir: str = typer.Argument(..., help="Existing run directory produced by 'run' command"),
259 |     test_cases_dir: str = typer.Option("test_cases", help="Directory containing test case folders."),
260 |     k: str = typer.Option("1,5,10", help="Comma-separated k values for pass@k metrics."),
261 |     generate_report: bool = typer.Option(True, help="Generate HTML report (index.html) after evaluation."),
262 |     processes: int = typer.Option(None, "--processes", "-p", help="Number of parallel processes for evaluation (defaults to CPU count; use 1 to disable)."),
263 | ):
264 |     """Evaluate previously generated HTML samples without requiring models config: run accessibility tests, compute aggregates, optionally render report."""
265 |     rd = Path(run_dir)
266 |     if not rd.exists():
267 |         typer.secho(f"Run directory not found: {rd}", err=True)
268 |         raise typer.Exit(code=1)
269 |     results_json_path = rd / "results.json"
270 |     prior_data = {}
271 |     if results_json_path.exists():
272 |         try:
273 |             prior_data = json.loads(results_json_path.read_text(encoding="utf-8"))
274 |         except Exception:
275 |             typer.secho("Warning: Failed to parse existing results.json; proceeding without prior metadata.", err=True)
276 |     # derive model list and display names from prior data
277 |     model_names = prior_data.get("models") or []
278 |     meta_block = (prior_data.get("meta") or {})
279 |     stored_models_info = meta_block.get("models_info") or []
280 |     display_lookup = {m.get("name"): m.get("display_name") for m in stored_models_info if m.get("name")}
281 |     # prompts
282 |     tcd = Path(test_cases_dir)
283 |     test_dirs = [p for p in tcd.iterdir() if p.is_dir() and (p / "prompt.md").exists()]
284 |     prompts_map = {td.name: (td / "prompt.md").read_text(encoding="utf-8") for td in test_dirs}
285 |     k_values = [int(x.strip()) for x in k.split(",") if x.strip().isdigit()]
286 |     if not k_values:
287 |         k_values = [1]
288 | 
289 |     # Map generation meta by triple for reuse
290 |     gen_meta_map = {}
291 |     for r in prior_data.get("results", []) if prior_data else []:
292 |         key = (r.get("test_name"), r.get("model_name"), r.get("sample_index"))
293 |         gen_meta_map[key] = r.get("generation")
294 | 
295 |     # Build evaluation task list
296 |     tasks = []  # each entry: (html_path, test_js_path, screenshot_path, test_name, model, sample_index, gen_meta, prompt_text)
297 |     for td in test_dirs:
298 |         test_name = td.name
299 |         test_js = td / "test.js"
300 |         raw_dir = rd / "raw" / test_name
301 |         if not raw_dir.exists():
302 |             typer.secho(f"Skipping missing raw dir for test '{test_name}'", err=True)
303 |             continue
304 |         html_files = sorted(raw_dir.glob("**/*.html"))
305 |         for hf in html_files:
306 |             fname = hf.name
307 |             if "__s" in fname:
308 |                 model_part, sample_part = fname.split("__s", 1)
309 |                 sample_index_str = sample_part[:-5] if sample_part.endswith(".html") else sample_part
310 |                 try:
311 |                     sample_index = int(sample_index_str)
312 |                 except ValueError:
313 |                     sample_index = None
314 |                 model = model_part
315 |             else:
316 |                 model = fname[:-5]
317 |                 sample_index = None
318 |             screenshot_name = f"{test_name}__{model}__s{sample_index}.png" if sample_index is not None else f"{test_name}__{model}.png"
319 |             screenshot_path = rd / "screenshots" / screenshot_name
320 |             gen_meta = gen_meta_map.get((test_name, model, sample_index)) or {}
321 |             tasks.append((str(hf), str(test_js), str(screenshot_path), test_name, model, sample_index, gen_meta, prompts_map.get(test_name, "")))
322 | 
323 |     # Sort tasks for deterministic ordering
324 |     tasks.sort(key=lambda t: (t[3], t[4], t[5] if t[5] is not None else -1))
325 | 
326 | 
327 |     all_results = []
328 |     pass_map = {}  # (test_name, model) -> list[bool]
329 |     if not tasks:
330 |         typer.secho("No evaluation tasks found.", err=True)
331 |     else:
332 |         pool_size = None
333 |         if processes is None:
334 |             # Default: use CPU count but cap at len(tasks)
335 |             pool_size = min(multiprocessing.cpu_count(), len(tasks))
336 |         else:
337 |             pool_size = max(1, processes)
338 |         if pool_size == 1:
339 |             for t in tasks:
340 |                 res, test_name, model, passed = _evaluate_worker(t)
341 |                 all_results.append(res)
342 |                 pass_map.setdefault((test_name, model), []).append(passed)
343 |         else:
344 |             typer.echo(f"Evaluating with {pool_size} processes...")
345 |             with multiprocessing.Pool(processes=pool_size) as pool:
346 |                 for res, test_name, model, passed in pool.map(_evaluate_worker, tasks):
347 |                     all_results.append(res)
348 |                     pass_map.setdefault((test_name, model), []).append(passed)
349 | 
350 |     aggregates: List[dict] = []
351 |     for (test_name, model), statuses in pass_map.items():
352 |         c = sum(1 for x in statuses if x)
353 |         n = len(statuses)
354 |         pass_at = compute_pass_at_k(c, n, k_values)
355 |         agg = AggregateRecord(
356 |             test_name=test_name,
357 |             model_name=model,
358 |             n_samples=n,
359 |             n_pass=c,
360 |             pass_at_k=format_pass_at_k(pass_at),
361 |             k_values=k_values,
362 |             computed_at=datetime.utcnow(),
363 |         )
364 |         aggregates.append(json.loads(agg.model_dump_json()))
365 | 
366 |     updated_json = {
367 |         "run_id": prior_data.get("run_id") or rd.name,
368 |         "models": model_names,
369 |         "tests": [d.name for d in test_dirs],
370 |         "prompts": prompts_map,
371 |         "results": all_results,
372 |         "aggregates": aggregates,
373 |         "meta": {
374 |             **(prior_data.get("meta") or {}),
375 |             "sampling": {
376 |                 **((prior_data.get("meta") or {}).get("sampling") or {}),
377 |                 "k_values": k_values,
378 |                 "processes": (processes if processes is not None else min(multiprocessing.cpu_count(), len(tasks))) if tasks else None,
379 |             },
380 |             "status": "EVALUATED",
381 |         },
382 |     }
383 |     results_json_path.write_text(json.dumps(updated_json, indent=2), encoding="utf-8")
384 |     if generate_report:
385 |         from .report import render_report
386 |         # Synthesize minimal models_cfg for backward compatibility with report renderer
387 |         synthesized_models_cfg = {
388 |             "models": [
389 |                 {"name": name, "display_name": display_lookup.get(name) or (str(name).split('/')[-1])}
390 |                 for name in model_names
391 |             ]
392 |         }
393 |         render_report(results_json_path, rd / "index.html", synthesized_models_cfg)
394 |         typer.echo(f"Evaluation complete. Report generated: {rd}/index.html")
395 |     else:
396 |         typer.echo("Evaluation complete. Report generation skipped.")
397 | 
398 | 
399 | @app.command()
400 | def report(
401 |     run_dir: str,
402 |     models_file: str = typer.Option("config/models.yaml", help="Models config YAML")
403 |     ):
404 |     """Regenerate HTML report for an existing run directory."""
405 |     models_cfg = yaml.safe_load(open(models_file))
406 |     rd = Path(run_dir)
407 |     from .report import render_report
408 |     render_report(rd / "results.json", rd / "index.html", models_cfg)
409 |     typer.echo("Report regenerated.")
410 | 
411 | 
412 | def main():  # pragma: no cover
413 |     app()
414 | 
415 | 
416 | if __name__ == "__main__":  # pragma: no cover
417 |     main()
418 | 


--------------------------------------------------------------------------------
/a11y_llm_tests/report.py:
--------------------------------------------------------------------------------
  1 | """HTML reporting for evaluation runs."""
  2 | from pathlib import Path
  3 | import orjson
  4 | from jinja2 import Template
  5 | from collections import OrderedDict
  6 | # importing os module for environment variables
  7 | import os
  8 | 
  9 | TEMPLATE = """<!DOCTYPE html>
 10 | <html lang=\"en\">
 11 | <head>
 12 | <meta charset=\"UTF-8\" />
 13 | <title>{{ site_name }}</title>
 14 | <base href=".">
 15 | <style>
 16 | :root {
 17 |   --bg-surface: #111827;
 18 |   --surface: #1f2937;
 19 |   --surface-muted: #283446;
 20 |   --border-subtle: #334155;
 21 |   --border-strong: #475569;
 22 |   --text-primary: #f9fafb;
 23 |   --text-secondary: #cbd5f5;
 24 |   --text-on-accent: #ffffff;
 25 |   --accent: #006aec;
 26 |   --accent-strong: #3b82f6;
 27 |   --pass: #0a8336;
 28 |   --fail: #a80000;
 29 |   --warn: #fbbf24;
 30 |   --badge-radius: 999px;
 31 |   --card-shadow: 0 12px 30px rgba(8, 47, 73, 0.35);
 32 |   --focus-ring: 0 0 0 3px rgba(29, 78, 216, 0.35);
 33 |   --link-text: #659fff;
 34 | }
 35 | * { box-sizing: border-box; }
 36 | body {
 37 |   font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
 38 |   font-size: 16px;
 39 |   background: var(--bg-surface);
 40 |   color: var(--text-primary);
 41 |   margin: 0;
 42 |   line-height: 1.6;
 43 | }
 44 | a { color: var(--link-text); }
 45 | a:hover { color: var(--accent-strong); }
 46 | header, main, footer {
 47 |   max-width: 1200px;
 48 |   margin: 0 auto;
 49 |   padding: 0 1.5rem;
 50 | }
 51 | header { padding-top: 2.5rem; padding-bottom: 1.5rem; }
 52 | header h1 { margin: 0; font-size: 2.25rem; letter-spacing: -0.015em; }
 53 | p, ul, li { color: var(--text-secondary); }
 54 | main { padding-bottom: 3rem; }
 55 | section + section { margin-top: 2rem; }
 56 | table {
 57 |   width: 100%;
 58 |   border-collapse: separate;
 59 |   border-spacing: 0;
 60 |   border: 1px solid var(--border-subtle);
 61 |   background: var(--surface);
 62 |   border-radius: 0.75rem;
 63 |   overflow: hidden;
 64 |   box-shadow: 0 6px 20px rgba(15, 23, 42, 0.1);
 65 |   margin-bottom: 1.5rem;
 66 | }
 67 | caption {
 68 |   text-align: left;
 69 |   padding: 1rem 1.25rem 0.5rem;
 70 |   --heatmap-low: #2d3748; /* fallback low value */
 71 |   --heatmap-high: #16a34a; /* fallback high value */
 72 |   font-weight: 700;
 73 |   font-size: 1.05rem;
 74 |   color: var(--text-primary);
 75 | }
 76 | thead th {
 77 |   background: var(--surface-muted);
 78 |   color: var(--text-primary);
 79 |   padding: 0.75rem 1rem;
 80 |   text-align: left;
 81 |   font-size: 0.95rem;
 82 |   border-bottom: 1px solid var(--border-subtle);
 83 | }
 84 | tbody td, tbody th {
 85 |   padding: 0.75rem 1rem;
 86 |   border-bottom: 1px solid var(--border-subtle);
 87 |   color: var(--text-secondary);
 88 |   vertical-align: top;
 89 | }
 90 | tbody tr:last-child td, tbody tr:last-child th { border-bottom: none; }
 91 | tbody tr:nth-child(even) { background: rgba(15, 23, 42, 0.04); }
 92 | .badge-pass, .badge-fail {
 93 |   display: inline-flex;
 94 |   align-items: center;
 95 |   font-weight: 600;
 96 |   font-size: 0.85rem;
 97 |   padding: 0.15rem 0.6rem;
 98 |   border-radius: var(--badge-radius);
 99 |   letter-spacing: 0.01em;
100 | }
101 | .badge-pass { background: var(--pass); color: var(--text-on-accent); }
102 | .badge-fail { background: var(--fail); color: var(--text-on-accent); }
103 | code { font-size: 0.85rem; background: var(--surface-muted); padding: 0.1rem 0.35rem; border-radius: 0.35rem; }
104 | details {
105 |   border: 1px solid var(--border-subtle);
106 |   border-radius: 0.75rem;
107 |   padding: 0.75rem 1rem;
108 |   margin-bottom: 1rem;
109 |   background: var(--surface);
110 |   box-shadow: 0 4px 16px rgba(15, 23, 42, 0.08);
111 | }
112 | details[open] {
113 |   border-color: var(--accent-strong);
114 |   box-shadow: 0 12px 24px rgba(37, 99, 235, 0.12);
115 | }
116 | summary { cursor: pointer;}
117 | details summary h2,
118 | details summary h3,
119 | details summary h4,
120 | details summary h5 {
121 |   display: inline-flex;
122 |   margin: 0;
123 |   font-size: 1.05rem;
124 |   color: var(--text-primary);
125 | }
126 | details summary:focus-visible {
127 |   outline: none;
128 |   box-shadow: var(--focus-ring);
129 |   border-radius: 0.5rem;
130 | }
131 | tbody th { text-align: left; }
132 | .filters {
133 |   display: flex;
134 |   flex-wrap: wrap;
135 |   gap: 0.75rem;
136 |   align-items: flex-end;
137 |   padding: 1rem;
138 |   background: var(--surface);
139 |   border: 1px solid var(--border-subtle);
140 |   border-radius: 0.75rem;
141 |   box-shadow: 0 6px 20px rgba(15, 23, 42, 0.1);
142 |   margin-bottom: 1rem;
143 | }
144 | .filters label {
145 |   display: flex;
146 |   flex-direction: column;
147 |   font-weight: 600;
148 |   font-size: 0.95rem;
149 |   color: var(--text-primary);
150 | }
151 | .filters select {
152 |   margin-top: 0.35rem;
153 |   padding: 0.4rem 0.6rem;
154 |   font-size: 0.95rem;
155 |   border-radius: 0.5rem;
156 |   border: 1px solid var(--border-strong);
157 |   background: var(--surface-muted);
158 |   color: var(--text-primary);
159 | }
160 | .filters button {
161 |   padding: 0.45rem 0.9rem;
162 |   font-size: 0.95rem;
163 |   border-radius: 0.5rem;
164 |   border: none;
165 |   background: var(--accent);
166 |   color: var(--text-on-accent);
167 |   font-weight: 600;
168 |   transition: transform 0.15s ease, box-shadow 0.15s ease;
169 | }
170 | .filters button:hover {
171 |   transform: translateY(-1px);
172 |   box-shadow: 0 10px 24px rgba(37, 99, 235, 0.22);
173 | }
174 | .filters button:focus-visible {
175 |   outline: none;
176 |   box-shadow: var(--focus-ring);
177 | }
178 | .filters-summary { margin: 1rem 0 0.5rem; font-size: 0.95rem; color: var(--text-primary); font-weight: 600; }
179 | .samples {
180 |   display: grid;
181 |   grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
182 |   gap: 1.25rem;
183 |   margin-top: 1rem;
184 | }
185 | .sample-card {
186 |   border-radius: 0.9rem;
187 |   border: 1px solid var(--border-subtle);
188 |   padding: 1rem;
189 |   background: var(--surface);
190 |   box-shadow: var(--card-shadow);
191 |   transition: transform 0.2s ease, box-shadow 0.2s ease;
192 | }
193 | .sample-card:hover {
194 |   transform: translateY(-4px);
195 |   box-shadow: 0 16px 32px rgba(15, 23, 42, 0.16);
196 | }
197 | .sample-card h4 {
198 |   margin-top: 0;
199 |   margin-bottom: 0.25rem;
200 |   font-size: 1.05rem;
201 |   color: var(--text-primary);
202 | }
203 | .sample-card p { margin: 0.35rem 0; color: var(--text-secondary); }
204 | .pass-rate-bar {
205 |   height: 12px;
206 |   background: var(--surface-muted);
207 |   position: relative;
208 |   border-radius: 999px;
209 |   overflow: hidden;
210 |   margin: 1rem 0;
211 | }
212 | .pass-rate-bar span {
213 |   position: absolute;
214 |   inset: 0;
215 |   background: var(--pass);
216 |   transition: width 0.4s ease;
217 | }
218 | figure { margin: 0.75rem 0 0; }
219 | figure img {
220 |   width: 100%;
221 |   border-radius: 0.75rem;
222 |   border: 1px solid var(--border-subtle);
223 |   box-shadow: 0 10px 24px rgba(15, 23, 42, 0.2);
224 | }
225 | .prompt-block {
226 |   white-space: pre-wrap;
227 |   background: var(--surface-muted);
228 |   padding: 0.85rem 1rem;
229 |   border-radius: 0.75rem;
230 |   border: 1px solid var(--border-subtle);
231 |   font-size: 0.92rem;
232 |   color: var(--text-primary);
233 |   box-shadow: inset 0 1px 3px rgba(15, 23, 42, 0.12);
234 | }
235 | .agg-table { margin-top: 1rem; }
236 | /* Heatmap cells for pass@k tables */
237 | .pass-at-k-cell {
238 |   transition: background-color 0.25s ease, color 0.25s ease;
239 |   text-align: center;
240 |   font-weight: 600;
241 | }
242 | footer {
243 |   padding: 2rem 1.5rem 3rem;
244 |   color: var(--text-secondary);
245 | }
246 | footer a { color: var(--link-text); }
247 | details ul { margin: 0.5rem 0 0.25rem; padding-left: 1.15rem; }
248 | details li { margin-bottom: 0.35rem; }
249 | @media (max-width: 768px) {
250 |   header, main, footer { padding: 0 1rem; }
251 |   header h1 { font-size: 1.75rem; }
252 |   table, details { box-shadow: none; }
253 |   .filters { padding: 0.75rem; }
254 | }
255 | </style>
256 | </head>
257 | <body>
258 | <header>
259 | <h1>{{ site_name }}</h1>
260 | </header>
261 | <main id=\"main\">
262 | <section>
263 | {% set total_samples = tests|length * n_samples %}
264 | <p>All Models were tested against {{ tests|length }} test cases. Each test case was tested {{ n_samples }} times. This results in <strong>{{ total_samples }}</strong> total samples being evaluated per model.</p>
265 | <table>
266 | <thead>
267 | <tr><th>Model</th><th>Rank</th><th>WCAG Pass Rate*</th><th>Avg Total WCAG Failures</th><th>Avg Axe WCAG Failures</th><th>Avg Assertion WCAG Failures</th><th>Avg Best Practice Failures</th></tr>
268 | </thead>
269 | <tbody>
270 | {% for model, stats in summary.items() %}
271 | <tr>
272 |   <th>{{ model_display_names.get(model, model) }}</th>
273 |   <td>{{ loop.index }}</td>
274 |   <td class="pass-at-k-cell" data-pass="{{ '%.4f'|format(stats.pass_rate) }}">{{ "%.0f%%"|format(stats.pass_rate * 100) }}</td>
275 |   <td>{{ "%.2f"|format(stats.avg_failures) }}</td>
276 |   <td>{{ "%.2f"|format(stats.avg_axe_failures) }}</td>
277 |   <td>{{ "%.2f"|format(stats.avg_assertion_failures) }}</td>
278 |   <td>{{ "%.2f"|format(stats.avg_bp_failures) }}</td>
279 | </tr>
280 | {% endfor %}
281 | </tbody>
282 | </table>
283 | <p>* These tests do not comprehensively test all WCAG requirements, only a subset of the most common issues. WCAG failures may still exist even for passing tests.</p>
284 | {% if aggregates_by_test %}
285 | <details>
286 |   <summary><h2>Pass@k Aggregates</h2></summary>
287 |   <p>Pass@k is a formula that determines the likelihood that if you pick random k samples from the set, then at least one of them would pass. For example, pass@10=.50 means that there is a 50 percent likelihood that at least 1 of the 10 randomly selected samples from the set would pass.</p>
288 |   <p>Pass@K is a metric used to evaluate the performance of models when multiple samples are generated per test case.</p>
289 |   {% for test_name, info in aggregates_by_test.items() %}
290 |     <table class="agg-table">
291 |       <caption>{{ test_name }}</caption>
292 |       <thead>
293 |         <tr>
294 |           <th>Model</th>
295 |           <th>Samples</th>
296 |           <th>Passes</th>
297 |           {% for k in info.ks %}
298 |             <th>pass@{{ k }}</th>
299 |           {% endfor %}
300 |         </tr>
301 |       </thead>
302 |       <tbody>
303 |         {% for a in info.rows %}
304 |         <tr>
305 |           <td>{{ model_display_names.get(a.model_name, a.model_name) }}</td>
306 |           <td>{{ a.n_samples }}</td>
307 |           <td>{{ a.n_pass }}</td>
308 |           {% for k in info.ks %}
309 |             {% set v = a.pass_at_k.get(k) %}
310 |             <td class="pass-at-k-cell" data-pass-at-k="{{ k }}" data-pass="{% if v is not none %}{{ '%.4f'|format(v) }}{% else %}{% endif %}">{% if v is not none %}{{ '%.0f%%'|format(v * 100) }}{% else %}-{% endif %}</td>
311 |           {% endfor %}
312 |         </tr>
313 |         {% endfor %}
314 |       </tbody>
315 |     </table>
316 |   {% endfor %}
317 | </details>
318 | {% endif %}
319 | </section>
320 | <details open>
321 | <summary><h2>Methodology</h2></summary>
322 |   <p>This report shows how well various LLMs generate accessible HTML.</p>
323 |   <ul>
324 |     <li>Each test uses a prompt to generate HTML. The generated HTML is then tested for accessibility.</li>
325 |     <li>The prompts intentionally do not include specific accessibility instructions. The goal is to see if the LLMs produce accessible HTML by default.</li>
326 |     <li>The resulting HTML is rendered in a browser via Playwright (Chromium). This allows the HTML's JavaScript and CSS to execute, which can impact accessibility.</li>
327 |     <li>The rendered HTML is evaluated using <a href="https://github.com/dequelabs/axe-core">axe-core</a> to identify common accessibility issues.</li>
328 |     <li>A custom test script (JavaScript) is executed against the rendered page to check for accessibility requirements that are specific to the test case and not covered by axe-core. These tests look for <a href="https://www.w3.org/WAI/WCAG22/quickref/">WCAG 2.2</a> failures and best practices. Best practices do not impact pass/fail results.</li>
329 |     <li>Each test case is run multiple times (samples) to evaluate the consistency and reliability of the LLM's output.</li>
330 |     <li>Default temperatures / settings are used for all models.</li>
331 |   </ul>
332 |   {% set system_prompt = prompting_meta.get('system_prompt') %}
333 |   {% set effective_system_prompt = prompting_meta.get('effective_system_prompt') %}
334 |   {% set display_system_prompt = effective_system_prompt or system_prompt %}
335 |   {% set custom_instructions = prompting_meta.get('custom_instructions') %}
336 |   {% set custom_instructions_path = prompting_meta.get('custom_instructions_path') %}
337 |   {% if display_system_prompt %}
338 |   <details>
339 |     <summary><h3>System Prompt</h3></summary>
340 |     <pre class="prompt-block">{{ display_system_prompt|e }}</pre>
341 |     {% if effective_system_prompt and system_prompt and effective_system_prompt != system_prompt %}
342 |     <p><small>The effective system prompt shown includes custom instructions.</small></p>
343 |     {% endif %}
344 |   </details>
345 |   {% endif %}
346 |   {% if custom_instructions %}
347 |   <details>
348 |     <summary><h3>Custom Instructions</h3></summary>
349 |     <pre class="prompt-block">{{ custom_instructions }}</pre>
350 |     {% if custom_instructions_path %}
351 |     <p><small>Source: {{ custom_instructions_path }}</small></p>
352 |     {% endif %}
353 |   </details>
354 |   {% endif %}
355 |   <p>All tests are automatic and deterministic (no human intervention). Only a fraction of accessibility requirements in WCAG can be covered in this way. Many requirements still need a human to evaluate. As such, these tests are not comprehensive. Even if a test passes, it may still fail WCAG and contain serious accessibility issues.</p>
356 |   <p>Please leave feedback, review the source code, and contribute test cases, assertions, and other improvements at the <a href="https://github.com/microsoft/a11y-llm-eval">GitHub Project</a>.</p>
357 | </details>
358 | <details>
359 |   <summary><h2>Glossary</h2></summary>
360 |   <h3>Column Definitions</h3>
361 |   <ul>
362 |     <li><strong>Rank</strong>: The position of the model when sorted by WCAG Pass Rate (lower is better).</li>
363 |     <li><strong>WCAG Pass Rate</strong>: The percentage of samples that passed all WCAG tests, including both axe-core WCAG checks and custom WCAG assertions. This does not include best practices.</li>
364 |     <li><strong>Avg Total WCAG Failures</strong>: The average number of total WCAG failures (axe-core + assertions) per sample for the model. This does not include best practices.</li>
365 |     <li><strong>Avg Axe WCAG Failures</strong>: The average number of axe-core detected WCAG failures per sample for the model. This does not include best practices.</li>
366 |     <li><strong>Avg Assertion WCAG Failures</strong>: The average number of custom WCAG assertion failures per sample for the model. This does not include best practices.</li>
367 |     <li><strong>Avg Best Practice Failures</strong>: The average number of best practice accessibility issues (informational only) per sample for the model. This includes axe-core best practices and best practice assertions.</li>
368 |   </ul>
369 | 
370 |   <h3>Other Glossary Terms</h3>
371 |   <ul>
372 |     <li><strong>Assertion</strong>: A specific accessibility check defined in the test script. Each assertion checks for a particular accessibility requirement or best practice for the specific test case which is not already tested by axe.</li>
373 |     <li><strong>Axe-core</strong>: An open-source accessibility testing engine developed by Deque Systems. It is widely used for automated accessibility testing of web applications. <a href="https://github.com/dequelabs/axe-core">Axe-core</a></li>
374 |     <li><strong>Pass@k</strong>: A metric that estimates the likelihood of at least one sample passing a test when k samples are randomly selected.</li>
375 |     <li><strong>WCAG</strong>: <a href=https://www.w3.org/WAI/WCAG22/quickref/">Web Content Accessibility Guidelines</a>, a set of guidelines for making web content more accessible to people with disabilities.</li>
376 |     <li><strong>Test Case</strong>: A specific scenario designed to evaluate the accessibility of generated HTML content. Each test case includes a prompt, expected accessibility requirements, and a test script.</li>
377 |   </ul>
378 | </details>
379 | <section>
380 | <h2 id="details-h2">Detailed Results</h2>
381 | <div class="filters" role="region" aria-label="Detailed results filters">
382 |   <label>
383 |     Model
384 |     <select id="model-filter">
385 |       <option value="">All models</option>
386 |       {% for model, name in model_display_names|dictsort(by='value') %}
387 |         <option value="{{ model }}">{{ name }}</option>
388 |       {% endfor %}
389 |     </select>
390 |   </label>
391 |   <label>
392 |     Result
393 |     <select id="result-filter">
394 |       <option value="">All results</option>
395 |       <option value="PASS">Pass</option>
396 |       <option value="FAIL">Fail</option>
397 |     </select>
398 |   </label>
399 |   <button type="button" id="reset-filters">Reset</button>
400 | </div>
401 | <p id="filter-count" class="filters-summary" aria-live="polite" aria-atomic="true"></p>
402 | <p id="no-results-message" hidden>No samples match the current filters.</p>
403 | {% for test_name, test_data in grouped_results.items() %}
404 | <section>
405 |   <details>
406 |     <summary><h3>{{ test_name }}</h3></summary>
407 |     {% if test_data.prompt %}
408 |     <details>
409 |       <summary>Prompt</summary>
410 |       <pre class="prompt-block">{{ test_data.prompt|e }}</pre>
411 |     </details>
412 |     {% endif %}
413 |     {% for group in test_data.models %}
414 |     {% set agg = group.aggregate %}
415 |     <details data-model-group="{{ group.model_name }}">
416 |       <summary>
417 |         <h4>
418 |           {{ model_display_names.get(group.model_name, group.model_name) }}
419 |           {% if agg and agg.n_samples %}
420 |             &nbsp;—&nbsp;{{ '%.0f%%'|format((agg.n_pass / agg.n_samples) * 100) }}
421 |           {% endif %}
422 |         </h4>
423 |       </summary>
424 |       {% if agg %}
425 |       <p>Samples: {{ agg.n_samples }} | Passes: {{ agg.n_pass }}</p>
426 |       <table>
427 |         <thead><tr>{% for k,v in agg.pass_at_k.items() %}<th>pass@{{ k }}</th>{% endfor %}</tr></thead>
428 |         <tbody><tr>{% for k,v in agg.pass_at_k.items() %}
429 |           <td class="pass-at-k-cell" data-pass-at-k="{{ k }}" data-pass="{{ '%.4f'|format(v) }}">{{ '%.0f%%'|format(v * 100) }}</td>
430 |         {% endfor %}</tr></tbody>
431 |       </table>
432 |       {% set _percent = (100.0 * (agg.n_pass / agg.n_samples)) if agg.n_samples else 0 %}
433 |       <div class="pass-rate-bar" role="img" aria-label="Pass ratio - {{ _percent }} percent"><span style="width: {{ _percent }}%"></span></div>
434 |       {% endif %}
435 |       <div class="samples">
436 |       {% for r in group.samples %}
437 |         <div class="sample-card" data-model="{{ r.model_name }}" data-result="{{ r.result }}">
438 |           {# Trim the first two path segments (e.g., 'runs/<run_id>/...') #}
439 |           {% set _parts = r.generation_html_path.split('/') %}
440 |           {% set _trimmed = '/'.join(_parts[2:]) %}
441 |           <h4><a href="{{ _trimmed }}">Sample {{ r.sample_index if r.sample_index is not none else loop.index0 }} ({{ model_display_names.get(r.model_name, r.model_name) }})</a></h4>
442 |           <p><span class="badge-{{ 'pass' if r.result=='PASS' else 'fail' }}">{{ r.result }}</span> | Latency {{ '%.2f'|format(r.generation.latency_s) }}s{% if r.generation.cached %} cached{% endif %}</p>
443 |           <p>Axe WCAG: {{ r.axe.failure_count if r.axe else 'n/a' }}{% if r.axe and r.axe.best_practice_count > 0 %} | BP: {{ r.axe.best_practice_count }}{% endif %}{% if r.generation.cost_usd is not none %} | ${{ '%.4f'|format(r.generation.cost_usd) }}{% endif %}</p>
444 |           {% if r.screenshot_path %}
445 |             {# Trim the first two path segments (e.g., 'runs/<run_id>/...') #}
446 |             {% set _parts = r.screenshot_path.split('/') %}
447 |             {% set _trimmed = '/'.join(_parts[2:]) %}
448 |             <figure>
449 |               <img src="{{ _trimmed }}" alt="Screenshot sample {{ r.sample_index }} for {{ r.test_name }} / {{ model_display_names.get(r.model_name, r.model_name) }}" style="max-width:320px;">
450 |             </figure>
451 |           {% endif %}
452 |           <details>
453 |             <summary>
454 |               Assertions
455 |               {% if r.test_function.status == "fail" %}
456 |                 <span role="img" aria-label="Fail">❌</span>
457 |               {% elif r.test_function.status == "pass" %}
458 |                 <span role="img" aria-label="Pass">✅</span>
459 |               {% endif %}
460 |             </summary>
461 |             <ul>
462 |               {% for a in r.test_function.assertions %}
463 |               <li>
464 |                 {% if a.status == "fail" %}
465 |                   <span role="img" aria-label="Fail">❌</span>:
466 |                 {% elif a.status == "pass" %}
467 |                   <span role="img" aria-label="Pass">✅</span>:
468 |                 {% endif %}
469 |                 {{ a.name }} ({{ a.type if a.type else 'R' }}): {{ a.status }}
470 |                 {% if a.message %} - {{ a.message }}{% endif %}
471 |               </li>
472 |               {% endfor %}
473 |             </ul>
474 |           </details>
475 |           {% if r.axe %}
476 |           {% if r.axe.failure_count > 0 %}
477 |           <details>
478 |             <summary>Axe WCAG Failures ({{ r.axe.failure_count }}) <span role="img" aria-label="Fail">❌</span></summary>
479 |             <ul>
480 |               {% for v in r.axe.failures %}
481 |               <li>({{ v.nodes|length }}x) - <strong>{{ v.id }}</strong> ({{ v.impact }}): {{ v.description }}</li>
482 |               {% endfor %}
483 |             </ul>
484 |           </details>
485 |           {% endif %}
486 |           {% if r.axe.best_practice_count > 0 %}
487 |           <details>
488 |             <summary>Axe Best Practice Issues ({{ r.axe.best_practice_count }}) <span role="img" aria-label="Warning">⚠️</span></summary>
489 |             <ul>
490 |               {% for v in r.axe.best_practice_failures %}
491 |               <li><strong>{{ v.id }}</strong> ({{ v.impact }}): {{ v.description }} <em>(Best Practice - does not affect pass/fail)</em></li>
492 |               {% endfor %}
493 |             </ul>
494 |           </details>
495 |           {% endif %}
496 |           {% endif %}
497 |         </div>
498 |       {% endfor %}
499 |       </div>
500 |     </details>
501 |     {% endfor %}
502 |   </details>
503 | </section>
504 | {% endfor %}
505 | </section>
506 | <section hidden>
507 |   <details>
508 |     <summary><h2>Costs</h2></summary>
509 |     <table>
510 |       <caption>Costs per model</caption>
511 |       <thead>
512 |         <tr><th>Model</th><th>Total Cost ($)</th><th>Avg Cost/Test ($)</th></tr>
513 |       </thead>
514 |       <tbody>
515 |         {% for model, stats in summary.items() %}
516 |         <tr>
517 |           <th>{{ model_display_names.get(model, model) }}</th>
518 |           <td>{{ "%.4f"|format(stats.total_cost) }}</td>
519 |           <td>{{ "%.4f"|format(stats.avg_cost) }}</td>
520 |         </tr>
521 |         {% endfor %}
522 |       </tbody>
523 |     </table>
524 |   </details>
525 | </section>
526 | </main>
527 | <footer>
528 | <p>GitHub Project: <a href="https://github.com/microsoft/a11y-llm-eval">a11y-llm-eval</a>. Run ID: {{ run_id }}</p>
529 | {{ footer_content|safe }}
530 | </footer>
531 | <script>
532 | document.addEventListener('DOMContentLoaded', function () {
533 |   const modelFilter = document.getElementById('model-filter');
534 |   const resultFilter = document.getElementById('result-filter');
535 |   const resetButton = document.getElementById('reset-filters');
536 |   const modelSections = Array.from(document.querySelectorAll('[data-model-group]'));
537 |   const allCards = Array.from(document.querySelectorAll('.sample-card'));
538 |   const noResultsMessage = document.getElementById('no-results-message');
539 |   const countEl = document.getElementById('filter-count');
540 |   const totalCardCount = allCards.length;
541 | 
542 |   function applyFilters() {
543 |     const modelValue = modelFilter ? modelFilter.value : '';
544 |     const resultValue = resultFilter ? resultFilter.value : '';
545 |     let anyVisible = false;
546 |     let visibleCardCount = 0;
547 | 
548 |     modelSections.forEach(function (section) {
549 |       const cards = Array.from(section.querySelectorAll('.sample-card'));
550 |       const sectionModel = section.getAttribute('data-model-group');
551 |       let sectionHasVisibleCard = false;
552 | 
553 |       cards.forEach(function (card) {
554 |         const cardModel = card.getAttribute('data-model');
555 |         const cardResult = card.getAttribute('data-result');
556 |         const matchesModel = !modelValue || cardModel === modelValue;
557 |         const matchesResult = !resultValue || cardResult === resultValue;
558 |         const shouldShowCard = matchesModel && matchesResult;
559 | 
560 |         card.style.display = shouldShowCard ? '' : 'none';
561 |         if (shouldShowCard) {
562 |           sectionHasVisibleCard = true;
563 |           visibleCardCount += 1;
564 |         }
565 |       });
566 | 
567 |       const sectionMatchesModel = !modelValue || sectionModel === modelValue;
568 |       const shouldShowSection = sectionMatchesModel && sectionHasVisibleCard;
569 |       section.style.display = shouldShowSection ? '' : 'none';
570 |       section.toggleAttribute('hidden', !shouldShowSection);
571 | 
572 |       if (shouldShowSection) {
573 |         anyVisible = true;
574 |       }
575 |     });
576 | 
577 |     if (noResultsMessage) {
578 |       noResultsMessage.hidden = anyVisible;
579 |     }
580 |     if (countEl) {
581 |       const message = anyVisible ? `Showing ${visibleCardCount} of ${totalCardCount} samples` : `Showing 0 of ${totalCardCount} samples`;
582 |       countEl.textContent = message;
583 |     }
584 |   }
585 | 
586 |   if (modelFilter) {
587 |     modelFilter.addEventListener('change', applyFilters);
588 |   }
589 |   if (resultFilter) {
590 |     resultFilter.addEventListener('change', applyFilters);
591 |   }
592 |   if (resetButton) {
593 |     resetButton.addEventListener('click', function () {
594 |       if (modelFilter) {
595 |         modelFilter.value = '';
596 |       }
597 |       if (resultFilter) {
598 |         resultFilter.value = '';
599 |       }
600 |       applyFilters();
601 |     });
602 |   }
603 | 
604 |   applyFilters();
605 | });
606 | </script>
607 | <script>
608 | // Heatmap coloring for pass@k cells: maps 0.0-1.0 to a green-ish scale and ensures text contrast
609 | document.addEventListener('DOMContentLoaded', function () {
610 |   function lerp(a, b, t) { return a + (b - a) * t; }
611 |   function hexToRgb(hex) {
612 |     const h = hex.replace('#','');
613 |     return [parseInt(h.substring(0,2),16), parseInt(h.substring(2,4),16), parseInt(h.substring(4,6),16)];
614 |   }
615 |   function rgbToHex(r,g,b){
616 |     return '#' + [r,g,b].map(x=>{ const s = Math.round(x).toString(16); return s.length===1 ? '0'+s : s; }).join('');
617 |   }
618 |   function luminance(r,g,b){
619 |     const rs = r/255, gs = g/255, bs = b/255;
620 |     const a = [rs,gs,bs].map(c=> c <= 0.03928 ? c/12.92 : Math.pow((c+0.055)/1.055, 2.4));
621 |     return 0.2126*a[0] + 0.7152*a[1] + 0.0722*a[2];
622 |   }
623 |   function contrastRatio(l1, l2){
624 |     const hi = Math.max(l1,l2), lo = Math.min(l1,l2);
625 |     return (hi + 0.05) / (lo + 0.05);
626 |   }
627 | 
628 |   // Define heatmap endpoints
629 |   const lowColor = getComputedStyle(document.documentElement).getPropertyValue('--heatmap-low').trim() || '#2d3748';
630 |   const highColor = getComputedStyle(document.documentElement).getPropertyValue('--heatmap-high').trim() || '#16a34a';
631 |   const lowRgb = hexToRgb(lowColor.replace(/\s/g,''));
632 |   const highRgb = hexToRgb(highColor.replace(/\s/g,''));
633 | 
634 |   const cells = Array.from(document.querySelectorAll('.pass-at-k-cell'));
635 |   cells.forEach(function(cell){
636 |     const v = cell.getAttribute('data-pass');
637 |     if (!v) return;
638 |     const t = Math.min(1, Math.max(0, parseFloat(v)));
639 |     const r = lerp(lowRgb[0], highRgb[0], t);
640 |     const g = lerp(lowRgb[1], highRgb[1], t);
641 |     const b = lerp(lowRgb[2], highRgb[2], t);
642 |     const bg = rgbToHex(r,g,b);
643 |     cell.style.backgroundColor = bg;
644 |     // compute readable foreground (black or white) based on WCAG contrast
645 |     const Lbg = luminance(r,g,b);
646 |     const Lwhite = luminance(255,255,255);
647 |     const Lblack = luminance(0,0,0);
648 |     const contrastWithWhite = contrastRatio(Lwhite, Lbg);
649 |     const contrastWithBlack = contrastRatio(Lblack, Lbg);
650 |     const fg = contrastWithWhite >= contrastWithBlack ? '#ffffff' : '#000000';
651 |     cell.style.color = fg;
652 |   });
653 | });
654 | </script>
655 | </body>
656 | </html>
657 | """
658 | 
659 | def render_report(run_json_path: Path, out_html: Path, models_cfg: dict):
660 |   data = orjson.loads(run_json_path.read_bytes())
661 |   meta_block = data.get("meta") or {}
662 |   sampling_meta = meta_block.get("sampling") or {}
663 |   prompting_meta = meta_block.get("prompting") or {}
664 |   from collections import defaultdict
665 | 
666 |   results = data.get("results", [])
667 | 
668 |   # Build display name mapping with precedence:
669 |   # 1. Stored meta.models_info
670 |   # 2. Provided models_cfg
671 |   # 3. Fallback to last path segment of model name
672 |   model_display_names = {}
673 |   for m in (meta_block.get("models_info") or []):
674 |     name = m.get("name")
675 |     if not name:
676 |       continue
677 |     model_display_names[name] = m.get("display_name") or name.split('/')[-1]
678 |     model_display_names[name.split('/')[-1]] = m.get("display_name") or name.split('/')[-1]
679 |   for m in (models_cfg.get("models") or []):
680 |     name = m.get("name")
681 |     if not name:
682 |       continue
683 |     display = m.get("display_name") or model_display_names.get(name) or name.split('/')[-1]
684 |     model_display_names[name] = display
685 |   # Ensure any model appearing only in results has a mapping
686 |   for r in results:
687 |     n = r.get("model_name")
688 |     if n and n not in model_display_names:
689 |       model_display_names[n] = n.split('/')[-1]
690 |   
691 |   per_model = defaultdict(lambda: {
692 |     "axe_failures": [],
693 |     "total_test_function_passes": 0,
694 |     "bp_passes": 0,
695 |     "total": 0,
696 |     "bp_total": 0,
697 |     "costs": [],
698 |     "axe_bp_failures": [],
699 |     "axe_bp_passes": 0,
700 |     "axe_bp_total": 0,
701 |     "total_axe_failures": 0,
702 |     "total_failures": 0,
703 |     "total_passes": 0,
704 |     "total_assertion_bp_failures": 0,
705 |     "total_assertion_failures": 0,
706 |   })
707 | 
708 |   for r in results:
709 |     model = r.get("model_name")
710 |     if not model:
711 |       continue
712 |     per_model[model]["total"] += 1
713 |     if r.get("result") == "PASS":
714 |       per_model[model]["total_passes"] += 1
715 |     # Determine test function pass count
716 |     if r.get("test_function", {}).get("status") == "pass":
717 |       per_model[model]["total_test_function_passes"] += 1
718 |     # Track best-practice assertions pass rate separately
719 |     assertions = r.get("test_function", {}).get("assertions", [])
720 |     bp_assertions = [a for a in assertions if (a.get("type") or "R").upper() == "BP"]
721 |     if bp_assertions:
722 |       per_model[model]["bp_total"] += 1 # treat per-test BP status aggregate: pass if all BP pass
723 |       if all(a.get("status") == "pass" for a in bp_assertions):
724 |         per_model[model]["bp_passes"] += 1
725 |     per_model[model]["total_assertion_bp_failures"] += r.get("test_function", {}).get("total_assertion_bp_failures", 0)
726 |     per_model[model]["total_assertion_failures"] += r.get("test_function", {}).get("total_assertion_failures", 0)
727 |     # Track axe failures (WCAG only now) and best practice failures
728 |     axe = r.get("axe") or {}
729 |     fc = axe.get("failure_count")
730 |     if fc is not None:
731 |       per_model[model]["axe_failures"].append(axe.get("failures", []))
732 |     per_model[model]["total_axe_failures"] += (fc or 0)
733 |     # Track axe best practice failures separately
734 |     bp_fc = axe.get("best_practice_count", 0)
735 |     per_model[model]["axe_bp_failures"].append(axe.get("best_practice_failures", []))
736 |     per_model[model]["axe_bp_total"] += bp_fc
737 |     if bp_fc == 0:
738 |       per_model[model]["axe_bp_passes"] += 1
739 |     gen = r.get("generation", {})
740 |     cost = gen.get("cost_usd")
741 |     if cost is not None:
742 |       try:
743 |         per_model[model]["costs"].append(float(cost))
744 |       except (TypeError, ValueError):
745 |         pass
746 |   # create summary
747 |   summary = {}
748 |   for m, s in per_model.items():
749 |     avg_axe_failures = s["total_axe_failures"] / s["total"] if s["total"] else 0.0
750 |     total_cost = sum(s["costs"]) if s["costs"] else 0.0
751 |     avg_cost = (total_cost / s["total"]) if s["total"] else 0.0
752 |     total_bp_failures = s["total_assertion_bp_failures"] + s["axe_bp_total"]
753 |     total_axe_failures = s["total_axe_failures"]
754 |     total_assertion_failures = s["total_assertion_failures"]
755 |     total_assertion_bp_failures = s["total_assertion_bp_failures"]
756 |     avg_assertion_failures = (total_assertion_failures / s["total"]) if s["total"] else 0.0
757 |     avg_bp_failures = (total_bp_failures / s["total"]) if s["total"] else 0.0
758 |     total_failures = total_assertion_failures + total_axe_failures
759 |     avg_failures = (total_failures / s["total"]) if s["total"] else 0.0
760 |     summary[m] = {
761 |       "avg_axe_failures": avg_axe_failures,
762 |       "pass_rate": s["total_passes"] / s["total"] if s["total"] else 0,
763 |       "total_cost": total_cost,
764 |       "avg_cost": avg_cost,
765 |       "total_assertion_failures": total_assertion_failures,
766 |       "total_assertion_bp_failures": total_assertion_bp_failures,
767 |       "avg_assertion_failures": avg_assertion_failures,
768 |       "avg_bp_failures": avg_bp_failures,
769 |       "total_failures": total_failures,
770 |       "avg_failures": avg_failures,
771 |     }
772 | 
773 |   # Group samples by (test_name, model_name)
774 |   grouped = {}
775 |   for r in results:
776 |     key = (r.get("test_name"), r.get("model_name"))
777 |     grouped.setdefault(key, []).append(r)
778 |   # Sort samples by sample_index if present
779 |   grouped_results = OrderedDict()
780 |   agg_index = {}
781 |   # Enhance aggregates with display_model_name (provider prefix stripped)
782 |   for a in (data.get("aggregates") or []):
783 |     agg_index[(a.get("test_name"), a.get("model_name"))] = a
784 | 
785 |   prompts_map = (data.get("prompts") or {})
786 |   for (test_name, model_name), samples in sorted(grouped.items()):
787 |     samples_sorted = sorted(samples, key=lambda x: (x.get("sample_index") is None, x.get("sample_index") or 0))
788 |     test_entry = grouped_results.setdefault(test_name, {"prompt": prompts_map.get(test_name), "models": []})
789 |     test_entry["models"].append({
790 |       "model_name": model_name,
791 |       "samples": samples_sorted,
792 |       "aggregate": agg_index.get((test_name, model_name)),
793 |     })
794 | 
795 |   summary = OrderedDict(sorted(summary.items(), key=lambda item: (-item[1]["pass_rate"], item[1]["avg_failures"])) )
796 |   # Build aggregates_by_test: for each test, list all models and their aggregates (ensures unique table per test)
797 |   aggregates_by_test = OrderedDict()
798 |   tests_in_order = list(grouped_results.keys())
799 |   for test_name in tests_in_order:
800 |     models_info = grouped_results.get(test_name, {}).get('models', [])
801 |     rows = []
802 |     ks_set = []
803 |     for m in models_info:
804 |       model_name = m.get('model_name')
805 |       agg = agg_index.get((test_name, model_name))
806 |       if agg:
807 |         pass_at_k = agg.get('pass_at_k') or {}
808 |         # preserve order of keys as they appear; avoid duplicates
809 |         for k in pass_at_k.keys():
810 |           if k not in ks_set:
811 |             ks_set.append(k)
812 |         rows.append({
813 |           'model_name': model_name,
814 |           'n_samples': agg.get('n_samples', 0),
815 |           'n_pass': agg.get('n_pass', 0),
816 |           'pass_at_k': pass_at_k,
817 |         })
818 |       else:
819 |         rows.append({
820 |           'model_name': model_name,
821 |           'n_samples': 0,
822 |           'n_pass': 0,
823 |           'pass_at_k': {},
824 |         })
825 |     # try to sort ks numerically when possible
826 |     try:
827 |       ks_sorted = sorted(ks_set, key=lambda x: int(x))
828 |     except Exception:
829 |       ks_sorted = sorted(ks_set)
830 |     aggregates_by_test[test_name] = {'rows': rows, 'ks': ks_sorted}
831 | 
832 |   html = Template(TEMPLATE).render(
833 |     run_id=data.get("run_id", "unknown"),
834 |     models=data.get("models", []),
835 |     model_display_names=model_display_names,
836 |     tests=data.get("tests", []),
837 |     summary=summary,
838 |     results=results,
839 |     aggregates=data.get("aggregates", []),
840 |     aggregates_by_test=aggregates_by_test,
841 |     grouped_results=grouped_results,
842 |     site_name=os.getenv("SITE_NAME", "A11y LLM Eval"),
843 |     footer_content=os.getenv("FOOTER_CONTENT", ""),
844 |     n_samples=sampling_meta.get("samples_per_case", 0),
845 |     prompting_meta=prompting_meta,
846 |   )
847 |   out_html.write_text(html, encoding="utf-8")
848 | 


--------------------------------------------------------------------------------