├── .env.example ├── test_cases ├── modal-dialog │ ├── prompt.md │ ├── examples │ │ ├── pass-native.html │ │ ├── fail-popover.html │ │ ├── fail-aria.html │ │ ├── pass-aria.html │ │ ├── fail-has-css-transition.html │ │ └── fail-has-js-delay.html │ └── test.js ├── shopping-home-page │ ├── prompt.md │ ├── examples │ │ ├── fail.html │ │ └── pass.html │ └── test.js ├── disclosure-widget │ ├── prompt.md │ ├── examples │ │ ├── good-details-summary.html │ │ ├── bad-missing-all-semantics.html │ │ ├── bad-hidden-content-in-reading-order.html │ │ ├── good-button.html │ │ └── good-hidden-from-everyone.html │ └── test.js └── readme.md ├── requirements.txt ├── setup.cfg ├── scripts └── install_node_deps.sh ├── config └── models.yaml.example ├── a11y_llm_tests ├── __init__.py ├── utils.py ├── cache.py ├── node_bridge.py ├── metrics.py ├── schema.py ├── generator.py ├── cli.py └── report.py ├── node_runner ├── package.json ├── package-lock.json └── runner.js ├── SECURITY.md ├── CODE_OF_CONDUCT.md ├── tests ├── test_metrics.py ├── test_generator_prompts.py ├── test_cli_sampling.py └── test_examples.py ├── .github └── workflows │ └── build.yml ├── LICENSE ├── SUPPORT.md ├── README.md └── .gitignore /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | ANTHROPIC_API_KEY= 3 | GOOGLE_API_KEY= 4 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/prompt.md: -------------------------------------------------------------------------------- 1 | Create an example of a modal dialog component. It is closed by default, and the button to open it has a `trigger` class. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | litellm 2 | pydantic>=2 3 | rich 4 | jinja2 5 | aiofiles 6 | httpx 7 | python-dotenv 8 | orjson 9 | typer 10 | beautifulsoup4 11 | lxml 12 | pytest 13 | pyyaml 14 | -------------------------------------------------------------------------------- /test_cases/shopping-home-page/prompt.md: -------------------------------------------------------------------------------- 1 | Generate an HTML file for the home page of a shopping website. The home page has several sections of content for popular items, deals, and blog posts. 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = a11y-llm-tests 3 | version = 0.1.0 4 | 5 | [options] 6 | packages = find: 7 | python_requires = >=3.10 8 | 9 | [tool:pytest] 10 | pythonpath = . 11 | addopts = -ra 12 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/prompt.md: -------------------------------------------------------------------------------- 1 | - Generate an HTML file that demonstrates an expand/collapse widget. 2 | - Wrap each widget with a div that has an `example` class. 3 | - Give the container for controlled content a `details` class. 4 | -------------------------------------------------------------------------------- /scripts/install_node_deps.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | cd "$(dirname "$0")/../node_runner" 4 | if [ ! -f package.json ]; then 5 | echo "package.json missing" >&2 6 | exit 1 7 | fi 8 | npm install 9 | npx playwright install chromium 10 | echo "Node dependencies installed." 11 | -------------------------------------------------------------------------------- /config/models.yaml.example: -------------------------------------------------------------------------------- 1 | defaults: 2 | #temperature: 0.2 3 | #max_tokens: 1200 4 | #system_prompt: | 5 | # You are generating a single standalone HTML document. 6 | #custom_instructions_markdown: custom-instructions.md 7 | models: 8 | - name: azure_ai/gpt-5-mini 9 | display_name: GPT-5 Mini 10 | - name: claude-sonnet-4-20250514 11 | display_name: Claud Sonnet 4 12 | -------------------------------------------------------------------------------- /a11y_llm_tests/__init__.py: -------------------------------------------------------------------------------- 1 | """a11y_llm_tests 2 | 3 | Toolkit to evaluate how well various LLMs generate accessible HTML. 4 | 5 | Primary entrypoints: 6 | - cli.py (Typer CLI) 7 | - generator.py (LLM generation + caching) 8 | - node_bridge.py (Playwright + axe-core invocation) 9 | - report.py (HTML report rendering) 10 | """ 11 | 12 | __all__ = [ 13 | "generator", 14 | "node_bridge", 15 | "report", 16 | ] 17 | -------------------------------------------------------------------------------- /a11y_llm_tests/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous utility helpers (placeholder).""" 2 | 3 | def ensure_single_html(doc: str) -> str: 4 | """Return only the first ... segment if multiple exist.""" 5 | lower = doc.lower() 6 | if "" in lower: 7 | start = lower.index("") + len("") 9 | return doc[start:end] 10 | return doc 11 | -------------------------------------------------------------------------------- /node_runner/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "a11y-llm-tests-runner", 3 | "version": "0.1.0", 4 | "private": true, 5 | "type": "commonjs", 6 | "description": "Playwright + axe-core runner for a11y-llm-tests", 7 | "main": "runner.js", 8 | "scripts": { 9 | "start": "node runner.js" 10 | }, 11 | "dependencies": { 12 | "axe-core": "^4.10.0", 13 | "deepmerge": "^4.3.1", 14 | "playwright": "^1.48.0" 15 | }, 16 | "engines": { 17 | "node": ">=18" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which 6 | includes all source code repositories in our GitHub organizations. 7 | 8 | **Please do not report security vulnerabilities through public GitHub issues.** 9 | 10 | For security reporting information, locations, contact information, and policies, 11 | please review the latest guidance for Microsoft repositories at 12 | [https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md). 13 | 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | - Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support) 11 | -------------------------------------------------------------------------------- /a11y_llm_tests/cache.py: -------------------------------------------------------------------------------- 1 | """Cache utilities for generation artifacts. 2 | 3 | Currently minimal: provides helper to compose cache keys that account for model, 4 | prompt hash, and optional seed, ensuring sampled generations can coexist. 5 | """ 6 | 7 | from pathlib import Path 8 | from typing import Optional 9 | 10 | CACHE_ROOT = Path('.cache') 11 | CACHE_ROOT.mkdir(exist_ok=True) 12 | 13 | def generation_cache_key(model: str, prompt_hash: str, seed: Optional[int] = None) -> str: 14 | """Return a filename-safe cache key for a generation. 15 | 16 | Example: modelabc_deadbeef or modelabc_deadbeef_s42 17 | """ 18 | if seed is None: 19 | return f"{model}_{prompt_hash}" 20 | return f"{model}_{prompt_hash}_s{seed}" 21 | 22 | __all__ = ["generation_cache_key", "CACHE_ROOT"] 23 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/good-details-summary.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Details/Summary Disclosure 6 | 12 | 13 | 14 |

Details/Summary Disclosure

15 | 16 |
17 |
18 | Show Details 19 |

This is the hidden content that can be disclosed.

20 |
21 |
22 | 23 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/bad-missing-all-semantics.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Missing all semantics 6 | 12 | 13 | 14 |

Missing all semantics

15 |
16 |
Show Details
17 | 20 |
21 | 22 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/bad-hidden-content-in-reading-order.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Hidden content is still in reading order 6 | 12 | 13 | 14 |

Hidden content is still in reading order

15 |
16 | 17 |
This is the hidden content that can be disclosed.
18 |
19 | 20 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/good-button.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Basic Disclosure using button and aria-expanded 6 | 12 | 13 | 14 |

Basic Disclosure using button and aria-expanded

15 | 16 |
17 | 18 | 21 |
22 | 23 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/good-hidden-from-everyone.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Hidden content is hidden from everyone (opacity and aria-hidden) 6 | 12 | 13 | 14 |

Hidden content is hidden from everyone (opacity and aria-hidden)

15 | 16 |
17 | 18 | 19 |
20 | 21 | -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | from a11y_llm_tests.metrics import compute_pass_at_k, format_pass_at_k 2 | 3 | 4 | def test_pass_at_k_basic_cases(): 5 | # All fail 6 | assert compute_pass_at_k(0, 5, [1, 2, 5]) == {1: 0.0, 2: 0.0, 5: 0.0} 7 | # All pass 8 | assert compute_pass_at_k(5, 5, [1, 2, 5]) == {1: 1.0, 2: 1.0, 5: 1.0} 9 | # Example: n=5, c=1 10 | r = compute_pass_at_k(1, 5, [1, 2]) 11 | # pass@1 = c/n = 0.2 12 | assert abs(r[1] - 0.2) < 1e-9 13 | # pass@2 = 1 - ( (4C2)/(5C2) ) = 1 - (6/10) = 0.4 14 | assert abs(r[2] - 0.4) < 1e-9 15 | 16 | 17 | def test_pass_at_k_edge_values(): 18 | # k larger than n 19 | r = compute_pass_at_k(1, 3, [5]) 20 | # k treated as n => probability that at least one passes = 1 when c>0 21 | assert r[5] == 1.0 22 | # zero samples 23 | r0 = compute_pass_at_k(0, 0, [1, 5]) 24 | assert r0 == {1: 0.0, 5: 0.0} 25 | 26 | 27 | def test_format_pass_at_k(): 28 | formatted = format_pass_at_k({5: 1.0, 1: 0.2}) 29 | # Keys become strings and sorted 30 | assert list(formatted.keys()) == ["1", "5"] 31 | assert formatted["1"] == 0.2 32 | assert formatted["5"] == 1.0 33 | -------------------------------------------------------------------------------- /test_cases/shopping-home-page/examples/fail.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example fail 6 | 17 | 18 | 19 |
Header content 20 |
nav link 1nav link 2
21 |
22 |
23 |
page title
24 |
content
25 |
section title
26 |
content
27 |
section title
28 |
content
29 |
30 |
31 | Footer 32 |
33 | 34 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Build 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v5 17 | - name: Set up Python 3.11 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: "3.11" 21 | - name: Set up Node.js 22 | uses: actions/setup-node@v4 23 | with: 24 | node-version: "22.x" 25 | - name: Disable AppArmor 26 | run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | bash scripts/install_node_deps.sh 33 | - name: Test with pytest 34 | run: | 35 | python -m pytest -s 36 | -------------------------------------------------------------------------------- /tests/test_generator_prompts.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from a11y_llm_tests import generator 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def reset_prompts(): 8 | generator.configure_prompts(None, None) 9 | yield 10 | generator.configure_prompts(None, None) 11 | 12 | 13 | def test_compute_prompt_hash_changes_with_system_prompt(): 14 | baseline = generator.compute_prompt_hash("Prompt body") 15 | generator.configure_prompts("Revised system prompt", None) 16 | changed = generator.compute_prompt_hash("Prompt body") 17 | assert baseline != changed 18 | 19 | 20 | def test_hash_changes_with_custom_instructions(): 21 | generator.configure_prompts(None, "Alpha instructions") 22 | first = generator.compute_prompt_hash("Prompt body") 23 | generator.configure_prompts(None, "Beta instructions") 24 | second = generator.compute_prompt_hash("Prompt body") 25 | assert first != second 26 | 27 | 28 | def test_effective_system_prompt_includes_custom_instructions(): 29 | generator.configure_prompts("Base prompt", "### Custom\n- Item") 30 | effective = generator.get_effective_system_prompt() 31 | assert "Base prompt" in effective 32 | assert "### Custom" in effective 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /test_cases/shopping-home-page/examples/pass.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example pass 6 | 17 | 18 | 19 |
Header content 20 | 26 |
27 |
28 |

page title

29 |
content
30 |

section title

31 |
content
32 |

section title

33 |
content
34 |
35 |
36 | Footer 37 |
38 | 39 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /test_cases/shopping-home-page/test.js: -------------------------------------------------------------------------------- 1 | // New harness signature with dependency injection 2 | module.exports.run = async ({ page, assert }) => { 3 | await assert("Has an h1", async () => { 4 | const h1 = await page.$$eval("h1", els => els.length); 5 | const roleH1 = await page.$$eval("[role='heading'][aria-level='1']", els => els.length); 6 | return (h1 + roleH1) >= 1; 7 | }); 8 | 9 | await assert("Has single h1", async () => { 10 | const h1 = await page.getByRole('heading', { level: 1 }); 11 | const count = await h1.count(); 12 | return count === 1; 13 | }, {type: 'BP'}); 14 | 15 | await assert("Has at least one h2", async () => { 16 | const h2 = await page.getByRole('heading', { level: 2 }); 17 | const count = await h2.count(); 18 | return count >= 1; 19 | }); 20 | 21 | await assert("Has a single banner", async () => { 22 | const banner = await page.getByRole('banner'); 23 | return (await banner.count()) === 1; 24 | }); 25 | 26 | await assert("Has a single maincontent", async () => { 27 | const main = await page.getByRole('main'); 28 | return (await main.count()) === 1; 29 | }); 30 | 31 | await assert("Has a single navigation", async () => { 32 | const nav = await page.getByRole('navigation'); 33 | return (await nav.count()) === 1; 34 | }); 35 | 36 | await assert("Has a single footer", async () => { 37 | let footer = await page.getByRole('contentinfo'); 38 | return (await footer.count()) === 1; 39 | }); 40 | 41 | return {}; // assertions collected via injected assert 42 | }; 43 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/test.js: -------------------------------------------------------------------------------- 1 | // New harness signature: module.exports.run = async ({ page, assert }) => { ... } 2 | module.exports.run = async ({ page, assert }) => { 3 | const examples = await page.$$(".example"); 4 | 5 | const hasValidSemantics = async (example) => { 6 | if (await example.$("button[aria-expanded], [role='button'][aria-expanded]")) { 7 | return true; 8 | } 9 | if (await example.$("details summary")) { 10 | return true; 11 | } 12 | return false; 13 | } 14 | 15 | await assert("All examples have a valid semantics", async () => { 16 | for (const example of examples) { 17 | if (! await hasValidSemantics(example)) { 18 | return false; 19 | } 20 | } 21 | return true; 22 | }); 23 | 24 | await assert("Collapsed content is hidden from assistive technology", async () => { 25 | for (const example of examples) { 26 | if (!await example.$("button[aria-expanded=false], [role='button'][aria-expanded=false]")) { 27 | // Only check button implementations 28 | continue; 29 | } 30 | 31 | let isHidden = await example.$eval(".details", el => { 32 | // Use axe-core's isVisible util to determine if hidden from sighted users but available to AT 33 | let isVisuallyHidden = !window.axe.commons.dom.isVisible(el, false, true); 34 | let isScreenReaderHidden = !window.axe.commons.dom.isVisible(el, true, true); 35 | return isVisuallyHidden && isScreenReaderHidden; 36 | }); 37 | 38 | if (!isHidden) { 39 | return false; 40 | } 41 | } 42 | return true; 43 | }); 44 | }; 45 | 46 | -------------------------------------------------------------------------------- /test_cases/readme.md: -------------------------------------------------------------------------------- 1 | # Example Test Structure Documentation 2 | 3 | This document explains the new structure for test cases and their examples. 4 | 5 | ## Directory Structure 6 | 7 | Each test case should have the following structure: 8 | 9 | ``` 10 | test_cases/ 11 | └── / 12 | ├── prompt.md # Prompt to generate code for the test case 13 | ├── test.js # JavaScript test assertions 14 | └── examples/ # Directory containing example HTML files and expectations 15 | ├── example1.html # HTML example file with embedded json expectations for assertions 16 | ├── example2.html # Another HTML example file with embedded json expectations for assertions 17 | ``` 18 | 19 | ## Embedded JSON expectations for assertions 20 | 21 | Each HTML example file should have a script tag in the `` that defines which assertions should pass or fail for that specific example. 22 | 23 | ### Format 24 | 25 | ```html 26 | 31 | ``` 32 | 33 | ### Example 34 | 35 | ```html 36 | 42 | ``` 43 | 44 | ## Benefits 45 | 46 | 1. **Performance**: Each HTML file is only processed once by the node runner, regardless of how many assertions it contains. 47 | 2. **Flexibility**: You can have multiple example files for each test case, each with different assertion expectations. 48 | 3. **Granular Testing**: Individual assertions can be tested separately, making it easier to identify specific failures. 49 | 4. **Clear Expectations**: The JSON make it explicit which assertions should pass or fail for each example. 50 | -------------------------------------------------------------------------------- /a11y_llm_tests/node_bridge.py: -------------------------------------------------------------------------------- 1 | """Bridge for invoking the Node-based Playwright + axe-core runner. 2 | 3 | The API is intentionally small: ``run`` executes a single HTML + 4 | test.js pair and returns a JSON-compatible dict produced by the Node script. 5 | """ 6 | from __future__ import annotations 7 | 8 | import subprocess 9 | import tempfile 10 | import json 11 | import os 12 | import pathlib 13 | import time 14 | from typing import Optional, Dict, Any 15 | 16 | _NODE_DIR = pathlib.Path(__file__).resolve().parent.parent / "node_runner" 17 | PLAYWRIGHT_RUNNER = _NODE_DIR / "runner.js" 18 | 19 | 20 | def run(html: str, test_js_path: str, screenshot_path: Optional[str]) -> Dict[str, Any]: 21 | if not PLAYWRIGHT_RUNNER.exists(): 22 | return {"error": f"Runner script not found: {PLAYWRIGHT_RUNNER}", "duration_s": 0.0, "engine": "playwright"} 23 | with tempfile.TemporaryDirectory() as td: 24 | html_path = os.path.join(td, "gen.html") 25 | out_json = os.path.join(td, "out.json") 26 | with open(html_path, "w", encoding="utf-8") as f: 27 | f.write(html) 28 | args = [ 29 | "node", 30 | str(PLAYWRIGHT_RUNNER), 31 | html_path, 32 | test_js_path, 33 | out_json, 34 | screenshot_path or "", 35 | ] 36 | start = time.time() 37 | proc = subprocess.run(args, capture_output=True, text=True) 38 | duration = time.time() - start 39 | if proc.returncode != 0: 40 | return {"error": f"Node runner failed: {proc.stderr}", "duration_s": duration, "engine": "playwright"} 41 | try: 42 | with open(out_json, "r", encoding="utf-8") as jf: 43 | data = json.load(jf) 44 | except Exception as e: 45 | return {"error": f"Failed reading JSON output: {e}", "duration_s": duration, "engine": "playwright"} 46 | data["duration_s"] = duration 47 | data.setdefault("engine", "playwright") 48 | return data 49 | -------------------------------------------------------------------------------- /a11y_llm_tests/metrics.py: -------------------------------------------------------------------------------- 1 | """Metrics utilities for evaluating multiple sampled generations (pass@k).""" 2 | from __future__ import annotations 3 | from math import comb 4 | from typing import Iterable, Dict, List 5 | 6 | 7 | def compute_pass_at_k(c: int, n: int, ks: Iterable[int]) -> Dict[int, float]: 8 | """Compute pass@k for given counts. 9 | 10 | pass@k = 1 - ((n-c choose k) / (n choose k)) for 0 < c < n and k <= n. 11 | Handles edge cases: 12 | - If c == 0 -> 0.0 13 | - If c == n -> 1.0 14 | - If k > n -> treat k as n (probability reduces to c>0 ? 1 : 0) 15 | - If k <= 0 -> 0.0 16 | Parameters: 17 | c: number of passing samples 18 | n: total number of samples 19 | ks: iterable of k values 20 | Returns: 21 | Dict mapping each k to probability (float) 22 | Raises: 23 | ValueError if counts invalid. 24 | """ 25 | if n < 0 or c < 0 or c > n: 26 | raise ValueError("Require 0 <= c <= n and n >= 0") 27 | if n == 0: 28 | return {int(k): 0.0 for k in ks} 29 | 30 | result: Dict[int, float] = {} 31 | for k in ks: 32 | k_int = int(k) 33 | if k_int <= 0: 34 | result[k_int] = 0.0 35 | continue 36 | k_eff = k_int if k_int <= n else n 37 | if c == 0: 38 | result[k_int] = 0.0 39 | continue 40 | if c == n: 41 | result[k_int] = 1.0 42 | continue 43 | if k_eff == 0: 44 | result[k_int] = 0.0 45 | continue 46 | numerator = comb(n - c, k_eff) if (n - c) >= k_eff else 0 47 | denominator = comb(n, k_eff) 48 | result[k_int] = 1.0 - (numerator / denominator) 49 | return result 50 | 51 | 52 | def format_pass_at_k(pass_at_k: Dict[int, float]) -> Dict[str, float]: 53 | """Convert int keys to strings for JSON serialization stability.""" 54 | return {str(k): float(v) for k, v in sorted(pass_at_k.items(), key=lambda x: x[0])} 55 | 56 | 57 | __all__ = ["compute_pass_at_k", "format_pass_at_k"] 58 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/examples/pass-native.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Pass - Native - Modal Dialog example 8 | 9 | 10 | 20 | 21 | 22 |
23 |

Pass - Native - Modal Dialog example

24 |
25 | 26 | 27 |

Test content

28 |
29 | 30 |
31 |
32 | 40 | 45 |
46 | 47 |
48 | 49 | 50 |

Test content 2

51 |
52 | 53 |
54 |
55 | 63 | 68 |
69 |
70 | 71 | 72 | -------------------------------------------------------------------------------- /node_runner/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "a11y-llm-tests-runner", 3 | "version": "0.1.0", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "a11y-llm-tests-runner", 9 | "version": "0.1.0", 10 | "dependencies": { 11 | "axe-core": "^4.10.0", 12 | "deepmerge": "^4.3.1", 13 | "playwright": "^1.48.0" 14 | }, 15 | "engines": { 16 | "node": ">=18" 17 | } 18 | }, 19 | "node_modules/axe-core": { 20 | "version": "4.10.3", 21 | "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.3.tgz", 22 | "integrity": "sha512-Xm7bpRXnDSX2YE2YFfBk2FnF0ep6tmG7xPh8iHee8MIcrgq762Nkce856dYtJYLkuIoYZvGfTs/PbZhideTcEg==", 23 | "license": "MPL-2.0", 24 | "engines": { 25 | "node": ">=4" 26 | } 27 | }, 28 | "node_modules/deepmerge": { 29 | "version": "4.3.1", 30 | "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", 31 | "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", 32 | "license": "MIT", 33 | "engines": { 34 | "node": ">=0.10.0" 35 | } 36 | }, 37 | "node_modules/fsevents": { 38 | "version": "2.3.2", 39 | "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", 40 | "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", 41 | "hasInstallScript": true, 42 | "license": "MIT", 43 | "optional": true, 44 | "os": [ 45 | "darwin" 46 | ], 47 | "engines": { 48 | "node": "^8.16.0 || ^10.6.0 || >=11.0.0" 49 | } 50 | }, 51 | "node_modules/playwright": { 52 | "version": "1.56.0", 53 | "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.56.0.tgz", 54 | "integrity": "sha512-X5Q1b8lOdWIE4KAoHpW3SE8HvUB+ZZsUoN64ZhjnN8dOb1UpujxBtENGiZFE+9F/yhzJwYa+ca3u43FeLbboHA==", 55 | "license": "Apache-2.0", 56 | "dependencies": { 57 | "playwright-core": "1.56.0" 58 | }, 59 | "bin": { 60 | "playwright": "cli.js" 61 | }, 62 | "engines": { 63 | "node": ">=18" 64 | }, 65 | "optionalDependencies": { 66 | "fsevents": "2.3.2" 67 | } 68 | }, 69 | "node_modules/playwright-core": { 70 | "version": "1.56.0", 71 | "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.56.0.tgz", 72 | "integrity": "sha512-1SXl7pMfemAMSDn5rkPeZljxOCYAmQnYLBTExuh6E8USHXGSX3dx6lYZN/xPpTz1vimXmPA9CDnILvmJaB8aSQ==", 73 | "license": "Apache-2.0", 74 | "bin": { 75 | "playwright-core": "cli.js" 76 | }, 77 | "engines": { 78 | "node": ">=18" 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/examples/fail-popover.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Fail - custom - modal dialog example 8 | 9 | 10 | 20 | 21 | 22 |
23 |

Fail - popover - modal dialog example

24 | 25 |

The following are examples of failing modal dialogs that are implemented as popovers.

26 | 27 |

Example 1

28 |

29 | This example has role of dialog but does not trap keyboard or screen reader focus. 30 | Content behind the dialog is still available to keyboard and screen reader users. 31 |

32 |
33 | 34 | 35 | 40 | 41 | 46 |
47 | 48 |

Example 2

49 |

50 | This example does not have a role of dialog and does not trap keyboard or screen reader focus. 51 | Content behind the dialog is still available to keyboard and screen reader users. 52 |

53 |
54 | 55 | 56 |
57 |

popover 2

58 |

Look ma, no JS! But is it good enough?

59 | 60 |
61 | 62 | 67 |
68 |
69 | 70 | 71 | -------------------------------------------------------------------------------- /a11y_llm_tests/schema.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional, Dict, Any 3 | from datetime import datetime 4 | 5 | 6 | class AssertionResult(BaseModel): 7 | name: str 8 | status: str # pass|fail 9 | message: Optional[str] = None 10 | type: str = "R" # R = Requirement (default), BP = Best Practice 11 | 12 | def model_post_init(self, __context): # type: ignore[override] 13 | # Normalize and validate type for backward compatibility 14 | t = (self.type or "R").upper() 15 | if t not in {"R", "BP"}: 16 | t = "R" 17 | object.__setattr__(self, "type", t) 18 | 19 | 20 | class TestFunctionResult(BaseModel): 21 | status: str # pass|fail|error|timeout 22 | assertions: List[AssertionResult] = [] 23 | error: Optional[str] = None 24 | duration_ms: Optional[int] = None 25 | total_assertion_failures: int = 0 26 | total_assertion_bp_failures: int = 0 27 | 28 | 29 | class AxeNode(BaseModel): 30 | html: Optional[str] 31 | target: List[str] = [] 32 | 33 | 34 | class AxeFailure(BaseModel): 35 | id: str 36 | impact: Optional[str] 37 | description: str 38 | helpUrl: Optional[str] 39 | nodes: List[AxeNode] = [] 40 | tags: List[str] = [] 41 | 42 | 43 | class AxeResult(BaseModel): 44 | failure_count: int # WCAG failures only (affects pass/fail) 45 | failures: List[AxeFailure] = [] # WCAG failures only 46 | best_practice_count: int = 0 # Best practice failures (informational) 47 | best_practice_failures: List[AxeFailure] = [] # Best practice failures 48 | 49 | 50 | class GenerationMeta(BaseModel): 51 | latency_s: float 52 | prompt_hash: str 53 | cached: bool 54 | tokens_in: Optional[int] = None 55 | tokens_out: Optional[int] = None 56 | total_tokens: Optional[int] = None 57 | cost_usd: Optional[float] = None 58 | # Added for sampling diversity / metadata 59 | seed: Optional[int] = None 60 | temperature: Optional[float] = None 61 | system_prompt: Optional[str] = None 62 | custom_instructions: Optional[str] = None 63 | effective_system_prompt: Optional[str] = None 64 | 65 | 66 | class ResultRecord(BaseModel): 67 | test_name: str 68 | model_name: str 69 | timestamp: datetime 70 | generation_html_path: str 71 | screenshot_path: Optional[str] 72 | test_function: TestFunctionResult 73 | axe: Optional[AxeResult] 74 | result: str # PASS|FAIL|ERROR 75 | generation: GenerationMeta 76 | # Index of the sample for (test_name, model_name). 0-based. None for legacy single-sample runs. 77 | sample_index: Optional[int] = None 78 | 79 | 80 | class RunSummary(BaseModel): 81 | run_id: str 82 | created_at: datetime 83 | results: List[ResultRecord] 84 | models: List[str] 85 | tests: List[str] 86 | 87 | 88 | class AggregateStats(BaseModel): 89 | per_model: Dict[str, Dict[str, Any]] 90 | 91 | 92 | class AggregateRecord(BaseModel): 93 | """Aggregate statistics for a (test_name, model_name) pair across multiple samples.""" 94 | test_name: str 95 | model_name: str 96 | n_samples: int 97 | n_pass: int 98 | pass_at_k: Dict[str, float] # JSON-friendly string keys 99 | k_values: List[int] 100 | computed_at: datetime 101 | 102 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/examples/fail-aria.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Fail - ARIA - Modal Dialog Examples 8 | 9 | 10 | 20 | 21 | 22 |
23 |

Fail - ARIA - Modal Dialog Examples

24 | 25 |

26 | This is a failing example of an ARIA modal dialog. It has an incorrect role, does not hide content behind the dialog from 27 | keyboard and screen reader users, and does not manage keyboard focus. 28 |

29 | 30 | 31 | 32 |
33 |
34 |

ARIA can be used to create modal dialogs.

35 |

There's extra work to make them properly. The native HTML 36 | dialog element 37 | handles a lot of this for us automatically.

38 |

39 | It's generally better to use the native HTML dialog unless 40 | you have specific use cases where a custom dialog might perform better. 41 | But even then, you may be able to still use native HTML features 42 | over ARIA... just sayin... 43 |

44 | 45 |
46 |
47 | 48 | 49 | 69 | 95 |
96 | 97 | 98 | -------------------------------------------------------------------------------- /node_runner/runner.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | // Playwright + axe-core executor (mirrors runner.js for Puppeteer). 3 | // NOTE: Initially Chromium-only; future work may add firefox/webkit via arg/env. 4 | 5 | const fs = require("fs"); 6 | const path = require("path"); 7 | const { chromium } = require("playwright"); 8 | const axeSource = require("axe-core").source; 9 | const merge = require('deepmerge') 10 | 11 | async function main() { 12 | const [,, htmlPath, testJsPath, outJsonPath, screenshotPath] = process.argv; 13 | if (!htmlPath || !testJsPath || !outJsonPath) { 14 | console.error("Usage: node playwright_runner.js [screenshotPath]"); 15 | process.exit(2); 16 | } 17 | const html = fs.readFileSync(htmlPath, "utf-8"); 18 | let testFn; 19 | try { 20 | testFn = require(path.resolve(testJsPath)); 21 | } catch (e) { 22 | console.error("Failed loading test file:", e); 23 | testFn = {}; 24 | } 25 | let launchOptions = { headless: true }; 26 | if (process.env.A11Y_LLM_EVAL_DEBUG === "1") { 27 | launchOptions = { headless: false, slowMo: 1000 }; 28 | } 29 | 30 | const browser = await chromium.launch(launchOptions); 31 | const context = await browser.newContext({ viewport: { width: 1280, height: 800 } }); 32 | const page = await context.newPage(); 33 | const consoleLogs = []; 34 | page.on("console", msg => consoleLogs.push(msg.text())); 35 | 36 | const start = Date.now(); 37 | let testFunctionResult = { status: "error", assertions: [] }; 38 | let axeResult = null; 39 | let errorMsg = null; 40 | 41 | async function loadHTML() { 42 | await page.reload(); 43 | await page.setContent(html, { waitUntil: "load" }); 44 | await page.addScriptTag({ content: axeSource }); 45 | await page.evaluate(() => { window.axe.setup();}); 46 | } 47 | 48 | async function runAxeOnPage(page) { 49 | return await page.evaluate(async () => { 50 | return await window.axe.run(); 51 | }); 52 | } 53 | 54 | const utils = { reload: loadHTML, runAxeOnPage, merge }; 55 | 56 | try { 57 | await loadHTML(); 58 | 59 | if (!testFn.run || typeof testFn.run !== 'function') { 60 | testFunctionResult = { status: 'error', assertions: [], error: 'No run export (expected module.exports.run = async ({ page, assert }) => {...})' }; 61 | } else { 62 | const collected = []; 63 | const assert = async (name, fn, opts = {}) => { 64 | const { type = 'R' } = opts; 65 | let normalizedType = (type || 'R').toUpperCase(); 66 | if (!['R','BP'].includes(normalizedType)) normalizedType = 'R'; 67 | try { 68 | const r = await fn(); 69 | // Allow boolean or object { pass, message } 70 | let passVal = r; 71 | let message; 72 | if (r && typeof r === 'object' && 'pass' in r) { 73 | passVal = r.pass; 74 | message = r.message; 75 | } 76 | collected.push({ name, status: passVal ? 'pass' : 'fail', message, type: normalizedType }); 77 | } catch (e) { 78 | collected.push({ name, status: 'fail', message: e.message, type: normalizedType }); 79 | } 80 | }; 81 | 82 | const runStart = Date.now(); 83 | try { 84 | await testFn.run({ page, assert, utils }); 85 | } catch (e) { 86 | errorMsg = e.stack || e.message; 87 | } 88 | const duration_ms = Date.now() - runStart; 89 | 90 | // Normalize & determine status based only on requirement failures 91 | const hasAssertionFailure = collected.some(a => a.type === 'R' && a.status === 'fail'); 92 | const totalAssertionFailures = collected.filter(a => a.type === 'R' && a.status === 'fail').length; 93 | const totalAssertionBpFailures = collected.filter(a => a.type === 'BP' && a.status === 'fail').length; 94 | testFunctionResult = { 95 | status: hasAssertionFailure ? 'fail' : 'pass', 96 | assertions: collected, 97 | duration_ms, 98 | total_assertion_failures: totalAssertionFailures, 99 | total_assertion_bp_failures: totalAssertionBpFailures 100 | }; 101 | } 102 | 103 | const processAxeResults = (results) => { 104 | // Separate WCAG violations from best practice violations 105 | const wcagViolations = []; 106 | const bestPracticeViolations = []; 107 | 108 | let wcagCount = 0; 109 | let bestPracticeCount = 0; 110 | results.violations.forEach(v => { 111 | const mappedViolation = { 112 | id: v.id, 113 | impact: v.impact, 114 | description: v.description, 115 | helpUrl: v.helpUrl, 116 | nodes: v.nodes.map(n => ({ html: n.html, target: n.target })), 117 | tags: v.tags 118 | }; 119 | if (v.tags.includes('best-practice')) { 120 | bestPracticeViolations.push(mappedViolation); 121 | bestPracticeCount += mappedViolation.nodes.length; 122 | } else { 123 | wcagViolations.push(mappedViolation); 124 | wcagCount += mappedViolation.nodes.length; 125 | } 126 | }); 127 | return { 128 | failure_count: wcagCount, 129 | failures: wcagViolations, 130 | best_practice_count: bestPracticeCount, 131 | best_practice_failures: bestPracticeViolations 132 | }; 133 | } 134 | 135 | if (screenshotPath) { 136 | try { 137 | await page.screenshot({ path: screenshotPath, fullPage: true }); 138 | } catch (e) { 139 | console.error('Screenshot failed:', e.message); 140 | } 141 | } 142 | 143 | axeResult = await runAxeOnPage(page); 144 | 145 | if (testFn.runAxe && typeof testFn.runAxe === 'function') { 146 | const axeCustomResult = await testFn.runAxe({ page, utils}); 147 | if (axeCustomResult && typeof axeCustomResult === 'object') { 148 | axeResult = merge(axeResult || {}, axeCustomResult); 149 | } 150 | } 151 | 152 | axeResult = processAxeResults(axeResult); 153 | } catch (e) { 154 | errorMsg = e.stack || e.message; 155 | if (testFunctionResult.status === "error") { 156 | testFunctionResult.error = errorMsg; 157 | } 158 | } finally { 159 | await browser.close(); 160 | } 161 | 162 | const out = { 163 | engine: 'playwright', 164 | browser: 'chromium', 165 | testFunctionResult, 166 | axeResult, 167 | consoleLogs, 168 | error: errorMsg, 169 | total_duration_ms: Date.now() - start 170 | }; 171 | fs.writeFileSync(outJsonPath, JSON.stringify(out, null, 2), "utf-8"); 172 | } 173 | 174 | main().catch(e => { 175 | console.error(e); 176 | process.exit(1); 177 | }); 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A11y LLM Evaluation Harness and Dataset 2 | 3 | This is a research project to evaluate how well various LLM models generate accessible HTML content. 4 | 5 | ## Problem 6 | LLMs currently generate code with accessibility bugs, resulting in blockers for people with disabilities and costly re-work and fixes downstream. 7 | 8 | ## Goal 9 | Create a public test suite which can be used to benchmark how well various LLMs generates accessible HTML code. Eventually, it could also be used to help train models to generate more accessible code by default. 10 | 11 | ## Methdology 12 | - Each test case contains a prompt to generate an HTML page to demonstrate a specific pattern or component. 13 | - This page is rendered in a real browser using Playwright (Chromium). Tests are executed against this rendered page. 14 | - The HTML is evaluated against axe-core, one of the most popular automated accessibility testing engines. 15 | - The HTML is also evaluated against a manually defined set of assertions, customized for the specific test case. This allows for more robust testing than just using axe-core. 16 | - Tests only pass if zero axe-core failures are found AND all *requirement* assertions pass. Best Practice (BP) assertion failures do not fail the test but are tracked separately. 17 | 18 | ## Features 19 | - Python orchestrator (generation, execution, reporting) 20 | - Node.js Playwright + axe-core evaluation 21 | - Per-test prompts & injected JS assertions 22 | - HTML report summarizing performance 23 | - Token + cost tracking (tokens in/out/total, per-generation cost, aggregated per model) 24 | - Multi-sample generation with pass@k metrics (probability at least one passing generation in k draws) 25 | 26 | ## Sampling & pass@k Metrics 27 | You can request multiple independent generations ("samples") per (test, model). This enables computation of pass@k metrics similar to code evaluation benchmarks. 28 | 29 | ### CLI Usage 30 | 31 | Step 1: Send prompts to the LLMs and generate HTML 32 | ```bash 33 | python -m a11y_llm_tests.cli run \ 34 | --models-file config/models.yaml \ 35 | --out runs \ 36 | --samples 20 \ 37 | ``` 38 | 39 | Step 2: Run the eval and generate the report 40 | ```bash 41 | python -m a11y_llm_tests.cli evaluate \ 42 | 43 | --k 1,5,10 44 | ``` 45 | 46 | Artifacts: 47 | - Each sample's HTML: `runs//raw//__s.html` (single-sample keeps legacy `.html`) 48 | - Screenshots with analogous naming 49 | - `results.json` now includes per-sample records + an `aggregates` array with pass@k stats. 50 | - Report includes an aggregate pass@k table and grouped per-sample cards. 51 | 52 | Tips: 53 | - Increase `temperature` (or other diversity params) to reduce sample correlation. 54 | - Use `--disable-cache` if you want fresh generations even when prompt/model/seed repeat. 55 | 56 | 57 | ## Quick Start 58 | ```bash 59 | python3 -m venv .venv 60 | source .venv/bin/activate 61 | pip install --upgrade pip 62 | pip install -r requirements.txt 63 | 64 | # Node deps 65 | bash scripts/install_node_deps.sh 66 | 67 | # Copy env and set keys 68 | cp .env.example .env 69 | export OPENAI_API_KEY=... # etc. or put in .env and use dotenv 70 | 71 | # Copy model config and set API keys 72 | cp config/models.yaml.example config/models.yaml 73 | 74 | # Run all tests against configured models 75 | python -m a11y_llm_tests.cli run --models-file config/models.yaml --out runs 76 | ``` 77 | 78 | ## Adding a Test Case 79 | Create a new folder under `test_cases/`: 80 | ``` 81 | test_cases/ 82 | form-labels/ 83 | prompt.md 84 | test.js 85 | example-fail/ 86 | example-pass/ 87 | ``` 88 | 89 | `prompt.md` contains ONLY the user-facing instruction for the model. 90 | 91 | `test.js` must export: 92 | 93 | ```js 94 | module.exports.run = async ({ page, assert }) => { 95 | await assert("Has an h1", async () => { 96 | const count = await page.$$eval('h1', els => els.length); 97 | return count >= 1; // truthy => pass, falsy => fail 98 | }); 99 | await assert("Sequential heading levels", async () => { 100 | // Return object form to include custom message 101 | const ok = await page.$$eval('h1 + h2', els => els.length) > 0; 102 | return { pass: ok, message: ok ? undefined : 'h2 does not follow h1' }; 103 | }, { type: 'BP' }); 104 | return {}; // assertions collected automatically 105 | }; 106 | ``` 107 | 108 | The runner injects an `assert(name, fn, opts?)` helper: 109 | 110 | | Parameter | Description | 111 | |-----------|-------------| 112 | | `name` | Human-readable assertion label | 113 | | `fn` | Async/Sync function returning boolean OR `{ pass, message? }` | 114 | | `opts.type` | `'R'` (Requirement, default) or `'BP'` (Best Practice) | 115 | 116 | Return shape from `run` can be empty. 117 | 118 | ### Assertion Types 119 | 120 | Each assertion may now include a `type` field: 121 | 122 | | Type | Meaning | Affects Test Pass/Fail | Aggregated Separately | 123 | |------|---------|------------------------|-----------------------| 124 | | `R` | Requirement (default) | Yes (any failing R => test fails) | Requirement Pass Rate | 125 | | `BP` | Best Practice | No (ignored for pass/fail) | Best Practice Pass Rate | 126 | 127 | If `type` is omitted it defaults to `R` for backward compatibility. The HTML report shows both Requirement Pass Rate (percentage of tests whose requirement assertions passed) and Best Practice Pass Rate (percentage of tests containing BP assertions where all BP assertions passed). 128 | 129 | Example assertion objects returned from `run`: 130 | 131 | ```js 132 | return { 133 | assertions: [ 134 | { name: 'has main landmark', status: 'pass', type: 'R' }, 135 | { name: 'images have alt text', status: 'fail', type: 'BP', message: '1 of 5 images missing alt' } 136 | ] 137 | }; 138 | ``` 139 | 140 | ## Report 141 | Generated at `runs//report.html` with: 142 | - Summary stats per model 143 | - Detailed per model/test breakdown 144 | - Axe violations 145 | - Assertions & statuses 146 | - Pass@k aggregate table and per-sample cards when multiple samples are collected 147 | 148 | ## Contributing 149 | 150 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 151 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 152 | the rights to use your contribution. For details, visit [Contributor License Agreements](https://cla.opensource.microsoft.com). 153 | 154 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 155 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 156 | provided by the bot. You will only need to do this once across all repos using our CLA. 157 | 158 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 159 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 160 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 161 | 162 | ## Trademarks 163 | 164 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 165 | trademarks or logos is subject to and must follow 166 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/legal/intellectualproperty/trademarks/usage/general). 167 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 168 | Any use of third-party trademarks or logos are subject to those third-party's policies. 169 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/examples/pass-aria.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Pass - ARIA - Modal Dialog Examples 8 | 9 | 10 | 20 | 21 | 22 |
23 |
24 |

Pass - ARIA - Modal Dialog Examples

25 | 26 | 27 |
28 | 29 |
30 |
31 | 44 |
45 |
46 | 47 | 48 | 68 | 172 |
173 | 174 | 175 | -------------------------------------------------------------------------------- /tests/test_cli_sampling.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typer.testing import CliRunner 4 | from a11y_llm_tests.cli import app 5 | 6 | # We'll monkeypatch generator and node_bridge to avoid real API calls 7 | 8 | class DummyResp: 9 | def __init__(self, content): 10 | self.choices = [type("c", (), {"message": type("m", (), {"content": content})()})] 11 | self.usage = {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30} 12 | self.response_cost = 0.001 13 | 14 | 15 | def fake_generate_html_with_meta(model, prompt, iteration, temperature=None, seed=None, disable_cache=False): 16 | # Generate deterministic pass/fail pattern by seed (even seed -> pass, odd -> fail) 17 | status_comment = f"" if seed is not None else "" 18 | html = f"

Test {model}

{status_comment}" 19 | return html, { 20 | "cached": False, 21 | "latency_s": 0.01, 22 | "prompt_hash": "deadbeef", 23 | "tokens_in": 5, 24 | "tokens_out": 7, 25 | "total_tokens": 12, 26 | "cost_usd": 0.0005, 27 | "seed": seed, 28 | "temperature": temperature, 29 | } 30 | 31 | 32 | def fake_run(html, test_js_path, screenshot_path): 33 | # Extract seed from comment to decide pass/fail 34 | import re 35 | m = re.search(r"seed:(\d+)", html) 36 | seed = int(m.group(1)) if m else 0 37 | status = "pass" if seed % 2 == 0 else "fail" 38 | return { 39 | "testFunctionResult": { 40 | "status": status, 41 | "assertions": [ 42 | {"name": "dummy", "status": status, "message": None, "type": "R"}, 43 | ], 44 | "duration_ms": 5, 45 | }, 46 | "axeResult": { 47 | "violation_count": 0, 48 | "violations": [], 49 | }, 50 | } 51 | 52 | 53 | def test_cli_sampling_multi(monkeypatch, tmp_path): 54 | monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", fake_generate_html_with_meta) 55 | monkeypatch.setattr("a11y_llm_tests.node_bridge.run", fake_run) 56 | 57 | # Create a minimal test case directory 58 | tc_dir = tmp_path / "test_cases" / "sample-case" 59 | tc_dir.mkdir(parents=True) 60 | (tc_dir / "prompt.md").write_text("Generate a page", encoding="utf-8") 61 | (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8") 62 | 63 | # Provide models config 64 | config_dir = tmp_path / "config" 65 | config_dir.mkdir() 66 | (config_dir / "models.yaml").write_text("""models:\n - name: test-model\n""", encoding="utf-8") 67 | 68 | runner = CliRunner() 69 | # Generation phase only 70 | gen_result = runner.invoke(app, [ 71 | "run", 72 | "--models-file", str(config_dir / "models.yaml"), 73 | "--out", str(tmp_path / "runs"), 74 | "--test-cases-dir", str(tmp_path / "test_cases"), 75 | "--samples", "4", 76 | "--k", "1,2,4", 77 | "--base-seed", "100", 78 | ]) 79 | assert gen_result.exit_code == 0, gen_result.output 80 | runs_dir = tmp_path / "runs" 81 | run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir()) 82 | assert run_subdirs, "No run directory created" 83 | latest = run_subdirs[-1] 84 | # Ensure aggregates are empty pre-evaluation 85 | pre_data = json.loads((latest / "results.json").read_text(encoding="utf-8")) 86 | assert pre_data["aggregates"] == [] 87 | # Evaluation phase 88 | eval_result = runner.invoke(app, [ 89 | "evaluate", 90 | str(latest), 91 | "--test-cases-dir", str(tmp_path / "test_cases"), 92 | "--k", "1,2,4", 93 | "--no-generate-report", 94 | ]) 95 | assert eval_result.exit_code == 0, eval_result.output 96 | data = json.loads((latest / "results.json").read_text(encoding="utf-8")) 97 | aggs = data["aggregates"] 98 | assert len(aggs) == 1 99 | agg = aggs[0] 100 | assert agg["n_samples"] == 4 101 | assert agg["n_pass"] == 2 # Seeds 100,101,102,103 -> pass,fail,pass,fail 102 | p1 = agg["pass_at_k"]["1"] 103 | p2 = agg["pass_at_k"]["2"] 104 | assert abs(p1 - 0.5) < 1e-6 105 | assert 0.82 < p2 < 0.85 106 | assert agg["pass_at_k"]["4"] == 1.0 107 | sample_indices = sorted(r["sample_index"] for r in data["results"]) 108 | assert sample_indices == [0, 1, 2, 3] 109 | 110 | 111 | def test_cli_sampling_single(monkeypatch, tmp_path): 112 | monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", fake_generate_html_with_meta) 113 | monkeypatch.setattr("a11y_llm_tests.node_bridge.run", fake_run) 114 | 115 | tc_dir = tmp_path / "test_cases" / "single" 116 | tc_dir.mkdir(parents=True) 117 | (tc_dir / "prompt.md").write_text("Prompt", encoding="utf-8") 118 | (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8") 119 | 120 | config_dir = tmp_path / "config" 121 | config_dir.mkdir(exist_ok=True) 122 | (config_dir / "models.yaml").write_text("""models:\n - name: m1\n""", encoding="utf-8") 123 | 124 | runner = CliRunner() 125 | gen_result = runner.invoke(app, [ 126 | "run", 127 | "--models-file", str(config_dir / "models.yaml"), 128 | "--out", str(tmp_path / "runs"), 129 | "--test-cases-dir", str(tmp_path / "test_cases"), 130 | "--samples", "1", 131 | "--k", "1,5", 132 | "--base-seed", "5", 133 | ]) 134 | assert gen_result.exit_code == 0, gen_result.output 135 | runs_dir = tmp_path / "runs" 136 | run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir()) 137 | latest = run_subdirs[-1] 138 | pre_data = json.loads((latest / "results.json").read_text(encoding="utf-8")) 139 | assert pre_data["aggregates"] == [] 140 | eval_result = runner.invoke(app, [ 141 | "evaluate", 142 | str(latest), 143 | "--test-cases-dir", str(tmp_path / "test_cases"), 144 | "--k", "1,5", 145 | "--no-generate-report", 146 | ]) 147 | assert eval_result.exit_code == 0, eval_result.output 148 | data = json.loads((latest / "results.json").read_text(encoding="utf-8")) 149 | agg = data["aggregates"][0] 150 | assert agg["n_samples"] == 1 151 | assert agg["n_pass"] == 0 # Seed=5 -> fail (odd) 152 | assert agg["pass_at_k"]["1"] == 0.0 153 | 154 | 155 | def test_bp_failure_not_affect_requirement_pass(monkeypatch, tmp_path): 156 | # Requirement passes, BP fails => overall should pass 157 | def gen_html(model, prompt, iteration, temperature=None, seed=None, disable_cache=False): 158 | return "

Page

", { 159 | "cached": False, 160 | "latency_s": 0.01, 161 | "prompt_hash": "hash", 162 | "cost_usd": 0.0001, 163 | "seed": 1, 164 | "temperature": temperature, 165 | } 166 | 167 | def run(html, test_js_path, screenshot_path): 168 | return { 169 | "testFunctionResult": { 170 | "status": "pass", # legacy status (will be recomputed logic wise in runner normally) 171 | "assertions": [ 172 | {"name": "req-1", "status": "pass", "type": "R"}, 173 | {"name": "bp-1", "status": "fail", "type": "BP"}, 174 | ], 175 | "duration_ms": 3, 176 | }, 177 | "axeResult": {"violation_count": 0, "violations": []}, 178 | } 179 | 180 | monkeypatch.setattr("a11y_llm_tests.generator.generate_html_with_meta", gen_html) 181 | monkeypatch.setattr("a11y_llm_tests.node_bridge.run", run) 182 | 183 | tc_dir = tmp_path / "test_cases" / "bp-case" 184 | tc_dir.mkdir(parents=True) 185 | (tc_dir / "prompt.md").write_text("Prompt", encoding="utf-8") 186 | (tc_dir / "test.js").write_text("module.exports=()=>{}", encoding="utf-8") 187 | 188 | config_dir = tmp_path / "config" 189 | config_dir.mkdir(exist_ok=True) 190 | (config_dir / "models.yaml").write_text("""models:\n - name: modelX\n""", encoding="utf-8") 191 | 192 | runner_cli = CliRunner() 193 | gen_result = runner_cli.invoke(app, [ 194 | "run", 195 | "--models-file", str(config_dir / "models.yaml"), 196 | "--out", str(tmp_path / "runs"), 197 | "--test-cases-dir", str(tmp_path / "test_cases"), 198 | "--samples", "1", 199 | "--k", "1", 200 | ]) 201 | assert gen_result.exit_code == 0, gen_result.output 202 | runs_dir = tmp_path / "runs" 203 | run_subdirs = sorted(p for p in runs_dir.iterdir() if p.is_dir()) 204 | latest = run_subdirs[-1] 205 | # Evaluate 206 | eval_result = runner_cli.invoke(app, [ 207 | "evaluate", 208 | str(latest), 209 | "--test-cases-dir", str(tmp_path / "test_cases"), 210 | "--k", "1", 211 | "--no-generate-report", 212 | ]) 213 | assert eval_result.exit_code == 0, eval_result.output 214 | data = json.loads((latest / "results.json").read_text(encoding="utf-8")) 215 | assert data["results"][0]["result"] == "PASS" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .pytest_cache/ 3 | .mypy_cache/ 4 | .cache/ 5 | .venv/ 6 | node_modules/ 7 | runs/* 8 | !runs/.gitkeep 9 | *.pyc 10 | *.pyo 11 | *.env 12 | .DS_Store 13 | config/models.yaml 14 | 15 | 16 | ## Ignore Visual Studio temporary files, build results, and 17 | ## files generated by popular Visual Studio add-ons. 18 | ## 19 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore 20 | 21 | # User-specific files 22 | *.rsuser 23 | *.suo 24 | *.user 25 | *.userosscache 26 | *.sln.docstates 27 | *.env 28 | 29 | # User-specific files (MonoDevelop/Xamarin Studio) 30 | *.userprefs 31 | 32 | # Mono auto generated files 33 | mono_crash.* 34 | 35 | # Build results 36 | [Dd]ebug/ 37 | [Dd]ebugPublic/ 38 | [Rr]elease/ 39 | [Rr]eleases/ 40 | x64/ 41 | x86/ 42 | [Ww][Ii][Nn]32/ 43 | [Aa][Rr][Mm]/ 44 | [Aa][Rr][Mm]64/ 45 | [Aa][Rr][Mm]64[Ee][Cc]/ 46 | bld/ 47 | [Oo]bj/ 48 | [Oo]ut/ 49 | [Ll]og/ 50 | [Ll]ogs/ 51 | 52 | # Build results on 'Bin' directories 53 | **/[Bb]in/* 54 | # Uncomment if you have tasks that rely on *.refresh files to move binaries 55 | # (https://github.com/github/gitignore/pull/3736) 56 | #!**/[Bb]in/*.refresh 57 | 58 | # Visual Studio 2015/2017 cache/options directory 59 | .vs/ 60 | # Uncomment if you have tasks that create the project's static files in wwwroot 61 | #wwwroot/ 62 | 63 | # Visual Studio 2017 auto generated files 64 | Generated\ Files/ 65 | 66 | # MSTest test Results 67 | [Tt]est[Rr]esult*/ 68 | [Bb]uild[Ll]og.* 69 | *.trx 70 | 71 | # NUnit 72 | *.VisualState.xml 73 | TestResult.xml 74 | nunit-*.xml 75 | 76 | # Approval Tests result files 77 | *.received.* 78 | 79 | # Build Results of an ATL Project 80 | [Dd]ebugPS/ 81 | [Rr]eleasePS/ 82 | dlldata.c 83 | 84 | # Benchmark Results 85 | BenchmarkDotNet.Artifacts/ 86 | 87 | # .NET Core 88 | project.lock.json 89 | project.fragment.lock.json 90 | artifacts/ 91 | 92 | # ASP.NET Scaffolding 93 | ScaffoldingReadMe.txt 94 | 95 | # StyleCop 96 | StyleCopReport.xml 97 | 98 | # Files built by Visual Studio 99 | *_i.c 100 | *_p.c 101 | *_h.h 102 | *.ilk 103 | *.meta 104 | *.obj 105 | *.idb 106 | *.iobj 107 | *.pch 108 | *.pdb 109 | *.ipdb 110 | *.pgc 111 | *.pgd 112 | *.rsp 113 | # but not Directory.Build.rsp, as it configures directory-level build defaults 114 | !Directory.Build.rsp 115 | *.sbr 116 | *.tlb 117 | *.tli 118 | *.tlh 119 | *.tmp 120 | *.tmp_proj 121 | *_wpftmp.csproj 122 | *.log 123 | *.tlog 124 | *.vspscc 125 | *.vssscc 126 | .builds 127 | *.pidb 128 | *.svclog 129 | *.scc 130 | 131 | # Chutzpah Test files 132 | _Chutzpah* 133 | 134 | # Visual C++ cache files 135 | ipch/ 136 | *.aps 137 | *.ncb 138 | *.opendb 139 | *.opensdf 140 | *.sdf 141 | *.cachefile 142 | *.VC.db 143 | *.VC.VC.opendb 144 | 145 | # Visual Studio profiler 146 | *.psess 147 | *.vsp 148 | *.vspx 149 | *.sap 150 | 151 | # Visual Studio Trace Files 152 | *.e2e 153 | 154 | # TFS 2012 Local Workspace 155 | $tf/ 156 | 157 | # Guidance Automation Toolkit 158 | *.gpState 159 | 160 | # ReSharper is a .NET coding add-in 161 | _ReSharper*/ 162 | *.[Rr]e[Ss]harper 163 | *.DotSettings.user 164 | 165 | # TeamCity is a build add-in 166 | _TeamCity* 167 | 168 | # DotCover is a Code Coverage Tool 169 | *.dotCover 170 | 171 | # AxoCover is a Code Coverage Tool 172 | .axoCover/* 173 | !.axoCover/settings.json 174 | 175 | # Coverlet is a free, cross platform Code Coverage Tool 176 | coverage*.json 177 | coverage*.xml 178 | coverage*.info 179 | 180 | # Visual Studio code coverage results 181 | *.coverage 182 | *.coveragexml 183 | 184 | # NCrunch 185 | _NCrunch_* 186 | .NCrunch_* 187 | .*crunch*.local.xml 188 | nCrunchTemp_* 189 | 190 | # MightyMoose 191 | *.mm.* 192 | AutoTest.Net/ 193 | 194 | # Web workbench (sass) 195 | .sass-cache/ 196 | 197 | # Installshield output folder 198 | [Ee]xpress/ 199 | 200 | # DocProject is a documentation generator add-in 201 | DocProject/buildhelp/ 202 | DocProject/Help/*.HxT 203 | DocProject/Help/*.HxC 204 | DocProject/Help/*.hhc 205 | DocProject/Help/*.hhk 206 | DocProject/Help/*.hhp 207 | DocProject/Help/Html2 208 | DocProject/Help/html 209 | 210 | # Click-Once directory 211 | publish/ 212 | 213 | # Publish Web Output 214 | *.[Pp]ublish.xml 215 | *.azurePubxml 216 | # Note: Comment the next line if you want to checkin your web deploy settings, 217 | # but database connection strings (with potential passwords) will be unencrypted 218 | *.pubxml 219 | *.publishproj 220 | 221 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 222 | # checkin your Azure Web App publish settings, but sensitive information contained 223 | # in these scripts will be unencrypted 224 | PublishScripts/ 225 | 226 | # NuGet Packages 227 | *.nupkg 228 | # NuGet Symbol Packages 229 | *.snupkg 230 | # The packages folder can be ignored because of Package Restore 231 | **/[Pp]ackages/* 232 | # except build/, which is used as an MSBuild target. 233 | !**/[Pp]ackages/build/ 234 | # Uncomment if necessary however generally it will be regenerated when needed 235 | #!**/[Pp]ackages/repositories.config 236 | # NuGet v3's project.json files produces more ignorable files 237 | *.nuget.props 238 | *.nuget.targets 239 | 240 | # Microsoft Azure Build Output 241 | csx/ 242 | *.build.csdef 243 | 244 | # Microsoft Azure Emulator 245 | ecf/ 246 | rcf/ 247 | 248 | # Windows Store app package directories and files 249 | AppPackages/ 250 | BundleArtifacts/ 251 | Package.StoreAssociation.xml 252 | _pkginfo.txt 253 | *.appx 254 | *.appxbundle 255 | *.appxupload 256 | 257 | # Visual Studio cache files 258 | # files ending in .cache can be ignored 259 | *.[Cc]ache 260 | # but keep track of directories ending in .cache 261 | !?*.[Cc]ache/ 262 | 263 | # Others 264 | ClientBin/ 265 | ~$* 266 | *~ 267 | *.dbmdl 268 | *.dbproj.schemaview 269 | *.jfm 270 | *.pfx 271 | *.publishsettings 272 | orleans.codegen.cs 273 | 274 | # Including strong name files can present a security risk 275 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 276 | #*.snk 277 | 278 | # Since there are multiple workflows, uncomment next line to ignore bower_components 279 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 280 | #bower_components/ 281 | 282 | # RIA/Silverlight projects 283 | Generated_Code/ 284 | 285 | # Backup & report files from converting an old project file 286 | # to a newer Visual Studio version. Backup files are not needed, 287 | # because we have git ;-) 288 | _UpgradeReport_Files/ 289 | Backup*/ 290 | UpgradeLog*.XML 291 | UpgradeLog*.htm 292 | ServiceFabricBackup/ 293 | *.rptproj.bak 294 | 295 | # SQL Server files 296 | *.mdf 297 | *.ldf 298 | *.ndf 299 | 300 | # Business Intelligence projects 301 | *.rdl.data 302 | *.bim.layout 303 | *.bim_*.settings 304 | *.rptproj.rsuser 305 | *- [Bb]ackup.rdl 306 | *- [Bb]ackup ([0-9]).rdl 307 | *- [Bb]ackup ([0-9][0-9]).rdl 308 | 309 | # Microsoft Fakes 310 | FakesAssemblies/ 311 | 312 | # GhostDoc plugin setting file 313 | *.GhostDoc.xml 314 | 315 | # Node.js Tools for Visual Studio 316 | .ntvs_analysis.dat 317 | node_modules/ 318 | 319 | # Visual Studio 6 build log 320 | *.plg 321 | 322 | # Visual Studio 6 workspace options file 323 | *.opt 324 | 325 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 326 | *.vbw 327 | 328 | # Visual Studio 6 auto-generated project file (contains which files were open etc.) 329 | *.vbp 330 | 331 | # Visual Studio 6 workspace and project file (working project files containing files to include in project) 332 | *.dsw 333 | *.dsp 334 | 335 | # Visual Studio 6 technical files 336 | *.ncb 337 | *.aps 338 | 339 | # Visual Studio LightSwitch build output 340 | **/*.HTMLClient/GeneratedArtifacts 341 | **/*.DesktopClient/GeneratedArtifacts 342 | **/*.DesktopClient/ModelManifest.xml 343 | **/*.Server/GeneratedArtifacts 344 | **/*.Server/ModelManifest.xml 345 | _Pvt_Extensions 346 | 347 | # Paket dependency manager 348 | **/.paket/paket.exe 349 | paket-files/ 350 | 351 | # FAKE - F# Make 352 | **/.fake/ 353 | 354 | # CodeRush personal settings 355 | **/.cr/personal 356 | 357 | # Python Tools for Visual Studio (PTVS) 358 | **/__pycache__/ 359 | *.pyc 360 | 361 | # Cake - Uncomment if you are using it 362 | #tools/** 363 | #!tools/packages.config 364 | 365 | # Tabs Studio 366 | *.tss 367 | 368 | # Telerik's JustMock configuration file 369 | *.jmconfig 370 | 371 | # BizTalk build output 372 | *.btp.cs 373 | *.btm.cs 374 | *.odx.cs 375 | *.xsd.cs 376 | 377 | # OpenCover UI analysis results 378 | OpenCover/ 379 | 380 | # Azure Stream Analytics local run output 381 | ASALocalRun/ 382 | 383 | # MSBuild Binary and Structured Log 384 | *.binlog 385 | MSBuild_Logs/ 386 | 387 | # AWS SAM Build and Temporary Artifacts folder 388 | .aws-sam 389 | 390 | # NVidia Nsight GPU debugger configuration file 391 | *.nvuser 392 | 393 | # MFractors (Xamarin productivity tool) working folder 394 | **/.mfractor/ 395 | 396 | # Local History for Visual Studio 397 | **/.localhistory/ 398 | 399 | # Visual Studio History (VSHistory) files 400 | .vshistory/ 401 | 402 | # BeatPulse healthcheck temp database 403 | healthchecksdb 404 | 405 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 406 | MigrationBackup/ 407 | 408 | # Ionide (cross platform F# VS Code tools) working folder 409 | **/.ionide/ 410 | 411 | # Fody - auto-generated XML schema 412 | FodyWeavers.xsd 413 | 414 | # VS Code files for those working on multiple tools 415 | .vscode/* 416 | !.vscode/settings.json 417 | !.vscode/tasks.json 418 | !.vscode/launch.json 419 | !.vscode/extensions.json 420 | !.vscode/*.code-snippets 421 | 422 | # Local History for Visual Studio Code 423 | .history/ 424 | 425 | # Built Visual Studio Code Extensions 426 | *.vsix 427 | 428 | # Windows Installer files from build outputs 429 | *.cab 430 | *.msi 431 | *.msix 432 | *.msm 433 | *.msp 434 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import json 4 | import pytest 5 | import re 6 | 7 | from a11y_llm_tests import node_bridge 8 | 9 | TEST_CASES_ROOT = Path("test_cases") 10 | SCREENSHOT_ROOT = Path("runs") / "pytest_screenshots" 11 | 12 | def _collect_example_html(): 13 | """Yield tuples: (test_case_name, html_path, yaml_path, test_js_path).""" 14 | for case_dir in TEST_CASES_ROOT.iterdir(): 15 | if not case_dir.is_dir(): 16 | continue 17 | test_js = case_dir / "test.js" 18 | if not test_js.exists(): 19 | continue 20 | examples_dir = case_dir / "examples" 21 | if not examples_dir.exists(): 22 | continue 23 | # Only consider HTML files that include json 24 | for html_file in examples_dir.glob("*.html"): 25 | fm, _ = parse_html_with_expectations(html_file) 26 | if fm is None: 27 | # Skip HTML files without json expectations (they are invalid in the new format) 28 | continue 29 | yield ( 30 | case_dir.name, 31 | html_file, 32 | None, 33 | test_js, 34 | ) 35 | 36 | # EXAMPLES will be computed after helper functions are defined 37 | 38 | 39 | def _collect_assertion_names_from_testjs(test_js_path: Path): 40 | """Parse a test.js file and return a set of assertion names used with assert("name", ...). 41 | 42 | This uses a simple regex to find string literals passed as the first argument to an 43 | `assert(...)` call. It's intentionally permissive and assumes tests call `assert` with 44 | a literal string as the first argument (the common pattern in our harnesses). 45 | """ 46 | content = test_js_path.read_text(encoding="utf-8") 47 | # Match assert( 'name' , or assert ( `name` , or assert("name", 48 | pattern = re.compile(r"\bassert\s*\(\s*([\'\"`])(.+?)\1", re.DOTALL) 49 | names = {m.group(2) for m in pattern.finditer(content)} 50 | return names 51 | 52 | 53 | # Json parser for merged HTML + json files 54 | SCRIPT_RE = re.compile(r"]+id=[\"']a11y-assertions[\"'][^>]*type=[\"']application/json[\"'][^>]*>(.*?)", re.DOTALL | re.IGNORECASE) 55 | 56 | 57 | def parse_html_with_expectations(path: Path): 58 | """Return (assertions_dict_or_None, html_text). 59 | 60 | This looks for a 61 | inside the HTML. If found, returns (parsed_json_dict, full_html_text). Otherwise (None, full_text). 62 | """ 63 | text = path.read_text(encoding="utf-8") 64 | m = SCRIPT_RE.search(text) 65 | if not m: 66 | return None, text 67 | json_text = m.group(1) 68 | try: 69 | data = json.loads(json_text) or {} 70 | except Exception: 71 | return None, text 72 | return data, text 73 | 74 | 75 | def _collect_assertions_for_case(case_dir: Path): 76 | """Return a mapping of example path -> dict(assertion name -> expected value) for examples in a case. 77 | 78 | This reads assertions embedded as JSON in the HTML examples using the 79 | 16 | 173 | 174 | 175 | 176 |

Fail - Has CSS Transition

177 |

178 | The following is a failing example of a modal dialog component. It passes most of the requirements but 179 | does not hide content behind it from screen reader users and does not take focus when opened. This example has slight CSS transitions when opening/closing. 180 | This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility. 181 |

182 | 183 | 184 | 185 | 208 | 209 | 263 | 264 | 265 | -------------------------------------------------------------------------------- /a11y_llm_tests/generator.py: -------------------------------------------------------------------------------- 1 | """LLM HTML generation & caching layer.""" 2 | from __future__ import annotations 3 | import hashlib, time, random 4 | from pathlib import Path 5 | from typing import Tuple, Dict, Any, Optional 6 | import json 7 | import litellm 8 | 9 | CACHE_DIR = Path(".cache/generations") 10 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 11 | 12 | # Retry policy for litellm calls 13 | RETRY_MAX_ATTEMPTS = 5 14 | RETRY_BASE_DELAY = 1.0 # seconds 15 | RETRY_MAX_DELAY = 60.0 # seconds 16 | 17 | DEFAULT_SYSTEM_PROMPT = ( 18 | "You are generating a single standalone HTML document. " 19 | "Do NOT wrap output in markdown fences. Include and . " 20 | "Do NOT explain the code, just output it." 21 | ) 22 | 23 | _PROMPT_JOINER = "\n|:|\n" 24 | _configured_system_prompt: str = DEFAULT_SYSTEM_PROMPT 25 | _custom_instructions: Optional[str] = None 26 | 27 | 28 | def configure_prompts(system_prompt: Optional[str] = None, custom_instructions: Optional[str] = None) -> None: 29 | """Configure the base system prompt and optional custom instructions.""" 30 | global _configured_system_prompt, _custom_instructions 31 | base = (system_prompt or "").strip() 32 | _configured_system_prompt = base or DEFAULT_SYSTEM_PROMPT 33 | if custom_instructions is None: 34 | _custom_instructions = None 35 | else: 36 | text = custom_instructions.rstrip("\n") 37 | _custom_instructions = text if text.strip() else None 38 | 39 | 40 | def get_base_system_prompt() -> str: 41 | return _configured_system_prompt 42 | 43 | 44 | def get_custom_instructions() -> Optional[str]: 45 | return _custom_instructions 46 | 47 | 48 | def get_effective_system_prompt() -> str: 49 | if _custom_instructions: 50 | return f"{_configured_system_prompt}\n\n{_custom_instructions}".strip() 51 | return _configured_system_prompt 52 | 53 | 54 | def prompt_hash(text: str) -> str: 55 | return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] 56 | 57 | 58 | def compute_prompt_hash(user_prompt: str) -> str: 59 | combined = _PROMPT_JOINER.join([ 60 | _configured_system_prompt, 61 | _custom_instructions or "", 62 | user_prompt, 63 | ]) 64 | return prompt_hash(combined) 65 | 66 | 67 | def clean_generation(raw: str) -> str: 68 | # Strip markdown fences if present 69 | if "```" in raw: 70 | parts = [] 71 | inside = False 72 | for line in raw.splitlines(): 73 | if line.strip().startswith("```"): 74 | inside = not inside 75 | continue 76 | if inside: 77 | parts.append(line) 78 | if parts: 79 | raw = "\n".join(parts) 80 | # Keep only first ... 81 | lower = raw.lower() 82 | if "" in lower: 83 | start = lower.index("") + len("") 85 | raw = raw[start:end] 86 | return raw.strip() 87 | 88 | 89 | def _meta_path(cache_file: Path) -> Path: 90 | return cache_file.with_suffix(cache_file.suffix + ".meta.json") 91 | 92 | 93 | def generate_html_with_meta( 94 | model: str, 95 | user_prompt: str, 96 | iteration: int, 97 | temperature: Optional[float] = None, 98 | seed: Optional[int] = None, 99 | disable_cache: bool = False, 100 | ) -> Tuple[str, Dict[str, Any]]: 101 | """Generate (or load cached) HTML plus metadata including token usage & cost. 102 | 103 | Returns: 104 | html (str): The generated HTML document. 105 | meta (dict): { 106 | 'cached': bool, 107 | 'latency_s': float, 108 | 'prompt_hash': str, 109 | 'tokens_in': int|None, 110 | 'tokens_out': int|None, 111 | 'total_tokens': int|None, 112 | 'cost_usd': float|None, 113 | } 114 | """ 115 | base_system_prompt = get_base_system_prompt() 116 | custom_instructions = get_custom_instructions() 117 | effective_system_prompt = get_effective_system_prompt() 118 | h = compute_prompt_hash(user_prompt) 119 | # Incorporate seed into cache identity for sampling diversity 120 | seed_part = f"_s{seed}" if seed is not None else "" 121 | iteration_part = f"_i{iteration}" 122 | cache_file = CACHE_DIR / f"{model}_{h}{seed_part}{iteration_part}.html" 123 | meta_file = _meta_path(cache_file) 124 | if not disable_cache and cache_file.exists(): 125 | html = cache_file.read_text(encoding="utf-8") 126 | meta: Dict[str, Any] = { 127 | "cached": True, 128 | "latency_s": 0.0, 129 | "prompt_hash": h, 130 | "tokens_in": None, 131 | "tokens_out": None, 132 | "total_tokens": None, 133 | "cost_usd": None, 134 | "seed": seed, 135 | "temperature": temperature, 136 | "system_prompt": base_system_prompt, 137 | "custom_instructions": custom_instructions, 138 | "effective_system_prompt": effective_system_prompt, 139 | } 140 | if meta_file.exists(): 141 | try: 142 | loaded = json.loads(meta_file.read_text(encoding="utf-8")) 143 | meta.update({ 144 | k: loaded.get(k) for k in [ 145 | "tokens_in", "tokens_out", "total_tokens", "cost_usd", 146 | "system_prompt", "custom_instructions", "effective_system_prompt", 147 | ] 148 | }) 149 | except Exception: 150 | pass # ignore malformed meta 151 | return html, meta 152 | 153 | start = time.time() 154 | litellm.drop_params = True 155 | print(f"Generating HTML with model={model}, temp={temperature}, seed={seed}...") 156 | 157 | resp = None 158 | last_exc: Optional[BaseException] = None 159 | for attempt in range(1, RETRY_MAX_ATTEMPTS + 1): 160 | try: 161 | resp = litellm.completion( 162 | model=model, 163 | messages=[ 164 | {"role": "system", "content": effective_system_prompt}, 165 | {"role": "user", "content": user_prompt}, 166 | ], 167 | temperature=temperature, 168 | seed=seed, 169 | ) 170 | # Basic validation: ensure we have choices and text 171 | if resp and getattr(resp, "choices", None) and len(resp.choices) > 0: 172 | break 173 | # Treat missing choices as transient error 174 | last_exc = RuntimeError("litellm returned no choices") 175 | except Exception as e: 176 | last_exc = e 177 | 178 | # If we're here, we will retry unless this was the last attempt 179 | if attempt == RETRY_MAX_ATTEMPTS: 180 | break 181 | # Exponential backoff with jitter 182 | delay = min(RETRY_BASE_DELAY * (2 ** (attempt - 1)), RETRY_MAX_DELAY) 183 | jitter = random.uniform(0, delay * 0.1) 184 | sleep_for = delay + jitter 185 | print(f"litellm call failed (attempt {attempt}/{RETRY_MAX_ATTEMPTS}): {last_exc}; retrying in {sleep_for:.1f}s...") 186 | time.sleep(sleep_for) 187 | 188 | elapsed = time.time() - start 189 | 190 | if resp is None or not getattr(resp, "choices", None): 191 | # Raise the last exception or a generic one so callers can handle it 192 | if last_exc: 193 | raise last_exc 194 | raise RuntimeError("litellm.completion failed with no response") 195 | 196 | # Extract tokens & cost defensively 197 | usage = getattr(resp, "usage", None) or getattr(resp, "_hidden_params", {}).get("usage") or {} 198 | tokens_in = usage.get("prompt_tokens") if isinstance(usage, dict) else None 199 | tokens_out = usage.get("completion_tokens") if isinstance(usage, dict) else None 200 | total_tokens = usage.get("total_tokens") if isinstance(usage, dict) else None 201 | 202 | cost_usd = None 203 | # liteLLM often attaches response_cost either directly or hidden 204 | cost_usd = getattr(resp, "response_cost", None) 205 | if cost_usd is None: 206 | hidden = getattr(resp, "_hidden_params", {}) 207 | if isinstance(hidden, dict): 208 | cost_usd = hidden.get("response_cost") 209 | 210 | raw = resp.choices[0].message.content 211 | html = clean_generation(raw) 212 | cache_file.parent.mkdir(exist_ok=True, parents=True) 213 | cache_file.write_text(html, encoding="utf-8") 214 | # Write meta file for future cache hits 215 | meta_payload = { 216 | "model": model, 217 | "prompt_hash": h, 218 | "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), 219 | "tokens_in": tokens_in, 220 | "tokens_out": tokens_out, 221 | "total_tokens": total_tokens, 222 | "cost_usd": cost_usd, 223 | "seed": seed, 224 | "temperature": temperature, 225 | "system_prompt": base_system_prompt, 226 | "custom_instructions": custom_instructions, 227 | "effective_system_prompt": effective_system_prompt, 228 | } 229 | try: 230 | meta_file.write_text(json.dumps(meta_payload, indent=2), encoding="utf-8") 231 | except Exception: 232 | pass 233 | 234 | meta = { 235 | "cached": False, 236 | "latency_s": elapsed, 237 | "prompt_hash": h, 238 | "tokens_in": tokens_in, 239 | "tokens_out": tokens_out, 240 | "total_tokens": total_tokens, 241 | "cost_usd": cost_usd, 242 | "seed": seed, 243 | "temperature": temperature, 244 | "system_prompt": base_system_prompt, 245 | "custom_instructions": custom_instructions, 246 | "effective_system_prompt": effective_system_prompt, 247 | } 248 | return html, meta 249 | 250 | 251 | def generate_html(model: str, user_prompt: str, temperature: float = None, seed: Optional[int] = None, disable_cache: bool = False) -> Tuple[str, bool, float]: 252 | """Backward-compatible shim. Prefer generate_html_with_meta. 253 | 254 | Returns legacy tuple (html, cached, latency_s).""" 255 | html, meta = generate_html_with_meta( 256 | model, 257 | user_prompt, 258 | iteration=0, 259 | temperature=temperature, 260 | seed=seed, 261 | disable_cache=disable_cache, 262 | ) 263 | return html, meta["cached"], meta["latency_s"] 264 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/test.js: -------------------------------------------------------------------------------- 1 | /* Function to dismiss the dialog by clicking a button with common dismissal names, pressing Escape, or refreshing the page */ 2 | const dismissDialog = async (page, reload = true) => { 3 | if (!await dialogIsOpen(page)) { 4 | return; 5 | } 6 | 7 | if (await dialogIsOpen(page)) { 8 | // Try pressing escape on the dialog 9 | await page.getByRole('dialog').press('Escape'); 10 | } 11 | 12 | if (await dialogIsOpen(page)) { 13 | // Fallback: press Escape on body 14 | await page.keyboard.press('Escape'); 15 | } 16 | 17 | if (await dialogIsOpen(page)) { 18 | // Fallback: by clicking outside the dialog 19 | await page.locator('body').click({position: {x: 0, y: 0}}); 20 | } 21 | 22 | const closeButton = await page.getByRole('button', { name: /\b(close|okay|ok|dismiss|exit|cancel|submit|apply|x)\b/iu }); 23 | if (await closeButton.count() > 0) { 24 | await closeButton.first().click(); 25 | } 26 | 27 | const closeControl = await page.getByRole('*', { name: /\b(close|okay|ok|dismiss|exit|cancel|submit|apply|x)\b/iu }); 28 | if (await closeButton.count() > 0) { 29 | await closeButton.first().click(); 30 | } 31 | 32 | if (reload && await dialogIsOpen(page)) { 33 | // If still open, refresh the page to reset state 34 | await utils.reload(); 35 | } 36 | } 37 | 38 | const waitForAnimationEnd = async (locator) => { 39 | return locator.evaluate((element) => 40 | Promise.all( 41 | element 42 | .getAnimations({ subtree: true }) 43 | .map((animation) => animation.finished) 44 | ) 45 | ) 46 | } 47 | 48 | const dialogIsOpen = async (page) => { 49 | // Some JS frameworks delay the addition/removal of the dialog to the DOM until after animations complete. 50 | await page.waitForTimeout(50); 51 | 52 | // Now wait for any animations to end 53 | const body = await page.locator('body'); 54 | await waitForAnimationEnd(body); 55 | 56 | // Now, check for dialog presence 57 | const dialog = await page.getByRole('dialog'); 58 | return await dialog.count() > 0; 59 | } 60 | 61 | /* Function to check if focus is inside the dialog 62 | * Checks if the activeElement is contained within the dialog or is the body (which can happen if focus is sent to the browser chrome). 63 | */ 64 | const focusIsInDialog = async (page) => { 65 | return await page.evaluate((obj) => obj.dialog.contains(document.activeElement) || document.activeElement.tagName === 'BODY', 66 | { dialog: await page.getByRole('dialog').elementHandle()}); 67 | } 68 | 69 | const tryToEscapeDialog = async (page, key, iterations) => { 70 | // Tab forward many times to see if we can escape the dialog. 71 | let foundElements = []; 72 | for (let i = 0; i < iterations; i++) { 73 | await page.keyboard.press(key); 74 | let focusedElement = await page.evaluate(() => document.activeElement); 75 | if (foundElements.includes(focusedElement)) { 76 | // We have cycled through all focusable elements, so stop. 77 | return false; 78 | } 79 | foundElements.push(focusedElement); 80 | if (!await focusIsInDialog(page)) { 81 | // Focus escaped the dialog, so fail this iteration. 82 | return true; 83 | } 84 | } 85 | } 86 | 87 | const getTriggers = async (page) => { 88 | return await page.locator('.trigger').filter({ visible: true }); 89 | } 90 | 91 | module.exports.run = async ({ page, assert, utils }) => { 92 | /* Loop through all dialog triggers, open the dialog, and assert that a dialog role is present */ 93 | await assert("Each dialog has a dialog role", async () => { 94 | await utils.reload(); // Ensure clean state before starting 95 | await dismissDialog(page, false); // Ensure no dialog is open 96 | const triggers = await getTriggers(page); 97 | const totalTriggers = await triggers.count(); 98 | let totalDialogs = 0; 99 | for (const trigger of await triggers.all()) { 100 | await trigger.click(); 101 | if (await dialogIsOpen(page)) { 102 | totalDialogs += 1; 103 | } 104 | await dismissDialog(page); 105 | } 106 | return totalDialogs === totalTriggers; 107 | }); 108 | 109 | await assert("Each dialog can be closed by escape key", async () => { 110 | await utils.reload(); // Ensure clean state before starting 111 | await dismissDialog(page, false); // Ensure no dialog is open 112 | const triggers = await getTriggers(page); 113 | const totalTriggers = await triggers.count(); 114 | let totalSuccess = 0; 115 | for (const trigger of await triggers.all()) { 116 | await dismissDialog(page); 117 | await trigger.click(); 118 | if (!await dialogIsOpen(page)) { 119 | throw new Error("Unable to test because no dialog was found"); 120 | } 121 | 122 | await page.getByRole('dialog').press('Escape'); 123 | if (!(await dialogIsOpen(page))) { 124 | totalSuccess += 1; 125 | } 126 | } 127 | return totalSuccess === totalTriggers; 128 | }, {type: 'BP'}); 129 | 130 | await assert("Each modal dialog traps keyboard focus", async () => { 131 | await utils.reload(); // Ensure clean state before starting 132 | await dismissDialog(page, false); // Ensure no dialog is open 133 | const triggers = await getTriggers(page); 134 | const totalTriggers = await triggers.count(); 135 | let totalSuccess = 0; 136 | for (const trigger of await triggers.all()) { 137 | await dismissDialog(page); 138 | await trigger.click(); 139 | if (!await dialogIsOpen(page)) { 140 | throw new Error("Unable to test because no dialog was found"); 141 | } 142 | 143 | if (await tryToEscapeDialog(page, 'Tab', 20)) { 144 | await dismissDialog(page); 145 | continue; 146 | } 147 | 148 | if (await tryToEscapeDialog(page, 'Shift+Tab', 20)) { 149 | await dismissDialog(page); 150 | continue; 151 | } 152 | 153 | totalSuccess += 1; 154 | } 155 | return totalSuccess === totalTriggers; 156 | }); 157 | 158 | await assert("Each modal dialog takes focus when opened", async () => { 159 | await utils.reload(); // Ensure clean state before starting 160 | await dismissDialog(page, false); // Ensure no dialog is open 161 | const triggers = await getTriggers(page); 162 | const totalTriggers = await triggers.count(); 163 | let totalSuccess = 0; 164 | for (const trigger of await triggers.all()) { 165 | await dismissDialog(page); 166 | await trigger.click(); 167 | if (!await dialogIsOpen(page)) { 168 | throw new Error("Unable to test because no dialog was found"); 169 | } 170 | 171 | if (!(await focusIsInDialog(page))) { 172 | // Focus is not in the dialog, so fail this iteration. 173 | continue; 174 | } 175 | 176 | const bodyIsFocused = await page.evaluate(() => document.activeElement.tagName === 'BODY'); 177 | if (bodyIsFocused) { 178 | // Focus is on body, meaning that focus was lost, so fail this iteration. 179 | // focusIsInDialog would have returned true if focus was on the Body element. 180 | continue; 181 | } 182 | 183 | totalSuccess += 1; 184 | } 185 | return totalSuccess === totalTriggers; 186 | }); 187 | 188 | await assert("Focus is not lost when each dialog closes", async () => { 189 | await utils.reload(); // Ensure clean state before starting 190 | await dismissDialog(page, false); // Ensure no dialog is open 191 | const triggers = await getTriggers(page); 192 | const totalTriggers = await triggers.count(); 193 | let totalSuccess = 0; 194 | for (const trigger of await triggers.all()) { 195 | await dismissDialog(page); 196 | await trigger.click(); 197 | if (!await dialogIsOpen(page)) { 198 | throw new Error("Unable to test because no dialog was found"); 199 | } 200 | 201 | await dismissDialog(page, false); 202 | 203 | const bodyIsFocused = await page.evaluate(() => document.activeElement.tagName === 'BODY'); 204 | if (bodyIsFocused) { 205 | // Focus is on body, meaning that focus was lost, so fail this iteration. 206 | // focusIsInDialog would have returned true if focus was on the Body element. 207 | // Note: this does not cover the scenario where the modal dialog triggers automatically on page load before the user can interact with the page. In this situation, focus should return to the body. 208 | continue; 209 | } 210 | 211 | totalSuccess += 1; 212 | } 213 | return totalSuccess === totalTriggers; 214 | }); 215 | 216 | await assert("Each modal dialog hides content behind it while open", async () => { 217 | await utils.reload(); // Ensure clean state before starting 218 | await dismissDialog(page, false); // Ensure no dialog is open 219 | const triggers = await getTriggers(page); 220 | const totalTriggers = await triggers.count(); 221 | let totalSuccess = 0; 222 | 223 | for (const trigger of await triggers.all()) { 224 | await dismissDialog(page); 225 | await trigger.click(); 226 | if (!await dialogIsOpen(page)) { 227 | throw new Error("Unable to test because no dialog was found"); 228 | } 229 | 230 | // Determine if native modal dialog is opened, which always hides background content. 231 | let isNativeModal = await page.evaluate(el => { 232 | return !!document.querySelector(':modal') 233 | }); 234 | 235 | if (!isNativeModal) { 236 | // If not a native modal dialog, check if content behind the dialog is hidden from screen reader users. 237 | let isScreenReaderHidden = await trigger.evaluate(el => { 238 | // Use axe-core's util to determine hidden from screen reader users. 239 | let vEl = window.axe.utils.getNodeFromTree(el) 240 | return !window.axe.commons.dom.isVisibleToScreenReaders(vEl); 241 | }); 242 | 243 | if (!isScreenReaderHidden) { 244 | // Trigger is still visible to screen reader users, so fail this iteration. 245 | continue; 246 | } 247 | } 248 | 249 | totalSuccess += 1; 250 | } 251 | return totalSuccess === totalTriggers; 252 | }); 253 | 254 | return {}; // assertions collected via injected assert 255 | }; 256 | 257 | module.exports.runAxe = async ({ page, utils }) => { 258 | await utils.reload(); // Ensure clean state before starting 259 | await dismissDialog(page, false); // Ensure no dialog is open 260 | 261 | const triggers = await getTriggers(page); 262 | let axeResult = {}; 263 | 264 | for (const trigger of await triggers.all()) { 265 | await dismissDialog(page); 266 | await trigger.click(); 267 | await dialogIsOpen(page); 268 | axeResult = utils.merge(axeResult, await utils.runAxeOnPage(page)); 269 | } 270 | 271 | return axeResult; 272 | }; -------------------------------------------------------------------------------- /test_cases/modal-dialog/examples/fail-has-js-delay.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Fail - Has JS Delay 6 | 16 | 154 | 155 | 156 |
157 |

Fail - Has JS Delay

158 |

159 | The following is a failing example of a modal dialog component. It passes most of the requirements but 160 | does not hide content behind it from screen reader users. This example has a slight JS delays when opening/closing. 161 | This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility. 162 |

163 | 164 |

Click the button to open the modal. The button has the class "trigger".

165 | 166 | 167 |
168 | 169 | 170 | 203 | 204 | 355 | 356 | -------------------------------------------------------------------------------- /a11y_llm_tests/cli.py: -------------------------------------------------------------------------------- 1 | """Typer CLI for running evaluations and generating reports.""" 2 | import json 3 | import multiprocessing 4 | from datetime import datetime 5 | from pathlib import Path 6 | import typer 7 | import yaml 8 | from typing import List 9 | 10 | from . import generator, node_bridge 11 | from .schema import ( 12 | ResultRecord, 13 | TestFunctionResult, 14 | AxeResult, 15 | GenerationMeta, 16 | AggregateRecord, 17 | ) 18 | from .metrics import compute_pass_at_k, format_pass_at_k 19 | 20 | # importing os module for environment variables 21 | import os 22 | # importing necessary functions from dotenv library 23 | from dotenv import load_dotenv, dotenv_values 24 | # loading variables from .env file 25 | load_dotenv() 26 | 27 | app = typer.Typer(add_completion=False) 28 | 29 | 30 | def _evaluate_worker(args_tuple): 31 | """Top-level worker for multiprocessing to ensure picklability on spawn-based systems (macOS, Windows).""" 32 | html_path, test_js_path, screenshot_path, test_name, model, sample_index, gen_meta, prompt_text = args_tuple 33 | html = Path(html_path).read_text(encoding="utf-8") 34 | sp = Path(screenshot_path) 35 | sp.parent.mkdir(parents=True, exist_ok=True) 36 | node_res = node_bridge.run(html, test_js_path, screenshot_path) 37 | tf = node_res.get("testFunctionResult", {}) 38 | assertions_raw = tf.get("assertions", []) 39 | norm_assertions = [] 40 | for a in assertions_raw: 41 | if not isinstance(a, dict): 42 | continue 43 | atype = (a.get("type") or "R").upper() 44 | if atype not in {"R", "BP"}: 45 | atype = "R" 46 | norm_assertions.append({ 47 | "name": a.get("name", "unknown"), 48 | "status": a.get("status", "fail"), 49 | "message": a.get("message"), 50 | "type": atype, 51 | }) 52 | test_result = TestFunctionResult( 53 | status=tf.get("status", "error"), 54 | assertions=norm_assertions, 55 | error=tf.get("error"), 56 | duration_ms=tf.get("duration_ms"), 57 | total_assertion_failures=tf.get("total_assertion_failures", 0), 58 | total_assertion_bp_failures=tf.get("total_assertion_bp_failures", 0) 59 | ) 60 | axe_data = node_res.get("axeResult") or node_res.get("axe_result") or node_res.get("axe") 61 | axe_obj = None 62 | if axe_data and isinstance(axe_data, dict): 63 | axe_obj = AxeResult( 64 | failure_count=axe_data.get("failure_count", 0), 65 | failures=axe_data.get("failures", []), 66 | best_practice_count=axe_data.get("best_practice_count", 0), 67 | best_practice_failures=axe_data.get("best_practice_failures", []), 68 | ) 69 | result_pass = bool(axe_obj) and (test_result.status == "pass" and axe_obj.failure_count == 0) 70 | rec = ResultRecord( 71 | test_name=test_name, 72 | model_name=model, 73 | timestamp=datetime.utcnow(), 74 | generation_html_path=html_path, 75 | screenshot_path=screenshot_path, 76 | test_function=test_result, 77 | axe=axe_obj, 78 | result="PASS" if result_pass else "FAIL", 79 | generation=GenerationMeta( 80 | latency_s=gen_meta.get("latency_s", 0.0), 81 | prompt_hash=gen_meta.get("prompt_hash", generator.compute_prompt_hash(prompt_text)), 82 | cached=gen_meta.get("cached", False), 83 | tokens_in=gen_meta.get("tokens_in"), 84 | tokens_out=gen_meta.get("tokens_out"), 85 | total_tokens=gen_meta.get("total_tokens"), 86 | cost_usd=gen_meta.get("cost_usd"), 87 | seed=gen_meta.get("seed"), 88 | temperature=gen_meta.get("temperature"), 89 | system_prompt=gen_meta.get("system_prompt", generator.get_base_system_prompt()), 90 | custom_instructions=gen_meta.get("custom_instructions", generator.get_custom_instructions()), 91 | effective_system_prompt=gen_meta.get("effective_system_prompt", generator.get_effective_system_prompt()), 92 | ), 93 | sample_index=sample_index, 94 | ) 95 | return json.loads(rec.model_dump_json()), test_name, model, result_pass 96 | 97 | 98 | def _generate_worker(task): 99 | """Top-level generation worker for multiprocessing; receives a tuple of parameters.""" 100 | test_name, model, sample_index, prompt_text, seed, temperature, disable_cache = task 101 | html, meta = generator.generate_html_with_meta( 102 | model, 103 | prompt_text, 104 | sample_index, 105 | temperature=temperature, 106 | seed=seed, 107 | disable_cache=disable_cache, 108 | ) 109 | return test_name, model, sample_index, prompt_text, meta, html 110 | 111 | 112 | @app.command() 113 | def run( 114 | models_file: str = typer.Option("config/models.yaml", help="Models config YAML"), 115 | out: str = typer.Option("runs", help="Output directory"), 116 | samples: int = typer.Option(1, min=1, help="Number of samples per (test,model)."), 117 | k: str = typer.Option("1,5,10", help="Comma-separated k values for pass@k metrics (stored for later evaluation)."), 118 | base_seed: int = typer.Option(None, help="Base seed for reproducibility; each sample adds its index."), 119 | temperature: float = typer.Option(None, help="Override model temperature (if supported)."), 120 | disable_cache: bool = typer.Option(False, help="Disable generation cache (always re-generate)."), 121 | test_cases_dir: str = typer.Option("test_cases", help="Directory containing test case folders."), 122 | processes: int = typer.Option(None, "--processes", "-p", help="Parallel processes for generation (defaults CPU count; use 1 to disable)."), 123 | ): 124 | """Generate HTML samples ONLY (no evaluation). A later 'evaluate' command will run tests & build report.""" 125 | run_id = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S") 126 | out_dir = Path(out) / run_id 127 | (out_dir / "raw").mkdir(parents=True, exist_ok=True) 128 | # Prepare screenshots directory (will be populated during evaluation phase) 129 | (out_dir / "screenshots").mkdir(parents=True, exist_ok=True) 130 | 131 | models_cfg = yaml.safe_load(open(models_file)) 132 | defaults_cfg = models_cfg.get("defaults") or {} 133 | config_dir = Path(models_file).resolve().parent 134 | system_prompt_override = defaults_cfg.get("system_prompt") 135 | instructions_cfg = defaults_cfg.get("custom_instructions_markdown") 136 | custom_instructions_text = None 137 | custom_instructions_path = None 138 | if instructions_cfg: 139 | instructions_path = Path(instructions_cfg) 140 | if not instructions_path.is_absolute(): 141 | instructions_path = config_dir / instructions_path 142 | instructions_path = instructions_path.resolve() 143 | if not instructions_path.exists(): 144 | typer.secho(f"Custom instructions file not found: {instructions_path}", err=True) 145 | raise typer.Exit(code=1) 146 | try: 147 | custom_instructions_text = instructions_path.read_text(encoding="utf-8") 148 | except OSError as exc: 149 | typer.secho(f"Failed to read custom instructions file '{instructions_path}': {exc}", err=True) 150 | raise typer.Exit(code=1) 151 | custom_instructions_path = str(instructions_path) 152 | generator.configure_prompts(system_prompt_override, custom_instructions_text) 153 | model_names = [m["name"] for m in models_cfg.get("models", [])] 154 | models_info = [] 155 | for m in models_cfg.get("models", []): 156 | name = m.get("name") 157 | display_name = m.get("display_name") or (name.split('/')[-1] if isinstance(name, str) else name) 158 | models_info.append({"name": name, "display_name": display_name}) 159 | tcd = Path(test_cases_dir) 160 | test_dirs = [p for p in tcd.iterdir() if p.is_dir() and (p / "prompt.md").exists()] 161 | # Build generation tasks 162 | results = [] # stub pending evaluation records 163 | prompts_map = {} 164 | gen_tasks = [] # (test_name, model, sample_index, prompt, seed) 165 | for td in test_dirs: 166 | prompt_text = (td / "prompt.md").read_text(encoding="utf-8") 167 | prompts_map[td.name] = prompt_text 168 | for model in model_names: 169 | for sample_index in range(samples): 170 | seed = (base_seed + sample_index) if base_seed is not None else None 171 | gen_tasks.append((td.name, model, sample_index, prompt_text, seed, temperature, disable_cache)) 172 | 173 | gen_tasks.sort(key=lambda t: (t[0], t[1], t[2])) 174 | 175 | if gen_tasks: 176 | pool_size = None 177 | if processes is None: 178 | pool_size = min(multiprocessing.cpu_count(), len(gen_tasks)) 179 | else: 180 | pool_size = max(1, processes) 181 | if pool_size == 1: 182 | gen_results_iter = map(_generate_worker, gen_tasks) 183 | else: 184 | typer.echo(f"Generating with {pool_size} processes...") 185 | with multiprocessing.Pool(processes=pool_size) as pool: 186 | gen_results_iter = pool.map(_generate_worker, gen_tasks) 187 | for test_name, model, sample_index, prompt_text, meta, html in gen_results_iter: 188 | raw_path = out_dir / "raw" / test_name 189 | html_file = raw_path / f"{model}__s{sample_index}.html" if samples > 1 else raw_path / f"{model}.html" 190 | html_file.parent.mkdir(exist_ok=True, parents=True) 191 | html_file.write_text(html, encoding="utf-8") 192 | rec = ResultRecord( 193 | test_name=test_name, 194 | model_name=model, 195 | timestamp=datetime.utcnow(), 196 | generation_html_path=str(html_file), 197 | screenshot_path=None, 198 | test_function=TestFunctionResult(status="PENDING", assertions=[], error=None, duration_ms=None), 199 | axe=None, 200 | result="PENDING", 201 | generation=GenerationMeta( 202 | latency_s=meta.get("latency_s", 0.0), 203 | prompt_hash=meta.get("prompt_hash", generator.compute_prompt_hash(prompt_text)), 204 | cached=meta.get("cached", False), 205 | tokens_in=meta.get("tokens_in"), 206 | tokens_out=meta.get("tokens_out"), 207 | total_tokens=meta.get("total_tokens"), 208 | cost_usd=meta.get("cost_usd"), 209 | seed=meta.get("seed"), 210 | temperature=meta.get("temperature"), 211 | system_prompt=meta.get("system_prompt", generator.get_base_system_prompt()), 212 | custom_instructions=meta.get("custom_instructions", generator.get_custom_instructions()), 213 | effective_system_prompt=meta.get("effective_system_prompt", generator.get_effective_system_prompt()), 214 | ), 215 | sample_index=sample_index, 216 | ) 217 | results.append(json.loads(rec.model_dump_json())) 218 | 219 | run_json = { 220 | "run_id": run_id, 221 | "models": model_names, 222 | "tests": [d.name for d in test_dirs], 223 | "prompts": prompts_map, 224 | "results": results, 225 | "aggregates": [], # will be populated after evaluation 226 | "meta": { 227 | "sampling": { 228 | "samples_per_case": samples, 229 | "k_values": [int(x.strip()) for x in k.split(",") if x.strip().isdigit()], # stored but not yet computed 230 | "temperature": temperature, 231 | "base_seed": base_seed, 232 | "disable_cache": disable_cache, 233 | "processes_generation": (processes if processes is not None else min(multiprocessing.cpu_count(), len(gen_tasks))) if gen_tasks else None, 234 | }, 235 | "prompting": { 236 | "system_prompt": generator.get_base_system_prompt(), 237 | "effective_system_prompt": generator.get_effective_system_prompt(), 238 | "custom_instructions": generator.get_custom_instructions(), 239 | "custom_instructions_path": custom_instructions_path, 240 | }, 241 | "models_info": models_info, 242 | "status": "GENERATED_ONLY", 243 | }, 244 | } 245 | (out_dir / "results.json").write_text(json.dumps(run_json, indent=2), encoding="utf-8") 246 | latest_link = Path(out) / "latest" 247 | try: 248 | if latest_link.exists() or latest_link.is_symlink(): 249 | latest_link.unlink() 250 | latest_link.symlink_to(out_dir) 251 | except OSError: 252 | pass 253 | typer.echo(f"Generation complete. Run directory ready for evaluation: {out_dir}") 254 | 255 | 256 | @app.command() 257 | def evaluate( 258 | run_dir: str = typer.Argument(..., help="Existing run directory produced by 'run' command"), 259 | test_cases_dir: str = typer.Option("test_cases", help="Directory containing test case folders."), 260 | k: str = typer.Option("1,5,10", help="Comma-separated k values for pass@k metrics."), 261 | generate_report: bool = typer.Option(True, help="Generate HTML report (index.html) after evaluation."), 262 | processes: int = typer.Option(None, "--processes", "-p", help="Number of parallel processes for evaluation (defaults to CPU count; use 1 to disable)."), 263 | ): 264 | """Evaluate previously generated HTML samples without requiring models config: run accessibility tests, compute aggregates, optionally render report.""" 265 | rd = Path(run_dir) 266 | if not rd.exists(): 267 | typer.secho(f"Run directory not found: {rd}", err=True) 268 | raise typer.Exit(code=1) 269 | results_json_path = rd / "results.json" 270 | prior_data = {} 271 | if results_json_path.exists(): 272 | try: 273 | prior_data = json.loads(results_json_path.read_text(encoding="utf-8")) 274 | except Exception: 275 | typer.secho("Warning: Failed to parse existing results.json; proceeding without prior metadata.", err=True) 276 | # derive model list and display names from prior data 277 | model_names = prior_data.get("models") or [] 278 | meta_block = (prior_data.get("meta") or {}) 279 | stored_models_info = meta_block.get("models_info") or [] 280 | display_lookup = {m.get("name"): m.get("display_name") for m in stored_models_info if m.get("name")} 281 | # prompts 282 | tcd = Path(test_cases_dir) 283 | test_dirs = [p for p in tcd.iterdir() if p.is_dir() and (p / "prompt.md").exists()] 284 | prompts_map = {td.name: (td / "prompt.md").read_text(encoding="utf-8") for td in test_dirs} 285 | k_values = [int(x.strip()) for x in k.split(",") if x.strip().isdigit()] 286 | if not k_values: 287 | k_values = [1] 288 | 289 | # Map generation meta by triple for reuse 290 | gen_meta_map = {} 291 | for r in prior_data.get("results", []) if prior_data else []: 292 | key = (r.get("test_name"), r.get("model_name"), r.get("sample_index")) 293 | gen_meta_map[key] = r.get("generation") 294 | 295 | # Build evaluation task list 296 | tasks = [] # each entry: (html_path, test_js_path, screenshot_path, test_name, model, sample_index, gen_meta, prompt_text) 297 | for td in test_dirs: 298 | test_name = td.name 299 | test_js = td / "test.js" 300 | raw_dir = rd / "raw" / test_name 301 | if not raw_dir.exists(): 302 | typer.secho(f"Skipping missing raw dir for test '{test_name}'", err=True) 303 | continue 304 | html_files = sorted(raw_dir.glob("**/*.html")) 305 | for hf in html_files: 306 | fname = hf.name 307 | if "__s" in fname: 308 | model_part, sample_part = fname.split("__s", 1) 309 | sample_index_str = sample_part[:-5] if sample_part.endswith(".html") else sample_part 310 | try: 311 | sample_index = int(sample_index_str) 312 | except ValueError: 313 | sample_index = None 314 | model = model_part 315 | else: 316 | model = fname[:-5] 317 | sample_index = None 318 | screenshot_name = f"{test_name}__{model}__s{sample_index}.png" if sample_index is not None else f"{test_name}__{model}.png" 319 | screenshot_path = rd / "screenshots" / screenshot_name 320 | gen_meta = gen_meta_map.get((test_name, model, sample_index)) or {} 321 | tasks.append((str(hf), str(test_js), str(screenshot_path), test_name, model, sample_index, gen_meta, prompts_map.get(test_name, ""))) 322 | 323 | # Sort tasks for deterministic ordering 324 | tasks.sort(key=lambda t: (t[3], t[4], t[5] if t[5] is not None else -1)) 325 | 326 | 327 | all_results = [] 328 | pass_map = {} # (test_name, model) -> list[bool] 329 | if not tasks: 330 | typer.secho("No evaluation tasks found.", err=True) 331 | else: 332 | pool_size = None 333 | if processes is None: 334 | # Default: use CPU count but cap at len(tasks) 335 | pool_size = min(multiprocessing.cpu_count(), len(tasks)) 336 | else: 337 | pool_size = max(1, processes) 338 | if pool_size == 1: 339 | for t in tasks: 340 | res, test_name, model, passed = _evaluate_worker(t) 341 | all_results.append(res) 342 | pass_map.setdefault((test_name, model), []).append(passed) 343 | else: 344 | typer.echo(f"Evaluating with {pool_size} processes...") 345 | with multiprocessing.Pool(processes=pool_size) as pool: 346 | for res, test_name, model, passed in pool.map(_evaluate_worker, tasks): 347 | all_results.append(res) 348 | pass_map.setdefault((test_name, model), []).append(passed) 349 | 350 | aggregates: List[dict] = [] 351 | for (test_name, model), statuses in pass_map.items(): 352 | c = sum(1 for x in statuses if x) 353 | n = len(statuses) 354 | pass_at = compute_pass_at_k(c, n, k_values) 355 | agg = AggregateRecord( 356 | test_name=test_name, 357 | model_name=model, 358 | n_samples=n, 359 | n_pass=c, 360 | pass_at_k=format_pass_at_k(pass_at), 361 | k_values=k_values, 362 | computed_at=datetime.utcnow(), 363 | ) 364 | aggregates.append(json.loads(agg.model_dump_json())) 365 | 366 | updated_json = { 367 | "run_id": prior_data.get("run_id") or rd.name, 368 | "models": model_names, 369 | "tests": [d.name for d in test_dirs], 370 | "prompts": prompts_map, 371 | "results": all_results, 372 | "aggregates": aggregates, 373 | "meta": { 374 | **(prior_data.get("meta") or {}), 375 | "sampling": { 376 | **((prior_data.get("meta") or {}).get("sampling") or {}), 377 | "k_values": k_values, 378 | "processes": (processes if processes is not None else min(multiprocessing.cpu_count(), len(tasks))) if tasks else None, 379 | }, 380 | "status": "EVALUATED", 381 | }, 382 | } 383 | results_json_path.write_text(json.dumps(updated_json, indent=2), encoding="utf-8") 384 | if generate_report: 385 | from .report import render_report 386 | # Synthesize minimal models_cfg for backward compatibility with report renderer 387 | synthesized_models_cfg = { 388 | "models": [ 389 | {"name": name, "display_name": display_lookup.get(name) or (str(name).split('/')[-1])} 390 | for name in model_names 391 | ] 392 | } 393 | render_report(results_json_path, rd / "index.html", synthesized_models_cfg) 394 | typer.echo(f"Evaluation complete. Report generated: {rd}/index.html") 395 | else: 396 | typer.echo("Evaluation complete. Report generation skipped.") 397 | 398 | 399 | @app.command() 400 | def report( 401 | run_dir: str, 402 | models_file: str = typer.Option("config/models.yaml", help="Models config YAML") 403 | ): 404 | """Regenerate HTML report for an existing run directory.""" 405 | models_cfg = yaml.safe_load(open(models_file)) 406 | rd = Path(run_dir) 407 | from .report import render_report 408 | render_report(rd / "results.json", rd / "index.html", models_cfg) 409 | typer.echo("Report regenerated.") 410 | 411 | 412 | def main(): # pragma: no cover 413 | app() 414 | 415 | 416 | if __name__ == "__main__": # pragma: no cover 417 | main() 418 | -------------------------------------------------------------------------------- /a11y_llm_tests/report.py: -------------------------------------------------------------------------------- 1 | """HTML reporting for evaluation runs.""" 2 | from pathlib import Path 3 | import orjson 4 | from jinja2 import Template 5 | from collections import OrderedDict 6 | # importing os module for environment variables 7 | import os 8 | 9 | TEMPLATE = """ 10 | 11 | 12 | 13 | {{ site_name }} 14 | 15 | 256 | 257 | 258 |
259 |

{{ site_name }}

260 |
261 |
262 |
263 | {% set total_samples = tests|length * n_samples %} 264 |

All Models were tested against {{ tests|length }} test cases. Each test case was tested {{ n_samples }} times. This results in {{ total_samples }} total samples being evaluated per model.

265 | 266 | 267 | 268 | 269 | 270 | {% for model, stats in summary.items() %} 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | {% endfor %} 281 | 282 |
ModelRankWCAG Pass Rate*Avg Total WCAG FailuresAvg Axe WCAG FailuresAvg Assertion WCAG FailuresAvg Best Practice Failures
{{ model_display_names.get(model, model) }}{{ loop.index }}{{ "%.0f%%"|format(stats.pass_rate * 100) }}{{ "%.2f"|format(stats.avg_failures) }}{{ "%.2f"|format(stats.avg_axe_failures) }}{{ "%.2f"|format(stats.avg_assertion_failures) }}{{ "%.2f"|format(stats.avg_bp_failures) }}
283 |

* These tests do not comprehensively test all WCAG requirements, only a subset of the most common issues. WCAG failures may still exist even for passing tests.

284 | {% if aggregates_by_test %} 285 |
286 |

Pass@k Aggregates

287 |

Pass@k is a formula that determines the likelihood that if you pick random k samples from the set, then at least one of them would pass. For example, pass@10=.50 means that there is a 50 percent likelihood that at least 1 of the 10 randomly selected samples from the set would pass.

288 |

Pass@K is a metric used to evaluate the performance of models when multiple samples are generated per test case.

289 | {% for test_name, info in aggregates_by_test.items() %} 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | {% for k in info.ks %} 298 | 299 | {% endfor %} 300 | 301 | 302 | 303 | {% for a in info.rows %} 304 | 305 | 306 | 307 | 308 | {% for k in info.ks %} 309 | {% set v = a.pass_at_k.get(k) %} 310 | 311 | {% endfor %} 312 | 313 | {% endfor %} 314 | 315 |
{{ test_name }}
ModelSamplesPassespass@{{ k }}
{{ model_display_names.get(a.model_name, a.model_name) }}{{ a.n_samples }}{{ a.n_pass }}{% if v is not none %}{{ '%.0f%%'|format(v * 100) }}{% else %}-{% endif %}
316 | {% endfor %} 317 |
318 | {% endif %} 319 |
320 |
321 |

Methodology

322 |

This report shows how well various LLMs generate accessible HTML.

323 |
    324 |
  • Each test uses a prompt to generate HTML. The generated HTML is then tested for accessibility.
  • 325 |
  • The prompts intentionally do not include specific accessibility instructions. The goal is to see if the LLMs produce accessible HTML by default.
  • 326 |
  • The resulting HTML is rendered in a browser via Playwright (Chromium). This allows the HTML's JavaScript and CSS to execute, which can impact accessibility.
  • 327 |
  • The rendered HTML is evaluated using axe-core to identify common accessibility issues.
  • 328 |
  • A custom test script (JavaScript) is executed against the rendered page to check for accessibility requirements that are specific to the test case and not covered by axe-core. These tests look for WCAG 2.2 failures and best practices. Best practices do not impact pass/fail results.
  • 329 |
  • Each test case is run multiple times (samples) to evaluate the consistency and reliability of the LLM's output.
  • 330 |
  • Default temperatures / settings are used for all models.
  • 331 |
332 | {% set system_prompt = prompting_meta.get('system_prompt') %} 333 | {% set effective_system_prompt = prompting_meta.get('effective_system_prompt') %} 334 | {% set display_system_prompt = effective_system_prompt or system_prompt %} 335 | {% set custom_instructions = prompting_meta.get('custom_instructions') %} 336 | {% set custom_instructions_path = prompting_meta.get('custom_instructions_path') %} 337 | {% if display_system_prompt %} 338 |
339 |

System Prompt

340 |
{{ display_system_prompt|e }}
341 | {% if effective_system_prompt and system_prompt and effective_system_prompt != system_prompt %} 342 |

The effective system prompt shown includes custom instructions.

343 | {% endif %} 344 |
345 | {% endif %} 346 | {% if custom_instructions %} 347 |
348 |

Custom Instructions

349 |
{{ custom_instructions }}
350 | {% if custom_instructions_path %} 351 |

Source: {{ custom_instructions_path }}

352 | {% endif %} 353 |
354 | {% endif %} 355 |

All tests are automatic and deterministic (no human intervention). Only a fraction of accessibility requirements in WCAG can be covered in this way. Many requirements still need a human to evaluate. As such, these tests are not comprehensive. Even if a test passes, it may still fail WCAG and contain serious accessibility issues.

356 |

Please leave feedback, review the source code, and contribute test cases, assertions, and other improvements at the GitHub Project.

357 |
358 |
359 |

Glossary

360 |

Column Definitions

361 |
    362 |
  • Rank: The position of the model when sorted by WCAG Pass Rate (lower is better).
  • 363 |
  • WCAG Pass Rate: The percentage of samples that passed all WCAG tests, including both axe-core WCAG checks and custom WCAG assertions. This does not include best practices.
  • 364 |
  • Avg Total WCAG Failures: The average number of total WCAG failures (axe-core + assertions) per sample for the model. This does not include best practices.
  • 365 |
  • Avg Axe WCAG Failures: The average number of axe-core detected WCAG failures per sample for the model. This does not include best practices.
  • 366 |
  • Avg Assertion WCAG Failures: The average number of custom WCAG assertion failures per sample for the model. This does not include best practices.
  • 367 |
  • Avg Best Practice Failures: The average number of best practice accessibility issues (informational only) per sample for the model. This includes axe-core best practices and best practice assertions.
  • 368 |
369 | 370 |

Other Glossary Terms

371 |
    372 |
  • Assertion: A specific accessibility check defined in the test script. Each assertion checks for a particular accessibility requirement or best practice for the specific test case which is not already tested by axe.
  • 373 |
  • Axe-core: An open-source accessibility testing engine developed by Deque Systems. It is widely used for automated accessibility testing of web applications. Axe-core
  • 374 |
  • Pass@k: A metric that estimates the likelihood of at least one sample passing a test when k samples are randomly selected.
  • 375 |
  • WCAG: Web Content Accessibility Guidelines, a set of guidelines for making web content more accessible to people with disabilities.
  • 376 |
  • Test Case: A specific scenario designed to evaluate the accessibility of generated HTML content. Each test case includes a prompt, expected accessibility requirements, and a test script.
  • 377 |
378 |
379 |
380 |

Detailed Results

381 |
382 | 391 | 399 | 400 |
401 |

402 | 403 | {% for test_name, test_data in grouped_results.items() %} 404 |
405 |
406 |

{{ test_name }}

407 | {% if test_data.prompt %} 408 |
409 | Prompt 410 |
{{ test_data.prompt|e }}
411 |
412 | {% endif %} 413 | {% for group in test_data.models %} 414 | {% set agg = group.aggregate %} 415 |
416 | 417 |

418 | {{ model_display_names.get(group.model_name, group.model_name) }} 419 | {% if agg and agg.n_samples %} 420 |  — {{ '%.0f%%'|format((agg.n_pass / agg.n_samples) * 100) }} 421 | {% endif %} 422 |

423 |
424 | {% if agg %} 425 |

Samples: {{ agg.n_samples }} | Passes: {{ agg.n_pass }}

426 | 427 | {% for k,v in agg.pass_at_k.items() %}{% endfor %} 428 | {% for k,v in agg.pass_at_k.items() %} 429 | 430 | {% endfor %} 431 |
pass@{{ k }}
{{ '%.0f%%'|format(v * 100) }}
432 | {% set _percent = (100.0 * (agg.n_pass / agg.n_samples)) if agg.n_samples else 0 %} 433 | 434 | {% endif %} 435 |
436 | {% for r in group.samples %} 437 |
438 | {# Trim the first two path segments (e.g., 'runs//...') #} 439 | {% set _parts = r.generation_html_path.split('/') %} 440 | {% set _trimmed = '/'.join(_parts[2:]) %} 441 |

Sample {{ r.sample_index if r.sample_index is not none else loop.index0 }} ({{ model_display_names.get(r.model_name, r.model_name) }})

442 |

{{ r.result }} | Latency {{ '%.2f'|format(r.generation.latency_s) }}s{% if r.generation.cached %} cached{% endif %}

443 |

Axe WCAG: {{ r.axe.failure_count if r.axe else 'n/a' }}{% if r.axe and r.axe.best_practice_count > 0 %} | BP: {{ r.axe.best_practice_count }}{% endif %}{% if r.generation.cost_usd is not none %} | ${{ '%.4f'|format(r.generation.cost_usd) }}{% endif %}

444 | {% if r.screenshot_path %} 445 | {# Trim the first two path segments (e.g., 'runs//...') #} 446 | {% set _parts = r.screenshot_path.split('/') %} 447 | {% set _trimmed = '/'.join(_parts[2:]) %} 448 |
449 | Screenshot sample {{ r.sample_index }} for {{ r.test_name }} / {{ model_display_names.get(r.model_name, r.model_name) }} 450 |
451 | {% endif %} 452 |
453 | 454 | Assertions 455 | {% if r.test_function.status == "fail" %} 456 | 457 | {% elif r.test_function.status == "pass" %} 458 | 459 | {% endif %} 460 | 461 |
    462 | {% for a in r.test_function.assertions %} 463 |
  • 464 | {% if a.status == "fail" %} 465 | : 466 | {% elif a.status == "pass" %} 467 | : 468 | {% endif %} 469 | {{ a.name }} ({{ a.type if a.type else 'R' }}): {{ a.status }} 470 | {% if a.message %} - {{ a.message }}{% endif %} 471 |
  • 472 | {% endfor %} 473 |
474 |
475 | {% if r.axe %} 476 | {% if r.axe.failure_count > 0 %} 477 |
478 | Axe WCAG Failures ({{ r.axe.failure_count }}) 479 |
    480 | {% for v in r.axe.failures %} 481 |
  • ({{ v.nodes|length }}x) - {{ v.id }} ({{ v.impact }}): {{ v.description }}
  • 482 | {% endfor %} 483 |
484 |
485 | {% endif %} 486 | {% if r.axe.best_practice_count > 0 %} 487 |
488 | Axe Best Practice Issues ({{ r.axe.best_practice_count }}) ⚠️ 489 |
    490 | {% for v in r.axe.best_practice_failures %} 491 |
  • {{ v.id }} ({{ v.impact }}): {{ v.description }} (Best Practice - does not affect pass/fail)
  • 492 | {% endfor %} 493 |
494 |
495 | {% endif %} 496 | {% endif %} 497 |
498 | {% endfor %} 499 |
500 |
501 | {% endfor %} 502 |
503 |
504 | {% endfor %} 505 |
506 | 526 |
527 |
528 |

GitHub Project: a11y-llm-eval. Run ID: {{ run_id }}

529 | {{ footer_content|safe }} 530 |
531 | 607 | 655 | 656 | 657 | """ 658 | 659 | def render_report(run_json_path: Path, out_html: Path, models_cfg: dict): 660 | data = orjson.loads(run_json_path.read_bytes()) 661 | meta_block = data.get("meta") or {} 662 | sampling_meta = meta_block.get("sampling") or {} 663 | prompting_meta = meta_block.get("prompting") or {} 664 | from collections import defaultdict 665 | 666 | results = data.get("results", []) 667 | 668 | # Build display name mapping with precedence: 669 | # 1. Stored meta.models_info 670 | # 2. Provided models_cfg 671 | # 3. Fallback to last path segment of model name 672 | model_display_names = {} 673 | for m in (meta_block.get("models_info") or []): 674 | name = m.get("name") 675 | if not name: 676 | continue 677 | model_display_names[name] = m.get("display_name") or name.split('/')[-1] 678 | model_display_names[name.split('/')[-1]] = m.get("display_name") or name.split('/')[-1] 679 | for m in (models_cfg.get("models") or []): 680 | name = m.get("name") 681 | if not name: 682 | continue 683 | display = m.get("display_name") or model_display_names.get(name) or name.split('/')[-1] 684 | model_display_names[name] = display 685 | # Ensure any model appearing only in results has a mapping 686 | for r in results: 687 | n = r.get("model_name") 688 | if n and n not in model_display_names: 689 | model_display_names[n] = n.split('/')[-1] 690 | 691 | per_model = defaultdict(lambda: { 692 | "axe_failures": [], 693 | "total_test_function_passes": 0, 694 | "bp_passes": 0, 695 | "total": 0, 696 | "bp_total": 0, 697 | "costs": [], 698 | "axe_bp_failures": [], 699 | "axe_bp_passes": 0, 700 | "axe_bp_total": 0, 701 | "total_axe_failures": 0, 702 | "total_failures": 0, 703 | "total_passes": 0, 704 | "total_assertion_bp_failures": 0, 705 | "total_assertion_failures": 0, 706 | }) 707 | 708 | for r in results: 709 | model = r.get("model_name") 710 | if not model: 711 | continue 712 | per_model[model]["total"] += 1 713 | if r.get("result") == "PASS": 714 | per_model[model]["total_passes"] += 1 715 | # Determine test function pass count 716 | if r.get("test_function", {}).get("status") == "pass": 717 | per_model[model]["total_test_function_passes"] += 1 718 | # Track best-practice assertions pass rate separately 719 | assertions = r.get("test_function", {}).get("assertions", []) 720 | bp_assertions = [a for a in assertions if (a.get("type") or "R").upper() == "BP"] 721 | if bp_assertions: 722 | per_model[model]["bp_total"] += 1 # treat per-test BP status aggregate: pass if all BP pass 723 | if all(a.get("status") == "pass" for a in bp_assertions): 724 | per_model[model]["bp_passes"] += 1 725 | per_model[model]["total_assertion_bp_failures"] += r.get("test_function", {}).get("total_assertion_bp_failures", 0) 726 | per_model[model]["total_assertion_failures"] += r.get("test_function", {}).get("total_assertion_failures", 0) 727 | # Track axe failures (WCAG only now) and best practice failures 728 | axe = r.get("axe") or {} 729 | fc = axe.get("failure_count") 730 | if fc is not None: 731 | per_model[model]["axe_failures"].append(axe.get("failures", [])) 732 | per_model[model]["total_axe_failures"] += (fc or 0) 733 | # Track axe best practice failures separately 734 | bp_fc = axe.get("best_practice_count", 0) 735 | per_model[model]["axe_bp_failures"].append(axe.get("best_practice_failures", [])) 736 | per_model[model]["axe_bp_total"] += bp_fc 737 | if bp_fc == 0: 738 | per_model[model]["axe_bp_passes"] += 1 739 | gen = r.get("generation", {}) 740 | cost = gen.get("cost_usd") 741 | if cost is not None: 742 | try: 743 | per_model[model]["costs"].append(float(cost)) 744 | except (TypeError, ValueError): 745 | pass 746 | # create summary 747 | summary = {} 748 | for m, s in per_model.items(): 749 | avg_axe_failures = s["total_axe_failures"] / s["total"] if s["total"] else 0.0 750 | total_cost = sum(s["costs"]) if s["costs"] else 0.0 751 | avg_cost = (total_cost / s["total"]) if s["total"] else 0.0 752 | total_bp_failures = s["total_assertion_bp_failures"] + s["axe_bp_total"] 753 | total_axe_failures = s["total_axe_failures"] 754 | total_assertion_failures = s["total_assertion_failures"] 755 | total_assertion_bp_failures = s["total_assertion_bp_failures"] 756 | avg_assertion_failures = (total_assertion_failures / s["total"]) if s["total"] else 0.0 757 | avg_bp_failures = (total_bp_failures / s["total"]) if s["total"] else 0.0 758 | total_failures = total_assertion_failures + total_axe_failures 759 | avg_failures = (total_failures / s["total"]) if s["total"] else 0.0 760 | summary[m] = { 761 | "avg_axe_failures": avg_axe_failures, 762 | "pass_rate": s["total_passes"] / s["total"] if s["total"] else 0, 763 | "total_cost": total_cost, 764 | "avg_cost": avg_cost, 765 | "total_assertion_failures": total_assertion_failures, 766 | "total_assertion_bp_failures": total_assertion_bp_failures, 767 | "avg_assertion_failures": avg_assertion_failures, 768 | "avg_bp_failures": avg_bp_failures, 769 | "total_failures": total_failures, 770 | "avg_failures": avg_failures, 771 | } 772 | 773 | # Group samples by (test_name, model_name) 774 | grouped = {} 775 | for r in results: 776 | key = (r.get("test_name"), r.get("model_name")) 777 | grouped.setdefault(key, []).append(r) 778 | # Sort samples by sample_index if present 779 | grouped_results = OrderedDict() 780 | agg_index = {} 781 | # Enhance aggregates with display_model_name (provider prefix stripped) 782 | for a in (data.get("aggregates") or []): 783 | agg_index[(a.get("test_name"), a.get("model_name"))] = a 784 | 785 | prompts_map = (data.get("prompts") or {}) 786 | for (test_name, model_name), samples in sorted(grouped.items()): 787 | samples_sorted = sorted(samples, key=lambda x: (x.get("sample_index") is None, x.get("sample_index") or 0)) 788 | test_entry = grouped_results.setdefault(test_name, {"prompt": prompts_map.get(test_name), "models": []}) 789 | test_entry["models"].append({ 790 | "model_name": model_name, 791 | "samples": samples_sorted, 792 | "aggregate": agg_index.get((test_name, model_name)), 793 | }) 794 | 795 | summary = OrderedDict(sorted(summary.items(), key=lambda item: (-item[1]["pass_rate"], item[1]["avg_failures"])) ) 796 | # Build aggregates_by_test: for each test, list all models and their aggregates (ensures unique table per test) 797 | aggregates_by_test = OrderedDict() 798 | tests_in_order = list(grouped_results.keys()) 799 | for test_name in tests_in_order: 800 | models_info = grouped_results.get(test_name, {}).get('models', []) 801 | rows = [] 802 | ks_set = [] 803 | for m in models_info: 804 | model_name = m.get('model_name') 805 | agg = agg_index.get((test_name, model_name)) 806 | if agg: 807 | pass_at_k = agg.get('pass_at_k') or {} 808 | # preserve order of keys as they appear; avoid duplicates 809 | for k in pass_at_k.keys(): 810 | if k not in ks_set: 811 | ks_set.append(k) 812 | rows.append({ 813 | 'model_name': model_name, 814 | 'n_samples': agg.get('n_samples', 0), 815 | 'n_pass': agg.get('n_pass', 0), 816 | 'pass_at_k': pass_at_k, 817 | }) 818 | else: 819 | rows.append({ 820 | 'model_name': model_name, 821 | 'n_samples': 0, 822 | 'n_pass': 0, 823 | 'pass_at_k': {}, 824 | }) 825 | # try to sort ks numerically when possible 826 | try: 827 | ks_sorted = sorted(ks_set, key=lambda x: int(x)) 828 | except Exception: 829 | ks_sorted = sorted(ks_set) 830 | aggregates_by_test[test_name] = {'rows': rows, 'ks': ks_sorted} 831 | 832 | html = Template(TEMPLATE).render( 833 | run_id=data.get("run_id", "unknown"), 834 | models=data.get("models", []), 835 | model_display_names=model_display_names, 836 | tests=data.get("tests", []), 837 | summary=summary, 838 | results=results, 839 | aggregates=data.get("aggregates", []), 840 | aggregates_by_test=aggregates_by_test, 841 | grouped_results=grouped_results, 842 | site_name=os.getenv("SITE_NAME", "A11y LLM Eval"), 843 | footer_content=os.getenv("FOOTER_CONTENT", ""), 844 | n_samples=sampling_meta.get("samples_per_case", 0), 845 | prompting_meta=prompting_meta, 846 | ) 847 | out_html.write_text(html, encoding="utf-8") 848 | --------------------------------------------------------------------------------