├── .env.example ├── test_cases ├── modal-dialog │ ├── prompt.md │ ├── examples │ │ ├── pass-native.html │ │ ├── fail-popover.html │ │ ├── fail-aria.html │ │ ├── pass-aria.html │ │ ├── fail-has-css-transition.html │ │ └── fail-has-js-delay.html │ └── test.js ├── shopping-home-page │ ├── prompt.md │ ├── examples │ │ ├── fail.html │ │ └── pass.html │ └── test.js ├── disclosure-widget │ ├── prompt.md │ ├── examples │ │ ├── good-details-summary.html │ │ ├── bad-missing-all-semantics.html │ │ ├── bad-hidden-content-in-reading-order.html │ │ ├── good-button.html │ │ └── good-hidden-from-everyone.html │ └── test.js └── readme.md ├── requirements.txt ├── setup.cfg ├── scripts └── install_node_deps.sh ├── config └── models.yaml.example ├── a11y_llm_tests ├── __init__.py ├── utils.py ├── cache.py ├── node_bridge.py ├── metrics.py ├── schema.py ├── generator.py ├── cli.py └── report.py ├── node_runner ├── package.json ├── package-lock.json └── runner.js ├── SECURITY.md ├── CODE_OF_CONDUCT.md ├── tests ├── test_metrics.py ├── test_generator_prompts.py ├── test_cli_sampling.py └── test_examples.py ├── .github └── workflows │ └── build.yml ├── LICENSE ├── SUPPORT.md ├── README.md └── .gitignore /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | ANTHROPIC_API_KEY= 3 | GOOGLE_API_KEY= 4 | -------------------------------------------------------------------------------- /test_cases/modal-dialog/prompt.md: -------------------------------------------------------------------------------- 1 | Create an example of a modal dialog component. It is closed by default, and the button to open it has a `trigger` class. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | litellm 2 | pydantic>=2 3 | rich 4 | jinja2 5 | aiofiles 6 | httpx 7 | python-dotenv 8 | orjson 9 | typer 10 | beautifulsoup4 11 | lxml 12 | pytest 13 | pyyaml 14 | -------------------------------------------------------------------------------- /test_cases/shopping-home-page/prompt.md: -------------------------------------------------------------------------------- 1 | Generate an HTML file for the home page of a shopping website. The home page has several sections of content for popular items, deals, and blog posts. 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = a11y-llm-tests 3 | version = 0.1.0 4 | 5 | [options] 6 | packages = find: 7 | python_requires = >=3.10 8 | 9 | [tool:pytest] 10 | pythonpath = . 11 | addopts = -ra 12 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/prompt.md: -------------------------------------------------------------------------------- 1 | - Generate an HTML file that demonstrates an expand/collapse widget. 2 | - Wrap each widget with a div that has an `example` class. 3 | - Give the container for controlled content a `details` class. 4 | -------------------------------------------------------------------------------- /scripts/install_node_deps.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | cd "$(dirname "$0")/../node_runner" 4 | if [ ! -f package.json ]; then 5 | echo "package.json missing" >&2 6 | exit 1 7 | fi 8 | npm install 9 | npx playwright install chromium 10 | echo "Node dependencies installed." 11 | -------------------------------------------------------------------------------- /config/models.yaml.example: -------------------------------------------------------------------------------- 1 | defaults: 2 | #temperature: 0.2 3 | #max_tokens: 1200 4 | #system_prompt: | 5 | # You are generating a single standalone HTML document. 6 | #custom_instructions_markdown: custom-instructions.md 7 | models: 8 | - name: azure_ai/gpt-5-mini 9 | display_name: GPT-5 Mini 10 | - name: claude-sonnet-4-20250514 11 | display_name: Claud Sonnet 4 12 | -------------------------------------------------------------------------------- /a11y_llm_tests/__init__.py: -------------------------------------------------------------------------------- 1 | """a11y_llm_tests 2 | 3 | Toolkit to evaluate how well various LLMs generate accessible HTML. 4 | 5 | Primary entrypoints: 6 | - cli.py (Typer CLI) 7 | - generator.py (LLM generation + caching) 8 | - node_bridge.py (Playwright + axe-core invocation) 9 | - report.py (HTML report rendering) 10 | """ 11 | 12 | __all__ = [ 13 | "generator", 14 | "node_bridge", 15 | "report", 16 | ] 17 | -------------------------------------------------------------------------------- /a11y_llm_tests/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous utility helpers (placeholder).""" 2 | 3 | def ensure_single_html(doc: str) -> str: 4 | """Return only the first ... segment if multiple exist.""" 5 | lower = doc.lower() 6 | if "" in lower: 7 | start = lower.index("") + len("") 9 | return doc[start:end] 10 | return doc 11 | -------------------------------------------------------------------------------- /node_runner/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "a11y-llm-tests-runner", 3 | "version": "0.1.0", 4 | "private": true, 5 | "type": "commonjs", 6 | "description": "Playwright + axe-core runner for a11y-llm-tests", 7 | "main": "runner.js", 8 | "scripts": { 9 | "start": "node runner.js" 10 | }, 11 | "dependencies": { 12 | "axe-core": "^4.10.0", 13 | "deepmerge": "^4.3.1", 14 | "playwright": "^1.48.0" 15 | }, 16 | "engines": { 17 | "node": ">=18" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which 6 | includes all source code repositories in our GitHub organizations. 7 | 8 | **Please do not report security vulnerabilities through public GitHub issues.** 9 | 10 | For security reporting information, locations, contact information, and policies, 11 | please review the latest guidance for Microsoft repositories at 12 | [https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md). 13 | 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | - Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support) 11 | -------------------------------------------------------------------------------- /a11y_llm_tests/cache.py: -------------------------------------------------------------------------------- 1 | """Cache utilities for generation artifacts. 2 | 3 | Currently minimal: provides helper to compose cache keys that account for model, 4 | prompt hash, and optional seed, ensuring sampled generations can coexist. 5 | """ 6 | 7 | from pathlib import Path 8 | from typing import Optional 9 | 10 | CACHE_ROOT = Path('.cache') 11 | CACHE_ROOT.mkdir(exist_ok=True) 12 | 13 | def generation_cache_key(model: str, prompt_hash: str, seed: Optional[int] = None) -> str: 14 | """Return a filename-safe cache key for a generation. 15 | 16 | Example: modelabc_deadbeef or modelabc_deadbeef_s42 17 | """ 18 | if seed is None: 19 | return f"{model}_{prompt_hash}" 20 | return f"{model}_{prompt_hash}_s{seed}" 21 | 22 | __all__ = ["generation_cache_key", "CACHE_ROOT"] 23 | -------------------------------------------------------------------------------- /test_cases/disclosure-widget/examples/good-details-summary.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 |This is the hidden content that can be disclosed.
20 |The following are examples of failing modal dialogs that are implemented as popovers.
26 | 27 |29 | This example has role of dialog but does not trap keyboard or screen reader focus. 30 | Content behind the dialog is still available to keyboard and screen reader users. 31 |
32 |Look ma, no JS! But is it good enough?
38 | 39 |50 | This example does not have a role of dialog and does not trap keyboard or screen reader focus. 51 | Content behind the dialog is still available to keyboard and screen reader users. 52 |
53 |Look ma, no JS! But is it good enough?
59 | 60 |26 | This is a failing example of an ARIA modal dialog. It has an incorrect role, does not hide content behind the dialog from 27 | keyboard and screen reader users, and does not manage keyboard focus. 28 |
29 | 30 | 31 | 32 |ARIA can be used to create modal dialogs.
35 |There's extra work to make them properly. The native HTML 36 | dialog element 37 | handles a lot of this for us automatically.
38 |39 | It's generally better to use the native HTML dialog unless 40 | you have specific use cases where a custom dialog might perform better. 41 | But even then, you may be able to still use native HTML features 42 | over ARIA... just sayin... 43 |
44 | 45 |ARIA can be used to create modal dialogs.
33 |There's extra work to make them properly. The native HTML 34 | dialog element 35 | handles a lot of this for us automatically.
36 |37 | It's generally better to use the native HTML dialog unless 38 | you have specific use cases where a custom dialog might perform better. 39 | But even then, you may be able to still use native HTML features 40 | over ARIA... just sayin... 41 |
42 | 43 |178 | The following is a failing example of a modal dialog component. It passes most of the requirements but 179 | does not hide content behind it from screen reader users and does not take focus when opened. This example has slight CSS transitions when opening/closing. 180 | This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility. 181 |
182 | 183 | 184 | 185 |159 | The following is a failing example of a modal dialog component. It passes most of the requirements but 160 | does not hide content behind it from screen reader users. This example has a slight JS delays when opening/closing. 161 | This delay does not fail the test, but is included to simulate real-world scenarios where animations or other JS operations may delay dialog visibility. 162 |
163 | 164 |Click the button to open the modal. The button has the class "trigger".
165 | 166 | 167 |All Models were tested against {{ tests|length }} test cases. Each test case was tested {{ n_samples }} times. This results in {{ total_samples }} total samples being evaluated per model.
265 || Model | Rank | WCAG Pass Rate* | Avg Total WCAG Failures | Avg Axe WCAG Failures | Avg Assertion WCAG Failures | Avg Best Practice Failures |
|---|---|---|---|---|---|---|
| {{ model_display_names.get(model, model) }} | 273 |{{ loop.index }} | 274 |{{ "%.0f%%"|format(stats.pass_rate * 100) }} | 275 |{{ "%.2f"|format(stats.avg_failures) }} | 276 |{{ "%.2f"|format(stats.avg_axe_failures) }} | 277 |{{ "%.2f"|format(stats.avg_assertion_failures) }} | 278 |{{ "%.2f"|format(stats.avg_bp_failures) }} | 279 |
* These tests do not comprehensively test all WCAG requirements, only a subset of the most common issues. WCAG failures may still exist even for passing tests.
284 | {% if aggregates_by_test %} 285 |Pass@k is a formula that determines the likelihood that if you pick random k samples from the set, then at least one of them would pass. For example, pass@10=.50 means that there is a 50 percent likelihood that at least 1 of the 10 randomly selected samples from the set would pass.
288 |Pass@K is a metric used to evaluate the performance of models when multiple samples are generated per test case.
289 | {% for test_name, info in aggregates_by_test.items() %} 290 || Model | 295 |Samples | 296 |Passes | 297 | {% for k in info.ks %} 298 |pass@{{ k }} | 299 | {% endfor %} 300 |
|---|---|---|---|
| {{ model_display_names.get(a.model_name, a.model_name) }} | 306 |{{ a.n_samples }} | 307 |{{ a.n_pass }} | 308 | {% for k in info.ks %} 309 | {% set v = a.pass_at_k.get(k) %} 310 |{% if v is not none %}{{ '%.0f%%'|format(v * 100) }}{% else %}-{% endif %} | 311 | {% endfor %} 312 |
This report shows how well various LLMs generate accessible HTML.
323 |{{ display_system_prompt|e }}
341 | {% if effective_system_prompt and system_prompt and effective_system_prompt != system_prompt %}
342 | The effective system prompt shown includes custom instructions.
343 | {% endif %} 344 |{{ custom_instructions }}
350 | {% if custom_instructions_path %}
351 | Source: {{ custom_instructions_path }}
352 | {% endif %} 353 |All tests are automatic and deterministic (no human intervention). Only a fraction of accessibility requirements in WCAG can be covered in this way. Many requirements still need a human to evaluate. As such, these tests are not comprehensive. Even if a test passes, it may still fail WCAG and contain serious accessibility issues.
356 |Please leave feedback, review the source code, and contribute test cases, assertions, and other improvements at the GitHub Project.
357 |No samples match the current filters.
403 | {% for test_name, test_data in grouped_results.items() %} 404 |{{ test_data.prompt|e }}
411 | Samples: {{ agg.n_samples }} | Passes: {{ agg.n_pass }}
426 || pass@{{ k }} | {% endfor %}
|---|
| {{ '%.0f%%'|format(v * 100) }} | 430 | {% endfor %}
{{ r.result }} | Latency {{ '%.2f'|format(r.generation.latency_s) }}s{% if r.generation.cached %} cached{% endif %}
443 |Axe WCAG: {{ r.axe.failure_count if r.axe else 'n/a' }}{% if r.axe and r.axe.best_practice_count > 0 %} | BP: {{ r.axe.best_practice_count }}{% endif %}{% if r.generation.cost_usd is not none %} | ${{ '%.4f'|format(r.generation.cost_usd) }}{% endif %}
444 | {% if r.screenshot_path %} 445 | {# Trim the first two path segments (e.g., 'runs/| Model | Total Cost ($) | Avg Cost/Test ($) |
|---|---|---|
| {{ model_display_names.get(model, model) }} | 518 |{{ "%.4f"|format(stats.total_cost) }} | 519 |{{ "%.4f"|format(stats.avg_cost) }} | 520 |