├── .dockerignore ├── .gitattributes ├── .github └── workflows │ └── build.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── README.md ├── build.sh ├── ci-build ├── Dockerfile ├── README.md ├── docker-build.sh ├── docker-run.sh └── inside-container.sh ├── entities ├── README.md ├── build.sh ├── entities-legacy.inc ├── entity-processor-json.py ├── entity-processor.py ├── entity-to-dtd.pl ├── json-entities-legacy.inc └── out │ ├── entities-dtd.url │ ├── entities.inc │ └── entities.json ├── lint.sh └── src ├── annotate_attributes.rs ├── boilerplate.rs ├── dom_utils.rs ├── interface_index.rs ├── io_utils.rs ├── main.rs ├── parser.rs ├── rcdom_with_line_numbers.rs ├── represents.rs └── tag_omission.rs /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !entities/out 3 | !quotes/out 4 | !build.sh 5 | !lint.sh 6 | !Cargo.lock 7 | !Cargo.toml 8 | !src 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: html-build CI 2 | on: 3 | pull_request: 4 | branches: ['main'] 5 | push: 6 | branches: ['main'] 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | IMAGE_NAME: ${{ github.repository }} 11 | 12 | 13 | jobs: 14 | build: 15 | name: Build 16 | runs-on: ubuntu-latest 17 | permissions: 18 | contents: read 19 | packages: write 20 | steps: 21 | - name: Checkout whatwg/html-build 22 | uses: actions/checkout@v3 23 | with: 24 | fetch-depth: 0 25 | - name: Shellcheck 26 | run: | 27 | shellcheck *.sh 28 | shellcheck ci-build/*.sh 29 | - name: Docker build 30 | run: ci-build/docker-build.sh 31 | - name: Checkout whatwg/html 32 | uses: actions/checkout@v3 33 | with: 34 | repository: whatwg/html 35 | path: html 36 | fetch-depth: 2 37 | - name: Test against whatwg/html 38 | run: | 39 | mkdir output 40 | bash ci-build/docker-run.sh "$GITHUB_WORKSPACE/html" output 41 | - name: Docker login 42 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} 43 | uses: docker/login-action@v2 44 | with: 45 | registry: ${{ env.REGISTRY }} 46 | username: ${{ github.actor }} 47 | password: ${{ secrets.GITHUB_TOKEN }} 48 | - name: Docker push 49 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} 50 | run: | 51 | docker tag "$REGISTRY/$IMAGE_NAME" "$REGISTRY/$IMAGE_NAME:$GITHUB_SHA" 52 | docker tag "$REGISTRY/$IMAGE_NAME" "$REGISTRY/$IMAGE_NAME:latest" 53 | docker push "$REGISTRY/$IMAGE_NAME:$GITHUB_SHA" 54 | docker push "$REGISTRY/$IMAGE_NAME:latest" 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | .temp/ 3 | html/ 4 | output/ 5 | mdn/.id-list 6 | mdn/developer.mozilla.org/ 7 | highlighter/ 8 | 9 | 10 | # Added by cargo 11 | 12 | /target 13 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "autocfg" 16 | version = "1.1.0" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 19 | 20 | [[package]] 21 | name = "bitflags" 22 | version = "1.3.2" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 25 | 26 | [[package]] 27 | name = "bytes" 28 | version = "1.4.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" 31 | 32 | [[package]] 33 | name = "cc" 34 | version = "1.0.79" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" 37 | 38 | [[package]] 39 | name = "cfg-if" 40 | version = "1.0.0" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 43 | 44 | [[package]] 45 | name = "delegate" 46 | version = "0.12.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" 49 | dependencies = [ 50 | "proc-macro2", 51 | "quote", 52 | "syn 2.0.18", 53 | ] 54 | 55 | [[package]] 56 | name = "errno" 57 | version = "0.3.1" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" 60 | dependencies = [ 61 | "errno-dragonfly", 62 | "libc", 63 | "windows-sys", 64 | ] 65 | 66 | [[package]] 67 | name = "errno-dragonfly" 68 | version = "0.1.2" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" 71 | dependencies = [ 72 | "cc", 73 | "libc", 74 | ] 75 | 76 | [[package]] 77 | name = "fastrand" 78 | version = "1.9.0" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" 81 | dependencies = [ 82 | "instant", 83 | ] 84 | 85 | [[package]] 86 | name = "futf" 87 | version = "0.1.5" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 90 | dependencies = [ 91 | "mac", 92 | "new_debug_unreachable", 93 | ] 94 | 95 | [[package]] 96 | name = "getrandom" 97 | version = "0.2.10" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" 100 | dependencies = [ 101 | "cfg-if", 102 | "libc", 103 | "wasi", 104 | ] 105 | 106 | [[package]] 107 | name = "hermit-abi" 108 | version = "0.2.6" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" 111 | dependencies = [ 112 | "libc", 113 | ] 114 | 115 | [[package]] 116 | name = "hermit-abi" 117 | version = "0.3.1" 118 | source = "registry+https://github.com/rust-lang/crates.io-index" 119 | checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" 120 | 121 | [[package]] 122 | name = "html-build" 123 | version = "0.0.0" 124 | dependencies = [ 125 | "delegate", 126 | "html5ever", 127 | "markup5ever_rcdom", 128 | "regex", 129 | "tempfile", 130 | "tokio", 131 | ] 132 | 133 | [[package]] 134 | name = "html5ever" 135 | version = "0.26.0" 136 | source = "registry+https://github.com/rust-lang/crates.io-index" 137 | checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" 138 | dependencies = [ 139 | "log", 140 | "mac", 141 | "markup5ever", 142 | "proc-macro2", 143 | "quote", 144 | "syn 1.0.109", 145 | ] 146 | 147 | [[package]] 148 | name = "instant" 149 | version = "0.1.12" 150 | source = "registry+https://github.com/rust-lang/crates.io-index" 151 | checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" 152 | dependencies = [ 153 | "cfg-if", 154 | ] 155 | 156 | [[package]] 157 | name = "io-lifetimes" 158 | version = "1.0.11" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" 161 | dependencies = [ 162 | "hermit-abi 0.3.1", 163 | "libc", 164 | "windows-sys", 165 | ] 166 | 167 | [[package]] 168 | name = "libc" 169 | version = "0.2.146" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" 172 | 173 | [[package]] 174 | name = "linux-raw-sys" 175 | version = "0.3.8" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" 178 | 179 | [[package]] 180 | name = "lock_api" 181 | version = "0.4.10" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" 184 | dependencies = [ 185 | "autocfg", 186 | "scopeguard", 187 | ] 188 | 189 | [[package]] 190 | name = "log" 191 | version = "0.4.19" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" 194 | 195 | [[package]] 196 | name = "mac" 197 | version = "0.1.1" 198 | source = "registry+https://github.com/rust-lang/crates.io-index" 199 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 200 | 201 | [[package]] 202 | name = "markup5ever" 203 | version = "0.11.0" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" 206 | dependencies = [ 207 | "log", 208 | "phf", 209 | "phf_codegen", 210 | "string_cache", 211 | "string_cache_codegen", 212 | "tendril", 213 | ] 214 | 215 | [[package]] 216 | name = "markup5ever_rcdom" 217 | version = "0.2.0" 218 | source = "registry+https://github.com/rust-lang/crates.io-index" 219 | checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" 220 | dependencies = [ 221 | "html5ever", 222 | "markup5ever", 223 | "tendril", 224 | "xml5ever", 225 | ] 226 | 227 | [[package]] 228 | name = "memchr" 229 | version = "2.5.0" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 232 | 233 | [[package]] 234 | name = "mio" 235 | version = "0.8.8" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" 238 | dependencies = [ 239 | "libc", 240 | "wasi", 241 | "windows-sys", 242 | ] 243 | 244 | [[package]] 245 | name = "new_debug_unreachable" 246 | version = "1.0.4" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 249 | 250 | [[package]] 251 | name = "num_cpus" 252 | version = "1.15.0" 253 | source = "registry+https://github.com/rust-lang/crates.io-index" 254 | checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" 255 | dependencies = [ 256 | "hermit-abi 0.2.6", 257 | "libc", 258 | ] 259 | 260 | [[package]] 261 | name = "once_cell" 262 | version = "1.18.0" 263 | source = "registry+https://github.com/rust-lang/crates.io-index" 264 | checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" 265 | 266 | [[package]] 267 | name = "parking_lot" 268 | version = "0.12.1" 269 | source = "registry+https://github.com/rust-lang/crates.io-index" 270 | checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" 271 | dependencies = [ 272 | "lock_api", 273 | "parking_lot_core", 274 | ] 275 | 276 | [[package]] 277 | name = "parking_lot_core" 278 | version = "0.9.8" 279 | source = "registry+https://github.com/rust-lang/crates.io-index" 280 | checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" 281 | dependencies = [ 282 | "cfg-if", 283 | "libc", 284 | "redox_syscall", 285 | "smallvec", 286 | "windows-targets", 287 | ] 288 | 289 | [[package]] 290 | name = "phf" 291 | version = "0.10.1" 292 | source = "registry+https://github.com/rust-lang/crates.io-index" 293 | checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" 294 | dependencies = [ 295 | "phf_shared", 296 | ] 297 | 298 | [[package]] 299 | name = "phf_codegen" 300 | version = "0.10.0" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" 303 | dependencies = [ 304 | "phf_generator", 305 | "phf_shared", 306 | ] 307 | 308 | [[package]] 309 | name = "phf_generator" 310 | version = "0.10.0" 311 | source = "registry+https://github.com/rust-lang/crates.io-index" 312 | checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" 313 | dependencies = [ 314 | "phf_shared", 315 | "rand", 316 | ] 317 | 318 | [[package]] 319 | name = "phf_shared" 320 | version = "0.10.0" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" 323 | dependencies = [ 324 | "siphasher", 325 | ] 326 | 327 | [[package]] 328 | name = "pin-project-lite" 329 | version = "0.2.9" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" 332 | 333 | [[package]] 334 | name = "ppv-lite86" 335 | version = "0.2.17" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 338 | 339 | [[package]] 340 | name = "precomputed-hash" 341 | version = "0.1.1" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 344 | 345 | [[package]] 346 | name = "proc-macro2" 347 | version = "1.0.60" 348 | source = "registry+https://github.com/rust-lang/crates.io-index" 349 | checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" 350 | dependencies = [ 351 | "unicode-ident", 352 | ] 353 | 354 | [[package]] 355 | name = "quote" 356 | version = "1.0.28" 357 | source = "registry+https://github.com/rust-lang/crates.io-index" 358 | checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" 359 | dependencies = [ 360 | "proc-macro2", 361 | ] 362 | 363 | [[package]] 364 | name = "rand" 365 | version = "0.8.5" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 368 | dependencies = [ 369 | "libc", 370 | "rand_chacha", 371 | "rand_core", 372 | ] 373 | 374 | [[package]] 375 | name = "rand_chacha" 376 | version = "0.3.1" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 379 | dependencies = [ 380 | "ppv-lite86", 381 | "rand_core", 382 | ] 383 | 384 | [[package]] 385 | name = "rand_core" 386 | version = "0.6.4" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 389 | dependencies = [ 390 | "getrandom", 391 | ] 392 | 393 | [[package]] 394 | name = "redox_syscall" 395 | version = "0.3.5" 396 | source = "registry+https://github.com/rust-lang/crates.io-index" 397 | checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" 398 | dependencies = [ 399 | "bitflags", 400 | ] 401 | 402 | [[package]] 403 | name = "regex" 404 | version = "1.8.4" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" 407 | dependencies = [ 408 | "aho-corasick", 409 | "memchr", 410 | "regex-syntax", 411 | ] 412 | 413 | [[package]] 414 | name = "regex-syntax" 415 | version = "0.7.2" 416 | source = "registry+https://github.com/rust-lang/crates.io-index" 417 | checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" 418 | 419 | [[package]] 420 | name = "rustix" 421 | version = "0.37.20" 422 | source = "registry+https://github.com/rust-lang/crates.io-index" 423 | checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" 424 | dependencies = [ 425 | "bitflags", 426 | "errno", 427 | "io-lifetimes", 428 | "libc", 429 | "linux-raw-sys", 430 | "windows-sys", 431 | ] 432 | 433 | [[package]] 434 | name = "scopeguard" 435 | version = "1.1.0" 436 | source = "registry+https://github.com/rust-lang/crates.io-index" 437 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 438 | 439 | [[package]] 440 | name = "serde" 441 | version = "1.0.164" 442 | source = "registry+https://github.com/rust-lang/crates.io-index" 443 | checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" 444 | 445 | [[package]] 446 | name = "signal-hook-registry" 447 | version = "1.4.1" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" 450 | dependencies = [ 451 | "libc", 452 | ] 453 | 454 | [[package]] 455 | name = "siphasher" 456 | version = "0.3.10" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" 459 | 460 | [[package]] 461 | name = "smallvec" 462 | version = "1.10.0" 463 | source = "registry+https://github.com/rust-lang/crates.io-index" 464 | checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" 465 | 466 | [[package]] 467 | name = "socket2" 468 | version = "0.4.9" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" 471 | dependencies = [ 472 | "libc", 473 | "winapi", 474 | ] 475 | 476 | [[package]] 477 | name = "string_cache" 478 | version = "0.8.7" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" 481 | dependencies = [ 482 | "new_debug_unreachable", 483 | "once_cell", 484 | "parking_lot", 485 | "phf_shared", 486 | "precomputed-hash", 487 | "serde", 488 | ] 489 | 490 | [[package]] 491 | name = "string_cache_codegen" 492 | version = "0.5.2" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" 495 | dependencies = [ 496 | "phf_generator", 497 | "phf_shared", 498 | "proc-macro2", 499 | "quote", 500 | ] 501 | 502 | [[package]] 503 | name = "syn" 504 | version = "1.0.109" 505 | source = "registry+https://github.com/rust-lang/crates.io-index" 506 | checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" 507 | dependencies = [ 508 | "proc-macro2", 509 | "quote", 510 | "unicode-ident", 511 | ] 512 | 513 | [[package]] 514 | name = "syn" 515 | version = "2.0.18" 516 | source = "registry+https://github.com/rust-lang/crates.io-index" 517 | checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" 518 | dependencies = [ 519 | "proc-macro2", 520 | "quote", 521 | "unicode-ident", 522 | ] 523 | 524 | [[package]] 525 | name = "tempfile" 526 | version = "3.6.0" 527 | source = "registry+https://github.com/rust-lang/crates.io-index" 528 | checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" 529 | dependencies = [ 530 | "autocfg", 531 | "cfg-if", 532 | "fastrand", 533 | "redox_syscall", 534 | "rustix", 535 | "windows-sys", 536 | ] 537 | 538 | [[package]] 539 | name = "tendril" 540 | version = "0.4.3" 541 | source = "registry+https://github.com/rust-lang/crates.io-index" 542 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 543 | dependencies = [ 544 | "futf", 545 | "mac", 546 | "utf-8", 547 | ] 548 | 549 | [[package]] 550 | name = "tokio" 551 | version = "1.28.2" 552 | source = "registry+https://github.com/rust-lang/crates.io-index" 553 | checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" 554 | dependencies = [ 555 | "autocfg", 556 | "bytes", 557 | "libc", 558 | "mio", 559 | "num_cpus", 560 | "parking_lot", 561 | "pin-project-lite", 562 | "signal-hook-registry", 563 | "socket2", 564 | "tokio-macros", 565 | "windows-sys", 566 | ] 567 | 568 | [[package]] 569 | name = "tokio-macros" 570 | version = "2.1.0" 571 | source = "registry+https://github.com/rust-lang/crates.io-index" 572 | checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" 573 | dependencies = [ 574 | "proc-macro2", 575 | "quote", 576 | "syn 2.0.18", 577 | ] 578 | 579 | [[package]] 580 | name = "unicode-ident" 581 | version = "1.0.9" 582 | source = "registry+https://github.com/rust-lang/crates.io-index" 583 | checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" 584 | 585 | [[package]] 586 | name = "utf-8" 587 | version = "0.7.6" 588 | source = "registry+https://github.com/rust-lang/crates.io-index" 589 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 590 | 591 | [[package]] 592 | name = "wasi" 593 | version = "0.11.0+wasi-snapshot-preview1" 594 | source = "registry+https://github.com/rust-lang/crates.io-index" 595 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 596 | 597 | [[package]] 598 | name = "winapi" 599 | version = "0.3.9" 600 | source = "registry+https://github.com/rust-lang/crates.io-index" 601 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 602 | dependencies = [ 603 | "winapi-i686-pc-windows-gnu", 604 | "winapi-x86_64-pc-windows-gnu", 605 | ] 606 | 607 | [[package]] 608 | name = "winapi-i686-pc-windows-gnu" 609 | version = "0.4.0" 610 | source = "registry+https://github.com/rust-lang/crates.io-index" 611 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 612 | 613 | [[package]] 614 | name = "winapi-x86_64-pc-windows-gnu" 615 | version = "0.4.0" 616 | source = "registry+https://github.com/rust-lang/crates.io-index" 617 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 618 | 619 | [[package]] 620 | name = "windows-sys" 621 | version = "0.48.0" 622 | source = "registry+https://github.com/rust-lang/crates.io-index" 623 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 624 | dependencies = [ 625 | "windows-targets", 626 | ] 627 | 628 | [[package]] 629 | name = "windows-targets" 630 | version = "0.48.0" 631 | source = "registry+https://github.com/rust-lang/crates.io-index" 632 | checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" 633 | dependencies = [ 634 | "windows_aarch64_gnullvm", 635 | "windows_aarch64_msvc", 636 | "windows_i686_gnu", 637 | "windows_i686_msvc", 638 | "windows_x86_64_gnu", 639 | "windows_x86_64_gnullvm", 640 | "windows_x86_64_msvc", 641 | ] 642 | 643 | [[package]] 644 | name = "windows_aarch64_gnullvm" 645 | version = "0.48.0" 646 | source = "registry+https://github.com/rust-lang/crates.io-index" 647 | checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" 648 | 649 | [[package]] 650 | name = "windows_aarch64_msvc" 651 | version = "0.48.0" 652 | source = "registry+https://github.com/rust-lang/crates.io-index" 653 | checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" 654 | 655 | [[package]] 656 | name = "windows_i686_gnu" 657 | version = "0.48.0" 658 | source = "registry+https://github.com/rust-lang/crates.io-index" 659 | checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" 660 | 661 | [[package]] 662 | name = "windows_i686_msvc" 663 | version = "0.48.0" 664 | source = "registry+https://github.com/rust-lang/crates.io-index" 665 | checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" 666 | 667 | [[package]] 668 | name = "windows_x86_64_gnu" 669 | version = "0.48.0" 670 | source = "registry+https://github.com/rust-lang/crates.io-index" 671 | checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" 672 | 673 | [[package]] 674 | name = "windows_x86_64_gnullvm" 675 | version = "0.48.0" 676 | source = "registry+https://github.com/rust-lang/crates.io-index" 677 | checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" 678 | 679 | [[package]] 680 | name = "windows_x86_64_msvc" 681 | version = "0.48.0" 682 | source = "registry+https://github.com/rust-lang/crates.io-index" 683 | checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" 684 | 685 | [[package]] 686 | name = "xml5ever" 687 | version = "0.17.0" 688 | source = "registry+https://github.com/rust-lang/crates.io-index" 689 | checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" 690 | dependencies = [ 691 | "log", 692 | "mac", 693 | "markup5ever", 694 | ] 695 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html-build" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2021" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | tokio = { version = "1", features = ["full"] } 11 | html5ever = "0.26.0" 12 | markup5ever_rcdom = "0.2.0" 13 | regex = "1" 14 | delegate = "0.12.0" 15 | 16 | [dev-dependencies] 17 | tempfile = "3" 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.73-slim as builder 2 | WORKDIR /whatwg/html-build 3 | COPY Cargo.lock Cargo.toml ./ 4 | COPY src ./src/ 5 | RUN cargo install --path . 6 | 7 | FROM debian:stable-slim 8 | RUN apt-get update && \ 9 | apt-get install --yes --no-install-recommends ca-certificates curl git python3 python3-pip pipx && \ 10 | rm -rf /var/lib/apt/lists/* 11 | 12 | COPY --from=builder /usr/local/cargo/bin/html-build /bin/html-build 13 | 14 | COPY --from=ghcr.io/whatwg/wattsi:latest /whatwg/wattsi/bin/wattsi /bin/wattsi 15 | 16 | ENV PIPX_HOME /opt/pipx 17 | ENV PIPX_BIN_DIR /usr/bin 18 | RUN pipx install bs-highlighter 19 | 20 | COPY . /whatwg/html-build/ 21 | 22 | ENV SKIP_BUILD_UPDATE_CHECK true 23 | ENTRYPOINT ["bash", "/whatwg/html-build/build.sh"] 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). 2 | 3 | This work is licensed under a Creative Commons Attribution 4.0 International 4 | License. To the extent portions of it are incorporated into source code, 5 | such portions in the source code are licensed under the BSD 3-Clause License instead. 6 | 7 | - - - - 8 | 9 | Creative Commons Attribution 4.0 International Public License 10 | 11 | By exercising the Licensed Rights (defined below), You accept and agree 12 | to be bound by the terms and conditions of this Creative Commons 13 | Attribution 4.0 International Public License ("Public License"). To the 14 | extent this Public License may be interpreted as a contract, You are 15 | granted the Licensed Rights in consideration of Your acceptance of 16 | these terms and conditions, and the Licensor grants You such rights in 17 | consideration of benefits the Licensor receives from making the 18 | Licensed Material available under these terms and conditions. 19 | 20 | 21 | Section 1 -- Definitions. 22 | 23 | a. Adapted Material means material subject to Copyright and Similar 24 | Rights that is derived from or based upon the Licensed Material 25 | and in which the Licensed Material is translated, altered, 26 | arranged, transformed, or otherwise modified in a manner requiring 27 | permission under the Copyright and Similar Rights held by the 28 | Licensor. For purposes of this Public License, where the Licensed 29 | Material is a musical work, performance, or sound recording, 30 | Adapted Material is always produced where the Licensed Material is 31 | synched in timed relation with a moving image. 32 | 33 | b. Adapter's License means the license You apply to Your Copyright 34 | and Similar Rights in Your contributions to Adapted Material in 35 | accordance with the terms and conditions of this Public License. 36 | 37 | c. Copyright and Similar Rights means copyright and/or similar rights 38 | closely related to copyright including, without limitation, 39 | performance, broadcast, sound recording, and Sui Generis Database 40 | Rights, without regard to how the rights are labeled or 41 | categorized. For purposes of this Public License, the rights 42 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 43 | Rights. 44 | 45 | d. Effective Technological Measures means those measures that, in the 46 | absence of proper authority, may not be circumvented under laws 47 | fulfilling obligations under Article 11 of the WIPO Copyright 48 | Treaty adopted on December 20, 1996, and/or similar international 49 | agreements. 50 | 51 | e. Exceptions and Limitations means fair use, fair dealing, and/or 52 | any other exception or limitation to Copyright and Similar Rights 53 | that applies to Your use of the Licensed Material. 54 | 55 | f. Licensed Material means the artistic or literary work, database, 56 | or other material to which the Licensor applied this Public 57 | License. 58 | 59 | g. Licensed Rights means the rights granted to You subject to the 60 | terms and conditions of this Public License, which are limited to 61 | all Copyright and Similar Rights that apply to Your use of the 62 | Licensed Material and that the Licensor has authority to license. 63 | 64 | h. Licensor means the individual(s) or entity(ies) granting rights 65 | under this Public License. 66 | 67 | i. Share means to provide material to the public by any means or 68 | process that requires permission under the Licensed Rights, such 69 | as reproduction, public display, public performance, distribution, 70 | dissemination, communication, or importation, and to make material 71 | available to the public including in ways that members of the 72 | public may access the material from a place and at a time 73 | individually chosen by them. 74 | 75 | j. Sui Generis Database Rights means rights other than copyright 76 | resulting from Directive 96/9/EC of the European Parliament and of 77 | the Council of 11 March 1996 on the legal protection of databases, 78 | as amended and/or succeeded, as well as other essentially 79 | equivalent rights anywhere in the world. 80 | 81 | k. You means the individual or entity exercising the Licensed Rights 82 | under this Public License. Your has a corresponding meaning. 83 | 84 | 85 | Section 2 -- Scope. 86 | 87 | a. License grant. 88 | 89 | 1. Subject to the terms and conditions of this Public License, 90 | the Licensor hereby grants You a worldwide, royalty-free, 91 | non-sublicensable, non-exclusive, irrevocable license to 92 | exercise the Licensed Rights in the Licensed Material to: 93 | 94 | a. reproduce and Share the Licensed Material, in whole or 95 | in part; and 96 | 97 | b. produce, reproduce, and Share Adapted Material. 98 | 99 | 2. Exceptions and Limitations. For the avoidance of doubt, where 100 | Exceptions and Limitations apply to Your use, this Public 101 | License does not apply, and You do not need to comply with 102 | its terms and conditions. 103 | 104 | 3. Term. The term of this Public License is specified in Section 105 | 6(a). 106 | 107 | 4. Media and formats; technical modifications allowed. The 108 | Licensor authorizes You to exercise the Licensed Rights in 109 | all media and formats whether now known or hereafter created, 110 | and to make technical modifications necessary to do so. The 111 | Licensor waives and/or agrees not to assert any right or 112 | authority to forbid You from making technical modifications 113 | necessary to exercise the Licensed Rights, including 114 | technical modifications necessary to circumvent Effective 115 | Technological Measures. For purposes of this Public License, 116 | simply making modifications authorized by this Section 2(a) 117 | (4) never produces Adapted Material. 118 | 119 | 5. Downstream recipients. 120 | 121 | a. Offer from the Licensor -- Licensed Material. Every 122 | recipient of the Licensed Material automatically 123 | receives an offer from the Licensor to exercise the 124 | Licensed Rights under the terms and conditions of this 125 | Public License. 126 | 127 | b. No downstream restrictions. You may not offer or impose 128 | any additional or different terms or conditions on, or 129 | apply any Effective Technological Measures to, the 130 | Licensed Material if doing so restricts exercise of the 131 | Licensed Rights by any recipient of the Licensed 132 | Material. 133 | 134 | 6. No endorsement. Nothing in this Public License constitutes or 135 | may be construed as permission to assert or imply that You 136 | are, or that Your use of the Licensed Material is, connected 137 | with, or sponsored, endorsed, or granted official status by, 138 | the Licensor or others designated to receive attribution as 139 | provided in Section 3(a)(1)(A)(i). 140 | 141 | b. Other rights. 142 | 143 | 1. Moral rights, such as the right of integrity, are not 144 | licensed under this Public License, nor are publicity, 145 | privacy, and/or other similar personality rights; however, to 146 | the extent possible, the Licensor waives and/or agrees not to 147 | assert any such rights held by the Licensor to the limited 148 | extent necessary to allow You to exercise the Licensed 149 | Rights, but not otherwise. 150 | 151 | 2. Patent and trademark rights are not licensed under this 152 | Public License. 153 | 154 | 3. To the extent possible, the Licensor waives any right to 155 | collect royalties from You for the exercise of the Licensed 156 | Rights, whether directly or through a collecting society 157 | under any voluntary or waivable statutory or compulsory 158 | licensing scheme. In all other cases the Licensor expressly 159 | reserves any right to collect such royalties. 160 | 161 | 162 | Section 3 -- License Conditions. 163 | 164 | Your exercise of the Licensed Rights is expressly made subject to the 165 | following conditions. 166 | 167 | a. Attribution. 168 | 169 | 1. If You Share the Licensed Material (including in modified 170 | form), You must: 171 | 172 | a. retain the following if it is supplied by the Licensor 173 | with the Licensed Material: 174 | 175 | i. identification of the creator(s) of the Licensed 176 | Material and any others designated to receive 177 | attribution, in any reasonable manner requested by 178 | the Licensor (including by pseudonym if 179 | designated); 180 | 181 | ii. a copyright notice; 182 | 183 | iii. a notice that refers to this Public License; 184 | 185 | iv. a notice that refers to the disclaimer of 186 | warranties; 187 | 188 | v. a URI or hyperlink to the Licensed Material to the 189 | extent reasonably practicable; 190 | 191 | b. indicate if You modified the Licensed Material and 192 | retain an indication of any previous modifications; and 193 | 194 | c. indicate the Licensed Material is licensed under this 195 | Public License, and include the text of, or the URI or 196 | hyperlink to, this Public License. 197 | 198 | 2. You may satisfy the conditions in Section 3(a)(1) in any 199 | reasonable manner based on the medium, means, and context in 200 | which You Share the Licensed Material. For example, it may be 201 | reasonable to satisfy the conditions by providing a URI or 202 | hyperlink to a resource that includes the required 203 | information. 204 | 205 | 3. If requested by the Licensor, You must remove any of the 206 | information required by Section 3(a)(1)(A) to the extent 207 | reasonably practicable. 208 | 209 | 4. If You Share Adapted Material You produce, the Adapter's 210 | License You apply must not prevent recipients of the Adapted 211 | Material from complying with this Public License. 212 | 213 | 214 | Section 4 -- Sui Generis Database Rights. 215 | 216 | Where the Licensed Rights include Sui Generis Database Rights that 217 | apply to Your use of the Licensed Material: 218 | 219 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 220 | to extract, reuse, reproduce, and Share all or a substantial 221 | portion of the contents of the database; 222 | 223 | b. if You include all or a substantial portion of the database 224 | contents in a database in which You have Sui Generis Database 225 | Rights, then the database in which You have Sui Generis Database 226 | Rights (but not its individual contents) is Adapted Material; and 227 | 228 | c. You must comply with the conditions in Section 3(a) if You Share 229 | all or a substantial portion of the contents of the database. 230 | 231 | For the avoidance of doubt, this Section 4 supplements and does not 232 | replace Your obligations under this Public License where the Licensed 233 | Rights include other Copyright and Similar Rights. 234 | 235 | 236 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 237 | 238 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 239 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 240 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 241 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 242 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 243 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 244 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 245 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 246 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 247 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 248 | 249 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 250 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 251 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 252 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 253 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 254 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 255 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 256 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 257 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 258 | 259 | c. The disclaimer of warranties and limitation of liability provided 260 | above shall be interpreted in a manner that, to the extent 261 | possible, most closely approximates an absolute disclaimer and 262 | waiver of all liability. 263 | 264 | 265 | Section 6 -- Term and Termination. 266 | 267 | a. This Public License applies for the term of the Copyright and 268 | Similar Rights licensed here. However, if You fail to comply with 269 | this Public License, then Your rights under this Public License 270 | terminate automatically. 271 | 272 | b. Where Your right to use the Licensed Material has terminated under 273 | Section 6(a), it reinstates: 274 | 275 | 1. automatically as of the date the violation is cured, provided 276 | it is cured within 30 days of Your discovery of the 277 | violation; or 278 | 279 | 2. upon express reinstatement by the Licensor. 280 | 281 | For the avoidance of doubt, this Section 6(b) does not affect any 282 | right the Licensor may have to seek remedies for Your violations 283 | of this Public License. 284 | 285 | c. For the avoidance of doubt, the Licensor may also offer the 286 | Licensed Material under separate terms or conditions or stop 287 | distributing the Licensed Material at any time; however, doing so 288 | will not terminate this Public License. 289 | 290 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 291 | License. 292 | 293 | 294 | Section 7 -- Other Terms and Conditions. 295 | 296 | a. The Licensor shall not be bound by any additional or different 297 | terms or conditions communicated by You unless expressly agreed. 298 | 299 | b. Any arrangements, understandings, or agreements regarding the 300 | Licensed Material not stated herein are separate from and 301 | independent of the terms and conditions of this Public License. 302 | 303 | 304 | Section 8 -- Interpretation. 305 | 306 | a. For the avoidance of doubt, this Public License does not, and 307 | shall not be interpreted to, reduce, limit, restrict, or impose 308 | conditions on any use of the Licensed Material that could lawfully 309 | be made without permission under this Public License. 310 | 311 | b. To the extent possible, if any provision of this Public License is 312 | deemed unenforceable, it shall be automatically reformed to the 313 | minimum extent necessary to make it enforceable. If the provision 314 | cannot be reformed, it shall be severed from this Public License 315 | without affecting the enforceability of the remaining terms and 316 | conditions. 317 | 318 | c. No term or condition of this Public License will be waived and no 319 | failure to comply consented to unless expressly agreed to by the 320 | Licensor. 321 | 322 | d. Nothing in this Public License constitutes or may be interpreted 323 | as a limitation upon, or waiver of, any privileges and immunities 324 | that apply to the Licensor or You, including from the legal 325 | processes of any jurisdiction or authority. 326 | 327 | - - - - 328 | 329 | BSD 3-Clause License 330 | 331 | Redistribution and use in source and binary forms, with or without 332 | modification, are permitted provided that the following conditions are met: 333 | 334 | 1. Redistributions of source code must retain the above copyright notice, this 335 | list of conditions and the following disclaimer. 336 | 337 | 2. Redistributions in binary form must reproduce the above copyright notice, 338 | this list of conditions and the following disclaimer in the documentation 339 | and/or other materials provided with the distribution. 340 | 341 | 3. Neither the name of the copyright holder nor the names of its 342 | contributors may be used to endorse or promote products derived from 343 | this software without specific prior written permission. 344 | 345 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 346 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 347 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 348 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 349 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 350 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 351 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 352 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 353 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 354 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 355 | 356 | - - - - 357 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTML Build Tools 2 | 3 | This repository contains the tools and instructions necessary for building the [HTML Standard](https://html.spec.whatwg.org/multipage/) from its [source](https://github.com/whatwg/html). 4 | 5 | ## Getting set up 6 | 7 | Make sure you have `git` installed on your system, and you are using a Bash shell. (On Windows, `cmd.exe` will not work, but the Git Bash shell that comes with [Git for Windows](https://git-for-windows.github.io/) works nicely.) 8 | 9 | Then, clone this ([html-build](https://github.com/whatwg/html-build)) repo: 10 | 11 | ```bash 12 | git clone https://github.com/whatwg/html-build.git && cd html-build 13 | ``` 14 | 15 | ## Performing a build 16 | 17 | You have a decision to make as to how you want to do your builds: 18 | 19 | - Locally on your computer 20 | - Remotely, using the [build server](https://github.com/whatwg/build.whatwg.org) 21 | - Using a [Docker](https://www.docker.com/) container 22 | 23 | Local builds will be fastest, but require installing a lot of prerequisites. Using the build server is easiest, but slowest. Docker has speed close to a local build, and only requires Docker as a prerequisite. 24 | 25 | ### Building locally 26 | 27 | #### Prerequisites 28 | 29 | To build locally, you'll need the following commands installed on your system: 30 | 31 | - `curl`, `grep`, `perl`, `unzip`, `cargo` 32 | 33 | Optionally, for faster builds, you can install [Wattsi](https://github.com/whatwg/wattsi). If you don't bother with that, we will use the [build server](https://github.com/whatwg/build.whatwg.org), which requires an internet connection. 34 | 35 | If you're using a local install of Wattsi, then optionally, you can install Python 3.7+ with [pipx](https://pypa.github.io/pipx/), to enable syntax highlighting of `pre` contents. 36 | 37 | #### Running the build 38 | 39 | Run the `build.sh` script from inside your `html-build` working directory, like this: 40 | 41 | ```bash 42 | ./build.sh 43 | ``` 44 | 45 | The first time this runs, it will look up for the HTML source from a `../html` folder, if it exists. Otherwise, it may ask for your input on where to clone the HTML source from, or where on your system to find it if you've already done that. If you're working to submit a pull request to [whatwg/html](https://github.com/whatwg/html), be sure to give it the URL of your fork. 46 | 47 | You may also set the environment variable `$HTML_SOURCE` to use a custom location for the HTML source. For example: 48 | 49 | ```bash 50 | HTML_SOURCE=~/hacks/dhtml ./build.sh 51 | ``` 52 | 53 | ### Building using the build server 54 | 55 | To use the build server, use the `--remote` flag: 56 | 57 | ```bash 58 | ./build.sh --remote 59 | ``` 60 | 61 | This will ZIP up most of the files in the `html/` directory, send them to the build server, get back another ZIP file with the output, and unzip those into the output folder. 62 | 63 | You will need `zip` and `unzip` commands available in your `$PATH`. 64 | 65 | ### Building using a Docker container 66 | 67 | The Dockerized version of the build allows you to run the build entirely inside a "container" (lightweight virtual machine). This includes tricky dependencies like a local copy of Wattsi and Python. 68 | 69 | To perform a Dockerized build, use the `--docker` flag: 70 | 71 | ```bash 72 | ./build.sh --docker 73 | ``` 74 | 75 | The first time you do this, Docker will download a bunch of stuff to set up the container properly, but subsequent runs will simply build the standard and be very fast. 76 | 77 | If you get permissions errors on Windows, you need to first [configure](https://docs.docker.com/docker-for-windows/#file-sharing) your `html-build/` and `html/` directories to be shareable with Docker. 78 | 79 | ## Output 80 | 81 | After you complete the build steps above, the build will run and generate the single-page version of the spec, the multipage version, and more. If all goes well, you should very soon have an `output/` directory containing important files like `index.html`, `multipage/`, and `dev/`. 82 | 83 | You can also use the `--serve` option to `build.sh` to automatically serve the results on `http://localhost:8080/` after building (as long as you have Python 3.7+ installed). 84 | 85 | Now you're ready to edit the `html/source` file—and after you make your changes, you can run the `build.sh` script again to see the new output. 86 | 87 | ## Fast local iteration 88 | 89 | There are a number of options to disable certain parts of the build process to speed up local iteration. Run `./build.sh help` to see them all, or just use the `--fast` flag to get maximally-fast builds. 90 | 91 | ## A note on Git history 92 | 93 | Your clone doesn't need the HTML standard's complete revision history just for you to build the spec and contribute patches. So, if you use `build.sh` to create the clone, we don't start you out with a clone of the history. That makes your first build finish much faster. And if later you decide you do want to clone the complete history, you can still get it, by doing this: 94 | 95 | ```bash 96 | cd ./html && git fetch --unshallow 97 | ``` 98 | 99 | That said, if you really do want to *start out* with the complete history of the repo, then run the build script for the first time like this: 100 | 101 | ```bash 102 | HTML_GIT_CLONE_OPTIONS="" ./build.sh 103 | ``` 104 | 105 | That will clone the complete history for you. But be warned: It'll make your first build take *dramatically* longer to finish! 106 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o nounset 4 | set -o pipefail 5 | 6 | # cd to the directory containing this script 7 | cd "$(dirname "$0")" 8 | DIR=$(pwd) 9 | 10 | # The latest required version of Wattsi. Update this if you change how ./build.sh invokes Wattsi; 11 | # it will cause a warning if Wattsi's self-reported version is lower. Note that there's no need to 12 | # update this on every revision of Wattsi; only do so when a warning is justified. 13 | declare -r WATTSI_LATEST=140 14 | 15 | # Shared state variables throughout this script 16 | LOCAL_WATTSI=true 17 | WATTSI_RESULT=0 18 | DO_UPDATE=true 19 | DO_LINT=true 20 | DO_HIGHLIGHT=true 21 | SINGLE_PAGE_ONLY=false 22 | USE_DOCKER=false 23 | USE_SERVER=false 24 | VERBOSE=false 25 | QUIET=false 26 | SERVE=false 27 | HTML_SHA="" 28 | HIGHLIGHT_SERVER_PID="" 29 | 30 | # Can be set from the outside to customize the script, but the defaults are usually fine. (Only 31 | # $HTML_SOURCE is documented.) $HTML_SOURCE will be determined inside the main function. 32 | HTML_SOURCE=${HTML_SOURCE:-} 33 | HTML_CACHE=${HTML_CACHE:-$DIR/.cache} 34 | HTML_TEMP=${HTML_TEMP:-$DIR/.temp} 35 | HTML_OUTPUT=${HTML_OUTPUT:-$DIR/output} 36 | HTML_GIT_CLONE_OPTIONS=${HTML_GIT_CLONE_OPTIONS:-"--depth=2"} 37 | 38 | # This is used by child scripts, and so we export it 39 | export HTML_CACHE 40 | 41 | # Used specifically when the Dockerfile calls this script 42 | SKIP_BUILD_UPDATE_CHECK=${SKIP_BUILD_UPDATE_CHECK:-false} 43 | SHA_OVERRIDE=${SHA_OVERRIDE:-} 44 | BUILD_SHA_OVERRIDE=${BUILD_SHA_OVERRIDE:-} 45 | 46 | # This needs to be coordinated with the bs-highlighter package 47 | declare -r HIGHLIGHT_SERVER_URL="http://127.0.0.1:8080" 48 | 49 | declare -r SERVE_PORT=8080 50 | 51 | function main { 52 | processCommandLineArgs "$@" 53 | 54 | # $SKIP_BUILD_UPDATE_CHECK is set inside the Dockerfile so that we don't check for updates both inside and outside 55 | # the Docker container. 56 | if [[ $DO_UPDATE == "true" && $SKIP_BUILD_UPDATE_CHECK != "true" ]]; then 57 | checkHTMLBuildIsUpToDate 58 | fi 59 | 60 | findHTMLSource 61 | 62 | clearDir "$HTML_OUTPUT" 63 | # Set these up so rsync will not complain about either being missing 64 | mkdir -p "$HTML_OUTPUT/commit-snapshots" 65 | mkdir -p "$HTML_OUTPUT/review-drafts" 66 | 67 | clearCacheIfNecessary 68 | 69 | local html_git_dir="$HTML_SOURCE/.git/" 70 | HTML_SHA=${SHA_OVERRIDE:-$(git --git-dir="$html_git_dir" rev-parse HEAD)} 71 | 72 | if [[ $USE_DOCKER == "true" ]]; then 73 | doDockerBuild 74 | exit 0 75 | fi 76 | 77 | if [[ $USE_SERVER == "true" ]]; then 78 | doServerBuild 79 | 80 | if [[ $SERVE == "true" ]]; then 81 | cd "$HTML_OUTPUT" 82 | python3 -m http.server "$SERVE_PORT" 83 | fi 84 | 85 | exit 0 86 | fi 87 | 88 | checkWattsi 89 | ensureHighlighterInstalled 90 | 91 | doLint 92 | 93 | updateRemoteDataFiles 94 | 95 | startHighlightServer 96 | 97 | processSource "source" "default" 98 | 99 | if [[ -e "$html_git_dir" ]]; then 100 | # This is based on https://github.com/whatwg/whatwg.org/pull/201 and should be kept synchronized 101 | # with that. 102 | local changed_files 103 | changed_files=$(git --git-dir="$html_git_dir" show --format="format:" --name-only HEAD) 104 | 105 | local changed 106 | for changed in $changed_files; do # Omit quotes around variable to split on whitespace 107 | if ! [[ "$changed" =~ ^review-drafts/.*.wattsi$ ]]; then 108 | continue 109 | fi 110 | processSource "$changed" "review" 111 | done 112 | else 113 | echo "" 114 | echo "Skipping review draft production as the .git directory is not present" 115 | echo "(This always happens if you use the --docker or --remote options.)" 116 | fi 117 | 118 | $QUIET || echo 119 | $QUIET || echo "Success!" 120 | 121 | if [[ $SERVE == "true" ]]; then 122 | stopHighlightServer 123 | cd "$HTML_OUTPUT" 124 | python3 -m http.server "$SERVE_PORT" 125 | fi 126 | } 127 | 128 | # Processes incoming command-line arguments 129 | # Arguments: all arguments to this shell script 130 | # Output: 131 | # - If the clean or help commands are given, perform them 132 | # - Otherwise, sets the $DO_UPDATE, $USE_DOCKER, $QUIET, and $VERBOSE variables appropriately 133 | function processCommandLineArgs { 134 | local arg 135 | for arg in "$@" 136 | do 137 | case $arg in 138 | clean) 139 | clearDir "$HTML_CACHE" 140 | exit 0 141 | ;; 142 | --help|help) 143 | echo "Commands:" 144 | echo " $0 Build the HTML Standard." 145 | echo " $0 clean Remove downloaded dependencies and generated files (then stop)." 146 | echo " $0 help Show this usage statement." 147 | echo 148 | echo "Build options:" 149 | echo " -d|--docker Use Docker to build in a container." 150 | echo " -r|--remote Use the build server." 151 | echo " -s|--serve After building, serve the results on http://localhost:$SERVE_PORT." 152 | echo " -n|--no-update Don't update before building; just build." 153 | echo " -l|--no-lint Don't lint before building; just build." 154 | echo " -h|--no-highlight Don't syntax-highlight the output." 155 | echo " -p|--single-page Only build the single-page variant of the spec." 156 | echo " -f|--fast Alias for --no-update --no-lint --no-highlight --single-page." 157 | echo " -q|--quiet Don't emit any messages except errors/warnings." 158 | echo " -v|--verbose Show verbose output from every build step." 159 | exit 0 160 | ;; 161 | -n|--no-update|--no-updates) 162 | DO_UPDATE=false 163 | ;; 164 | -l|--no-lint) 165 | DO_LINT=false 166 | ;; 167 | -h|--no-highlight) 168 | DO_HIGHLIGHT=false 169 | ;; 170 | -p|--single-page) 171 | SINGLE_PAGE_ONLY=true 172 | ;; 173 | -f|--fast) 174 | DO_UPDATE=false 175 | DO_LINT=false 176 | DO_HIGHLIGHT=false 177 | SINGLE_PAGE_ONLY=true 178 | ;; 179 | -d|--docker) 180 | USE_DOCKER=true 181 | ;; 182 | -r|--remote) 183 | USE_SERVER=true 184 | ;; 185 | -q|--quiet) 186 | QUIET=true 187 | VERBOSE=false 188 | ;; 189 | -v|--verbose) 190 | VERBOSE=true 191 | QUIET=false 192 | set -vx 193 | ;; 194 | -s|--serve) 195 | SERVE=true 196 | ;; 197 | *) 198 | ;; 199 | esac 200 | done 201 | 202 | if [[ $USE_DOCKER == "true" && $USE_SERVER == "true" ]]; then 203 | echo "Error: --docker and --remote are mutually exclusive." 204 | exit 1 205 | fi 206 | } 207 | 208 | # Checks if the html-build repository is up to date 209 | # Arguments: none 210 | # Output: will tell the user and exit the script with code 1 if not up to date 211 | function checkHTMLBuildIsUpToDate { 212 | $QUIET || echo "Checking if html-build is up to date..." 213 | 214 | # TODO: `git remote get-url origin` is nicer, but new in Git 2.7. 215 | local origin_url 216 | origin_url=$(git config --get remote.origin.url) 217 | 218 | local git_fetch_args=() 219 | if ! $VERBOSE ; then 220 | git_fetch_args+=( --quiet ) 221 | fi 222 | git_fetch_args+=( "$origin_url" main) 223 | git fetch "${git_fetch_args[@]}" 224 | 225 | local new_commits 226 | new_commits=$(git rev-list --count HEAD..FETCH_HEAD) 227 | if [[ $new_commits != "0" ]]; then 228 | $QUIET || echo 229 | echo -n "Your local branch is $new_commits " 230 | [[ $new_commits == "1" ]] && echo -n "commit" || echo -n "commits" 231 | echo " behind $origin_url:" 232 | git --no-pager log --oneline HEAD..FETCH_HEAD 233 | echo 234 | echo "To update, run this command:" 235 | echo 236 | echo " git pull --rebase origin main" 237 | echo 238 | echo "This check can be bypassed with the --no-update option." 239 | exit 1 240 | fi 241 | } 242 | 243 | # Tries to install the bs-highlighter Python package if necessary 244 | # - Arguments: none 245 | # - Output: 246 | # - Either bs-highlighter-server will be in the $PATH, or $DO_HIGHTLIGHT will be set to false and 247 | # a warning will be echoed. 248 | function ensureHighlighterInstalled { 249 | # If we're not using local Wattsi then we won't use the local highlighter. 250 | if [[ $LOCAL_WATTSI == "true" && $DO_HIGHLIGHT == "true" ]]; then 251 | if hash pipx 2>/dev/null; then 252 | if ! hash bs-highlighter-server 2>/dev/null; then 253 | pipx install bs-highlighter 254 | fi 255 | else 256 | echo 257 | echo "Warning: could not find pipx in your PATH. Disabling syntax highlighting." 258 | echo 259 | DO_HIGHLIGHT="false" 260 | fi 261 | fi 262 | } 263 | 264 | # Runs the lint.sh script, if requested 265 | # - Arguments: none 266 | # - Output: 267 | # - Will echo any errors and exit the script with error code 1 if lint fails. 268 | function doLint { 269 | if [[ $DO_LINT == "false" ]]; then 270 | return 271 | fi 272 | 273 | $QUIET || echo "Linting the source file..." 274 | ./lint.sh "$HTML_SOURCE/source" || { 275 | echo 276 | echo "There were lint errors. Stopping." 277 | exit 1 278 | } 279 | } 280 | 281 | # Finds the location of the HTML Standard, and stores it in the HTML_SOURCE variable. 282 | # It either guesses based on directory structure, or interactively prompts the user. 283 | # - Arguments: none 284 | # - Output: 285 | # - Sets $HTML_SOURCE 286 | function findHTMLSource { 287 | $QUIET || echo "Looking for the HTML source (set HTML_SOURCE to override)..." 288 | if [[ $HTML_SOURCE == "" ]]; then 289 | local parent_dir 290 | parent_dir=$(dirname "$DIR") 291 | 292 | if [[ -f "$parent_dir/html/source" ]]; then 293 | HTML_SOURCE="$parent_dir/html" 294 | $QUIET || echo "Found $HTML_SOURCE (alongside html-build)..." 295 | else 296 | if [[ -f "$DIR/html/source" ]]; then 297 | HTML_SOURCE="$DIR/html" 298 | $QUIET || echo "Found $HTML_SOURCE (inside html-build)..." 299 | else 300 | $QUIET || echo "Didn't find the HTML source on your system..." 301 | chooseRepo 302 | fi 303 | fi 304 | else 305 | if [[ -f "$HTML_SOURCE/source" ]]; then 306 | $QUIET || echo "Found $HTML_SOURCE (from HTML_SOURCE)..." 307 | else 308 | $QUIET || echo "Looked in the $HTML_SOURCE directory but didn't find HTML source there..." 309 | HTML_SOURCE="" 310 | chooseRepo 311 | fi 312 | fi 313 | 314 | export HTML_SOURCE 315 | } 316 | 317 | # Interactively prompts the user for where their HTML source file is. 318 | # - Arguments: none 319 | # - Output: 320 | # - Sets $HTML_SOURCE 321 | function chooseRepo { 322 | echo 323 | echo "What HTML source would you like to build from?" 324 | echo 325 | echo "1) Use an existing clone on my local filesystem." 326 | echo "2) Create a clone from https://github.com/whatwg/html." 327 | echo "3) Create a clone from an existing fork, by GitHub username." 328 | echo "4) Create a clone from an existing fork, by custom URL." 329 | echo "5) Quit" 330 | echo 331 | 332 | local choice 333 | read -r -e -p "Choose 1-5: " choice 334 | if [[ $choice == "1" ]]; then 335 | read -r -e -p "Path to your existing clone: " 336 | HTML_SOURCE=$(echo "$REPLY" | xargs) # trims leading/trailing space 337 | if [[ $HTML_SOURCE = "" ]]; then 338 | chooseRepo 339 | fi 340 | confirmRepo 341 | elif [[ $choice == "2" ]]; then 342 | HTML_REPO="https://github.com/whatwg/html.git" 343 | confirmRepo 344 | elif [[ $choice == "3" ]]; then 345 | echo 346 | 347 | local gh_username 348 | read -r -e -p "GitHub username of fork owner: " gh_username 349 | gh_username=$(echo "$gh_username" | xargs) # trims leading/trailing space 350 | if [[ $gh_username == "" ]]; then 351 | chooseRepo 352 | fi 353 | echo 354 | echo "Does a fork already exist at https://github.com/$gh_username/html?" 355 | echo 356 | read -r -e -p "Y or N? " yn 357 | if [[ $yn == "y" || $yn == "Y" ]]; then 358 | HTML_REPO="https://github.com/$gh_username/html.git" 359 | confirmRepo 360 | else 361 | echo 362 | echo "Before proceeding, first go to https://github.com/whatwg/html and create a fork." 363 | exit 364 | fi 365 | elif [[ $choice == "4" ]]; then 366 | echo 367 | read -r -e -p "URL: " 368 | REPLY=$(echo "$REPLY" | xargs) # trims leading/trailing space 369 | if [[ $REPLY == "" ]]; then 370 | chooseRepo 371 | fi 372 | HTML_REPO=$REPLY 373 | confirmRepo 374 | elif [[ $choice == "5" || $choice == "q" || $choice == "Q" ]]; then 375 | echo 376 | echo "Can't build without a source repo to build from. Quitting..." 377 | exit 378 | else 379 | chooseRepo 380 | fi 381 | } 382 | 383 | # Confirms the currently-set HTML_SOURCE with the user, or clones HTML_REPO into HTML_SOURCE 384 | # - Arguments: none 385 | # - Output: 386 | # - $HTML_SOURCE will now point to a folder containing the HTML Standard 387 | function confirmRepo { 388 | if [[ $HTML_SOURCE != "" ]]; then 389 | if [[ -f "$HTML_SOURCE/source" ]]; then 390 | echo 391 | echo "OK, build from the $HTML_SOURCE/source file?" 392 | echo 393 | 394 | local build_yn 395 | read -r -e -p "Y or N? " yn 396 | if [[ $build_yn == "y" || $build_yn == "Y" ]]; then 397 | return 398 | else 399 | HTML_SOURCE="" 400 | chooseRepo 401 | fi 402 | else 403 | echo 404 | echo "$HTML_SOURCE/source file doesn't exist. Please choose another option." 405 | HTML_SOURCE="" 406 | chooseRepo 407 | fi 408 | return 409 | fi 410 | HTML_SOURCE=${HTML_SOURCE:-$DIR/html} 411 | echo 412 | echo "OK, clone from $HTML_REPO?" 413 | echo 414 | 415 | local clone_yn 416 | read -r -e -p "Y or N? " clone_yn 417 | 418 | local git_clone_args=( "$HTML_GIT_CLONE_OPTIONS" ) 419 | $QUIET && git_clone_args+=( --quiet ) 420 | $VERBOSE && git_clone_args+=( --verbose ) 421 | git_clone_args+=( "$HTML_REPO" "$HTML_SOURCE" ) 422 | if [[ $clone_yn == "y" || $clone_yn == "Y" ]]; then 423 | git clone "${git_clone_args[@]}" 424 | else 425 | HTML_SOURCE="" 426 | chooseRepo 427 | fi 428 | } 429 | 430 | # Gives the relative path to $2 from $1 431 | # From http://stackoverflow.com/a/12498485 432 | # - Arguments: 433 | # - $1: absolute path beginning with / 434 | # - $2: absolute path beginning with / 435 | # - Output: 436 | # - Echoes the relative path 437 | function relativePath { 438 | local source=$1 439 | local target=$2 440 | 441 | local commonPart=$source 442 | local result="" 443 | 444 | while [[ "${target#"$commonPart"}" == "${target}" ]]; do 445 | # no match, means that candidate common part is not correct 446 | # go up one level (reduce common part) 447 | commonPart=$(dirname "$commonPart") 448 | # and record that we went back, with correct / handling 449 | if [[ $result == "" ]]; then 450 | result=".." 451 | else 452 | result="../$result" 453 | fi 454 | done 455 | 456 | if [[ $commonPart == "/" ]]; then 457 | # special case for root (no common path) 458 | result="$result/" 459 | fi 460 | 461 | # since we now have identified the common part, 462 | # compute the non-common part 463 | local forwardPart="${target#"$commonPart"}" 464 | 465 | # and now stick all parts together 466 | if [[ $result != "" ]] && [[ $forwardPart != "" ]]; then 467 | result="$result$forwardPart" 468 | elif [[ $forwardPart != "" ]]; then 469 | # extra slash removal 470 | result="${forwardPart:1}" 471 | fi 472 | 473 | echo "$result" 474 | } 475 | 476 | # Performs the build using Docker, essentially running this script again inside the container. 477 | # Arguments: none 478 | # Output: A web server with the build output will be running inside the Docker container 479 | function doDockerBuild { 480 | # Ensure ghcr.io/whatwg/wattsi:latest is up to date. Without this, the locally cached copy would 481 | # be used, i.e. once Wattsi was downloaded once, it would never update. Note that this is fast 482 | # (zero-transfer) if the locally cached copy is already up to date. 483 | local docker_pull_args=() 484 | $QUIET && docker_pull_args+=( --quiet ) 485 | docker_pull_args+=( ghcr.io/whatwg/wattsi:latest ) 486 | docker pull "${docker_pull_args[@]}" 487 | 488 | local docker_build_args=( --tag whatwg-html ) 489 | $QUIET && docker_build_args+=( --quiet ) 490 | docker build "${docker_build_args[@]}" . 491 | 492 | local docker_run_args=() 493 | $SERVE && docker_run_args+=( --publish "$SERVE_PORT:$SERVE_PORT" ) 494 | docker_run_args+=( whatwg-html ) 495 | $QUIET && docker_run_args+=( --quiet ) 496 | $VERBOSE && docker_run_args+=( --verbose ) 497 | $DO_UPDATE || docker_run_args+=( --no-update ) 498 | $DO_LINT || docker_run_args+=( --no-lint ) 499 | $DO_HIGHLIGHT || docker_run_args+=( --no-highlight ) 500 | $SINGLE_PAGE_ONLY && docker_run_args+=( --single-page ) 501 | $SERVE && docker_run_args+=( --serve ) 502 | 503 | # Pass in the html-build SHA (since there's no .git directory inside the container) 504 | docker run --rm --interactive --tty \ 505 | --env "BUILD_SHA_OVERRIDE=$(git rev-parse HEAD)" \ 506 | --mount "type=bind,source=$HTML_SOURCE,destination=/whatwg/html-build/html,readonly=1" \ 507 | --mount "type=bind,source=$HTML_CACHE,destination=/whatwg/html-build/.cache" \ 508 | --mount "type=bind,source=$HTML_OUTPUT,destination=/whatwg/html-build/output" \ 509 | "${docker_run_args[@]}" 510 | } 511 | 512 | # Performs the build using the build server, zipping up the input, sending it to the server, and 513 | # unzipping the output. 514 | # Output: the $HTML_OUTPUT directory will contain the built files 515 | function doServerBuild { 516 | clearDir "$HTML_TEMP" 517 | 518 | local input_zip="build-server-input.zip" 519 | local build_server_output="build-server-output" 520 | local build_server_headers="build-server-headers.txt" 521 | 522 | # Keep include list in sync with `processSource` 523 | # 524 | # We use an allowlist (--include) instead of a blocklist (--exclude) to avoid accidentally 525 | # sending files that the user might not anticipate sending to a remote server, e.g. their 526 | # private-notes-on-current-pull-request.txt. 527 | # 528 | # The contents of fonts/, images/, and dev/ are not round-tripped to the server, but instead 529 | # copied below in this function. (We still send the directories to avoid the build script on the 530 | # server getting confused about their absence.) demos/ needs to be sent in full for inlining. 531 | local zip_args=( 532 | --recurse-paths "$HTML_TEMP/$input_zip" . \ 533 | --include ./source ./404.html ./link-fixup.js ./html-dfn.js ./styles.css \ 534 | ./fonts/ ./images/ ./dev/ ./demos/\* 535 | ) 536 | $QUIET && zip_args+=( --quiet ) 537 | (cd "$HTML_SOURCE" && zip "${zip_args[@]}") 538 | 539 | local query_params=() 540 | $QUIET && query_params+=( quiet ) 541 | $VERBOSE && query_params+=( verbose ) 542 | $DO_UPDATE || query_params+=( no-update ) 543 | $DO_LINT || query_params+=( no-lint ) 544 | $DO_HIGHLIGHT || query_params+=( no-highlight ) 545 | $SINGLE_PAGE_ONLY && query_params+=( single-page ) 546 | 547 | $QUIET || echo 548 | $QUIET || echo "Sending files to the build server..." 549 | 550 | local query_string 551 | query_string=$(joinBy "\&" "${query_params[@]-''}") 552 | local curl_url="https://build.whatwg.org/html-build?${query_string}" 553 | local curl_args=( "$curl_url" \ 554 | --form "html=@$HTML_TEMP/$input_zip" \ 555 | --form "sha=$HTML_SHA" \ 556 | --dump-header "$HTML_TEMP/$build_server_headers" \ 557 | --output "$HTML_TEMP/$build_server_output" ) 558 | $QUIET && curl_args+=( --silent ) 559 | $VERBOSE && curl_args+=( --verbose ) 560 | curl "${curl_args[@]}" 561 | 562 | # Read exit code from the Exit-Code header and assume failure if not found 563 | local build_server_result=1 564 | local name value 565 | while IFS=":" read -r name value; do 566 | shopt -s nocasematch 567 | if [[ $name == "Exit-Code" ]]; then 568 | build_server_result=$(echo "$value" | tr -d ' \r\n') 569 | break 570 | fi 571 | shopt -u nocasematch 572 | done < "$HTML_TEMP/$build_server_headers" 573 | 574 | if [[ $build_server_result != "0" ]]; then 575 | cat "$HTML_TEMP/$build_server_output" 576 | exit "$build_server_result" 577 | else 578 | local unzip_args=() 579 | # Note: Don't use the -v flag; it doesn't work in combination with -d 580 | if [[ "$VERBOSE" == "false" ]]; then 581 | unzip_args+=( -qq ) 582 | fi 583 | unzip_args+=( "$HTML_TEMP/$build_server_output" -d "$HTML_OUTPUT" ) 584 | unzip "${unzip_args[@]}" 585 | cp -pR "$HTML_SOURCE/fonts" "$HTML_OUTPUT" 586 | cp -pR "$HTML_SOURCE/images" "$HTML_OUTPUT" 587 | 588 | if [[ "$SINGLE_PAGE_ONLY" == "false" ]]; then 589 | cp -pR "$HTML_SOURCE/dev" "$HTML_OUTPUT" 590 | fi 591 | 592 | $QUIET || echo 593 | $QUIET || echo "Build server output:" 594 | cat "$HTML_OUTPUT/output.txt" 595 | rm "$HTML_OUTPUT/output.txt" 596 | fi 597 | } 598 | 599 | # Clears the $HTML_CACHE directory if the build tools have been updated since last run. 600 | # Arguments: none 601 | # Output: 602 | # - $HTML_CACHE will be usable (possibly empty) 603 | function clearCacheIfNecessary { 604 | if [[ -d "$HTML_CACHE" ]]; then 605 | local prev_build_sha 606 | prev_build_sha=$( cat "$HTML_CACHE/last-build-sha.txt" 2>/dev/null || echo ) 607 | 608 | local current_build_sha 609 | current_build_sha=${BUILD_SHA_OVERRIDE:-$(git rev-parse HEAD)} 610 | 611 | if [[ "$prev_build_sha" != "$current_build_sha" ]]; then 612 | $QUIET || echo "Build tools have been updated since last run; clearing the cache..." 613 | DO_UPDATE=true 614 | clearDir "$HTML_CACHE" 615 | echo "$current_build_sha" > "$HTML_CACHE/last-build-sha.txt" 616 | fi 617 | else 618 | mkdir -p "$HTML_CACHE" 619 | fi 620 | } 621 | 622 | # Updates the mdn-spec-links-html.json file, if either $DO_UPDATE is true 623 | # or it is not yet cached. 624 | # Arguments: none 625 | # Output: 626 | # - $HTML_CACHE will contain a usable mdn-spec-links-html.json file 627 | function updateRemoteDataFiles { 628 | if [[ $DO_UPDATE == "true" || ! -f "$HTML_CACHE/mdn-spec-links-html.json" ]]; then 629 | rm -f "$HTML_CACHE/mdn-spec-links-html.json" 630 | $QUIET || echo "Downloading mdn-spec-links/html.json..." 631 | 632 | local curl_args=( "https://raw.githubusercontent.com/w3c/mdn-spec-links/master/html.json" \ 633 | --output "$HTML_CACHE/mdn-spec-links-html.json" \ 634 | --retry 2 ) 635 | if ! $VERBOSE; then 636 | curl_args+=( --silent ) 637 | fi 638 | curl "${curl_args[@]}" 639 | fi 640 | } 641 | 642 | # Performs a build of the HTML source file into the resulting output 643 | # - Arguments: 644 | # - $1: the filename of the source file within HTML_SOURCE (e.g. "source") 645 | # - $2: the build type, either "default" or "review" 646 | # - Output: 647 | # - $HTML_OUTPUT will contain the built files 648 | function processSource { 649 | local source_location="$1" 650 | local build_type="$2" 651 | 652 | clearDir "$HTML_TEMP" 653 | 654 | $QUIET || echo "Pre-processing the source..." 655 | cp -p entities/out/entities.inc "$HTML_CACHE" 656 | cp -p entities/out/entities-dtd.url "$HTML_CACHE" 657 | if hash html-build 2>/dev/null; then 658 | html-build <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete" 659 | else 660 | local cargo_args=( --release ) 661 | $VERBOSE && cargo_args+=( --verbose ) 662 | $QUIET && cargo_args+=( --quiet ) 663 | cargo run "${cargo_args[@]}" <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete" 664 | fi 665 | 666 | runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output" 667 | if [[ $WATTSI_RESULT == "0" ]]; then 668 | if [[ $LOCAL_WATTSI != "true" ]]; then 669 | "$QUIET" || grep -v '^$' "$HTML_TEMP/wattsi-output.txt" # trim blank lines 670 | fi 671 | else 672 | if [[ $LOCAL_WATTSI != "true" ]]; then 673 | "$QUIET" || grep -v '^$' "$HTML_TEMP/wattsi-output.txt" # trim blank lines 674 | fi 675 | if [[ $WATTSI_RESULT == "65" ]]; then 676 | echo 677 | echo "There were errors. Running again to show the original line numbers." 678 | echo 679 | runWattsi "$HTML_SOURCE/$source_location" "$HTML_TEMP/wattsi-raw-source-output" 680 | if [[ $LOCAL_WATTSI != "true" ]]; then 681 | grep -v '^$' "$HTML_TEMP/wattsi-output.txt" # trim blank lines 682 | fi 683 | fi 684 | echo 685 | echo "There were errors. Stopping." 686 | exit "$WATTSI_RESULT" 687 | fi 688 | 689 | # Keep the list of files copied from $HTML_SOURCE in sync with `doServerBuild` 690 | 691 | if [[ $build_type == "default" ]]; then 692 | # Singlepage HTML 693 | mv "$HTML_TEMP/wattsi-output/index-html" "$HTML_OUTPUT/index.html" 694 | 695 | if [[ $SINGLE_PAGE_ONLY == "false" ]]; then 696 | # Singlepage Commit Snapshot 697 | local commit_dir="$HTML_OUTPUT/commit-snapshots/$HTML_SHA" 698 | mkdir -p "$commit_dir" 699 | mv "$HTML_TEMP/wattsi-output/index-snap" "$commit_dir/index.html" 700 | 701 | # Multipage HTML and Dev Edition 702 | mv "$HTML_TEMP/wattsi-output/multipage-html" "$HTML_OUTPUT/multipage" 703 | mv "$HTML_TEMP/wattsi-output/multipage-dev" "$HTML_OUTPUT/dev" 704 | 705 | cp -pR "$HTML_SOURCE/dev" "$HTML_OUTPUT" 706 | fi 707 | 708 | cp -p entities/out/entities.json "$HTML_OUTPUT" 709 | cp -p "$HTML_TEMP/wattsi-output/xrefs.json" "$HTML_OUTPUT" 710 | 711 | clearDir "$HTML_TEMP" 712 | 713 | echo "User-agent: * 714 | Disallow: /commit-snapshots/ 715 | Disallow: /review-drafts/" > "$HTML_OUTPUT/robots.txt" 716 | cp -p "$HTML_SOURCE/404.html" "$HTML_OUTPUT" 717 | cp -p "$HTML_SOURCE/link-fixup.js" "$HTML_OUTPUT" 718 | cp -p "$HTML_SOURCE/html-dfn.js" "$HTML_OUTPUT" 719 | cp -p "$HTML_SOURCE/styles.css" "$HTML_OUTPUT" 720 | cp -pR "$HTML_SOURCE/fonts" "$HTML_OUTPUT" 721 | cp -pR "$HTML_SOURCE/images" "$HTML_OUTPUT" 722 | cp -pR "$HTML_SOURCE/demos" "$HTML_OUTPUT" 723 | else 724 | # Singlepage Review Draft 725 | local year_month 726 | year_month=$(basename "$source_location" .wattsi) 727 | 728 | local new_dir="$HTML_OUTPUT/review-drafts/$year_month" 729 | mkdir -p "$new_dir" 730 | mv "$HTML_TEMP/wattsi-output/index-review" "$new_dir/index.html" 731 | fi 732 | } 733 | 734 | # Checks if Wattsi is available and up to date 735 | # - Arguments: none 736 | # - Output: 737 | # - Sets $LOCAL_WATTSI to true or false 738 | # - Echoes a warning if Wattsi is out of date according to $WATTSI_LATEST 739 | function checkWattsi { 740 | if hash wattsi 2>/dev/null; then 741 | if [[ "$(wattsi --version | cut -d' ' -f2)" -lt "$WATTSI_LATEST" ]]; then 742 | echo 743 | echo "Warning: Your wattsi version is out of date. You should to rebuild an" 744 | echo "up-to-date wattsi binary from the wattsi sources." 745 | echo 746 | fi 747 | LOCAL_WATTSI=true 748 | else 749 | LOCAL_WATTSI=false 750 | fi 751 | } 752 | 753 | # Runs Wattsi on the given file, either locally or using the web service 754 | # - Arguments: 755 | # - $1: the file to run Wattsi on 756 | # - $2: the directory for Wattsi to write output to 757 | # - $3: the URL for the syntax-highlighter server 758 | # - Output: 759 | # - Sets $WATTSI_RESULT to the exit code 760 | # - $HTML_TEMP/wattsi-output directory will contain the output from Wattsi on success 761 | # - $HTML_TEMP/wattsi-output.txt will contain the output from Wattsi, on both success and failure 762 | function runWattsi { 763 | local source_file="$1" 764 | local output_dir="$2" 765 | 766 | clearDir "$output_dir" 767 | 768 | if [[ "$LOCAL_WATTSI" == "true" ]]; then 769 | local wattsi_args=() 770 | $QUIET && wattsi_args+=( --quiet ) 771 | $SINGLE_PAGE_ONLY && wattsi_args+=( --single-page-only ) 772 | wattsi_args+=( "$source_file" "$HTML_SHA" "$output_dir" "$build_type" "$HTML_CACHE/mdn-spec-links-html.json" ) 773 | if [[ "$DO_HIGHLIGHT" == "true" ]]; then 774 | wattsi_args+=( "$HIGHLIGHT_SERVER_URL" ) 775 | fi 776 | 777 | WATTSI_RESULT="0" 778 | wattsi "${wattsi_args[@]}" || WATTSI_RESULT=$? 779 | else 780 | $QUIET || echo 781 | $QUIET || echo "Local wattsi not present; trying the build server..." 782 | 783 | 784 | local query_params=() 785 | $QUIET && query_params+=( quiet ) 786 | $SINGLE_PAGE_ONLY && query_params+=( single-page-only ) 787 | 788 | local query_string 789 | query_string=$(joinBy "\&" "${query_params[@]-''}") 790 | local curl_url="https://build.whatwg.org/wattsi?${query_string}" 791 | 792 | local curl_args=( "$curl_url" \ 793 | --form "source=@$source_file" \ 794 | --form "sha=$HTML_SHA" \ 795 | --form "build=$build_type" \ 796 | --form "mdn=@$HTML_CACHE/mdn-spec-links-html.json" \ 797 | --dump-header "$HTML_TEMP/wattsi-headers.txt" \ 798 | --output "$HTML_TEMP/wattsi-output.zip" ) 799 | $QUIET && curl_args+=( --silent ) 800 | $VERBOSE && curl_args+=( --verbose ) 801 | curl "${curl_args[@]}" 802 | 803 | # read exit code from the Exit-Code header and assume failure if not found 804 | WATTSI_RESULT="1" 805 | local name value 806 | while IFS=":" read -r name value; do 807 | shopt -s nocasematch 808 | if [[ $name == "Exit-Code" ]]; then 809 | WATTSI_RESULT=$(echo "$value" | tr -d ' \r\n') 810 | break 811 | fi 812 | shopt -u nocasematch 813 | done < "$HTML_TEMP/wattsi-headers.txt" 814 | 815 | if [[ $WATTSI_RESULT != "0" ]]; then 816 | mv "$HTML_TEMP/wattsi-output.zip" "$HTML_TEMP/wattsi-output.txt" 817 | else 818 | local unzip_args=() 819 | # Note: Don't use the -v flag; it doesn't work in combination with -d 820 | if ! $VERBOSE; then 821 | unzip_args+=( -qq ) 822 | fi 823 | unzip_args+=( "$HTML_TEMP/wattsi-output.zip" -d "$output_dir" ) 824 | unzip "${unzip_args[@]}" 825 | mv "$output_dir/output.txt" "$HTML_TEMP/wattsi-output.txt" 826 | fi 827 | fi 828 | } 829 | 830 | # Starts the syntax-highlighting Python server, when appropriate 831 | # Arguments: none 832 | # Output: if the server is necessary, then 833 | # - A server will be running in the background, at $HIGHLIGHT_SERVER_URL 834 | # - $HIGHLIGHT_SERVER_PID will be set for later use by stopHighlightServer 835 | function startHighlightServer { 836 | if [[ "$LOCAL_WATTSI" == "true" && "$DO_HIGHLIGHT" == "true" ]]; then 837 | local highlight_server_args=() 838 | $QUIET && highlight_server_args+=( --quiet ) 839 | bs-highlighter-server ${highlight_server_args[@]+"${highlight_server_args[@]}"} & 840 | HIGHLIGHT_SERVER_PID=$! 841 | 842 | trap stopHighlightServer EXIT 843 | fi 844 | } 845 | 846 | # Stops the syntax-highlighting Python server 847 | # Arguments: none 848 | # Output: the server will be stopped, if it is running. Failures to stop will be suppressed. 849 | function stopHighlightServer { 850 | if [[ $HIGHLIGHT_SERVER_PID != "" ]]; then 851 | kill "$HIGHLIGHT_SERVER_PID" 2>/dev/null || true 852 | 853 | # This suppresses a 'Terminated: 15 "$DIR/highlighter/server.py"' message 854 | wait "$HIGHLIGHT_SERVER_PID" 2>/dev/null || true 855 | fi 856 | } 857 | 858 | # Ensures the given directory exists, but is empty 859 | # Arguments: 860 | # - $1: the directory to clear 861 | # Output: the directory will be empty (but guaranteed to exist) 862 | function clearDir { 863 | # We use this implementation strategy, instead of `rm -rf`ing the directory, because deleting the 864 | # directory itself can run into permissions issues, e.g. if the directory is open in another 865 | # program, or in the Docker case where we have permission to write to the directory but not delete 866 | # it. 867 | mkdir -p "$1" 868 | find "$1" -mindepth 1 -delete 869 | } 870 | 871 | # Joins parameters $2 onward with the separator given in $1 872 | # Arguments: 873 | # - $1: the separator string 874 | # - $2...: the strings to join 875 | # Output: echoes the joined string 876 | function joinBy { 877 | local d=${1-} f=${2-} 878 | if shift 2; then 879 | printf %s "$f" "${@/#/$d}" 880 | fi 881 | } 882 | 883 | main "$@" 884 | -------------------------------------------------------------------------------- /ci-build/Dockerfile: -------------------------------------------------------------------------------- 1 | # This Dockerfile is just used to run on Travis CI in an environment that can easily and repeatedly 2 | # install our build dependencies. 3 | FROM rust:1.73-slim as builder 4 | WORKDIR /whatwg/html-build 5 | COPY Cargo.lock Cargo.toml ./ 6 | COPY src ./src/ 7 | RUN cargo install --path . 8 | 9 | FROM debian:stable 10 | 11 | RUN apt-get update && \ 12 | apt-get install --yes --no-install-recommends \ 13 | ca-certificates curl rsync git \ 14 | default-jre \ 15 | python3 python3-pip pipx \ 16 | libbrotli1 libexpat1 libfontconfig1 libfreetype6 libpng16-16 \ 17 | fonts-dejavu fonts-droid-fallback fonts-liberation fonts-symbola fonts-unfonts-core 18 | 19 | # Dependency lines above are: 20 | # - General 21 | # - validator 22 | # - Highlighter 23 | # - Prince 24 | # - fonts, for when Prince renders to PDF 25 | 26 | COPY --from=builder /usr/local/cargo/bin/html-build /bin/html-build 27 | 28 | COPY --from=ghcr.io/whatwg/wattsi:latest /whatwg/wattsi/bin/wattsi /bin/wattsi 29 | 30 | ENV PIPX_HOME /opt/pipx 31 | ENV PIPX_BIN_DIR /usr/bin 32 | RUN pipx install bs-highlighter 33 | 34 | # The DockerHub container for the validator only contains the server version, so we get the .jar 35 | # from GitHub: 36 | ADD https://github.com/validator/validator/releases/download/latest/vnu.jar /whatwg/ 37 | 38 | # Trying to copy Prince from its DockerHub container like the others does not work; it has too many 39 | # shared library dependencies. Probably this is a job for Docker Compose... we should learn how that 40 | # works one day. 41 | # Prince also hasn't been updated for Debian 12 and is no longer installable from its deb file. 42 | ADD https://www.princexml.com/download/prince-15.1-linux-generic-x86_64.tar.gz /whatwg/prince.tar.gz 43 | RUN cd /whatwg && \ 44 | tar xvzf prince.tar.gz && \ 45 | ( cd prince-* && echo /usr | ./install.sh ) && \ 46 | echo '@font-face { font-family: serif; src: local("Symbola") }' >> /usr/lib/prince/style/fonts.css && \ 47 | rm -rf prince* && \ 48 | prince --version 49 | 50 | ADD . /whatwg/html-build 51 | 52 | ENTRYPOINT ["bash", "/whatwg/html-build/ci-build/inside-container.sh"] 53 | -------------------------------------------------------------------------------- /ci-build/README.md: -------------------------------------------------------------------------------- 1 | # HTML Standard CI Build 2 | 3 | This directory contains the infrastructure for building and running a Docker container, [whatwg/html-build](https://hub.docker.com/r/whatwg/html-build), which performs a "full" build of the HTML Standard, producing artifacts ready for deployment. 4 | 5 | The relevant entrypoints are: 6 | 7 | - `docker-build.sh` will build the Docker container 8 | - `docker-run.sh $INPUT $OUTPUT` will run the Docker container to do such a full build. 9 | - `$INPUT` should contain a checkout of the [whatwg/html](https://github.com/whatwg/html) repository 10 | - `$OUTPUT` should be an empty directory 11 | -------------------------------------------------------------------------------- /ci-build/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o nounset 4 | set -o pipefail 5 | shopt -s extglob 6 | 7 | TMP_DIR=$(mktemp -d) 8 | 9 | function main { 10 | local here 11 | here=$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd ) 12 | 13 | # We want the image to contain: 14 | # * All of the important stuff from the top-level (html-build) directory 15 | # * But, the Dockerfile from this (ci-build) directory 16 | # And in particular it should *not* contain the top-level Dockerfile, dotfiles, .git/, and 17 | # any html/ and output/ directories that might be hanging around from local testing. 18 | cp "$here/Dockerfile" "$TMP_DIR" 19 | cd "$here/.." 20 | cp -r !(.*|html|output|Dockerfile) "$TMP_DIR" 21 | cd "$TMP_DIR" 22 | trap cleanTemp EXIT 23 | 24 | local ghcr_repo="ghcr.io/whatwg/html-build" 25 | 26 | # Build the Docker image, using GHCR as a cache. (This will be fast if nothing has changed 27 | # in html-build or its dependencies). 28 | docker pull ghcr.io/whatwg/wattsi 29 | docker pull "$ghcr_repo" || true 30 | docker build --cache-from "$ghcr_repo" --tag "$ghcr_repo" . 31 | } 32 | 33 | function cleanTemp { 34 | rm -rf "$TMP_DIR" 35 | } 36 | 37 | main "$@" 38 | -------------------------------------------------------------------------------- /ci-build/docker-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o nounset 4 | set -o pipefail 5 | shopt -s extglob 6 | 7 | HTML_SOURCE=$(realpath "$1") 8 | HTML_OUTPUT=$(realpath "$2") 9 | 10 | docker run --rm --mount "type=bind,source=$HTML_SOURCE,destination=/whatwg/html,readonly=1" \ 11 | --env "HTML_SOURCE=/whatwg/html" \ 12 | --mount "type=bind,source=$HTML_OUTPUT,destination=/whatwg/output" \ 13 | --env "HTML_OUTPUT=/whatwg/output" \ 14 | ghcr.io/whatwg/html-build 15 | -------------------------------------------------------------------------------- /ci-build/inside-container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o nounset 4 | set -o pipefail 5 | cd "$(dirname "$0")/../.." 6 | 7 | PDF_SERVE_PORT=8080 8 | 9 | SKIP_BUILD_UPDATE_CHECK=true ./html-build/build.sh 10 | 11 | echo "" 12 | echo "Running conformance checker..." 13 | # the -Xmx1g argument sets the size of the Java heap space to 1 gigabyte 14 | java -Xmx1g -jar ./vnu.jar --skip-non-html "$HTML_OUTPUT" 15 | echo "" 16 | 17 | # The build output contains some relative links, which will end up pointing to 18 | # "https://0.0.0.0:$PDF_SERVE_PORT/" in the built PDF. That's undesirable; see 19 | # https://github.com/whatwg/html/issues/9097. Our hack is to replace such 20 | # relative links like so. Note: we can't just insert a or use Prince's 21 | # --baseurl option, because that would cause Prince to crawl the actual live 22 | # files for subresources, missing any updates to them we made as part of this 23 | # change. 24 | sed 's| href=/| href=https://html.spec.whatwg.org/|g' "$HTML_OUTPUT/index.html" > "$HTML_OUTPUT/print.html" 25 | 26 | # Serve the built output so that Prince can snapshot it 27 | # The nohup/sleep incantations are necessary because normal & does not work inside Docker: 28 | # https://stackoverflow.com/q/50211207/3191 29 | ( 30 | cd "$HTML_OUTPUT" 31 | nohup bash -c "python3 -m http.server $PDF_SERVE_PORT &" && sleep 4 32 | ) 33 | 34 | echo "" 35 | echo "Building PDF..." 36 | PATH=/whatwg/prince/bin:$PATH prince --verbose --output "$HTML_OUTPUT/print.pdf" "http://0.0.0.0:$PDF_SERVE_PORT/print.html" 37 | 38 | rm "$HTML_OUTPUT/print.html" 39 | -------------------------------------------------------------------------------- /entities/README.md: -------------------------------------------------------------------------------- 1 | # HTML Entities Generator 2 | 3 | This directory contains the tools for generating HTML's [named character references](https://html.spec.whatwg.org/#named-character-references). 4 | 5 | ## Prerequisites 6 | 7 | Before building, make sure you have the following commands installed on your system. 8 | 9 | - `curl`, `perl`, `python` 10 | 11 | ## Build 12 | 13 | Run the `build.sh` script, like this: 14 | ``` 15 | ./build.sh 16 | ``` 17 | 18 | ## Input 19 | 20 | - `entities-legacy.inc` 21 | - `json-entities-legacy.inc` 22 | - [`unicode.xml`](https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml) (downloaded by `build.sh`) 23 | 24 | ## Output 25 | 26 | - `entities-dtd.url` 27 | - `entities.inc` 28 | - `entities.json` 29 | 30 | Because the output is expected to change very rarely, if ever, it is checked in. The top-level `build.sh` script uses these files directly. 31 | -------------------------------------------------------------------------------- /entities/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # cd to the directory containing this script 5 | cd "$( dirname "${BASH_SOURCE[0]}" )" 6 | 7 | VERBOSE=false 8 | QUIET=false 9 | ENTITIES_TEMP=${ENTITIES_TEMP:-.temp} 10 | ENTITIES_OUTPUT=${ENTITIES_OUTPUT:-out} 11 | 12 | for arg in "$@" 13 | do 14 | case $arg in 15 | -h|--help) 16 | echo "Usage: $0 [-h|--help]" 17 | echo "Usage: $0 [-q|--quiet] [-v|--verbose]" 18 | echo 19 | echo " -h|--help Show this usage statement." 20 | echo " -q|--quiet Don't emit any messages except errors/warnings." 21 | echo " -v|--verbose Show verbose output from every build step." 22 | exit 0 23 | ;; 24 | -q|--quiet) 25 | QUIET=true 26 | VERBOSE=false 27 | ;; 28 | -v|--verbose) 29 | VERBOSE=true 30 | QUIET=false 31 | set -vx 32 | ;; 33 | *) 34 | ;; 35 | esac 36 | done 37 | 38 | rm -rf $ENTITIES_TEMP && mkdir -p $ENTITIES_TEMP 39 | rm -rf $ENTITIES_OUTPUT && mkdir -p $ENTITIES_OUTPUT 40 | 41 | # Fetch unicode.xml 42 | $QUIET || echo "Downloading unicode.xml (can take a short time, depending on your bandwidth)..."; 43 | curl $($VERBOSE && echo "-v") $($QUIET && echo "-s") \ 44 | https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml \ 45 | --output $ENTITIES_TEMP/unicode.xml 46 | 47 | # Generate entity files 48 | $QUIET || echo; 49 | $QUIET || echo "Generating entities (this always takes a while)..."; 50 | python entity-processor.py < $ENTITIES_TEMP/unicode.xml > $ENTITIES_TEMP/new-entities-unicode.inc; 51 | [ -s $ENTITIES_TEMP/new-entities-unicode.inc ] && mv -f $ENTITIES_TEMP/new-entities-unicode.inc $ENTITIES_TEMP/entities-unicode.inc; # otherwise, probably http error, just do it again next time 52 | python entity-processor-json.py < $ENTITIES_TEMP/unicode.xml > $ENTITIES_TEMP/new-entities-unicode-json.inc; 53 | [ -s $ENTITIES_TEMP/new-entities-unicode-json.inc ] && mv -f $ENTITIES_TEMP/new-entities-unicode-json.inc $ENTITIES_TEMP/json-entities-unicode.inc; # otherwise, probably http error, just do it again next time 54 | echo '' > $ENTITIES_OUTPUT/entities.inc 55 | cat entities-*.inc $ENTITIES_TEMP/entities-*.inc | perl -e 'my @lines = <>; print sort { $a =~ m/id="([^"]+?)(-legacy)?"/; $a1 = $1; $a2 = $2; $b =~ m/id="([^"]+?)(-legacy)?"/; $b1 = $1; $b2 = $2; return (lc($a1) cmp lc($b1)) || ($a1 cmp $b1) || ($a2 cmp $b2); } @lines' >> $ENTITIES_OUTPUT/entities.inc 56 | echo '{' > $ENTITIES_OUTPUT/entities.json 57 | cat json-entities-* $ENTITIES_TEMP/json-entities-* | sort | perl -e '$/ = undef; $_ = <>; chop, chop, print' >> $ENTITIES_OUTPUT/entities.json 58 | echo '' >> $ENTITIES_OUTPUT/entities.json 59 | echo '}' >> $ENTITIES_OUTPUT/entities.json 60 | perl -Tw entity-to-dtd.pl < $ENTITIES_TEMP/entities-unicode.inc > $ENTITIES_OUTPUT/entities-dtd.url 61 | 62 | rm -rf $ENTITIES_TEMP 63 | -------------------------------------------------------------------------------- /entities/entities-legacy.inc: -------------------------------------------------------------------------------- 1 | AElig U+000C6 Æ 2 | AMP U+00026 & 3 | Aacute U+000C1 Á 4 | Acirc U+000C2 Â 5 | Agrave U+000C0 À 6 | Aring U+000C5 Å 7 | Atilde U+000C3 Ã 8 | Auml U+000C4 Ä 9 | COPY U+000A9 © 10 | Ccedil U+000C7 Ç 11 | ETH U+000D0 Ð 12 | Eacute U+000C9 É 13 | Ecirc U+000CA Ê 14 | Egrave U+000C8 È 15 | Euml U+000CB Ë 16 | GT U+0003E > 17 | Iacute U+000CD Í 18 | Icirc U+000CE Î 19 | Igrave U+000CC Ì 20 | Iuml U+000CF Ï 21 | LT U+0003C < 22 | Ntilde U+000D1 Ñ 23 | Oacute U+000D3 Ó 24 | Ocirc U+000D4 Ô 25 | Ograve U+000D2 Ò 26 | Oslash U+000D8 Ø 27 | Otilde U+000D5 Õ 28 | Ouml U+000D6 Ö 29 | QUOT U+00022 " 30 | REG U+000AE ® 31 | THORN U+000DE Þ 32 | Uacute U+000DA Ú 33 | Ucirc U+000DB Û 34 | Ugrave U+000D9 Ù 35 | Uuml U+000DC Ü 36 | Yacute U+000DD Ý 37 | aacute U+000E1 á 38 | acirc U+000E2 â 39 | acute U+000B4 ´ 40 | aelig U+000E6 æ 41 | agrave U+000E0 à 42 | amp U+00026 & 43 | aring U+000E5 å 44 | atilde U+000E3 ã 45 | auml U+000E4 ä 46 | brvbar U+000A6 ¦ 47 | ccedil U+000E7 ç 48 | cedil U+000B8 ¸ 49 | cent U+000A2 ¢ 50 | copy U+000A9 © 51 | curren U+000A4 ¤ 52 | deg U+000B0 ° 53 | divide U+000F7 ÷ 54 | eacute U+000E9 é 55 | ecirc U+000EA ê 56 | egrave U+000E8 è 57 | eth U+000F0 ð 58 | euml U+000EB ë 59 | frac12 U+000BD ½ 60 | frac14 U+000BC ¼ 61 | frac34 U+000BE ¾ 62 | gt U+0003E > 63 | iacute U+000ED í 64 | icirc U+000EE î 65 | iexcl U+000A1 ¡ 66 | igrave U+000EC ì 67 | iquest U+000BF ¿ 68 | iuml U+000EF ï 69 | laquo U+000AB « 70 | lt U+0003C < 71 | macr U+000AF ¯ 72 | micro U+000B5 µ 73 | middot U+000B7 · 74 | nbsp U+000A0   75 | not U+000AC ¬ 76 | ntilde U+000F1 ñ 77 | oacute U+000F3 ó 78 | ocirc U+000F4 ô 79 | ograve U+000F2 ò 80 | ordf U+000AA ª 81 | ordm U+000BA º 82 | oslash U+000F8 ø 83 | otilde U+000F5 õ 84 | ouml U+000F6 ö 85 | para U+000B6 86 | plusmn U+000B1 ± 87 | pound U+000A3 £ 88 | quot U+00022 " 89 | raquo U+000BB » 90 | reg U+000AE ® 91 | sect U+000A7 § 92 | shy U+000AD ­ 93 | sup1 U+000B9 ¹ 94 | sup2 U+000B2 ² 95 | sup3 U+000B3 ³ 96 | szlig U+000DF ß 97 | thorn U+000FE þ 98 | times U+000D7 × 99 | uacute U+000FA ú 100 | ucirc U+000FB û 101 | ugrave U+000F9 ù 102 | uml U+000A8 ¨ 103 | uuml U+000FC ü 104 | yacute U+000FD ý 105 | yen U+000A5 ¥ 106 | yuml U+000FF ÿ 107 | -------------------------------------------------------------------------------- /entities/entity-processor-json.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | import math 3 | import sys 4 | 5 | def highSurrogate(codePoint): 6 | return int(math.floor((codePoint - 0x10000) / 0x400) + 0xD800) 7 | 8 | def lowSurrogate(codePoint): 9 | return int((codePoint - 0x10000) % 0x400 + 0xDC00) 10 | 11 | def codePointToString(codePoint): 12 | if codePoint <= 0xFFFF: 13 | string = '\\u' + '%04X' % codePoint 14 | else: 15 | string = '\\u' + '%04X' % highSurrogate(codePoint) + '\\u' + '%04X' % lowSurrogate(codePoint) 16 | return string 17 | 18 | # this uses 658 MB 19 | document = xml.dom.minidom.parse(sys.stdin) 20 | 21 | sets = [] 22 | entities = {} 23 | 24 | for group in document.getElementsByTagName('group'): 25 | if (group.getAttribute('name') == 'html5' or group.getAttribute('name') == 'mathml'): 26 | for set in group.getElementsByTagName('set'): 27 | sets.append(set.getAttribute('name')) 28 | 29 | for entity in document.getElementsByTagName('entity'): 30 | assert entity.parentNode.tagName == 'character' 31 | assert entity.hasAttribute('set') 32 | set = entity.getAttribute('set') 33 | if (set in sets): 34 | assert entity.hasAttribute('id') 35 | name = entity.getAttribute('id') 36 | assert len(name) > 0 37 | assert entity.parentNode.hasAttribute('id') 38 | value = entity.parentNode.getAttribute('id') 39 | assert name not in entities or entities[name] == value, '(name: ' + name + ' old value: ' + entities[name] + ' new value: ' + value + ')' 40 | if (name not in entities): 41 | entities[name] = value 42 | if ('-' in value): 43 | codes = str(int(value[1:6], 16)) + ', ' + str(int(value[7:], 16)) 44 | glyphs = codePointToString(int(value[1:6], 16)) + codePointToString(int(value[7:], 16)) 45 | else: 46 | codes = str(int(value[1:], 16)) 47 | glyphs = codePointToString(int(value[1:], 16)) 48 | print(' "&' + name + ';": { "codepoints": [' + codes + '], "characters": "' + glyphs + '" },') 49 | -------------------------------------------------------------------------------- /entities/entity-processor.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | import sys 3 | 4 | # this uses 658 MB 5 | document = xml.dom.minidom.parse(sys.stdin) 6 | 7 | sets = [] 8 | entities = {} 9 | 10 | for group in document.getElementsByTagName('group'): 11 | if (group.getAttribute('name') == 'html5' or group.getAttribute('name') == 'mathml'): 12 | for set in group.getElementsByTagName('set'): 13 | sets.append(set.getAttribute('name')) 14 | 15 | for entity in document.getElementsByTagName('entity'): 16 | assert entity.parentNode.tagName == 'character' 17 | assert entity.hasAttribute('set') 18 | set = entity.getAttribute('set') 19 | if (set in sets): 20 | assert entity.hasAttribute('id') 21 | name = entity.getAttribute('id') 22 | assert len(name) > 0 23 | assert entity.parentNode.hasAttribute('id') 24 | value = entity.parentNode.getAttribute('id') 25 | assert name not in entities or entities[name] == value, '(name: ' + name + ' old value: ' + entities[name] + ' new value: ' + value + ')' 26 | if (name not in entities): 27 | entities[name] = value 28 | if ('-' in value): 29 | value1 = value[1:6]; 30 | value2 = value[7:]; 31 | glyph = '&#x' + value1 + ';&#x' + value2 + ';' 32 | print(' ' + name + '; U+' + value1 + ' U+' + value2 + ' ' + glyph + ' '); 33 | else: 34 | if (value[1:] in ['020DC', '00311', '020DB', '020DB']): 35 | glyph = '◌' + '&#x' + value[1:] + ';' 36 | elif ('00000' < value[1:] < '00020'): 37 | glyph = '$' + value[4:] + ';' 38 | else: 39 | glyph = '&#x' + value[1:] + ';' 40 | print(' ' + name + '; U+' + value[1:] + ' ' + glyph + ' '); 41 | -------------------------------------------------------------------------------- /entities/entity-to-dtd.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -wT 2 | use strict; 3 | 4 | sub escape($) { ($_[0] eq "26" || $_[0] eq "3C" ? "&" : "&") . "#x$_[0];" } 5 | 6 | # read dtd 7 | my $dtd = ''; 8 | while (<>) { 9 | if (m( (.+?); U\+0*([0-9A-F]+) )os) { 10 | $dtd .= ""; 11 | } elsif (m( (.+?); U\+0*([0-9A-F]+) U\+0*([0-9A-F]+) )os) { 12 | $dtd .= ""; 13 | } else { 14 | die "$0: line doesn't match pattern:\n$_\n"; 15 | } 16 | } 17 | 18 | #warn "$dtd\n"; 19 | #exit; 20 | 21 | # output data: URL 22 | use HTML::Entities; 23 | use MIME::Base64; 24 | use URI::Escape; 25 | 26 | my $data = uri_escape(encode_base64($dtd, '')); 27 | 28 | print "data:application/xml-dtd;base64,$data"; 29 | -------------------------------------------------------------------------------- /entities/json-entities-legacy.inc: -------------------------------------------------------------------------------- 1 | "Æ": { "codepoints": [198], "characters": "\u00C6" }, 2 | "&": { "codepoints": [38], "characters": "\u0026" }, 3 | "Á": { "codepoints": [193], "characters": "\u00C1" }, 4 | "Â": { "codepoints": [194], "characters": "\u00C2" }, 5 | "À": { "codepoints": [192], "characters": "\u00C0" }, 6 | "Å": { "codepoints": [197], "characters": "\u00C5" }, 7 | "Ã": { "codepoints": [195], "characters": "\u00C3" }, 8 | "Ä": { "codepoints": [196], "characters": "\u00C4" }, 9 | "©": { "codepoints": [169], "characters": "\u00A9" }, 10 | "Ç": { "codepoints": [199], "characters": "\u00C7" }, 11 | "Ð": { "codepoints": [208], "characters": "\u00D0" }, 12 | "É": { "codepoints": [201], "characters": "\u00C9" }, 13 | "Ê": { "codepoints": [202], "characters": "\u00CA" }, 14 | "È": { "codepoints": [200], "characters": "\u00C8" }, 15 | "Ë": { "codepoints": [203], "characters": "\u00CB" }, 16 | ">": { "codepoints": [62], "characters": "\u003E" }, 17 | "Í": { "codepoints": [205], "characters": "\u00CD" }, 18 | "Î": { "codepoints": [206], "characters": "\u00CE" }, 19 | "Ì": { "codepoints": [204], "characters": "\u00CC" }, 20 | "Ï": { "codepoints": [207], "characters": "\u00CF" }, 21 | "<": { "codepoints": [60], "characters": "\u003C" }, 22 | "Ñ": { "codepoints": [209], "characters": "\u00D1" }, 23 | "Ó": { "codepoints": [211], "characters": "\u00D3" }, 24 | "Ô": { "codepoints": [212], "characters": "\u00D4" }, 25 | "Ò": { "codepoints": [210], "characters": "\u00D2" }, 26 | "Ø": { "codepoints": [216], "characters": "\u00D8" }, 27 | "Õ": { "codepoints": [213], "characters": "\u00D5" }, 28 | "Ö": { "codepoints": [214], "characters": "\u00D6" }, 29 | """: { "codepoints": [34], "characters": "\u0022" }, 30 | "®": { "codepoints": [174], "characters": "\u00AE" }, 31 | "Þ": { "codepoints": [222], "characters": "\u00DE" }, 32 | "Ú": { "codepoints": [218], "characters": "\u00DA" }, 33 | "Û": { "codepoints": [219], "characters": "\u00DB" }, 34 | "Ù": { "codepoints": [217], "characters": "\u00D9" }, 35 | "Ü": { "codepoints": [220], "characters": "\u00DC" }, 36 | "Ý": { "codepoints": [221], "characters": "\u00DD" }, 37 | "á": { "codepoints": [225], "characters": "\u00E1" }, 38 | "â": { "codepoints": [226], "characters": "\u00E2" }, 39 | "´": { "codepoints": [180], "characters": "\u00B4" }, 40 | "æ": { "codepoints": [230], "characters": "\u00E6" }, 41 | "à": { "codepoints": [224], "characters": "\u00E0" }, 42 | "&": { "codepoints": [38], "characters": "\u0026" }, 43 | "å": { "codepoints": [229], "characters": "\u00E5" }, 44 | "ã": { "codepoints": [227], "characters": "\u00E3" }, 45 | "ä": { "codepoints": [228], "characters": "\u00E4" }, 46 | "¦": { "codepoints": [166], "characters": "\u00A6" }, 47 | "ç": { "codepoints": [231], "characters": "\u00E7" }, 48 | "¸": { "codepoints": [184], "characters": "\u00B8" }, 49 | "¢": { "codepoints": [162], "characters": "\u00A2" }, 50 | "©": { "codepoints": [169], "characters": "\u00A9" }, 51 | "¤": { "codepoints": [164], "characters": "\u00A4" }, 52 | "°": { "codepoints": [176], "characters": "\u00B0" }, 53 | "÷": { "codepoints": [247], "characters": "\u00F7" }, 54 | "é": { "codepoints": [233], "characters": "\u00E9" }, 55 | "ê": { "codepoints": [234], "characters": "\u00EA" }, 56 | "è": { "codepoints": [232], "characters": "\u00E8" }, 57 | "ð": { "codepoints": [240], "characters": "\u00F0" }, 58 | "ë": { "codepoints": [235], "characters": "\u00EB" }, 59 | "½": { "codepoints": [189], "characters": "\u00BD" }, 60 | "¼": { "codepoints": [188], "characters": "\u00BC" }, 61 | "¾": { "codepoints": [190], "characters": "\u00BE" }, 62 | ">": { "codepoints": [62], "characters": "\u003E" }, 63 | "í": { "codepoints": [237], "characters": "\u00ED" }, 64 | "î": { "codepoints": [238], "characters": "\u00EE" }, 65 | "¡": { "codepoints": [161], "characters": "\u00A1" }, 66 | "ì": { "codepoints": [236], "characters": "\u00EC" }, 67 | "¿": { "codepoints": [191], "characters": "\u00BF" }, 68 | "ï": { "codepoints": [239], "characters": "\u00EF" }, 69 | "«": { "codepoints": [171], "characters": "\u00AB" }, 70 | "<": { "codepoints": [60], "characters": "\u003C" }, 71 | "¯": { "codepoints": [175], "characters": "\u00AF" }, 72 | "µ": { "codepoints": [181], "characters": "\u00B5" }, 73 | "·": { "codepoints": [183], "characters": "\u00B7" }, 74 | " ": { "codepoints": [160], "characters": "\u00A0" }, 75 | "¬": { "codepoints": [172], "characters": "\u00AC" }, 76 | "ñ": { "codepoints": [241], "characters": "\u00F1" }, 77 | "ó": { "codepoints": [243], "characters": "\u00F3" }, 78 | "ô": { "codepoints": [244], "characters": "\u00F4" }, 79 | "ò": { "codepoints": [242], "characters": "\u00F2" }, 80 | "ª": { "codepoints": [170], "characters": "\u00AA" }, 81 | "º": { "codepoints": [186], "characters": "\u00BA" }, 82 | "ø": { "codepoints": [248], "characters": "\u00F8" }, 83 | "õ": { "codepoints": [245], "characters": "\u00F5" }, 84 | "ö": { "codepoints": [246], "characters": "\u00F6" }, 85 | "¶": { "codepoints": [182], "characters": "\u00B6" }, 86 | "±": { "codepoints": [177], "characters": "\u00B1" }, 87 | "£": { "codepoints": [163], "characters": "\u00A3" }, 88 | """: { "codepoints": [34], "characters": "\u0022" }, 89 | "»": { "codepoints": [187], "characters": "\u00BB" }, 90 | "®": { "codepoints": [174], "characters": "\u00AE" }, 91 | "§": { "codepoints": [167], "characters": "\u00A7" }, 92 | "­": { "codepoints": [173], "characters": "\u00AD" }, 93 | "¹": { "codepoints": [185], "characters": "\u00B9" }, 94 | "²": { "codepoints": [178], "characters": "\u00B2" }, 95 | "³": { "codepoints": [179], "characters": "\u00B3" }, 96 | "ß": { "codepoints": [223], "characters": "\u00DF" }, 97 | "þ": { "codepoints": [254], "characters": "\u00FE" }, 98 | "×": { "codepoints": [215], "characters": "\u00D7" }, 99 | "ú": { "codepoints": [250], "characters": "\u00FA" }, 100 | "û": { "codepoints": [251], "characters": "\u00FB" }, 101 | "ù": { "codepoints": [249], "characters": "\u00F9" }, 102 | "¨": { "codepoints": [168], "characters": "\u00A8" }, 103 | "ü": { "codepoints": [252], "characters": "\u00FC" }, 104 | "ý": { "codepoints": [253], "characters": "\u00FD" }, 105 | "¥": { "codepoints": [165], "characters": "\u00A5" }, 106 | "ÿ": { "codepoints": [255], "characters": "\u00FF" }, 107 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$#" -ne 1 ]; then 5 | echo "Usage: $0 path/to/html/source" 6 | exit 1 7 | fi 8 | 9 | # show potential problems 10 | MATCHES=$(grep -niE '( (code|span|var)(>| data-x=)|[^<;]/(code|span|var)>)' "$1" | perl -lpe 'print "\nPossible copypasta:" if $. == 1' 11 | perl -ne '$/ = "\n\n"; print "$_" if (/chosing|approprate|occured|elemenst|\bteh\b|\blabelled\b|\blabelling\b|\bhte\b|taht|linx\b|speciication|attribue|kestern|horiontal|\battribute\s+attribute\b|\bthe\s+the\b|\bthe\s+there\b|\bfor\s+for\b|\bor\s+or\b|\bany\s+any\b|\bbe\s+be\b|\bwith\s+with\b|\bis\s+is\b/si)' "$1" | perl -lpe 'print "\nPossible typos:" if $. == 1' 12 | grep -niE '((anonym|author|categor|custom|emphas|initial|local|minim|neutral|normal|optim|raster|real|recogn|roman|serial|standard|summar|synchron|synthes|token|optim)is(e|ing|ation|ability)|(col|behavi|hono|fav)our)' "$1" | grep -vE "\ben-GB\b" | perl -lpe 'print "\nen-GB spelling (use lang=\"en-GB\", or , on the same line to override):" if $. == 1' 13 | perl -ne '$/ = "\n\n"; print "$_" if (/\ban\s+(<[^>]*>)*(?!(L\b|http|https|href|hgroup|rb|rp|rt|rtc|li|xml|svg|svgmatrix|hour|hr|xhtml|xslt|xbl|nntp|mpeg|m[ions]|mtext|merror|h[1-6]|xmlns|xpath|s|x|sgml|huang|srgb|rsa|only|option|optgroup)\b|html)[b-df-hj-np-tv-z]/si or /\b(?)(<[^>]*>)*(?!>|one)(?:(L\b|http|https|href|hgroup|rt|rp|li|xml|svg|svgmatrix|hour|hr|xhtml|xslt|xbl|nntp|mpeg|m[ions]|mtext|merror|h[1-6]|xmlns|xpath|s|x|sgml|huang|srgb|rsa|only|option|optgroup)\b|html|[aeio])/si)' "$1" | perl -lpe 'print "\nPossible grammar problem: \"a\" instead of \"an\" or vice versa (to override, use e.g. \"a apple\"):" if $. == 1' 14 | grep -ni 'and/or' "$1" | perl -lpe 'print "\nOccurrences of making Ms2ger unhappy and/or annoyed:" if $. == 1' 15 | grep -niE '\s+$' "$1" | perl -lpe 'print "\nTrailing whitespace:" if $. == 1' 16 | grep $'\t' "$1" | perl -lpe 'print "\nTab:" if $. == 1' 17 | grep $'\xc2\xa0' "$1" | perl -lpe 'print "\nUnescaped nonbreaking space:" if $. == 1' 18 | perl -ne '$/ = "\n\n"; print "$_" if (/class="?(note|example).+(\n.+)*\s+(should|must|may|optional|recommended)(\s|$)/mi)' "$1" | perl -lpe 'print "\nRFC2119 keyword in example or note (use: might, can, has to, or override with must):" if $. == 1' 19 | perl -ne '$line++; $in_domintro = 1 if (/^
$/); print "$line: $_" if ($in_domintro && /\s+(should|must|may|optional|recommended)(\s|$)/i); $in_domintro = 0 if (/^ <\/dl>$/)' "$1" | perl -lpe 'print "\nRFC2119 keyword in domintro (use: might, can, has to, or override with must):" if $. == 1' 20 | ) 21 | 22 | if [ -n "$MATCHES" ]; then 23 | echo "$MATCHES" 24 | exit 1 25 | fi 26 | -------------------------------------------------------------------------------- /src/annotate_attributes.rs: -------------------------------------------------------------------------------- 1 | //! Augments the content attribute list for each element with a description found in the Attributes table. 2 | 3 | use std::collections::{HashMap, HashSet}; 4 | use std::io; 5 | use std::rc::Rc; 6 | 7 | use html5ever::tendril::StrTendril; 8 | use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; 9 | use markup5ever_rcdom::{Handle, NodeData}; 10 | 11 | use crate::dom_utils::{self, NodeHandleExt}; 12 | use crate::parser; 13 | 14 | #[derive(Debug, Default)] 15 | struct Descriptions { 16 | /// The default description, as a list of nodes. 17 | default: Vec, 18 | 19 | /// The variant description, if any, as an unparsed string. 20 | variant: Option, 21 | } 22 | 23 | #[derive(Debug)] 24 | struct Edit { 25 | /// Handle on the
element which is to be filled in. 26 | dd: Handle, 27 | 28 | /// The data-x attribute which must be described. 29 | key: StrTendril, 30 | 31 | /// Whether this location has requested the variant/alternate description. 32 | wants_variant_description: bool, 33 | 34 | /// Whether this is described as having "special semantics" and so must be 35 | /// formatted differently. 36 | has_special_semantics: bool, 37 | } 38 | 39 | pub struct Processor { 40 | /// Map from attribute key (e.g., attr-elem-someattribute) to the 41 | /// descriptions found in the Attributes table. 42 | attributes: HashMap, 43 | 44 | /// List of
nodes in Content attributes sections that need to be filled in. 45 | edits: Vec, 46 | } 47 | 48 | impl Processor { 49 | pub fn new() -> Self { 50 | Processor { 51 | attributes: HashMap::new(), 52 | edits: Vec::new(), 53 | } 54 | } 55 | 56 | pub fn visit(&mut self, node: &Handle) { 57 | // We're looking for a (which is under the Attributes heading). 58 | if node.is_html_element(&local_name!("table")) && node.has_id("attributes-1") { 59 | self.index_attribute_table(node); 60 | } 61 | 62 | // We're looking for the following: 63 | //
64 | // ... 65 | //
Content attributes:
66 | //
Global attributes
67 | //
href
68 | //
someattribute
69 | // ... 70 | fn is_content_attribute_dt(dt: &Handle) -> bool { 71 | if !dt.is_html_element(&local_name!("dt")) { 72 | return false; 73 | } 74 | match dt.parent_node() { 75 | Some(p) if p.is_html_element(&local_name!("dl")) && p.has_class("element") => (), 76 | _ => return false, 77 | } 78 | let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); 79 | dt.any_child(|c| c.attribute_is(&data_x, "concept-element-attributes")) 80 | } 81 | if is_content_attribute_dt(node) { 82 | self.index_attribute_list(node); 83 | } 84 | } 85 | 86 | fn index_attribute_table(&mut self, table: &Handle) { 87 | let tbody = match table 88 | .children 89 | .borrow() 90 | .iter() 91 | .find(|n| n.is_html_element(&local_name!("tbody"))) 92 | { 93 | Some(tbody) => tbody.clone(), 94 | None => return, 95 | }; 96 | for row in tbody 97 | .children 98 | .borrow() 99 | .iter() 100 | .filter(|c| c.is_html_element(&local_name!("tr"))) 101 | { 102 | // Each row is expected to have this structure: 103 | //
104 | // ".as_bytes(), &table).await?; 128 | assert_eq!(serialize_for_test(&children), ""); 129 | Ok(()) 130 | } 131 | 132 | #[tokio::test] 133 | async fn test_document_error_line_number() -> io::Result<()> { 134 | let result = 135 | parse_document_async("Hello\nworld".as_bytes()) 136 | .await; 137 | 138 | let error = result.unwrap_err(); 139 | assert_eq!(error.kind(), io::ErrorKind::InvalidData); 140 | assert!(error.to_string().contains("Line 2: ")); 141 | 142 | Ok(()) 143 | } 144 | 145 | #[tokio::test] 146 | async fn test_document_error_exact() -> io::Result<()> { 147 | let result = 148 | parse_document_async("&asdf;".as_bytes()) 149 | .await; 150 | 151 | let error = result.unwrap_err(); 152 | assert_eq!(error.kind(), io::ErrorKind::InvalidData); 153 | assert!(error.to_string().contains("&asdf;")); 154 | 155 | Ok(()) 156 | } 157 | 158 | #[tokio::test] 159 | async fn test_fragment_error_line_number() -> io::Result<()> { 160 | let document = parse_document_async("".as_bytes()).await?; 161 | let body = document.children.borrow()[1].children.borrow()[1].clone(); 162 | assert!(body.is_html_element(&local_name!("body"))); 163 | let result = 164 | parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await; 165 | 166 | let error = result.unwrap_err(); 167 | assert_eq!(error.kind(), io::ErrorKind::InvalidData); 168 | assert!(error.to_string().contains("Line 3: ")); 169 | 170 | Ok(()) 171 | } 172 | 173 | #[tokio::test] 174 | async fn test_fragment_error_exact() -> io::Result<()> { 175 | let document = parse_document_async("".as_bytes()).await?; 176 | let body = document.children.borrow()[1].children.borrow()[1].clone(); 177 | assert!(body.is_html_element(&local_name!("body"))); 178 | let result = 179 | parse_fragment_async("&asdf;".as_bytes(), &body).await; 180 | 181 | let error = result.unwrap_err(); 182 | assert_eq!(error.kind(), io::ErrorKind::InvalidData); 183 | assert!(error.to_string().contains("&asdf;")); 184 | 185 | Ok(()) 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/rcdom_with_line_numbers.rs: -------------------------------------------------------------------------------- 1 | // This provides a wrapper around RcDom which tracks line numbers in the errors. 2 | 3 | use delegate::delegate; 4 | use html5ever::interface::TreeSink; 5 | use html5ever::{ 6 | tendril::StrTendril, 7 | tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode}, 8 | Attribute, ExpandedName, QualName, 9 | }; 10 | use markup5ever_rcdom::{Handle, RcDom}; 11 | use std::borrow::Cow; 12 | use std::io; 13 | 14 | pub struct RcDomWithLineNumbers { 15 | dom: RcDom, 16 | current_line: u64, 17 | } 18 | 19 | impl RcDomWithLineNumbers { 20 | // Expose out the document and errors from the inner RcDom 21 | pub fn document(&self) -> &Handle { 22 | &self.dom.document 23 | } 24 | 25 | pub fn create_error_from_parse_errors(&self) -> io::Result<()> { 26 | if !self.dom.errors.is_empty() { 27 | let error_messages = self 28 | .dom 29 | .errors 30 | .iter() 31 | .map(|e| e.to_string()) 32 | .collect::>() 33 | .join("\n"); 34 | Err(io::Error::new( 35 | io::ErrorKind::InvalidData, 36 | format!("Parse errors encountered:\n\n{}", error_messages), 37 | )) 38 | } else { 39 | Ok(()) 40 | } 41 | } 42 | } 43 | 44 | impl Default for RcDomWithLineNumbers { 45 | fn default() -> Self { 46 | Self { 47 | dom: RcDom::default(), 48 | current_line: 1, 49 | } 50 | } 51 | } 52 | 53 | impl TreeSink for RcDomWithLineNumbers { 54 | type Output = RcDomWithLineNumbers; 55 | type Handle = ::Handle; 56 | 57 | // Override the parse_error method to add line numbers to the error messages. 58 | fn parse_error(&mut self, msg: Cow<'static, str>) { 59 | let msg_with_line = format!("Line {}: {}", self.current_line, msg); 60 | self.dom.parse_error(Cow::Owned(msg_with_line)); 61 | } 62 | 63 | // Override to track the current line number. 64 | fn set_current_line(&mut self, line: u64) { 65 | self.current_line = line; 66 | } 67 | 68 | // Override to return RcDomWithLineNumbers instead of RcDom. 69 | fn finish(self) -> Self::Output { 70 | self 71 | } 72 | 73 | // Delegate all other methods to RcDom. 74 | delegate! { 75 | to self.dom { 76 | fn get_document(&mut self) -> Self::Handle; 77 | 78 | fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>; 79 | 80 | fn create_element( 81 | &mut self, 82 | name: QualName, 83 | attrs: Vec, 84 | flags: ElementFlags, 85 | ) -> Self::Handle; 86 | 87 | fn create_comment(&mut self, text: StrTendril) -> Self::Handle; 88 | 89 | fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle; 90 | 91 | fn append(&mut self, parent: &Self::Handle, child: NodeOrText); 92 | 93 | fn append_based_on_parent_node( 94 | &mut self, 95 | element: &Self::Handle, 96 | prev_element: &Self::Handle, 97 | child: NodeOrText, 98 | ); 99 | 100 | fn append_doctype_to_document( 101 | &mut self, 102 | name: StrTendril, 103 | public_id: StrTendril, 104 | system_id: StrTendril, 105 | ); 106 | 107 | fn mark_script_already_started(&mut self, node: &Self::Handle); 108 | 109 | fn pop(&mut self, node: &Self::Handle); 110 | 111 | fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle; 112 | 113 | fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool; 114 | 115 | fn set_quirks_mode(&mut self, mode: QuirksMode); 116 | 117 | fn append_before_sibling( 118 | &mut self, 119 | sibling: &Self::Handle, 120 | new_node: NodeOrText, 121 | ); 122 | 123 | fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec); 124 | 125 | fn associate_with_form( 126 | &mut self, 127 | target: &Self::Handle, 128 | form: &Self::Handle, 129 | nodes: (&Self::Handle, Option<&Self::Handle>), 130 | ); 131 | 132 | fn remove_from_parent(&mut self, target: &Self::Handle); 133 | 134 | fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle); 135 | 136 | fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool; 137 | 138 | fn complete_script(&mut self, node: &Self::Handle) -> NextParserState; 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/represents.rs: -------------------------------------------------------------------------------- 1 | //! Replaces comments with the HTML which appears in a 2 | //! paragraph of the form: 3 | //!

The tagname element represents ...

4 | 5 | use std::collections::HashMap; 6 | use std::io; 7 | use std::rc::Rc; 8 | 9 | use crate::dom_utils::NodeHandleExt; 10 | use html5ever::local_name; 11 | use html5ever::tendril::StrTendril; 12 | use markup5ever_rcdom::{Handle, NodeData}; 13 | 14 | pub struct Processor { 15 | /// Map from tag name (as found in the paragraph) to the which 16 | /// contains the text "represents". 17 | represents: HashMap, 18 | 19 | /// List of comments to be replaced, and what tag name 20 | /// they correspond to. 21 | placeholders: Vec<(Handle, StrTendril)>, 22 | } 23 | 24 | /// Walks from the text node "represents" and finds the tag name and the 25 | /// span that marks where the description begins, or returns None if that 26 | /// cannot be found. 27 | fn find_tag_name(represents_text: &Handle) -> Option<(StrTendril, Handle)> { 28 | let span = represents_text 29 | .parent_node() 30 | .filter(|p| p.is_html_element(&local_name!("span")))?; 31 | let p = span 32 | .parent_node() 33 | .filter(|p| p.is_html_element(&local_name!("p")))?; 34 | let children = p.children.borrow(); 35 | match &children[..] { 36 | [a, b, c, d, ..] 37 | if a.node_text().as_deref().map(|x| x.trim()) == Some("The") 38 | && b.is_html_element(&local_name!("code")) 39 | && c.node_text().as_deref().map(|x| x.trim()) == Some("element") 40 | && Rc::ptr_eq(d, &span) => 41 | { 42 | Some((b.text_content(), span)) 43 | } 44 | _ => None, 45 | } 46 | } 47 | 48 | impl Processor { 49 | pub fn new() -> Self { 50 | Self { 51 | represents: HashMap::new(), 52 | placeholders: Vec::new(), 53 | } 54 | } 55 | 56 | /// Should be called for each node the document. Records when it sees a 57 | /// represents and which element it is defining 58 | pub fn visit(&mut self, node: &Handle) { 59 | match node.data { 60 | NodeData::Text { ref contents } if contents.borrow().as_ref() == "represents" => { 61 | if let Some((tag, span)) = find_tag_name(node) { 62 | self.represents.insert(tag, span); 63 | } 64 | } 65 | NodeData::Comment { ref contents } if contents.starts_with("REPRESENTS ") => { 66 | self.placeholders 67 | .push((node.clone(), contents.subtendril(11, contents.len32() - 11))); 68 | } 69 | _ => (), 70 | } 71 | } 72 | 73 | pub fn apply(self) -> io::Result<()> { 74 | for (placeholder, ref tag) in self.placeholders { 75 | let span = match self.represents.get(tag) { 76 | Some(span) => span, 77 | None => { 78 | return Err(io::Error::new( 79 | io::ErrorKind::InvalidData, 80 | format!(" refers to unknown tag", tag), 81 | )); 82 | } 83 | }; 84 | let parent = match span.parent_node() { 85 | Some(p) => p, 86 | None => continue, 87 | }; 88 | let replacements = parent 89 | .children 90 | .borrow() 91 | .iter() 92 | .skip_while(|s| !Rc::ptr_eq(s, span)) 93 | .skip(1) 94 | .enumerate() 95 | .map(|(index, sibling)| { 96 | let clone = sibling.deep_clone(); 97 | // Capitalize the first letter of the first node (which is expected to be text). 98 | if let (0, NodeData::Text { ref contents }) = (index, &clone.data) { 99 | contents.replace_with(|text| capitalize(text.trim_start())); 100 | } 101 | clone 102 | }) 103 | .collect(); 104 | placeholder.replace_with(replacements); 105 | } 106 | Ok(()) 107 | } 108 | } 109 | 110 | fn capitalize(text: &str) -> StrTendril { 111 | let mut chars = text.chars(); 112 | match chars.next() { 113 | Some(c) => { 114 | let mut capitalized = StrTendril::from_char(c.to_ascii_uppercase()); 115 | capitalized.push_slice(chars.as_str()); 116 | capitalized 117 | } 118 | None => StrTendril::new(), 119 | } 120 | } 121 | 122 | #[cfg(test)] 123 | mod tests { 124 | use super::*; 125 | use crate::dom_utils; 126 | use crate::parser::{parse_document_async, tests::serialize_for_test}; 127 | 128 | #[tokio::test] 129 | async fn test_represents() -> io::Result<()> { 130 | // Uses can occur either before or after. 131 | let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; 132 | let mut proc = Processor::new(); 133 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 134 | proc.apply()?; 135 | assert_eq!( 136 | serialize_for_test(&[document]), 137 | "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" 138 | ); 139 | Ok(()) 140 | } 141 | 142 | #[tokio::test] 143 | async fn test_represents_undefined() -> io::Result<()> { 144 | // Uses can occur either before or after. 145 | let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; 146 | let mut proc = Processor::new(); 147 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 148 | let result = proc.apply(); 149 | assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); 150 | Ok(()) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/tag_omission.rs: -------------------------------------------------------------------------------- 1 | //! Looks at the "Optional tags" and "Void elements" sections from the HTML 2 | //! syntax spec and replicates that information into the descriptions of the 3 | //! individual elements. 4 | 5 | use std::borrow::Borrow; 6 | use std::collections::HashMap; 7 | use std::io; 8 | 9 | use html5ever::tendril::StrTendril; 10 | use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; 11 | use markup5ever_rcdom::{Handle, NodeData}; 12 | use regex::Regex; 13 | 14 | use crate::dom_utils::{self, heading_level, NodeHandleExt}; 15 | 16 | #[derive(Default)] 17 | struct ElementInfo { 18 | /// Handles on any paragraphs in the "Optional tags" section which refer to the element. 19 | optional_tags_info: Vec, 20 | 21 | /// Whether the element appears in the "Void elements" list. 22 | is_void_element: bool, 23 | 24 | ///

into which this info must be added. 25 | dl: Option, 26 | } 27 | 28 | #[derive(Default)] 29 | pub struct Processor { 30 | /// The heading level of the "Optional tags" heading, if inside one. 31 | in_optional_tags_heading: Option, 32 | 33 | /// Most recently seen . 34 | most_recent_element_dfn: Option, 35 | 36 | /// Info about elements which have been referred to in these sections. 37 | elements: HashMap, 38 | } 39 | 40 | impl Processor { 41 | pub fn new() -> Self { 42 | Default::default() 43 | } 44 | 45 | pub fn visit(&mut self, node: &Handle) { 46 | // If the heading ends the "Optional tags" section, clear that state. 47 | if let Some(optional_tag_heading_level) = self.in_optional_tags_heading { 48 | match heading_level(node) { 49 | Some(level) if level <= optional_tag_heading_level => { 50 | self.in_optional_tags_heading = None; 51 | } 52 | _ => (), 53 | } 54 | } 55 | 56 | // If we encounter an "Optional tags" section, start observing relevant paragraphs. 57 | // When one is encountered, possibly add it. 58 | if let Some(level) = heading_level(node) { 59 | if node.text_content().trim() == "Optional tags" { 60 | self.in_optional_tags_heading = Some(level); 61 | } 62 | } else if self.in_optional_tags_heading.is_some() && node.is_html_element(&local_name!("p")) 63 | { 64 | self.maybe_record_optional_tags_paragraph(node); 65 | } 66 | 67 | // If we encounter the Void elements section, look for the next dt. 68 | if node.is_html_element(&local_name!("dfn")) 69 | && node.text_content().trim() == "Void elements" 70 | { 71 | if let Some(dt) = node 72 | .parent_node() 73 | .filter(|n| n.is_html_element(&local_name!("dt"))) 74 | { 75 | for dd in dom_utils::dt_descriptions(&dt) { 76 | dom_utils::scan_dom(&dd, &mut |n| { 77 | if n.is_html_element(&local_name!("code")) { 78 | let info = self.elements.entry(n.text_content()).or_default(); 79 | info.is_void_element = true; 80 | } 81 | }); 82 | } 83 | } 84 | } 85 | 86 | // If we see an element dfn, watch out for the upcoming
. 87 | if node.is_html_element(&local_name!("dfn")) 88 | && node.has_attribute(&QualName::new(None, ns!(), LocalName::from("element"))) 89 | { 90 | self.most_recent_element_dfn = Some(node.text_content()); 91 | } 92 | 93 | // If we see a
, record that. 94 | if node.is_html_element(&local_name!("dl")) && node.has_class("element") { 95 | if let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn) { 96 | let info = self.elements.entry(elem).or_default(); 97 | if info.dl.is_none() { 98 | info.dl = Some(node.clone()); 99 | } 100 | } 101 | } 102 | } 103 | 104 | fn maybe_record_optional_tags_paragraph(&mut self, paragraph: &Handle) { 105 | // The paragraph must have the structure "A(n) img element..." 106 | let children = paragraph.children.borrow(); 107 | let mut iter = children.iter().fuse(); 108 | match (iter.next(), iter.next(), iter.next()) { 109 | (Some(a), Some(b), Some(c)) 110 | if a.node_text() 111 | .map_or(false, |t| t.trim() == "A" || t.trim() == "An") 112 | && b.is_html_element(&local_name!("code")) 113 | && c.node_text() 114 | .map_or(false, |t| t.trim().starts_with("element")) => 115 | { 116 | let info = self.elements.entry(b.text_content()).or_default(); 117 | info.optional_tags_info.push(paragraph.clone()); 118 | } 119 | _ => (), 120 | } 121 | } 122 | 123 | pub fn apply(self) -> io::Result<()> { 124 | let data_x = LocalName::from("data-x"); 125 | let qual_data_x = QualName::new(None, ns!(), data_x.clone()); 126 | let dt = Handle::create_element(local_name!("dt")) 127 | .child( 128 | Handle::create_element(local_name!("span")) 129 | .attribute(&data_x, "concept-element-tag-omission") 130 | .text("Tag omission in text/html") 131 | .build(), 132 | ) 133 | .text(":") 134 | .build(); 135 | let void_dd = Handle::create_element(local_name!("dd")) 136 | .text("No ") 137 | .child( 138 | Handle::create_element(local_name!("span")) 139 | .attribute(&data_x, "syntax-end-tag") 140 | .text("end tag") 141 | .build(), 142 | ) 143 | .text(".") 144 | .build(); 145 | let default_dd = Handle::create_element(local_name!("dd")) 146 | .text("Neither tag is omissible.") 147 | .build(); 148 | let may_re = Regex::new(r"\bmay\b").unwrap(); 149 | 150 | for info in self.elements.into_values() { 151 | let dl = match info.dl { 152 | Some(dl) => dl, 153 | None => continue, 154 | }; 155 | 156 | let mut to_insert = vec![dt.deep_clone()]; 157 | if !info.optional_tags_info.is_empty() { 158 | // Convert

to

, replacing "may" with "can". 159 | for p in info.optional_tags_info { 160 | let borrowed_children = p.children.borrow(); 161 | let new_children = borrowed_children.iter().map(|n| { 162 | let new_node = n.deep_clone(); 163 | dom_utils::scan_dom(&new_node, &mut |c| { 164 | if let NodeData::Text { ref contents } = c.data { 165 | let mut text = contents.borrow_mut(); 166 | *text = StrTendril::from(may_re.replace(&text, "can").borrow()); 167 | } 168 | }); 169 | new_node 170 | }); 171 | let dd = Handle::create_element(local_name!("dd")) 172 | .children(new_children) 173 | .build(); 174 | to_insert.push(dd); 175 | } 176 | } else if info.is_void_element { 177 | to_insert.push(void_dd.deep_clone()); 178 | } else { 179 | to_insert.push(default_dd.deep_clone()); 180 | } 181 | to_insert.push(Handle::create_text_node("\n")); 182 | 183 | let dl_children = dl.children.borrow(); 184 | let attributes_dt = if let Some(attributes_dt) = dl_children.iter().find(|child| { 185 | child.is_html_element(&local_name!("dt")) 186 | && child 187 | .any_child(|c| c.attribute_is(&qual_data_x, "concept-element-attributes")) 188 | }) { 189 | attributes_dt.clone() 190 | } else { 191 | continue; 192 | }; 193 | drop(dl_children); 194 | dl.insert_children_before(&attributes_dt, to_insert.into_iter()); 195 | } 196 | Ok(()) 197 | } 198 | } 199 | 200 | #[cfg(test)] 201 | mod tests { 202 | use super::*; 203 | use crate::parser::{parse_document_async, tests::serialize_for_test}; 204 | 205 | #[tokio::test] 206 | async fn test_simple() -> io::Result<()> { 207 | let document = parse_document_async( 208 | r#" 209 | 210 |

Optional tags

211 |

A td element does very tdish things and may be very cellular.

212 |

An audio element is quite audible.

213 |

Another section

214 |

A body element is ignored because it's in another section. 215 |

216 |
Void elements 217 |
img and meta are void. 218 |
input is too. 219 |
Non-void elements 220 |
html is interesting but not void. 221 |
222 |

Elements

223 |

audio 224 |

225 |
226 |
227 |

body 228 |

229 |
230 |
231 |

html 232 |

233 |
234 |
235 |

img 236 |

237 |
238 |
239 |

input 240 |

241 |
242 |
243 |

meta 244 |

245 |
246 |
247 |

td 248 |

249 |
250 |
251 | "# 252 | .trim() 253 | .as_bytes(), 254 | ) 255 | .await?; 256 | let mut proc = Processor::new(); 257 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 258 | proc.apply()?; 259 | assert_eq!( 260 | serialize_for_test(&[document]), 261 | r#" 262 |

Optional tags

263 |

A td element does very tdish things and may be very cellular.

264 |

An audio element is quite audible.

265 |

Another section

266 |

A body element is ignored because it's in another section. 267 |

268 |
Void elements 269 |
img and meta are void. 270 |
input is too. 271 |
Non-void elements 272 |
html is interesting but not void. 273 |
274 |

Elements

275 |

audio 276 |

277 |
Tag omission in text/html:
An audio element is quite audible.
278 |
279 |
280 |

body 281 |

282 |
Tag omission in text/html:
Neither tag is omissible.
283 |
284 |
285 |

html 286 |

287 |
Tag omission in text/html:
Neither tag is omissible.
288 |
289 |
290 |

img 291 |

292 |
Tag omission in text/html:
No end tag.
293 |
294 |
295 |

input 296 |

297 |
Tag omission in text/html:
No end tag.
298 |
299 |
300 |

meta 301 |

302 |
Tag omission in text/html:
No end tag.
303 |
304 |
305 |

td 306 |

307 |
Tag omission in text/html:
A td element does very tdish things and can be very cellular.
308 |
309 |
310 | "#.trim()); 311 | Ok(()) 312 | } 313 | } 314 | --------------------------------------------------------------------------------
someattribute 105 | // a; b; ... 106 | // Description of how someattribute applies to a, b, etc. 107 | // Description if the valid values 108 | // And we want to extract the descriptions so that we can later insert them 109 | // alongside the definitions of attr-a-someattribute, etc. 110 | let row_children = row.children.borrow(); 111 | let mut tds = row_children 112 | .iter() 113 | .filter(|c| c.is_html_element(&local_name!("td"))); 114 | let (keys_td, description_td) = match (tds.next(), tds.next()) { 115 | (Some(a), Some(b)) => (a, b), 116 | _ => continue, 117 | }; 118 | 119 | // If a single row describes the same element multiple times, we don't need to repeat it. 120 | // StrTendril doesn't have logical interior mutability, so this Clippy warning is overzealous. 121 | #[allow(clippy::mutable_key_type)] 122 | let mut seen_this_row: HashSet = HashSet::new(); 123 | 124 | // These will be strings like "attr-input-maxlength", which identify particular element-attribute pairs. 125 | let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); 126 | for attr_key in keys_td 127 | .children 128 | .borrow() 129 | .iter() 130 | .filter_map(|c| c.get_attribute(&data_x).filter(|v| !v.is_empty())) 131 | { 132 | // If this row describes the the same attribute, with the same 133 | // identifier, for multiple elements (like attr-fae-form and 134 | // attr-dim-width), these aren't actually distinct descriptions 135 | // and we need not join them. 136 | if !seen_this_row.insert(attr_key.clone()) { 137 | continue; 138 | } 139 | 140 | // Find the comment, if one exists, and extract its contents. 141 | let description = description_td.children.borrow(); 142 | let mut variant_comment = None; 143 | let mut variant_str = None; 144 | for node in description.iter() { 145 | if let NodeData::Comment { ref contents } = node.data { 146 | if contents.trim().starts_with("or:") { 147 | variant_comment = Some(node); 148 | variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start())); 149 | } 150 | } 151 | } 152 | 153 | // Store the (already parsed) ordinary description. If a variant 154 | // comment exists, omit it and instead store its unparsed 155 | // string. 156 | let descriptions = Descriptions { 157 | default: description_td 158 | .children 159 | .borrow() 160 | .iter() 161 | .filter(|c| variant_comment.map_or(true, |vc| !Rc::ptr_eq(c, vc))) 162 | .map(|c| c.deep_clone()) 163 | .collect(), 164 | variant: variant_str, 165 | }; 166 | let existing = self.attributes.entry(attr_key).or_default(); 167 | if existing.default.is_empty() { 168 | existing.default = descriptions.default; 169 | } else if !descriptions.default.is_empty() { 170 | if let NodeData::Text { ref contents } = existing.default.last().unwrap().data { 171 | let mut borrow = contents.borrow_mut(); 172 | if let Some(last_non_ws) = borrow.rfind(|c: char| !c.is_ascii_whitespace()) 173 | { 174 | let to_remove = borrow.len32() - (last_non_ws as u32) - 1; 175 | borrow.pop_back(to_remove); 176 | } 177 | } 178 | existing.default.push(Handle::create_text_node("; ")); 179 | existing.default.extend(descriptions.default.into_iter()); 180 | } 181 | if existing.variant.is_none() { 182 | existing.variant = descriptions.variant; 183 | } else if descriptions.variant.is_some() { 184 | let existing_variant = existing.variant.as_mut().unwrap(); 185 | existing_variant.push_slice("; "); 186 | existing_variant.push_tendril(&descriptions.variant.unwrap()); 187 | } 188 | } 189 | } 190 | } 191 | 192 | fn index_attribute_list(&mut self, dt: &Handle) { 193 | // If a
contains , it is not annotated. 194 | // If it contains , the description found in a comment is used instead. 195 | // If it mentions "special semantics", it is joined with a colon rather than an em dash. 196 | let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); 197 | let parent = dt.parent_node().unwrap(); 198 | let children = parent.children.borrow(); 199 | self.edits.extend( 200 | children 201 | .iter() 202 | .skip_while(|n| !Rc::ptr_eq(n, dt)) 203 | .skip(1) 204 | .filter(|n| n.is_element()) 205 | .take_while(|e| e.is_html_element(&local_name!("dd"))) 206 | .filter_map(|dd| { 207 | let mut can_annotate = true; 208 | let mut wants_variant_description = false; 209 | let mut has_special_semantics = false; 210 | let mut key = None; 211 | dom_utils::scan_dom(dd, &mut |n| match &n.data { 212 | NodeData::Comment { ref contents } if contents.trim() == "no-annotate" => { 213 | can_annotate = false; 214 | } 215 | NodeData::Comment { ref contents } if contents.trim() == "variant" => { 216 | wants_variant_description = true; 217 | } 218 | NodeData::Text { ref contents } 219 | if contents.borrow().contains("has special semantics") => 220 | { 221 | has_special_semantics = true; 222 | } 223 | NodeData::Element { .. } => { 224 | if key.is_none() { 225 | key = n.get_attribute(&data_x); 226 | } 227 | } 228 | _ => (), 229 | }); 230 | match (can_annotate, key) { 231 | (true, Some(key)) => Some(Edit { 232 | dd: dd.clone(), 233 | key, 234 | wants_variant_description, 235 | has_special_semantics, 236 | }), 237 | _ => None, 238 | } 239 | }), 240 | ); 241 | } 242 | 243 | pub async fn apply(self) -> io::Result<()> { 244 | let em_dash = StrTendril::from(" \u{2014} "); 245 | 246 | for Edit { 247 | dd, 248 | key, 249 | wants_variant_description, 250 | has_special_semantics, 251 | } in self.edits 252 | { 253 | // Find the requested description to insert at this point. 254 | let descriptions = match self.attributes.get(&key) { 255 | Some(descriptions) => descriptions, 256 | None => continue, 257 | }; 258 | let mut description: Vec = match descriptions { 259 | Descriptions { 260 | variant: Some(ref variant), 261 | .. 262 | } if wants_variant_description => { 263 | parser::parse_fragment_async(variant[..].as_bytes(), &dd).await? 264 | } 265 | _ if wants_variant_description => { 266 | return Err(io::Error::new( 267 | io::ErrorKind::InvalidData, 268 | format!( 269 | "Attribute {key} wants variant description, but no was found" 270 | ), 271 | )) 272 | } 273 | Descriptions { ref default, .. } => { 274 | default.iter().map(|n| n.deep_clone()).collect() 275 | } 276 | }; 277 | 278 | let mut dd_children = dd.children.borrow_mut(); 279 | if has_special_semantics { 280 | // Replace the trailing period with a separating colon. 281 | if let Some(NodeData::Text { contents }) = dd_children.last_mut().map(|n| &n.data) { 282 | let mut text = contents.borrow_mut(); 283 | *text = StrTendril::from( 284 | text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), 285 | ); 286 | text.push_slice(": "); 287 | } 288 | } else { 289 | // Insert an em dash. 290 | description.insert(0, Handle::create_text_node(em_dash.clone())); 291 | } 292 | 293 | // Insert the description. 294 | for child in description.iter_mut() { 295 | child.parent.set(Some(Rc::downgrade(&dd))); 296 | } 297 | dd_children.extend(description); 298 | } 299 | Ok(()) 300 | } 301 | } 302 | 303 | #[cfg(test)] 304 | mod tests { 305 | use super::*; 306 | use crate::parser::{parse_document_async, tests::serialize_for_test}; 307 | 308 | #[tokio::test] 309 | async fn test_simple() -> io::Result<()> { 310 | // This is a simple document with enough stuff in it. Elements are shown 311 | // before and after the attributes table, to demonstrate that this is 312 | // not sensitive to which order they occur in (i.e., these could be 313 | // reordered in the HTML spec). 314 | let document = parse_document_async( 315 | r#" 316 | 317 |

The a element

318 |
319 |
Categories 320 |
Flow content 321 |
Content attributes 322 |
href 323 |
324 |

Attributes

325 | 326 |
hrefa; areaDestination of the hyperlink 327 |
328 |

The area element

329 |
330 |
Categories 331 |
Flow content 332 |
Content attributes 333 |
href 334 |
335 | "#.trim().as_bytes()).await?; 336 | let mut proc = Processor::new(); 337 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 338 | proc.apply().await?; 339 | assert_eq!( 340 | serialize_for_test(&[document]), 341 | r#" 342 |

The a element

343 |
344 |
Categories 345 |
Flow content 346 |
Content attributes 347 |
href 348 | — Destination of the hyperlink 349 |
350 |

Attributes

351 | 352 |
hrefa; areaDestination of the hyperlink 353 |
354 |

The area element

355 |
356 |
Categories 357 |
Flow content 358 |
Content attributes 359 |
href 360 | — Destination of the hyperlink 361 |
362 | "#.trim() 363 | ); 364 | Ok(()) 365 | } 366 | 367 | #[tokio::test] 368 | async fn test_variant() -> io::Result<()> { 369 | // This checks that and work correctly. 370 | // i.e., the variant description is used where requested 371 | let document = parse_document_async( 372 | r#" 373 | 374 |

The a element

375 |
376 |
Content attributes 377 |
href 378 |
379 |

Attributes

380 | 381 |
hrefa; areaDestination of the hyperlink 382 |
383 |

The area element

384 |
385 |
Content attributes 386 |
href 387 |
388 | "#.trim().as_bytes()).await?; 389 | let mut proc = Processor::new(); 390 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 391 | proc.apply().await?; 392 | assert_eq!( 393 | serialize_for_test(&[document]), 394 | r#" 395 |

The a element

396 |
397 |
Content attributes 398 |
href 399 | — Destination of the hyperlink 400 |
401 |

Attributes

402 | 403 |
hrefa; areaDestination of the hyperlink 404 |
405 |

The area element

406 |
407 |
Content attributes 408 |
href 409 | — click on shapes!
410 | "#.trim() 411 | ); 412 | Ok(()) 413 | } 414 | 415 | #[tokio::test] 416 | async fn test_special_semantics() -> io::Result<()> { 417 | // Checks that the special rules for using : instead of an em dash work. 418 | let document = parse_document_async( 419 | r#" 420 | 421 |

The a element

422 |
423 |
Content attributes 424 |
Also, the name attribute has special semantics on this element. 425 |
426 |

Attributes

427 | 428 |
nameaAnchor name 429 |
430 | "#.trim().as_bytes()).await?; 431 | let mut proc = Processor::new(); 432 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 433 | proc.apply().await?; 434 | assert_eq!( 435 | serialize_for_test(&[document]), 436 | r#" 437 |

The a element

438 |
439 |
Content attributes 440 |
Also, the name attribute has special semantics on this element: Anchor name 441 |
442 |

Attributes

443 | 444 |
nameaAnchor name 445 |
446 | "#.trim() 447 | ); 448 | Ok(()) 449 | } 450 | 451 | #[tokio::test] 452 | async fn test_special_semantics_multiple() -> io::Result<()> { 453 | // Checks that the special rules for joining any special semantics with a ; work. 454 | let document = parse_document_async( 455 | r#" 456 | 457 |

The a element

458 |
459 |
Content attributes 460 |
Also, the name attribute has special semantics on this element. 461 |
462 |

Attributes

463 | 464 |
nameaAnchor name 465 |
nameaName of the anchor 466 |
467 | "#.trim().as_bytes()).await?; 468 | let mut proc = Processor::new(); 469 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 470 | proc.apply().await?; 471 | assert_eq!( 472 | serialize_for_test(&[document]), 473 | r#" 474 |

The a element

475 |
476 |
Content attributes 477 |
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor 478 |
479 |

Attributes

480 | 481 |
nameaAnchor name 482 |
nameaName of the anchor 483 |
484 | "#.trim() 485 | ); 486 | Ok(()) 487 | } 488 | 489 | #[tokio::test] 490 | async fn test_identical_links() -> io::Result<()> { 491 | // This checks the same identifier can be linked multiple times without 492 | // repeating the description. 493 | let document = parse_document_async( 494 | r#" 495 | 496 |

The img element

497 |
498 |
Content attributes 499 |
width 500 |
501 |

The video element

502 |
503 |
Content attributes 504 |
width 505 |
506 |

Attributes

507 | 508 |
widthimg; videoHorizontal dimension 509 |
510 | "#.trim().as_bytes()).await?; 511 | let mut proc = Processor::new(); 512 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 513 | proc.apply().await?; 514 | assert_eq!( 515 | serialize_for_test(&[document]), 516 | r#" 517 |

The img element

518 |
519 |
Content attributes 520 |
width 521 | — Horizontal dimension 522 |
523 |

The video element

524 |
525 |
Content attributes 526 |
width 527 | — Horizontal dimension 528 |
529 |

Attributes

530 | 531 |
widthimg; videoHorizontal dimension 532 |
533 | "#.trim() 534 | ); 535 | Ok(()) 536 | } 537 | } 538 | -------------------------------------------------------------------------------- /src/boilerplate.rs: -------------------------------------------------------------------------------- 1 | //! Replaces comments. 2 | //! These can either be comment nodes (in which case the resulting fragment will 3 | //! be inserted), or the complete value of an element's attribute (in which case 4 | //! the text will become the attribute value). 5 | 6 | use std::io; 7 | use std::path::{Path, PathBuf}; 8 | 9 | use html5ever::tendril::{self, SendTendril}; 10 | use html5ever::{local_name, Attribute, LocalName, QualName}; 11 | use markup5ever_rcdom::{Handle, NodeData}; 12 | use tokio::fs::File; 13 | use tokio::task::JoinHandle; 14 | 15 | use crate::dom_utils::NodeHandleExt; 16 | use crate::io_utils::{async_error, is_safe_path, read_to_str_tendril}; 17 | use crate::parser; 18 | 19 | type SendStrTendril = SendTendril; 20 | 21 | enum Edit { 22 | ReplaceHTML(Handle, JoinHandle>), 23 | ReplaceAttr(Handle, QualName, JoinHandle>), 24 | ReplaceText(Handle, JoinHandle>), 25 | } 26 | 27 | pub struct Processor { 28 | /// Path to look for boilerplate files. 29 | path: PathBuf, 30 | 31 | /// Path to look for example files. 32 | example_path: PathBuf, 33 | 34 | /// Changes to be made in the apply step. 35 | edits: Vec, 36 | } 37 | 38 | impl Processor { 39 | pub fn new(path: impl Into, example_path: impl Into) -> Self { 40 | Self { 41 | path: path.into(), 42 | example_path: example_path.into(), 43 | edits: vec![], 44 | } 45 | } 46 | 47 | /// Should be called for each node in the document. 48 | /// Identifies replacements which will be needed, and starts the necessary 49 | /// I/O. 50 | pub fn visit(&mut self, node: &Handle) { 51 | match node.data { 52 | // BOILERPLATE comments will need to be replaced with their 53 | // corresponding HTML, parsed. Open the file so that we can do so on 54 | // demand. 55 | NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { 56 | let path = Path::new(contents[12..].trim()); 57 | let file = if is_safe_path(path) { 58 | tokio::spawn(File::open(self.path.join(path))) 59 | } else { 60 | async_error(io::Error::new( 61 | io::ErrorKind::PermissionDenied, 62 | "cannot traverse to a parent directory in {path}", 63 | )) 64 | }; 65 | self.edits.push(Edit::ReplaceHTML(node.clone(), file)); 66 | } 67 | // Pseudo-comments can also appear in element attributes. These are 68 | // not parsed as HTML, so we simply want to read them into memory so 69 | // they can be replaced. 70 | NodeData::Element { ref attrs, .. } => { 71 | for Attribute { 72 | ref name, 73 | ref value, 74 | } in attrs.borrow().iter() 75 | { 76 | if value.starts_with("") { 77 | let path = Path::new(value[16..value.len() - 3].trim()); 78 | let file_contents = if is_safe_path(path) { 79 | read_to_str_tendril(self.path.join(path)) 80 | } else { 81 | async_error(io::Error::new( 82 | io::ErrorKind::PermissionDenied, 83 | "cannot traverse to a parent directory in {path}", 84 | )) 85 | }; 86 | self.edits.push(Edit::ReplaceAttr( 87 | node.clone(), 88 | name.clone(), 89 | file_contents, 90 | )); 91 | } 92 | } 93 | } 94 | //
 and 
 which contain EXAMPLE also need to be
 95 |             // replaced, but as plain text. These are loaded from the "examples"
 96 |             // directory instead.
 97 |             NodeData::Text { ref contents } => {
 98 |                 let borrowed_contents = contents.borrow();
 99 |                 let text = borrowed_contents.trim();
100 |                 if !text.starts_with("EXAMPLE ") {
101 |                     return;
102 |                 }
103 |                 const PRE: LocalName = local_name!("pre");
104 |                 const CODE: LocalName = local_name!("code");
105 |                 let has_suitable_parent = node.parent_node().map_or(false, |p| {
106 |                     p.is_html_element(&PRE)
107 |                         || (p.is_html_element(&CODE)
108 |                             && p.parent_node().map_or(false, |p2| p2.is_html_element(&PRE)))
109 |                 });
110 |                 if has_suitable_parent {
111 |                     let path = Path::new(text[8..].trim());
112 |                     let file_contents = if is_safe_path(path) {
113 |                         read_to_str_tendril(self.example_path.join(path))
114 |                     } else {
115 |                         async_error(io::Error::new(
116 |                             io::ErrorKind::PermissionDenied,
117 |                             "cannot traverse to a parent directory in {path}",
118 |                         ))
119 |                     };
120 |                     self.edits
121 |                         .push(Edit::ReplaceText(node.clone(), file_contents))
122 |                 }
123 |             }
124 |             _ => (),
125 |         }
126 |     }
127 | 
128 |     /// Applies the required replacements, in order.
129 |     pub async fn apply(self) -> io::Result<()> {
130 |         for edit in self.edits {
131 |             match edit {
132 |                 // When parsing HTML, we need the context it's in so that the
133 |                 // context-sensitive parsing behavior works correctly.
134 |                 Edit::ReplaceHTML(node, replacement) => {
135 |                     let context = match node.parent_node() {
136 |                         Some(n) => n,
137 |                         _ => continue,
138 |                     };
139 |                     let file: File = replacement.await??;
140 |                     let new_children = parser::parse_fragment_async(file, &context).await?;
141 |                     node.replace_with(new_children);
142 |                 }
143 |                 Edit::ReplaceAttr(element, ref attr, replacement) => {
144 |                     element.set_attribute(attr, replacement.await??.into());
145 |                 }
146 |                 Edit::ReplaceText(element, replacement) => match element.data {
147 |                     NodeData::Text { ref contents } => {
148 |                         contents.replace(replacement.await??.into());
149 |                     }
150 |                     _ => panic!("not text"),
151 |                 },
152 |             }
153 |         }
154 |         Ok(())
155 |     }
156 | }
157 | 
158 | #[cfg(test)]
159 | mod tests {
160 |     use super::*;
161 |     use crate::dom_utils;
162 |     use crate::parser::{parse_document_async, tests::serialize_for_test};
163 |     use tempfile::TempDir;
164 | 
165 |     #[tokio::test]
166 |     async fn test_replace_boilerplate_comment() -> io::Result<()> {
167 |         let boilerplate_dir = TempDir::new()?;
168 |         tokio::fs::write(
169 |             boilerplate_dir.path().join("languages"),
170 |             "
enEnglish", 171 | ) 172 | .await?; 173 | let document = parse_document_async( 174 | "
".as_bytes(), 175 | ) 176 | .await?; 177 | let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); 178 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 179 | proc.apply().await?; 180 | assert_eq!( 181 | serialize_for_test(&[document]), 182 | "
enEnglish
"); 183 | Ok(()) 184 | } 185 | 186 | #[tokio::test] 187 | async fn test_replace_boilerplate_attribute() -> io::Result<()> { 188 | let boilerplate_dir = TempDir::new()?; 189 | tokio::fs::write( 190 | boilerplate_dir.path().join("data.url"), 191 | "data:text/html,Hello, world!", 192 | ) 193 | .await?; 194 | let document = parse_document_async( 195 | "\">hello".as_bytes(), 196 | ) 197 | .await?; 198 | let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); 199 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 200 | proc.apply().await?; 201 | assert_eq!( 202 | serialize_for_test(&[document]), 203 | "hello"); 204 | Ok(()) 205 | } 206 | 207 | #[tokio::test] 208 | async fn test_replace_example() -> io::Result<()> { 209 | let example_dir = TempDir::new()?; 210 | tokio::fs::write(example_dir.path().join("ex1"), "first").await?; 211 | tokio::fs::write(example_dir.path().join("ex2"), "second").await?; 212 | tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; 213 | let document = 214 | parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) 215 | .await?; 216 | let mut proc = Processor::new(Path::new("."), example_dir.path()); 217 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 218 | proc.apply().await?; 219 | assert_eq!( 220 | serialize_for_test(&[document]), 221 | "
first
second

EXAMPLE ignored

" ); 222 | Ok(()) 223 | } 224 | 225 | #[tokio::test] 226 | async fn test_errors_unsafe_paths() -> io::Result<()> { 227 | let bad_path_examples = [ 228 | "", 229 | "
\">
", 230 | "
EXAMPLE ../foo
", 231 | ]; 232 | for example in bad_path_examples { 233 | let document = parse_document_async(example.as_bytes()).await?; 234 | let mut proc = Processor::new(Path::new("."), Path::new(".")); 235 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 236 | let result = proc.apply().await; 237 | assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::PermissionDenied)); 238 | } 239 | Ok(()) 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/dom_utils.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::rc::Rc; 3 | 4 | use html5ever::tendril::StrTendril; 5 | use html5ever::{local_name, namespace_url, ns, Attribute, LocalName, QualName}; 6 | use markup5ever_rcdom::{Handle, Node, NodeData}; 7 | 8 | /// Extensions to the DOM interface to make manipulation more ergonimc. 9 | pub trait NodeHandleExt { 10 | /// Returns a handle to the parent node, if there is one. 11 | fn parent_node(&self) -> Option 12 | where 13 | Self: Sized; 14 | 15 | /// Gets an attribute on the element, or None if absent or not an element. 16 | fn get_attribute(&self, name: &QualName) -> Option; 17 | 18 | /// Returns whether the node has the named attribute. 19 | fn has_attribute(&self, name: &QualName) -> bool { 20 | self.get_attribute(name).is_some() 21 | } 22 | 23 | /// Returns true if the attribute exists and has the value mentioned. 24 | fn attribute_is(&self, name: &QualName, expected: &str) -> bool { 25 | self.get_attribute(name).as_deref() == Some(expected) 26 | } 27 | 28 | /// Sets an attribute on the element. Must be an element. 29 | fn set_attribute(&self, name: &QualName, value: StrTendril); 30 | 31 | /// Returns true if the node is an element. 32 | fn is_element(&self) -> bool; 33 | 34 | /// Returns true if the node is an HTML element with the given tag name. 35 | fn is_html_element(&self, tag_name: &LocalName) -> bool; 36 | 37 | /// Returns true if the node is an element with the given class. 38 | fn has_class(&self, class: &str) -> bool; 39 | 40 | /// Returns true if the node is an element with the given ID. 41 | fn has_id(&self, id: &str) -> bool { 42 | const ID: QualName = QualName { 43 | prefix: None, 44 | ns: ns!(), 45 | local: local_name!("id"), 46 | }; 47 | self.attribute_is(&ID, id) 48 | } 49 | 50 | /// If this is a text node, returns its text. 51 | fn node_text(&self) -> Option; 52 | 53 | /// Concatenate the text of the node and its descendants. 54 | fn text_content(&self) -> StrTendril; 55 | 56 | /// True if any child matches the predicate. 57 | fn any_child(&self, f: impl Fn(&Self) -> bool) -> bool; 58 | 59 | /// Appends children (without checking node type). 60 | fn append_children(&self, children: impl Iterator); 61 | 62 | /// Inserts children before the specified child. 63 | fn insert_children_before(&self, existing: &Self, new: impl Iterator); 64 | 65 | /// Removes the node from its parent and replaces it with the nodes provided. 66 | /// Does nothing if the node has no parent. 67 | fn replace_with(&self, replacements: Vec) 68 | where 69 | Self: Sized; 70 | 71 | /// Clones the node and its entire subtree (including template contents). 72 | fn deep_clone(&self) -> Self; 73 | 74 | /// Create a new element, with the given children. 75 | fn create_element(name: LocalName) -> ElementBuilder 76 | where 77 | Self: Sized; 78 | 79 | /// Create a new text node. 80 | fn create_text_node(text: impl Into) -> Self 81 | where 82 | Self: Sized; 83 | } 84 | 85 | /// Convenience helper for constructing nodes. Use like: 86 | /// Handle::create_element(local_name!("a")) 87 | /// .attribute(&local_name!("href"), "/") 88 | /// .text("Home") 89 | /// .build() 90 | pub struct ElementBuilder { 91 | element: T, 92 | } 93 | 94 | impl ElementBuilder { 95 | pub fn attribute(self, name: &LocalName, value: impl Into) -> Self { 96 | self.element 97 | .set_attribute(&QualName::new(None, ns!(), name.clone()), value.into()); 98 | self 99 | } 100 | 101 | pub fn children(self, children: impl Iterator) -> Self { 102 | self.element.append_children(children); 103 | self 104 | } 105 | 106 | pub fn child(self, child: T) -> Self { 107 | self.children(std::iter::once(child)) 108 | } 109 | 110 | pub fn text(self, text: impl Into) -> Self { 111 | self.child(::create_text_node(text)) 112 | } 113 | 114 | pub fn build(self) -> T { 115 | self.element 116 | } 117 | } 118 | 119 | /// Recursively visits every DOM node (preorder). Template contents are visited 120 | /// after children, but there are seldom both. 121 | pub fn scan_dom(handle: &Handle, f: &mut F) { 122 | f(handle); 123 | 124 | for child in handle.children.borrow().iter() { 125 | scan_dom(child, f); 126 | } 127 | 128 | if let NodeData::Element { 129 | template_contents: ref tc, 130 | .. 131 | } = handle.data 132 | { 133 | if let Some(ref tc_handle) = *tc.borrow() { 134 | scan_dom(tc_handle, f); 135 | } 136 | } 137 | } 138 | 139 | /// Given a
element, find the corresponding
elements. 140 | /// 141 | /// This is more subtle than you might immediately think, because there can be 142 | /// multiple
listing various terms with one or more common
143 | /// definitions. We need to find the
in the child list, and then skip it 144 | /// and any other
, before providing the
that follow. 145 | pub fn dt_descriptions(dt: &Handle) -> Vec { 146 | assert!(dt.is_html_element(&local_name!("dt"))); 147 | if let Some(ref dl) = dt 148 | .parent_node() 149 | .filter(|n| n.is_html_element(&local_name!("dl"))) 150 | { 151 | dl.children 152 | .borrow() 153 | .iter() 154 | .filter(|n| n.is_element()) 155 | .skip_while(|n| !Rc::ptr_eq(n, dt)) 156 | .skip_while(|n| n.is_html_element(&local_name!("dt"))) 157 | .take_while(|n| n.is_html_element(&local_name!("dd"))) 158 | .cloned() 159 | .collect() 160 | } else { 161 | Vec::new() 162 | } 163 | } 164 | 165 | /// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. 166 | pub fn heading_level(node: &Handle) -> Option { 167 | let local = match node.data { 168 | NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, 169 | _ => return None, 170 | }; 171 | match *local { 172 | local_name!("h1") => Some(1), 173 | local_name!("h2") => Some(2), 174 | local_name!("h3") => Some(3), 175 | local_name!("h4") => Some(4), 176 | local_name!("h5") => Some(5), 177 | local_name!("h6") => Some(6), 178 | _ => None, 179 | } 180 | } 181 | 182 | impl NodeHandleExt for Handle { 183 | fn parent_node(&self) -> Option { 184 | let weak_parent = self.parent.take()?; 185 | let parent = weak_parent.upgrade().expect("dangling parent"); 186 | self.parent.set(Some(weak_parent)); 187 | Some(parent) 188 | } 189 | 190 | fn get_attribute(&self, name: &QualName) -> Option { 191 | let attrs = match self.data { 192 | NodeData::Element { ref attrs, .. } => attrs.borrow(), 193 | _ => return None, 194 | }; 195 | attrs 196 | .iter() 197 | .find(|a| &a.name == name) 198 | .map(|a| a.value.clone()) 199 | } 200 | 201 | fn set_attribute(&self, name: &QualName, value: StrTendril) { 202 | let mut attrs = match self.data { 203 | NodeData::Element { ref attrs, .. } => attrs.borrow_mut(), 204 | _ => panic!("not an element"), 205 | }; 206 | if let Some(attr) = attrs.iter_mut().find(|a| &a.name == name) { 207 | attr.value = value; 208 | } else { 209 | attrs.push(Attribute { 210 | name: name.clone(), 211 | value, 212 | }); 213 | } 214 | } 215 | 216 | fn is_element(&self) -> bool { 217 | matches!(&self.data, NodeData::Element { .. }) 218 | } 219 | 220 | fn is_html_element(&self, tag_name: &LocalName) -> bool { 221 | match &self.data { 222 | NodeData::Element { 223 | name: 224 | QualName { 225 | ns: ns!(html), 226 | ref local, 227 | .. 228 | }, 229 | .. 230 | } => local == tag_name, 231 | _ => false, 232 | } 233 | } 234 | 235 | fn has_class(&self, class: &str) -> bool { 236 | const CLASS: QualName = QualName { 237 | prefix: None, 238 | ns: ns!(), 239 | local: local_name!("class"), 240 | }; 241 | self.get_attribute(&CLASS) 242 | .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) 243 | } 244 | 245 | fn node_text(&self) -> Option { 246 | match &self.data { 247 | NodeData::Text { ref contents } => Some(contents.borrow().clone()), 248 | _ => None, 249 | } 250 | } 251 | 252 | fn text_content(&self) -> StrTendril { 253 | let mut text = StrTendril::new(); 254 | scan_dom(self, &mut |n| { 255 | if let NodeData::Text { ref contents } = &n.data { 256 | text.push_tendril(&contents.borrow()); 257 | } 258 | }); 259 | text 260 | } 261 | 262 | fn any_child(&self, f: impl Fn(&Handle) -> bool) -> bool { 263 | self.children.borrow().iter().any(f) 264 | } 265 | 266 | fn append_children(&self, children: impl Iterator) { 267 | self.children.borrow_mut().extend(children.inspect(|c| { 268 | let old_parent = c.parent.replace(Some(Rc::downgrade(self))); 269 | assert!(old_parent.is_none()); 270 | })); 271 | } 272 | 273 | fn insert_children_before(&self, existing: &Handle, new: impl Iterator) { 274 | let mut children = self.children.borrow_mut(); 275 | let i = children 276 | .iter() 277 | .position(|c| Rc::ptr_eq(c, existing)) 278 | .expect("corrupt child list"); 279 | children.splice( 280 | i..i, 281 | new.inspect(|c| { 282 | let old_parent = c.parent.replace(Some(Rc::downgrade(self))); 283 | assert!(old_parent.is_none()); 284 | }), 285 | ); 286 | } 287 | 288 | fn replace_with(&self, replacements: Vec) { 289 | let parent = match self.parent.take() { 290 | Some(n) => n.upgrade().expect("dangling parent"), 291 | _ => return, 292 | }; 293 | for new_child in replacements.iter() { 294 | new_child.parent.replace(Some(Rc::downgrade(&parent))); 295 | } 296 | let mut children = parent.children.borrow_mut(); 297 | let i = children 298 | .iter() 299 | .position(|c| Rc::ptr_eq(c, self)) 300 | .expect("corrupt child list"); 301 | children.splice(i..=i, replacements); 302 | self.parent.take(); 303 | } 304 | 305 | fn deep_clone(&self) -> Handle { 306 | use NodeData::*; 307 | let new_node_data = match &self.data { 308 | Document => Document, 309 | Doctype { 310 | name, 311 | public_id, 312 | system_id, 313 | } => Doctype { 314 | name: name.clone(), 315 | public_id: public_id.clone(), 316 | system_id: system_id.clone(), 317 | }, 318 | Text { contents } => Text { 319 | contents: contents.clone(), 320 | }, 321 | Comment { contents } => Comment { 322 | contents: contents.clone(), 323 | }, 324 | Element { 325 | name, 326 | attrs, 327 | template_contents, 328 | mathml_annotation_xml_integration_point, 329 | } => Element { 330 | name: name.clone(), 331 | attrs: attrs.clone(), 332 | template_contents: RefCell::new( 333 | template_contents 334 | .borrow() 335 | .as_ref() 336 | .map(|tc| tc.deep_clone()), 337 | ), 338 | mathml_annotation_xml_integration_point: *mathml_annotation_xml_integration_point, 339 | }, 340 | ProcessingInstruction { target, contents } => ProcessingInstruction { 341 | target: target.clone(), 342 | contents: contents.clone(), 343 | }, 344 | }; 345 | let node = Node::new(new_node_data); 346 | let mut children = node.children.borrow_mut(); 347 | *children = self 348 | .children 349 | .borrow() 350 | .iter() 351 | .map(|c| c.deep_clone()) 352 | .collect(); 353 | for child in children.iter_mut() { 354 | let old_parent = child.parent.replace(Some(Rc::downgrade(&node))); 355 | assert!(old_parent.is_none()); 356 | } 357 | drop(children); 358 | node 359 | } 360 | 361 | fn create_element(name: LocalName) -> ElementBuilder { 362 | let new_node_data = NodeData::Element { 363 | name: QualName::new(None, ns!(html), name), 364 | attrs: RefCell::new(Vec::new()), 365 | template_contents: RefCell::new(None), 366 | mathml_annotation_xml_integration_point: false, 367 | }; 368 | ElementBuilder { 369 | element: Node::new(new_node_data), 370 | } 371 | } 372 | 373 | fn create_text_node(text: impl Into) -> Handle { 374 | let new_node_data = NodeData::Text { 375 | contents: RefCell::new(text.into()), 376 | }; 377 | Node::new(new_node_data) 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/interface_index.rs: -------------------------------------------------------------------------------- 1 | //! Generates an index of WebIDL interfaces. 2 | //! This index is inserted where "INSERT INTERFACES HERE" appears. 3 | 4 | use std::collections::BTreeMap; 5 | use std::io; 6 | 7 | use html5ever::tendril::StrTendril; 8 | use html5ever::{local_name, namespace_url, ns, QualName}; 9 | use markup5ever_rcdom::Handle; 10 | 11 | use crate::dom_utils::NodeHandleExt; 12 | 13 | #[derive(Default, Debug)] 14 | struct InterfaceInfo { 15 | /// Number of times the interface definition was seen. Should be one. 16 | /// We store other numbers for convenience in error handling and reporting. 17 | seen: u32, 18 | 19 | /// The IDs of the partial interfaces, in the order they appear in the document. 20 | partials: Vec, 21 | 22 | /// Set to true if a partial is missing its ID. 23 | has_partial_with_no_id: bool, 24 | } 25 | 26 | pub struct Processor { 27 | /// The interfaces encountered, keyed and sorted by name. 28 | interfaces: BTreeMap, 29 | 30 | /// The text nodes which contains the text "INSERT INTERFACES HERE". 31 | marker_nodes: Vec, 32 | } 33 | 34 | /// The string which marks where the index belongs. Ideally this would be a node 35 | /// and not plain text. 36 | const MARKER: &str = "INSERT INTERFACES HERE"; 37 | 38 | impl Processor { 39 | pub fn new() -> Self { 40 | Processor { 41 | interfaces: BTreeMap::new(), 42 | marker_nodes: Vec::new(), 43 | } 44 | } 45 | 46 | pub fn visit(&mut self, node: &Handle) { 47 | const ID: QualName = QualName { 48 | prefix: None, 49 | ns: ns!(), 50 | local: local_name!("id"), 51 | }; 52 | // We're looking for inside a
, to find
 53 |         // potential interfaces defined there.
 54 |         //
 55 |         // One surprise here -- there is an "interface Example" that is not defined
 56 |         // according to Wattsi. It yells about this not being defined, and the
 57 |         // prior Perl preprocessing actually requires the 
 have no
 58 |         // attributes.
 59 |         if node.is_html_element(&local_name!("code"))
 60 |             && node.has_class("idl")
 61 |             && node.parent_node().map_or(false, |p| {
 62 |                 p.is_html_element(&local_name!("pre")) && !p.has_class("extract")
 63 |             })
 64 |         {
 65 |             let borrowed_children = node.children.borrow();
 66 |             for window in borrowed_children.windows(2) {
 67 |                 let is_partial = match window[0].node_text() {
 68 |                     Some(a) if a.ends_with("partial interface ") => true,
 69 |                     Some(a) if a.ends_with("interface ") => false,
 70 |                     _ => continue,
 71 |                 };
 72 |                 // These definitions must appear as a ,  or  element.
 73 |                 if !window[1].is_html_element(&local_name!("span"))
 74 |                     && !window[1].is_html_element(&local_name!("dfn"))
 75 |                     && !window[1].is_html_element(&local_name!("a"))
 76 |                 {
 77 |                     continue;
 78 |                 }
 79 |                 let name = window[1].text_content();
 80 |                 let info = self.interfaces.entry(name).or_default();
 81 |                 if is_partial {
 82 |                     if let Some(id) = window[1].get_attribute(&ID) {
 83 |                         info.partials.push(id);
 84 |                     } else {
 85 |                         info.has_partial_with_no_id = true;
 86 |                     }
 87 |                 } else {
 88 |                     info.seen += 1;
 89 |                 }
 90 |             }
 91 |         }
 92 | 
 93 |         if node.node_text().map_or(false, |t| t.contains(MARKER)) {
 94 |             self.marker_nodes.push(node.clone());
 95 |         }
 96 |     }
 97 | 
 98 |     pub fn apply(self) -> io::Result<()> {
 99 |         // It is likely an author error to not include anywhere to insert an
100 |         // interface index. More than one is supported, mainly because it's no
101 |         // more work than enforcing that just one exists.
102 |         if self.marker_nodes.is_empty() {
103 |             return Err(io::Error::new(
104 |                 io::ErrorKind::InvalidData,
105 |                 format!("Marker {MARKER:?} not found."),
106 |             ));
107 |         }
108 |         if self.marker_nodes.len() > 1 {
109 |             return Err(io::Error::new(
110 |                 io::ErrorKind::InvalidData,
111 |                 format!(
112 |                     "{MARKER:?} found {} times, expected just one.",
113 |                     self.marker_nodes.len()
114 |                 ),
115 |             ));
116 |         }
117 |         for marker in self.marker_nodes {
118 |             // We need to find where the marker appears in the text so that we
119 |             // can split it into two text nodes.
120 |             let text = marker.node_text().expect("should still be a text node");
121 |             let position: u32 = match text.find(MARKER) {
122 |                 None => {
123 |                     return Err(io::Error::new(
124 |                         io::ErrorKind::InvalidData,
125 |                         format!("Marker {MARKER:?} not found (but was during first pass)."),
126 |                     ));
127 |                 }
128 |                 Some(p) => p.try_into().unwrap(),
129 |             };
130 |             let end_position: u32 = position + TryInto::::try_into(MARKER.len()).unwrap();
131 |             let before = text.subtendril(0, position);
132 |             let after = text.subtendril(end_position, text.len32() - end_position);
133 | 
134 |             // Then, we need to construct a list of interfaces and their partial interfaces.
135 |             let mut ul =
136 |                 Handle::create_element(local_name!("ul")).attribute(&local_name!("class"), "brief");
137 |             for (name, info) in &self.interfaces {
138 |                 if info.seen > 1 {
139 |                     return Err(io::Error::new(
140 |                         io::ErrorKind::InvalidData,
141 |                         format!("Interface {name} defined {} times.", info.seen),
142 |                     ));
143 |                 }
144 |                 fn make_link(id: &str, text: &str) -> Handle {
145 |                     Handle::create_element(local_name!("a"))
146 |                         .attribute(&local_name!("href"), format!("#{id}"))
147 |                         .text(text)
148 |                         .build()
149 |                 }
150 |                 let mut li = Handle::create_element(local_name!("li")).child(
151 |                     Handle::create_element(local_name!("code"))
152 |                         .text(name.clone())
153 |                         .build(),
154 |                 );
155 |                 match &info.partials[..] {
156 |                     [] => (),
157 |                     [sole_partial] => {
158 |                         li = li.text(", ").child(make_link(sole_partial, "partial"));
159 |                     }
160 |                     [first, rest @ ..] => {
161 |                         li = li.text(", ").child(make_link(first, "partial 1"));
162 |                         for (i, p) in rest.iter().enumerate() {
163 |                             li = li.text(" ").child(make_link(p, &(i + 2).to_string()));
164 |                         }
165 |                     }
166 |                 }
167 |                 ul = ul.child(li.build());
168 |             }
169 | 
170 |             // Finally, we replace the marker's text node with the combination of the two.
171 |             marker.replace_with(vec![
172 |                 Handle::create_text_node(before),
173 |                 ul.build(),
174 |                 Handle::create_text_node(after),
175 |             ]);
176 |         }
177 |         Ok(())
178 |     }
179 | }
180 | 
181 | #[cfg(test)]
182 | mod tests {
183 |     use super::*;
184 |     use crate::dom_utils;
185 |     use crate::parser::{parse_document_async, tests::serialize_for_test};
186 | 
187 |     #[tokio::test]
188 |     async fn test_two_interfaces_in_one_block() -> io::Result<()> {
189 |         let document = parse_document_async(
190 |             r#"
191 | 
192 | 

193 | interface HTMLMarqueeElement { ... }
194 | interface HTMLBlinkElement { ... }
195 | 
196 | INSERT INTERFACES HERE 197 | "# 198 | .trim() 199 | .as_bytes(), 200 | ) 201 | .await?; 202 | let mut proc = Processor::new(); 203 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 204 | proc.apply()?; 205 | assert_eq!( 206 | serialize_for_test(&[document]), 207 | r#" 208 |

209 | interface HTMLMarqueeElement { ... }
210 | interface HTMLBlinkElement { ... }
211 | 
212 |
  • HTMLBlinkElement
  • HTMLMarqueeElement
213 | "#.trim()); 214 | Ok(()) 215 | } 216 | 217 | #[tokio::test] 218 | async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { 219 | let document = parse_document_async( 220 | r#" 221 | 222 |

223 | interface HTMLMarqueeElement { ... }
224 | 
225 |

226 | interface HTMLBlinkElement { ... }
227 | 
228 | INSERT INTERFACES HERE 229 | "# 230 | .trim() 231 | .as_bytes(), 232 | ) 233 | .await?; 234 | let mut proc = Processor::new(); 235 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 236 | proc.apply()?; 237 | assert_eq!( 238 | serialize_for_test(&[document]), 239 | r#" 240 |

241 | interface HTMLMarqueeElement { ... }
242 | 
243 |

244 | interface HTMLBlinkElement { ... }
245 | 
246 |
  • HTMLBlinkElement
  • HTMLMarqueeElement
247 | "#.trim()); 248 | Ok(()) 249 | } 250 | 251 | #[tokio::test] 252 | async fn interface_with_partial() -> io::Result<()> { 253 | let document = parse_document_async( 254 | r#" 255 | 256 |

257 | interface HTMLMarqueeElement { ... }
258 | 
259 |

260 | partial interface HTMLMarqueeElement { ... }
261 | 
262 | INSERT INTERFACES HERE 263 | "# 264 | .trim() 265 | .as_bytes(), 266 | ) 267 | .await?; 268 | let mut proc = Processor::new(); 269 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 270 | proc.apply()?; 271 | assert_eq!( 272 | serialize_for_test(&[document]), 273 | r##" 274 |

275 | interface HTMLMarqueeElement { ... }
276 | 
277 |

278 | partial interface HTMLMarqueeElement { ... }
279 | 
280 |
281 | "##.trim()); 282 | Ok(()) 283 | } 284 | 285 | #[tokio::test] 286 | async fn interface_with_two_partials() -> io::Result<()> { 287 | let document = parse_document_async( 288 | r#" 289 | 290 |

291 | interface HTMLMarqueeElement { ... }
292 | partial interface HTMLMarqueeElement { ... }
293 | partial interface HTMLMarqueeElement { ... }
294 | 
295 | INSERT INTERFACES HERE 296 | "# 297 | .trim() 298 | .as_bytes(), 299 | ) 300 | .await?; 301 | let mut proc = Processor::new(); 302 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 303 | proc.apply()?; 304 | assert_eq!( 305 | serialize_for_test(&[document]), 306 | r##" 307 |

308 | interface HTMLMarqueeElement { ... }
309 | partial interface HTMLMarqueeElement { ... }
310 | partial interface HTMLMarqueeElement { ... }
311 | 
312 | 313 | "##.trim()); 314 | Ok(()) 315 | } 316 | 317 | #[tokio::test] 318 | async fn only_partials() -> io::Result<()> { 319 | let document = parse_document_async( 320 | r#" 321 | 322 |

323 | partial interface HTMLMarqueeElement { ... }
324 | partial interface HTMLMarqueeElement { ... }
325 | 
326 | INSERT INTERFACES HERE 327 | "# 328 | .trim() 329 | .as_bytes(), 330 | ) 331 | .await?; 332 | let mut proc = Processor::new(); 333 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 334 | proc.apply()?; 335 | assert_eq!( 336 | serialize_for_test(&[document]), 337 | r##" 338 |

339 | partial interface HTMLMarqueeElement { ... }
340 | partial interface HTMLMarqueeElement { ... }
341 | 
342 | 343 | "##.trim()); 344 | Ok(()) 345 | } 346 | 347 | #[tokio::test] 348 | async fn marker_before() -> io::Result<()> { 349 | let document = parse_document_async( 350 | r#" 351 | 352 | INSERT INTERFACES HERE 353 |

354 | interface HTMLMarqueeElement { ... }
355 | 
356 | "# 357 | .trim() 358 | .as_bytes(), 359 | ) 360 | .await?; 361 | let mut proc = Processor::new(); 362 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 363 | proc.apply()?; 364 | assert_eq!( 365 | serialize_for_test(&[document]), 366 | r#" 367 |
  • HTMLMarqueeElement
368 |

369 | interface HTMLMarqueeElement { ... }
370 | 
371 | "# 372 | .trim() 373 | ); 374 | Ok(()) 375 | } 376 | 377 | #[tokio::test] 378 | async fn no_marker() -> io::Result<()> { 379 | let document = parse_document_async("".as_bytes()).await?; 380 | let mut proc = Processor::new(); 381 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 382 | let result = proc.apply(); 383 | assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); 384 | Ok(()) 385 | } 386 | 387 | #[tokio::test] 388 | async fn duplicate_marker() -> io::Result<()> { 389 | let document = parse_document_async( 390 | "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
" 391 | .as_bytes(), 392 | ) 393 | .await?; 394 | let mut proc = Processor::new(); 395 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 396 | let result = proc.apply(); 397 | assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); 398 | Ok(()) 399 | } 400 | 401 | #[tokio::test] 402 | async fn duplicate_dfn() -> io::Result<()> { 403 | let document = parse_document_async( 404 | r#" 405 | 406 |

407 | interface HTMLMarqueeElement { ... }
408 | interface HTMLMarqueeElement { ... }
409 | 
410 | "# 411 | .as_bytes(), 412 | ) 413 | .await?; 414 | let mut proc = Processor::new(); 415 | dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); 416 | let result = proc.apply(); 417 | assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); 418 | Ok(()) 419 | } 420 | } 421 | -------------------------------------------------------------------------------- /src/io_utils.rs: -------------------------------------------------------------------------------- 1 | //! Misccellaneous utilities for I/O. 2 | 3 | use std::io; 4 | use std::path::Path; 5 | 6 | use html5ever::tendril::{self, SendTendril, StrTendril}; 7 | use tokio::task::JoinHandle; 8 | 9 | type SendStrTendril = SendTendril; 10 | 11 | /// Check that a path is safe to open, even if the source is potentially untrusted. 12 | pub fn is_safe_path(path: impl AsRef) -> bool { 13 | use std::path::Component; 14 | path.as_ref() 15 | .components() 16 | .all(|c| matches!(c, Component::Normal(_) | Component::CurDir)) 17 | } 18 | 19 | /// In a spawned task, read to a string, then move it to a tendril. 20 | pub fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> { 21 | let path = path.as_ref().to_owned(); 22 | tokio::spawn(async move { 23 | let string = tokio::fs::read_to_string(path).await?; 24 | Ok(StrTendril::from(string).into_send()) 25 | }) 26 | } 27 | 28 | /// Creates a join Handle for an error. Useful when an operation will fail, but 29 | /// it's more convenient to handle later on. 30 | pub fn async_error(err: io::Error) -> JoinHandle> { 31 | tokio::spawn(async move { Err(err) }) 32 | } 33 | 34 | #[cfg(test)] 35 | mod tests { 36 | use super::*; 37 | use tempfile::TempDir; 38 | 39 | #[test] 40 | fn test_is_safe_path() { 41 | assert!(is_safe_path("a.txt")); 42 | assert!(is_safe_path("a/b.txt")); 43 | assert!(is_safe_path("a/b/c/./d.txt")); 44 | assert!(!is_safe_path("../parent.txt")); 45 | assert!(!is_safe_path("/etc/passwd")); 46 | } 47 | 48 | #[tokio::test] 49 | async fn test_read_to_str_tendril() -> io::Result<()> { 50 | let temp_dir = TempDir::new()?; 51 | let file_path = temp_dir.path().join("a.txt"); 52 | tokio::fs::write(&file_path, "Hello, world!").await?; 53 | let send_tendril = read_to_str_tendril(&file_path).await??; 54 | assert_eq!(StrTendril::from(send_tendril).as_ref(), "Hello, world!"); 55 | Ok(()) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use html5ever::serialize::{serialize, SerializeOpts}; 2 | use std::borrow::Cow; 3 | use std::default::Default; 4 | use std::env; 5 | use std::ffi::OsStr; 6 | use std::io::{self, BufWriter}; 7 | use std::path::{Path, PathBuf}; 8 | 9 | use markup5ever_rcdom::SerializableHandle; 10 | 11 | mod annotate_attributes; 12 | mod boilerplate; 13 | mod dom_utils; 14 | mod interface_index; 15 | mod io_utils; 16 | mod parser; 17 | mod rcdom_with_line_numbers; 18 | mod represents; 19 | mod tag_omission; 20 | 21 | #[tokio::main] 22 | async fn main() -> io::Result<()> { 23 | // This gives slightly prettier error-printing. 24 | if let Err(e) = run().await { 25 | eprintln!("{}", e); 26 | std::process::exit(1); 27 | } 28 | Ok(()) 29 | } 30 | 31 | async fn run() -> io::Result<()> { 32 | // Since we're using Rc in the DOM implementation, we must ensure that tasks 33 | // which act on it are confined to this thread. 34 | 35 | // Find the paths we need. 36 | let cache_dir = path_from_env("HTML_CACHE", ".cache"); 37 | let source_dir = path_from_env("HTML_SOURCE", "../html"); 38 | 39 | // Because parsing can jump around the tree a little, it's most reasonable 40 | // to just parse the whole document before doing any processing. Even for 41 | // the HTML standard, this doesn't take too long. 42 | let document = parser::parse_document_async(tokio::io::stdin()).await?; 43 | 44 | let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); 45 | let mut represents = represents::Processor::new(); 46 | let mut annotate_attributes = annotate_attributes::Processor::new(); 47 | let mut tag_omission = tag_omission::Processor::new(); 48 | let mut interface_index = interface_index::Processor::new(); 49 | 50 | // We do exactly one pass to identify the changes that need to be made. 51 | dom_utils::scan_dom(&document, &mut |h| { 52 | boilerplate.visit(h); 53 | represents.visit(h); 54 | annotate_attributes.visit(h); 55 | tag_omission.visit(h); 56 | interface_index.visit(h); 57 | }); 58 | 59 | // And then we apply all of the changes. These different processors mostly 60 | // apply quite local changes, so hopefully we never have to deal with 61 | // conflicts between them. 62 | boilerplate.apply().await?; 63 | represents.apply()?; 64 | annotate_attributes.apply().await?; 65 | tag_omission.apply()?; 66 | interface_index.apply()?; 67 | 68 | // Finally, we write the result to standard out. 69 | let serializable: SerializableHandle = document.into(); 70 | serialize( 71 | &mut BufWriter::with_capacity(128 * 1024, io::stdout()), 72 | &serializable, 73 | SerializeOpts::default(), 74 | )?; 75 | Ok(()) 76 | } 77 | 78 | fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> 79 | where 80 | V: AsRef + ?Sized, 81 | D: AsRef + ?Sized, 82 | { 83 | match env::var_os(var) { 84 | Some(p) => PathBuf::from(p).into(), 85 | None => default.as_ref().into(), 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | //! This module provides some mild integration between the html5ever parser and async I/O. 2 | 3 | use std::io; 4 | 5 | use html5ever::driver::{self, ParseOpts, Parser}; 6 | use html5ever::tendril::{ByteTendril, TendrilSink}; 7 | use html5ever::tokenizer::TokenizerOpts; 8 | use html5ever::tree_builder::TreeBuilderOpts; 9 | use markup5ever_rcdom::Handle; 10 | use tokio::io::{AsyncRead, AsyncReadExt}; 11 | 12 | use crate::rcdom_with_line_numbers::RcDomWithLineNumbers; 13 | 14 | async fn parse_internal_async( 15 | parser: Parser, 16 | mut r: R, 17 | ) -> io::Result { 18 | let mut tendril_sink = parser.from_utf8(); 19 | 20 | // This draws on the structure of the sync tendril read_from. 21 | // https://docs.rs/tendril/latest/tendril/stream/trait.TendrilSink.html#method.read_from 22 | const BUFFER_SIZE: u32 = 128 * 1024; 23 | 'read: loop { 24 | let mut tendril = ByteTendril::new(); 25 | unsafe { 26 | tendril.push_uninitialized(BUFFER_SIZE); 27 | } 28 | loop { 29 | match r.read(&mut tendril).await { 30 | Ok(0) => break 'read, 31 | Ok(n) => { 32 | tendril.pop_back(BUFFER_SIZE - n as u32); 33 | tendril_sink.process(tendril); 34 | break; 35 | } 36 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 37 | Err(e) => Err(e)?, 38 | } 39 | } 40 | } 41 | let dom = tendril_sink.finish(); 42 | Ok(dom) 43 | } 44 | 45 | pub async fn parse_fragment_async( 46 | r: R, 47 | context: &Handle, 48 | ) -> io::Result> { 49 | let parser = driver::parse_fragment_for_element( 50 | RcDomWithLineNumbers::default(), 51 | create_error_opts(), 52 | context.clone(), 53 | None, 54 | ); 55 | 56 | let dom = parse_internal_async(parser, r).await?; 57 | dom.create_error_from_parse_errors()?; 58 | 59 | let document = dom.document(); 60 | let mut new_children = document.children.take()[0].children.take(); 61 | for new_child in new_children.iter_mut() { 62 | new_child.parent.take(); 63 | } 64 | Ok(new_children) 65 | } 66 | 67 | pub async fn parse_document_async(r: R) -> io::Result { 68 | let parser = driver::parse_document(RcDomWithLineNumbers::default(), create_error_opts()); 69 | let dom = parse_internal_async(parser, r).await?; 70 | dom.create_error_from_parse_errors()?; 71 | 72 | Ok(dom.document().clone()) 73 | } 74 | 75 | fn create_error_opts() -> ParseOpts { 76 | ParseOpts { 77 | tokenizer: TokenizerOpts { 78 | exact_errors: true, 79 | ..Default::default() 80 | }, 81 | tree_builder: TreeBuilderOpts { 82 | exact_errors: true, 83 | ..Default::default() 84 | }, 85 | } 86 | } 87 | 88 | #[cfg(test)] 89 | pub(crate) mod tests { 90 | use super::*; 91 | use crate::dom_utils::NodeHandleExt; 92 | use html5ever::serialize::{SerializeOpts, TraversalScope}; 93 | use html5ever::{local_name, serialize}; 94 | use markup5ever_rcdom::{NodeData, SerializableHandle}; 95 | 96 | pub(crate) fn serialize_for_test(nodes: &[Handle]) -> String { 97 | let mut output = vec![]; 98 | for node in nodes { 99 | let traversal_scope = match node.data { 100 | NodeData::Document => TraversalScope::ChildrenOnly(None), 101 | _ => TraversalScope::IncludeNode, 102 | }; 103 | serialize( 104 | &mut output, 105 | &SerializableHandle::from(node.clone()), 106 | SerializeOpts { 107 | traversal_scope, 108 | ..Default::default() 109 | }, 110 | ) 111 | .unwrap(); 112 | } 113 | String::from_utf8(output).unwrap() 114 | } 115 | 116 | #[tokio::test] 117 | async fn test_fragment_respects_context() -> io::Result<()> { 118 | // Checks that we have the appropriate insertion mode for the element 119 | // we're in. This is important because of the special rules 120 | // surrounding, e.g., tables. If you change this to use the body as context, 121 | // no element at all is emitted. 122 | let document = parse_document_async("
".as_bytes()).await?; 123 | let body = document.children.borrow()[1].children.borrow()[1].clone(); 124 | assert!(body.is_html_element(&local_name!("body"))); 125 | let table = body.children.borrow()[0].clone(); 126 | assert!(table.is_html_element(&local_name!("table"))); 127 | let children = parse_fragment_async("