├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── build.zig ├── build.zig.zon ├── example.zig ├── rem.zig ├── source ├── Dom.zig ├── Parser.zig ├── Tokenizer.zig ├── dom │ ├── mutation.zig │ └── node.zig ├── named_characters.zig ├── token.zig ├── tree_construction.zig └── util.zig ├── test ├── html5lib-test-tokenizer.zig └── html5lib-test-tree-construction.zig └── tools ├── README.md ├── character_reference_data.json └── generate_named_characters.zig /.gitignore: -------------------------------------------------------------------------------- 1 | zig-cache/ 2 | zig-out/ 3 | .zig-cache/ 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "test/html5lib-tests"] 2 | path = test/html5lib-tests 3 | url = https://github.com/chadwain/html5lib-tests.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rem 2 | rem is an HTML5 parser written in [Zig](https://ziglang.org). 3 | 4 | ## About 5 | ### Features 6 | - [x] An HTML5 parser consisting of a tokenizer (complete) and a tree constructor (works "well enough") 7 | - [x] A minimal DOM implementation 8 | - [x] HTML fragment parsing 9 | - [x] Tested by [html5lib-tests](https://github.com/chadwain/html5lib-tests) 10 | 11 | ### Things to be improved 12 | - [ ] Better DOM functionality 13 | - [ ] Support for more character encodings 14 | - [ ] Support for Javascript 15 | 16 | ### Why create this? 17 | * To understand what it takes "implement" HTML, even if just a small portion of it. As I discovered, even just trying to parse an HTML file _correctly_ can be quite challenging. 18 | * To learn more about web standards in general. Reading the HTML spec naturally causes (or rather, forces) one to learn about DOM (especially), SVG, CSS, and many others. 19 | * For use in other projects, and to be useful to others. 20 | 21 | ### Lastly... 22 | rem is still a work in progress. Not all the features of a fully-capable HTML5 parser are implemented. 23 | 24 | ## Get the code 25 | Clone the repository like this: 26 | ``` 27 | git clone --recursive --config core.autocrlf=false https://github.com/chadwain/rem.git 28 | ``` 29 | 30 | **Using the Zig Package Manager** 31 | ``` 32 | zig fetch --save https://github.com/chadwain/rem/archive/refs/heads/master.tar.gz 33 | ``` 34 | 35 | There are no dependencies other than a Zig compiler. Note that this library is only compatible with Zig version 0.11.0 or newer. 36 | 37 | ## Use the code 38 | Here's an example of using the parser. You can see the output of this program by running `zig build example`. 39 | 40 | ```zig 41 | const std = @import("std"); 42 | const rem = @import("rem"); 43 | 44 | pub fn main() !void { 45 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 46 | defer std.debug.assert(gpa.deinit() == .ok); 47 | const allocator = gpa.allocator(); 48 | 49 | // This is the text that will be read by the parser. 50 | // Since the parser accepts Unicode codepoints, the text must be decoded before it can be used. 51 | const input = "

Your text goes here!

"; 52 | const decoded_input = &rem.util.utf8DecodeStringComptime(input); 53 | 54 | // Create the DOM in which the parsed Document will be created. 55 | var dom = rem.Dom{ .allocator = allocator }; 56 | defer dom.deinit(); 57 | 58 | // Create the HTML parser. 59 | var parser = try rem.Parser.init(&dom, decoded_input, allocator, .report, false); 60 | defer parser.deinit(); 61 | 62 | // This causes the parser to read the input and produce a Document. 63 | try parser.run(); 64 | 65 | // `errors` returns the list of parse errors that were encountered while parsing. 66 | // Since we know that our input was well-formed HTML, we expect there to be 0 parse errors. 67 | const errors = parser.errors(); 68 | std.debug.assert(errors.len == 0); 69 | 70 | // We can now print the resulting Document to the console. 71 | const stdout = std.io.getStdOut().writer(); 72 | const document = parser.getDocument(); 73 | try rem.util.printDocument(stdout, document, &dom, allocator); 74 | } 75 | ``` 76 | 77 | ## Test the code 78 | rem uses [html5lib-tests](https://github.com/html5lib/html5lib-tests) as a test suite. Specifically, it tests against the 'tokenizer' and 'tree-construction' tests from that suite. 79 | 80 | `zig build test-tokenizer` will run the 'tokenizer' tests. 81 | `zig build test-tree-construction` will run the 'tree-construction' tests in 2 ways: with scripting disabled, then with scripting enabled. 82 | The expected results are as follows: 83 | - tokenizer: All tests pass. 84 | - tree-construction (scripting disabled): Some tests are skipped because they rely on HTML features that aren't yet implemented in this library (specifically, templates). All other tests pass. 85 | - tree-construction (scripting enabled): Similar to testing with scripting off, but in addition, some entire test files are skipped because they would cause panics. 86 | 87 | ## License 88 | ### GPL-3.0-only 89 | Copyright (C) 2021-2023 Chadwain Holness 90 | 91 | rem is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3. 92 | 93 | This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 94 | 95 | You should have received a copy of the GNU General Public License along with this library. If not, see . 96 | 97 | ## References 98 | [HTML Parsing Specification](https://html.spec.whatwg.org/multipage/parsing.html) 99 | 100 | [DOM Specification](https://dom.spec.whatwg.org/) 101 | -------------------------------------------------------------------------------- /build.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const std = @import("std"); 7 | 8 | pub fn build(b: *std.Build) void { 9 | const optimize = b.standardOptimizeOption(.{}); 10 | const target = b.standardTargetOptions(.{}); 11 | 12 | const rem_lib = b.addStaticLibrary(.{ 13 | .name = "rem", 14 | .root_source_file = b.path("rem.zig"), 15 | .target = target, 16 | .optimize = optimize, 17 | }); 18 | b.installArtifact(rem_lib); 19 | 20 | { 21 | const rem_unit_tests = b.addTest(.{ 22 | .name = "rem-unit-tests", 23 | .root_source_file = b.path("rem.zig"), 24 | .target = target, 25 | .optimize = optimize, 26 | }); 27 | b.installArtifact(rem_unit_tests); 28 | 29 | const rem_unit_tests_run = b.addRunArtifact(rem_unit_tests); 30 | rem_unit_tests_run.step.dependOn(&rem_unit_tests.step); 31 | 32 | const rem_unit_tests_run_step = b.step("test", "Run unit tests"); 33 | rem_unit_tests_run_step.dependOn(&rem_unit_tests_run.step); 34 | } 35 | 36 | const rem_module = b.addModule("rem", .{ .root_source_file = b.path("rem.zig") }); 37 | 38 | { 39 | const html5lib_tokenizer_tests = b.addTest(.{ 40 | .name = "html5lib-tokenizer-tests", 41 | .root_source_file = b.path("test/html5lib-test-tokenizer.zig"), 42 | .target = target, 43 | .optimize = optimize, 44 | }); 45 | html5lib_tokenizer_tests.root_module.addImport("rem", rem_module); 46 | b.installArtifact(html5lib_tokenizer_tests); 47 | 48 | const html5lib_tokenizer_tests_run = b.addRunArtifact(html5lib_tokenizer_tests); 49 | html5lib_tokenizer_tests_run.step.dependOn(&html5lib_tokenizer_tests.step); 50 | 51 | const html5lib_tokenizer_tests_run_step = b.step( 52 | "test-tokenizer", 53 | "Run tokenizer tests from html5lib-tests (requires 0.12.0-dev.91+a155e3585 or newer)", 54 | ); 55 | html5lib_tokenizer_tests_run_step.dependOn(&html5lib_tokenizer_tests_run.step); 56 | } 57 | 58 | { 59 | const html5lib_tree_construction_tests = b.addTest(.{ 60 | .name = "html5lib-tree-construction-tests", 61 | .root_source_file = b.path("test/html5lib-test-tree-construction.zig"), 62 | .target = target, 63 | .optimize = optimize, 64 | }); 65 | html5lib_tree_construction_tests.root_module.addImport("rem", rem_module); 66 | b.installArtifact(html5lib_tree_construction_tests); 67 | 68 | const html5lib_tree_construction_tests_run = b.addRunArtifact(html5lib_tree_construction_tests); 69 | html5lib_tree_construction_tests_run.step.dependOn(&html5lib_tree_construction_tests.step); 70 | 71 | const html5lib_tree_construction_tests_run_step = b.step("test-tree-construction", "Run tree construction tests from html5lib-tests"); 72 | html5lib_tree_construction_tests_run_step.dependOn(&html5lib_tree_construction_tests_run.step); 73 | } 74 | 75 | { 76 | const example = b.addExecutable(.{ 77 | .name = "example", 78 | .root_source_file = b.path("./example.zig"), 79 | .target = target, 80 | .optimize = optimize, 81 | }); 82 | example.root_module.addImport("rem", rem_module); 83 | b.installArtifact(example); 84 | 85 | const example_run = b.addRunArtifact(example); 86 | const example_run_step = b.step("example", "Run an example program"); 87 | example_run_step.dependOn(&example_run.step); 88 | } 89 | 90 | { 91 | const json_data = b.pathFromRoot("tools/character_reference_data.json"); 92 | const output_path = b.pathFromRoot("source/named_characters.zig"); 93 | const generate_named_characters = b.addExecutable(.{ 94 | .name = "generate-named-characters", 95 | .root_source_file = b.path("tools/generate_named_characters.zig"), 96 | .target = target, 97 | .optimize = .Debug, 98 | }); 99 | 100 | const generate_named_characters_run = b.addRunArtifact(generate_named_characters); 101 | generate_named_characters_run.addArgs(&.{ json_data, output_path }); 102 | 103 | const generate_named_characters_run_step = b.step("generate-named-characters", "Generate the named character reference data"); 104 | generate_named_characters_run_step.dependOn(&generate_named_characters_run.step); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /build.zig.zon: -------------------------------------------------------------------------------- 1 | .{ 2 | .name = .rem, 3 | .version = "0.3.0", 4 | .minimum_zig_version = "0.14.1", 5 | .fingerprint = 0x83bde72b9431762b, 6 | .paths = .{ 7 | "build.zig", 8 | "rem.zig", 9 | "source", 10 | }, 11 | } 12 | -------------------------------------------------------------------------------- /example.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const std = @import("std"); 7 | const rem = @import("rem"); 8 | 9 | pub fn main() !void { 10 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 11 | defer std.debug.assert(gpa.deinit() == .ok); 12 | const allocator = gpa.allocator(); 13 | 14 | // This is the text that will be read by the parser. 15 | // Since the parser accepts Unicode codepoints, the text must be decoded before it can be used. 16 | const input = "

Your text goes here!

"; 17 | const decoded_input = &rem.util.utf8DecodeStringComptime(input); 18 | 19 | // Create the DOM in which the parsed Document will be created. 20 | var dom = rem.Dom{ .allocator = allocator }; 21 | defer dom.deinit(); 22 | 23 | // Create the HTML parser. 24 | var parser = try rem.Parser.init(&dom, decoded_input, allocator, .report, false); 25 | defer parser.deinit(); 26 | 27 | // This causes the parser to read the input and produce a Document. 28 | try parser.run(); 29 | 30 | // `errors` returns the list of parse errors that were encountered while parsing. 31 | // Since we know that our input was well-formed HTML, we expect there to be 0 parse errors. 32 | const errors = parser.errors(); 33 | std.debug.assert(errors.len == 0); 34 | 35 | // We can now print the resulting Document to the console. 36 | const stdout = std.io.getStdOut().writer(); 37 | const document = parser.getDocument(); 38 | try rem.util.printDocument(stdout, document, &dom, allocator); 39 | } 40 | -------------------------------------------------------------------------------- /rem.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | pub const token = @import("source/token.zig"); 7 | pub const Tokenizer = @import("source/Tokenizer.zig"); 8 | pub const Dom = @import("source/Dom.zig"); 9 | pub const tree_construction = @import("source/tree_construction.zig"); 10 | pub const Parser = @import("source/Parser.zig"); 11 | pub const util = @import("source/util.zig"); 12 | 13 | comptime { 14 | if (@import("builtin").is_test) { 15 | @import("std").testing.refAllDecls(@This()); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /source/Dom.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const Dom = @This(); 7 | 8 | const node = @import("dom/node.zig"); 9 | pub const Document = node.Document; 10 | pub const DocumentType = node.DocumentType; 11 | pub const DocumentFormatter = node.DocumentFormatter; 12 | pub const Namespace = node.Namespace; 13 | pub const ElementType = node.ElementType; 14 | pub const Element = node.Element; 15 | pub const ParentNode = node.ParentNode; 16 | pub const AttributePrefix = node.AttributePrefix; 17 | pub const AttributeNamespace = node.AttributeNamespace; 18 | pub const ElementAttributesKey = node.ElementAttributesKey; 19 | pub const Attribute = node.Attribute; 20 | pub const CharacterDataInterface = node.CharacterDataInterface; 21 | pub const CharacterData = node.CharacterData; 22 | pub const ElementOrCharacterData = node.ElementOrCharacterData; 23 | 24 | pub const mutation = @import("dom/mutation.zig"); 25 | 26 | const std = @import("std"); 27 | const assert = std.debug.assert; 28 | const Allocator = std.mem.Allocator; 29 | const ArrayListUnmanaged = std.ArrayListUnmanaged; 30 | const StaticStringMap = std.StaticStringMap; 31 | const MultiArrayList = std.MultiArrayList; 32 | const AutoHashMapUnmanaged = std.AutoHashMapUnmanaged; 33 | const StringHashMapUnmanaged = std.StringHashMapUnmanaged; 34 | 35 | allocator: Allocator, 36 | /// For elements whose local name cannot be determined by looking at its element_type. 37 | /// This does not take precedence over looking at element_type. 38 | local_names: AutoHashMapUnmanaged(*const Element, []const u8) = .{}, 39 | /// Specifically holds MathML annotation-xml elements that are HTML integration points. 40 | /// This does not take precedence if finding if an element is an HTML integration point could be done by other means. 41 | html_integration_points: AutoHashMapUnmanaged(*const Element, void) = .{}, 42 | 43 | all_documents: ArrayListUnmanaged(*Document) = .{}, 44 | all_elements: ArrayListUnmanaged(*Element) = .{}, 45 | all_cdatas: ArrayListUnmanaged(*CharacterData) = .{}, 46 | all_doctypes: ArrayListUnmanaged(*DocumentType) = .{}, 47 | 48 | pub fn deinit(self: *Dom) void { 49 | for (self.all_elements.items) |item| { 50 | item.deinit(self.allocator); 51 | self.allocator.destroy(item); 52 | } 53 | self.all_elements.deinit(self.allocator); 54 | for (self.all_cdatas.items) |item| { 55 | item.deinit(self.allocator); 56 | self.allocator.destroy(item); 57 | } 58 | self.all_cdatas.deinit(self.allocator); 59 | for (self.all_doctypes.items) |item| { 60 | item.deinit(self.allocator); 61 | self.allocator.destroy(item); 62 | } 63 | self.all_doctypes.deinit(self.allocator); 64 | for (self.all_documents.items) |item| { 65 | item.deinit(self.allocator); 66 | self.allocator.destroy(item); 67 | } 68 | self.all_documents.deinit(self.allocator); 69 | 70 | var iterator = self.local_names.valueIterator(); 71 | while (iterator.next()) |local_name| self.allocator.free(local_name.*); 72 | self.local_names.deinit(self.allocator); 73 | 74 | self.html_integration_points.deinit(self.allocator); 75 | } 76 | 77 | pub const Exception = enum { 78 | NotFound, 79 | HierarchyRequest, 80 | }; 81 | 82 | pub fn exception(self: *Dom, ex: Exception) error{DomException} { 83 | _ = self; 84 | std.debug.print("DOM Exception raised: {s}\n", .{@tagName(ex)}); 85 | return error.DomException; 86 | } 87 | 88 | /// Creates a new Document node. The returned node is owned by the Dom. 89 | pub fn makeDocument(self: *Dom) !*Document { 90 | const document = try self.allocator.create(Document); 91 | errdefer self.allocator.destroy(document); 92 | try self.all_documents.append(self.allocator, document); 93 | document.* = Document{}; 94 | return document; 95 | } 96 | 97 | /// Creates a new CharacterData node. The returned node is owned by the Dom. 98 | pub fn makeCdata(self: *Dom, data: []const u8, interface: CharacterDataInterface) !*CharacterData { 99 | const cdata = try self.allocator.create(CharacterData); 100 | errdefer self.allocator.destroy(cdata); 101 | try self.all_cdatas.append(self.allocator, cdata); 102 | cdata.* = try CharacterData.init(self.allocator, data, interface); 103 | return cdata; 104 | } 105 | 106 | /// Creates a new DocumentType node. The returned node is owned by the Dom. 107 | pub fn makeDoctype(self: *Dom, doctype_name: ?[]const u8, public_identifier: ?[]const u8, system_identifier: ?[]const u8) !*DocumentType { 108 | const doctype = try self.allocator.create(DocumentType); 109 | errdefer self.allocator.destroy(doctype); 110 | try self.all_doctypes.append(self.allocator, doctype); 111 | doctype.* = try DocumentType.init(self.allocator, doctype_name, public_identifier, system_identifier); 112 | return doctype; 113 | } 114 | 115 | /// Creates a new Element node. The returned node is owned by the Dom. 116 | pub fn makeElement(self: *Dom, element_type: ElementType) !*Element { 117 | // TODO: This function should implement the "create an element" algorithm. 118 | // https://dom.spec.whatwg.org/#concept-create-element 119 | const element = try self.allocator.create(Element); 120 | errdefer self.allocator.destroy(element); 121 | try self.all_elements.append(self.allocator, element); 122 | element.* = Element{ .element_type = element_type, .attributes = .{}, .parent = null, .children = .{} }; 123 | return element; 124 | } 125 | 126 | pub fn registerLocalName(self: *Dom, element: *const Element, name: []const u8) !void { 127 | const copy = try self.allocator.dupe(u8, name); 128 | errdefer self.allocator.free(copy); 129 | try self.local_names.putNoClobber(self.allocator, element, copy); 130 | } 131 | 132 | pub fn registerHtmlIntegrationPoint(self: *Dom, element: *const Element) !void { 133 | assert(element.element_type == .mathml_annotation_xml); 134 | try self.html_integration_points.putNoClobber(self.allocator, element, {}); 135 | } 136 | 137 | pub fn documentFormatter(self: *const Dom, document: *const Document, allocator: Allocator) DocumentFormatter { 138 | return .{ 139 | .dom = self, 140 | .document = document, 141 | .allocator = allocator, 142 | }; 143 | } 144 | -------------------------------------------------------------------------------- /source/Parser.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | //! The Parser wraps the Tokenizer and the TreeConstructor. 7 | //! It handles the execution and the passing of messages between the two objects. 8 | 9 | const rem = @import("../rem.zig"); 10 | const Dom = @import("Dom.zig"); 11 | const Document = Dom.Document; 12 | const Element = Dom.Element; 13 | 14 | const Token = @import("token.zig").Token; 15 | const Tokenizer = @import("Tokenizer.zig"); 16 | const tree_construction = @import("tree_construction.zig"); 17 | const TreeConstructor = tree_construction.TreeConstructor; 18 | 19 | const std = @import("std"); 20 | const assert = std.debug.assert; 21 | const Allocator = std.mem.Allocator; 22 | const ArrayListUnmanaged = std.ArrayListUnmanaged; 23 | 24 | input_stream: InputStream, 25 | tokenizer_initial_state: Tokenizer.State, 26 | tokenizer_initial_last_start_tag: ?Tokenizer.LastStartTag, 27 | constructor: TreeConstructor, 28 | allocator: Allocator, 29 | error_handler: ErrorHandler, 30 | 31 | const Self = @This(); 32 | 33 | const InputStream = struct { 34 | text: []const u21, 35 | position: usize = 0, 36 | eof: bool = false, 37 | }; 38 | 39 | pub const ParseError = enum { 40 | SurrogateInInputStream, 41 | NoncharacterInInputStream, 42 | ControlCharacterInInputStream, 43 | UnexpectedNullCharacter, 44 | UnexpectedQuestionMarkInsteadOfTagName, 45 | EOFBeforeTagName, 46 | InvalidFirstCharacterOfTagName, 47 | MissingEndTagName, 48 | EOFInTag, 49 | EOFInScriptHtmlCommentLikeText, 50 | UnexpectedEqualsSignBeforeAttributeName, 51 | UnexpectedCharacterInAttributeName, 52 | MissingAttributeValue, 53 | UnexpectedCharacterInUnquotedAttributeValue, 54 | MissingWhitespaceBetweenAttributes, 55 | UnexpectedSolidusInTag, 56 | EndTagWithAttributes, 57 | EndTagWithTrailingSolidus, 58 | CDATAInHtmlContent, 59 | IncorrectlyOpenedComment, 60 | AbruptClosingOfEmptyComment, 61 | EOFInComment, 62 | NestedComment, 63 | IncorrectlyClosedComment, 64 | EOFInDOCTYPE, 65 | MissingWhitespaceBeforeDOCTYPEName, 66 | MissingDOCTYPEName, 67 | InvalidCharacterSequenceAfterDOCTYPEName, 68 | MissingWhitespaceAfterDOCTYPEPublicKeyword, 69 | MissingDOCTYPEPublicIdentifier, 70 | MissingQuoteBeforeDOCTYPEPublicIdentifier, 71 | AbruptDOCTYPEPublicIdentifier, 72 | MissingWhitespaceBetweenDOCTYPEPublicAndSystemIdentifiers, 73 | MissingQuoteBeforeDOCTYPESystemIdentifier, 74 | MissingWhitespaceAfterDOCTYPESystemKeyword, 75 | MissingDOCTYPESystemIdentifier, 76 | AbruptDOCTYPESystemIdentifier, 77 | UnexpectedCharacterAfterDOCTYPESystemIdentifier, 78 | EOFInCDATA, 79 | MissingSemicolonAfterCharacterReference, 80 | UnknownNamedCharacterReference, 81 | AbsenceOfDigitsInNumericCharacterReference, 82 | NullCharacterReference, 83 | CharacterReferenceOutsideUnicodeRange, 84 | SurrogateCharacterReference, 85 | NoncharacterCharacterReference, 86 | ControlCharacterReference, 87 | DuplicateAttribute, 88 | 89 | NonVoidHtmlElementStartTagWithTrailingSolidus, 90 | TreeConstructionError, 91 | }; 92 | 93 | pub const OnError = enum { 94 | /// The parser will continue to run when it encounters an error. 95 | ignore, 96 | /// The parser will immediately stop when it encounters an error. 97 | /// The error that caused the parser to stop can be seen by calling `errors`. 98 | abort, 99 | /// The parser will continue to run when it encounters an error. 100 | /// All errors that are encountered will be saved to a list, which can be accessed by calling `errors`. 101 | report, 102 | }; 103 | 104 | pub const ErrorHandler = union(OnError) { 105 | ignore, 106 | abort: ?ParseError, 107 | report: ArrayListUnmanaged(ParseError), 108 | 109 | fn init(on_error: OnError) ErrorHandler { 110 | return switch (on_error) { 111 | .ignore => .ignore, 112 | .abort => .{ .abort = null }, 113 | .report => .{ .report = .{} }, 114 | }; 115 | } 116 | 117 | fn deinit(error_handler: *ErrorHandler, allocator: Allocator) void { 118 | switch (error_handler.*) { 119 | .ignore, .abort => {}, 120 | .report => |*list| list.deinit(allocator), 121 | } 122 | } 123 | 124 | fn sendError(error_handler: *ErrorHandler, allocator: Allocator, err: ParseError) !void { 125 | switch (error_handler.*) { 126 | .ignore => {}, 127 | .abort => |*the_error| { 128 | the_error.* = err; 129 | return error.AbortParsing; 130 | }, 131 | .report => |*list| try list.append(allocator, err), 132 | } 133 | } 134 | }; 135 | 136 | /// Create a new HTML5 parser. 137 | pub fn init( 138 | dom: *Dom, 139 | /// Must not be freed while being used by the parser. 140 | input: []const u21, 141 | allocator: Allocator, 142 | on_error: OnError, 143 | scripting: bool, 144 | ) !Self { 145 | const document = try dom.makeDocument(); 146 | 147 | return Self{ 148 | .input_stream = InputStream{ .text = input }, 149 | .tokenizer_initial_state = .Data, 150 | .tokenizer_initial_last_start_tag = null, 151 | .constructor = TreeConstructor.init(dom, document, allocator, .{ .scripting = scripting }), 152 | .allocator = allocator, 153 | .error_handler = ErrorHandler.init(on_error), 154 | }; 155 | } 156 | 157 | /// Create a new HTML5 fragment parser. 158 | // Follows https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments 159 | pub fn initFragment( 160 | dom: *Dom, 161 | context: *Element, 162 | /// Must not be freed while being used by the parser. 163 | input: []const u21, 164 | allocator: Allocator, 165 | on_error: OnError, 166 | scripting: bool, 167 | // Must be the same "quirks mode" as the node document of the context. 168 | quirks_mode: Document.QuirksMode, 169 | ) !Self { 170 | // Step 1 171 | const document = try dom.makeDocument(); 172 | 173 | // Step 2 174 | document.quirks_mode = quirks_mode; 175 | 176 | // Steps 3 and 4 177 | const initial_state: Tokenizer.State = switch (context.element_type) { 178 | .html_title, .html_textarea => .RCDATA, 179 | .html_style, .html_xmp, .html_iframe, .html_noembed, .html_noframes => .RAWTEXT, 180 | .html_script => .ScriptData, 181 | .html_noscript => if (scripting) Tokenizer.State.RAWTEXT else Tokenizer.State.Data, 182 | .html_plaintext => .PLAINTEXT, 183 | else => .Data, 184 | }; 185 | 186 | var result = Self{ 187 | .input_stream = InputStream{ .text = input }, 188 | .tokenizer_initial_state = initial_state, 189 | .tokenizer_initial_last_start_tag = null, 190 | .constructor = TreeConstructor.init(dom, document, allocator, .{ 191 | .fragment_context = context, 192 | .scripting = scripting, 193 | }), 194 | // Step 12 195 | .allocator = allocator, 196 | .error_handler = ErrorHandler.init(on_error), 197 | }; 198 | 199 | // Steps 5-7 200 | const html = try dom.makeElement(.html_html); 201 | try Dom.mutation.documentAppendElement(dom, document, html, .Suppress); 202 | try result.constructor.open_elements.append(result.constructor.allocator, html); 203 | 204 | // Step 8 205 | if (context.element_type == .html_template) { 206 | try result.constructor.template_insertion_modes.append(result.constructor.allocator, .InTemplate); 207 | } 208 | 209 | // Step 9 210 | const should_be_html_integration_point = if (context.element_type == .mathml_annotation_xml) blk: { 211 | const eql = rem.util.eqlIgnoreCase2; 212 | const encoding = context.getAttribute(.{ .prefix = .none, .namespace = .none, .local_name = "encoding" }) orelse break :blk false; 213 | break :blk eql("text/html", encoding) or eql("application/xhtml+xml", encoding); 214 | } else false; 215 | if (should_be_html_integration_point) try dom.registerHtmlIntegrationPoint(context); 216 | 217 | // Step 10 218 | tree_construction.resetInsertionModeAppropriately(&result.constructor); 219 | 220 | // Step 11 221 | var form: ?*Element = context; 222 | while (form) |f| { 223 | if (f.element_type == .html_form) { 224 | result.constructor.form_element_pointer = f; 225 | break; 226 | } else switch (f.parent orelse break) { 227 | .document => break, 228 | .element => |e| form = e, 229 | } 230 | } 231 | 232 | // Step 12 233 | // TODO: Set the encoding confidence. 234 | 235 | // TODO: Set the tree constructor's 'parser_cannot_change_the_mode' and 'is_iframe_srcdoc_document' flags. 236 | 237 | return result; 238 | } 239 | 240 | /// Frees the memory associated with the parser. 241 | pub fn deinit(self: *Self) void { 242 | self.constructor.deinit(); 243 | self.error_handler.deinit(self.allocator); 244 | } 245 | 246 | /// Runs the tokenization and tree construction steps to completion. 247 | pub fn run(self: *Self) !void { 248 | var tokenizer = Tokenizer.init(self, self.tokenizer_initial_state, self.tokenizer_initial_last_start_tag); 249 | defer tokenizer.deinit(); 250 | while (!tokenizer.eof) { 251 | tokenizer.run() catch |err| switch (err) { 252 | error.AbortParsing => return self.abort(), 253 | error.OutOfMemory, 254 | error.Utf8CannotEncodeSurrogateHalf, 255 | error.CodepointTooLarge, 256 | => |e| return e, 257 | }; 258 | 259 | const tokens = tokenizer.tokens.items; 260 | if (tokens.len > 0) { 261 | var constructor_result: TreeConstructor.RunResult = undefined; 262 | for (tokens, 0..) |*token, i| { 263 | constructor_result = self.constructor.run(token.*) catch |err| switch (err) { 264 | error.AbortParsing => @panic("TODO abort parsing"), 265 | error.OutOfMemory, 266 | error.Utf8CannotEncodeSurrogateHalf, 267 | error.CodepointTooLarge, 268 | => @panic("TODO Handle errors in parsing"), 269 | error.DomException => @panic("TODO Handle DOM Exceptions"), 270 | }; 271 | assert(constructor_result.new_tokenizer_state == null or i == tokens.len - 1); 272 | } 273 | 274 | if (constructor_result.new_tokenizer_state) |state| { 275 | tokenizer.setState(state); 276 | tokenizer.setLastStartTag(constructor_result.new_tokenizer_last_start_tag); 277 | } 278 | tokenizer.setAdjustedCurrentNodeIsNotInHtmlNamespace(constructor_result.adjusted_current_node_is_not_in_html_namespace); 279 | } 280 | } 281 | } 282 | 283 | /// Create a new HTML5 parser for testing purposes. 284 | pub fn initTokenizerOnly( 285 | /// Must not be freed while being used by the parser. 286 | input: []const u21, 287 | allocator: Allocator, 288 | on_error: OnError, 289 | tokenizer_initial_state: Tokenizer.State, 290 | tokenizer_initial_last_start_tag: ?Tokenizer.LastStartTag, 291 | ) !Self { 292 | return Self{ 293 | .input_stream = InputStream{ .text = input }, 294 | .tokenizer_initial_state = tokenizer_initial_state, 295 | .tokenizer_initial_last_start_tag = tokenizer_initial_last_start_tag, 296 | .constructor = undefined, 297 | .allocator = allocator, 298 | .error_handler = ErrorHandler.init(on_error), 299 | }; 300 | } 301 | 302 | pub fn runTokenizerOnly(self: *Self, token_sink: *std.ArrayList(Token)) !void { 303 | var tokenizer = Tokenizer.init(self, self.tokenizer_initial_state, self.tokenizer_initial_last_start_tag); 304 | defer tokenizer.deinit(); 305 | while (!tokenizer.eof) { 306 | tokenizer.run() catch |err| switch (err) { 307 | error.AbortParsing => return self.abort(), 308 | error.OutOfMemory, 309 | error.Utf8CannotEncodeSurrogateHalf, 310 | error.CodepointTooLarge, 311 | => |e| return e, 312 | }; 313 | 314 | const old_len = token_sink.items.len; 315 | try token_sink.resize(old_len + tokenizer.tokens.items.len); 316 | tokenizer.moveTokens(token_sink.items[old_len..]); 317 | } 318 | } 319 | 320 | /// Frees the memory associated with the parser. 321 | pub fn deinitTokenizerOnly(self: *Self) void { 322 | self.error_handler.deinit(self.allocator); 323 | } 324 | 325 | pub fn parseError(parser: *Self, err: ParseError) !void { 326 | try parser.error_handler.sendError(parser.allocator, err); 327 | } 328 | 329 | /// Implements HTML's "abort a parser" algorithm 330 | /// https://html.spec.whatwg.org/multipage/parsing.html#abort-a-parser 331 | fn abort(self: *Self) void { 332 | _ = self; 333 | // TODO: The rest of this algorithm. 334 | // self.input = &[0]u21{}; 335 | } 336 | 337 | /// Returns the Document node associated with this parser. 338 | pub fn getDocument(self: Self) *Document { 339 | return self.constructor.document; 340 | } 341 | 342 | /// Returns all of the parse errors that were encountered. 343 | /// If the error handling strategy is `ignore`, the slice will be empty. 344 | /// If the error handling strategy is `abort`, the slice will have at most 1 element. 345 | /// If the error handling strategy is `report`, the slice can have any number of elements. 346 | pub fn errors(self: Self) []const ParseError { 347 | return switch (self.error_handler) { 348 | .ignore => &[0]ParseError{}, 349 | .abort => |err| if (err) |*e| @as([*]const ParseError, @ptrCast(e))[0..1] else &[0]ParseError{}, 350 | .report => |list| list.items, 351 | }; 352 | } 353 | 354 | test "Parser usage" { 355 | const string = "asdf"; 356 | const input = &rem.util.utf8DecodeStringComptime(string); 357 | const allocator = std.testing.allocator; 358 | 359 | var dom = Dom{ .allocator = allocator }; 360 | defer dom.deinit(); 361 | 362 | var parser = try init(&dom, input, allocator, .ignore, false); 363 | defer parser.deinit(); 364 | try parser.run(); 365 | } 366 | 367 | test "Parser usage, fragment case" { 368 | const string = "tacos"; 369 | const input = &rem.util.utf8DecodeStringComptime(string); 370 | const allocator = std.testing.allocator; 371 | 372 | var dom = Dom{ .allocator = allocator }; 373 | defer dom.deinit(); 374 | const context = try dom.makeElement(.html_div); 375 | 376 | var parser = try initFragment(&dom, context, input, allocator, .ignore, false, .no_quirks); 377 | defer parser.deinit(); 378 | try parser.run(); 379 | } 380 | -------------------------------------------------------------------------------- /source/dom/mutation.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | //! This file implements the mutation algorithms of the DOM. 7 | //! https://dom.spec.whatwg.org/#mutation-algorithms 8 | 9 | const std = @import("std"); 10 | const assert = std.debug.assert; 11 | 12 | const rem = @import("../../rem.zig"); 13 | const Dom = rem.Dom; 14 | const Document = Dom.Document; 15 | const DocumentType = Dom.DocumentType; 16 | const Element = Dom.Element; 17 | const CharacterData = Dom.CharacterData; 18 | const ElementOrCharacterData = Dom.ElementOrCharacterData; 19 | 20 | pub const SuppressMutationEvents = enum { Suppress, NoSuppress }; 21 | 22 | /// This is the DOM's append mutation algorithm, specialized for a Document parent and a DocumentType node. 23 | pub fn documentAppendDocumentType(dom: *Dom, document: *Document, doctype: *DocumentType, suppress: SuppressMutationEvents) !void { 24 | // Ensure pre-insertion validity. Only step 6 of this algorithm applies. 25 | if (document.doctype != null or document.element != null) { 26 | return dom.exception(.HierarchyRequest); 27 | } 28 | 29 | // The pre-insert steps are essentially a no-op. 30 | 31 | // Insert doctype into document. 32 | // TODO: Most of the steps in this algorithm have been skipped. 33 | _ = suppress; 34 | { 35 | assert(document.cdata_current_endpoint == 0); 36 | const num_cdatas = document.cdata_endpoints[0].end; 37 | document.cdata_endpoints[1] = .{ .begin = num_cdatas, .end = num_cdatas }; 38 | document.cdata_current_endpoint = 1; 39 | } 40 | document.doctype = doctype; 41 | } 42 | 43 | /// This is the DOM's append mutation algorithm, specialized for a Document parent and an Element node. 44 | pub fn documentAppendElement(dom: *Dom, document: *Document, element: *Element, suppress: SuppressMutationEvents) !void { 45 | // Ensure pre-insertion validity. Only step 6 of this algorithm applies. 46 | if (document.element != null) { 47 | return dom.exception(.HierarchyRequest); 48 | } 49 | 50 | // The pre-insert steps are essentially a no-op. 51 | 52 | // Insert element into document. 53 | // TODO: Most of the steps in this algorithm have been skipped. 54 | _ = suppress; 55 | { 56 | assert(document.cdata_current_endpoint < 2); 57 | if (document.cdata_current_endpoint == 0) { 58 | assert(document.doctype == null); 59 | const num_cdatas = document.cdata_endpoints[0].end; 60 | document.cdata_endpoints[1] = .{ .begin = num_cdatas, .end = num_cdatas }; 61 | } 62 | const num_cdatas = document.cdata_endpoints[1].end; 63 | document.cdata_endpoints[2] = .{ .begin = num_cdatas, .end = num_cdatas }; 64 | document.cdata_current_endpoint = 2; 65 | } 66 | document.element = element; 67 | } 68 | 69 | /// This is the DOM's append mutation algorithm, specialized for a Document parent and a CharacterData node. 70 | pub fn documentAppendCdata(dom: *Dom, document: *Document, cdata: *CharacterData, suppress: SuppressMutationEvents) !void { 71 | // Ensure pre-insertion validity. Only step 5 of this algorithm applies. 72 | if (cdata.interface == .text) { 73 | return dom.exception(.HierarchyRequest); 74 | } 75 | 76 | // The pre-insert steps are essentially a no-op. 77 | 78 | // Insert cdata into document. 79 | // TODO: Most of the steps in this algorithm have been skipped. 80 | _ = suppress; 81 | try document.cdata.append(dom.allocator, cdata); 82 | document.cdata_endpoints[document.cdata_current_endpoint].end += 1; 83 | } 84 | 85 | /// This is the DOM's insert mutation algorithm, specialized for an Element node parent. 86 | pub fn elementInsert( 87 | dom: *Dom, 88 | parent: *Element, 89 | child: ElementOrCharacterData, 90 | node: ElementOrCharacterData, 91 | suppress: SuppressMutationEvents, 92 | ) !void { 93 | // Insert node into parent before child. 94 | // TODO: Most of the steps in this algorithm have been skipped. 95 | _ = suppress; 96 | const index = parent.indexOfChild(child) orelse unreachable; 97 | try parent.children.insert(dom.allocator, index, node); 98 | switch (node) { 99 | .element => |e| e.parent = .{ .element = parent }, 100 | // TODO: Set the parent element of a cdata node. 101 | .cdata => {}, 102 | } 103 | } 104 | 105 | /// This is the DOM's append mutation algorithm, specialized for an Element node parent. 106 | pub fn elementAppend(dom: *Dom, parent: *Element, node: ElementOrCharacterData, suppress: SuppressMutationEvents) !void { 107 | // TODO: Ensure pre-insertion validity. Only step 2 of that algorithm applies. 108 | // TODO: Check if node is a host-including inclusive ancestor of parent. 109 | 110 | // The pre-insert steps are essentially a no-op. 111 | 112 | // Insert node into parent. 113 | // TODO: Most of the steps in this algorithm have been skipped. 114 | _ = suppress; 115 | try parent.children.append(dom.allocator, node); 116 | switch (node) { 117 | .element => |e| e.parent = .{ .element = parent }, 118 | // TODO: Set the parent element of a cdata node. 119 | .cdata => {}, 120 | } 121 | } 122 | 123 | pub fn elementRemove(dom: *Dom, node: *Element, suppress: SuppressMutationEvents) void { 124 | // Remove node. 125 | // TODO: Most of the steps in this algorithm have been skipped. 126 | _ = dom; 127 | _ = suppress; 128 | switch (node.parent.?) { 129 | .element => |e| { 130 | const index = e.indexOfChild(.{ .element = node }).?; 131 | _ = e.children.orderedRemove(index); 132 | }, 133 | .document => @panic("TODO elementRemove: parent is a document"), 134 | } 135 | node.parent = null; 136 | } 137 | -------------------------------------------------------------------------------- /source/dom/node.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const Dom = @import("../Dom.zig"); 7 | 8 | const std = @import("std"); 9 | const Allocator = std.mem.Allocator; 10 | const ArrayListUnmanaged = std.ArrayListUnmanaged; 11 | const StaticStringMap = std.StaticStringMap; 12 | const MultiArrayList = std.MultiArrayList; 13 | 14 | pub const Document = struct { 15 | doctype: ?*DocumentType = null, 16 | element: ?*Element = null, 17 | cdata: ArrayListUnmanaged(*CharacterData) = .{}, 18 | cdata_endpoints: [3]Endpoints = .{Endpoints{ .begin = 0, .end = 0 }} ** 3, 19 | cdata_current_endpoint: u2 = 0, 20 | quirks_mode: QuirksMode = .no_quirks, 21 | 22 | pub const Endpoints = struct { 23 | begin: usize, 24 | end: usize, 25 | 26 | pub fn sliceOf(self: Endpoints, array: anytype) @TypeOf(array) { 27 | return array[self.begin..self.end]; 28 | } 29 | }; 30 | 31 | pub const QuirksMode = enum { 32 | no_quirks, 33 | quirks, 34 | limited_quirks, 35 | }; 36 | 37 | pub fn deinit(self: *Document, allocator: Allocator) void { 38 | self.cdata.deinit(allocator); 39 | } 40 | }; 41 | 42 | pub const DocumentFormatter = struct { 43 | document: *const Document, 44 | dom: *const Dom, 45 | allocator: Allocator, 46 | 47 | pub fn print(self: DocumentFormatter, writer: anytype) !void { 48 | try std.fmt.format(writer, "Document: {s}\n", .{@tagName(self.document.quirks_mode)}); 49 | 50 | try printDocumentCdatas(writer, self.document, 0); 51 | 52 | if (self.document.doctype) |doctype| { 53 | try std.fmt.format(writer, " DocumentType: name={s} publicId={s} systemId={s}\n", .{ doctype.name, doctype.publicId, doctype.systemId }); 54 | } 55 | 56 | try printDocumentCdatas(writer, self.document, 1); 57 | 58 | const ConstElementOrCharacterData = union(enum) { 59 | element: *const Element, 60 | cdata: *const CharacterData, 61 | }; 62 | var node_stack = ArrayListUnmanaged(struct { node: ConstElementOrCharacterData, depth: usize }){}; 63 | defer node_stack.deinit(self.allocator); 64 | 65 | if (self.document.element) |document_element| { 66 | try node_stack.append(self.allocator, .{ .node = .{ .element = document_element }, .depth = 1 }); 67 | } 68 | 69 | while (node_stack.items.len > 0) { 70 | const item = node_stack.pop(); 71 | var len = item.depth; 72 | while (len > 0) : (len -= 1) { 73 | try std.fmt.format(writer, " ", .{}); 74 | } 75 | switch (item.node) { 76 | .element => |element| { 77 | try std.fmt.format(writer, "Element: type={s} local_name={s} namespace={s} attributes=[", .{ 78 | @tagName(element.element_type), 79 | element.localName(self.dom), 80 | @tagName(element.namespace()), 81 | }); 82 | const num_attributes = element.numAttributes(); 83 | if (num_attributes > 0) { 84 | try writer.writeAll(" "); 85 | const attribute_slice = element.attributes.slice(); 86 | var i: u32 = 0; 87 | while (i < num_attributes) : (i += 1) { 88 | const key = attribute_slice.items(.key)[i]; 89 | const value = attribute_slice.items(.value)[i]; 90 | if (key.prefix == .none) { 91 | try std.fmt.format(writer, "\"{s}\"=\"{}\" ", .{ key.local_name, std.zig.fmtEscapes(value) }); 92 | } else { 93 | try std.fmt.format(writer, "\"{s}:{s}\"=\"{}\" ", .{ @tagName(key.prefix), key.local_name, std.zig.fmtEscapes(value) }); 94 | } 95 | } 96 | } 97 | try std.fmt.format(writer, "]\n", .{}); 98 | 99 | // Add children to stack 100 | var num_children = element.children.items.len; 101 | while (num_children > 0) : (num_children -= 1) { 102 | const node = switch (element.children.items[num_children - 1]) { 103 | .element => |e| ConstElementOrCharacterData{ .element = e }, 104 | .cdata => |c| ConstElementOrCharacterData{ .cdata = c }, 105 | }; 106 | try node_stack.append(self.allocator, .{ .node = node, .depth = item.depth + 1 }); 107 | } 108 | }, 109 | .cdata => |cdata| try printCdata(writer, cdata), 110 | } 111 | } 112 | 113 | try printDocumentCdatas(writer, self.document, 2); 114 | } 115 | 116 | fn printDocumentCdatas(writer: anytype, document: *const Document, endpoint_index: u2) !void { 117 | const endpoint = document.cdata_endpoints[endpoint_index]; 118 | for (endpoint.sliceOf(document.cdata.items)) |cdata| { 119 | try printCdata(writer, cdata); 120 | } 121 | } 122 | 123 | fn printCdata(writer: anytype, cdata: *const CharacterData) !void { 124 | const interface = switch (cdata.interface) { 125 | .text => "Text", 126 | .comment => "Comment", 127 | }; 128 | try std.fmt.format(writer, "{s}: \"{}\"\n", .{ interface, std.zig.fmtEscapes(cdata.data.items) }); 129 | } 130 | }; 131 | 132 | pub const DocumentType = struct { 133 | name: []u8, 134 | publicId: []u8, 135 | systemId: []u8, 136 | 137 | pub fn init(allocator: Allocator, doctype_name: ?[]const u8, public_identifier: ?[]const u8, system_identifier: ?[]const u8) !DocumentType { 138 | const name = doctype_name orelse ""; 139 | const publicId = public_identifier orelse ""; 140 | const systemId = system_identifier orelse ""; 141 | const strings = try allocator.alloc(u8, name.len + publicId.len + systemId.len); 142 | 143 | var result = @as(DocumentType, undefined); 144 | var index: usize = 0; 145 | result.name = strings[index .. index + name.len]; 146 | index += name.len; 147 | result.publicId = strings[index .. index + publicId.len]; 148 | index += publicId.len; 149 | result.systemId = strings[index .. index + systemId.len]; 150 | 151 | @memcpy(result.name, name); 152 | @memcpy(result.publicId, publicId); 153 | @memcpy(result.systemId, systemId); 154 | 155 | return result; 156 | } 157 | 158 | pub fn deinit(self: *DocumentType, allocator: Allocator) void { 159 | const memory = self.name.ptr[0 .. self.name.len + self.publicId.len + self.systemId.len]; 160 | allocator.free(memory); 161 | } 162 | }; 163 | 164 | pub const Namespace = enum { 165 | html, 166 | svg, 167 | mathml, 168 | }; 169 | 170 | pub const ElementType = enum { 171 | // This is the complete list of conforming HTML elements. 172 | // (https://html.spec.whatwg.org/multipage/indices.html#elements-3) 173 | html_a, 174 | html_abbr, 175 | html_address, 176 | html_area, 177 | html_article, 178 | html_aside, 179 | html_audio, 180 | html_b, 181 | html_base, 182 | html_bdi, 183 | html_bdo, 184 | html_blockquote, 185 | html_body, 186 | html_br, 187 | html_button, 188 | html_canvas, 189 | html_caption, 190 | html_cite, 191 | html_code, 192 | html_col, 193 | html_colgroup, 194 | html_data, 195 | html_datalist, 196 | html_dd, 197 | html_del, 198 | html_details, 199 | html_dfn, 200 | html_dialog, 201 | html_div, 202 | html_dl, 203 | html_dt, 204 | html_em, 205 | html_embed, 206 | html_fieldset, 207 | html_figcaption, 208 | html_figure, 209 | html_footer, 210 | html_form, 211 | html_h1, 212 | html_h2, 213 | html_h3, 214 | html_h4, 215 | html_h5, 216 | html_h6, 217 | html_head, 218 | html_header, 219 | html_hgroup, 220 | html_hr, 221 | html_html, 222 | html_i, 223 | html_iframe, 224 | html_img, 225 | html_input, 226 | html_ins, 227 | html_kbd, 228 | html_label, 229 | html_legend, 230 | html_li, 231 | html_link, 232 | html_main, 233 | html_map, 234 | html_mark, 235 | html_menu, 236 | html_meta, 237 | html_meter, 238 | html_nav, 239 | html_noscript, 240 | html_object, 241 | html_ol, 242 | html_optgroup, 243 | html_option, 244 | html_output, 245 | html_p, 246 | html_param, 247 | html_picture, 248 | html_pre, 249 | html_progress, 250 | html_q, 251 | html_rp, 252 | html_rt, 253 | html_ruby, 254 | html_s, 255 | html_samp, 256 | html_script, 257 | html_section, 258 | html_select, 259 | html_slot, 260 | html_small, 261 | html_source, 262 | html_span, 263 | html_strong, 264 | html_style, 265 | html_sub, 266 | html_summary, 267 | html_sup, 268 | html_table, 269 | html_tbody, 270 | html_td, 271 | html_template, 272 | html_textarea, 273 | html_tfoot, 274 | html_th, 275 | html_thead, 276 | html_time, 277 | html_title, 278 | html_tr, 279 | html_track, 280 | html_u, 281 | html_ul, 282 | html_var, 283 | html_video, 284 | html_wbr, 285 | 286 | // This is the complete list of obsolete and non-conforming elements. 287 | // (https://html.spec.whatwg.org/multipage/obsolete.html#non-conforming-features) 288 | html_acronym, 289 | html_applet, 290 | html_basefont, 291 | html_bgsound, 292 | html_big, 293 | html_blink, 294 | html_center, 295 | html_dir, 296 | html_font, 297 | html_frame, 298 | html_frameset, 299 | html_isindex, 300 | html_keygen, 301 | html_listing, 302 | html_marquee, 303 | html_menuitem, 304 | html_multicol, 305 | html_nextid, 306 | html_nobr, 307 | html_noembed, 308 | html_noframes, 309 | html_plaintext, 310 | html_rb, 311 | html_rtc, 312 | html_spacer, 313 | html_strike, 314 | html_tt, 315 | html_xmp, 316 | 317 | mathml_math, 318 | mathml_mi, 319 | mathml_mo, 320 | mathml_mn, 321 | mathml_ms, 322 | mathml_mtext, 323 | mathml_annotation_xml, 324 | 325 | svg_svg, 326 | svg_foreign_object, 327 | svg_desc, 328 | svg_title, 329 | svg_script, 330 | 331 | /// The type of a custom HTML element. 332 | custom_html, 333 | /// The type of a MathML element that this DOM implementation doesn't know about. 334 | some_other_mathml, 335 | /// The type of an SVG element that this DOM implementation doesn't know about. 336 | some_other_svg, 337 | 338 | pub fn namespace(self: ElementType) Namespace { 339 | @setEvalBranchQuota(5000); 340 | // TODO: Some metaprogramming to make this less fragile. 341 | const html_lowest = std.meta.fieldInfo(ElementType, .html_a).value; 342 | const html_highest = std.meta.fieldInfo(ElementType, .html_xmp).value; 343 | 344 | const mathml_lowest = std.meta.fieldInfo(ElementType, .mathml_math).value; 345 | const mathml_highest = std.meta.fieldInfo(ElementType, .mathml_annotation_xml).value; 346 | 347 | const svg_lowest = std.meta.fieldInfo(ElementType, .svg_svg).value; 348 | const svg_highest = std.meta.fieldInfo(ElementType, .svg_script).value; 349 | 350 | const value = @intFromEnum(self); 351 | if ((value >= html_lowest and value <= html_highest) or self == .custom_html) { 352 | return .html; 353 | } else if ((value >= mathml_lowest and value <= mathml_highest) or self == .some_other_mathml) { 354 | return .mathml; 355 | } else if ((value >= svg_lowest and value <= svg_highest) or self == .some_other_svg) { 356 | return .svg; 357 | } else { 358 | unreachable; 359 | } 360 | } 361 | 362 | const html_map_blk = html_map: { 363 | @setEvalBranchQuota(5000); 364 | break :html_map StaticStringMap(ElementType).initComptime(.{ 365 | .{ "a", .html_a }, 366 | .{ "abbr", .html_abbr }, 367 | .{ "address", .html_address }, 368 | .{ "area", .html_area }, 369 | .{ "article", .html_article }, 370 | .{ "aside", .html_aside }, 371 | .{ "audio", .html_audio }, 372 | .{ "b", .html_b }, 373 | .{ "base", .html_base }, 374 | .{ "bdi", .html_bdi }, 375 | .{ "bdo", .html_bdo }, 376 | .{ "blockquote", .html_blockquote }, 377 | .{ "body", .html_body }, 378 | .{ "br", .html_br }, 379 | .{ "button", .html_button }, 380 | .{ "canvas", .html_canvas }, 381 | .{ "caption", .html_caption }, 382 | .{ "cite", .html_cite }, 383 | .{ "code", .html_code }, 384 | .{ "col", .html_col }, 385 | .{ "colgroup", .html_colgroup }, 386 | .{ "data", .html_data }, 387 | .{ "datalist", .html_datalist }, 388 | .{ "dd", .html_dd }, 389 | .{ "del", .html_del }, 390 | .{ "details", .html_details }, 391 | .{ "dfn", .html_dfn }, 392 | .{ "dialog", .html_dialog }, 393 | .{ "div", .html_div }, 394 | .{ "dl", .html_dl }, 395 | .{ "dt", .html_dt }, 396 | .{ "em", .html_em }, 397 | .{ "embed", .html_embed }, 398 | .{ "fieldset", .html_fieldset }, 399 | .{ "figcaption", .html_figcaption }, 400 | .{ "figure", .html_figure }, 401 | .{ "footer", .html_footer }, 402 | .{ "form", .html_form }, 403 | .{ "h1", .html_h1 }, 404 | .{ "h2", .html_h2 }, 405 | .{ "h3", .html_h3 }, 406 | .{ "h4", .html_h4 }, 407 | .{ "h5", .html_h5 }, 408 | .{ "h6", .html_h6 }, 409 | .{ "head", .html_head }, 410 | .{ "header", .html_header }, 411 | .{ "hgroup", .html_hgroup }, 412 | .{ "hr", .html_hr }, 413 | .{ "html", .html_html }, 414 | .{ "i", .html_i }, 415 | .{ "iframe", .html_iframe }, 416 | .{ "img", .html_img }, 417 | .{ "input", .html_input }, 418 | .{ "ins", .html_ins }, 419 | .{ "kbd", .html_kbd }, 420 | .{ "label", .html_label }, 421 | .{ "legend", .html_legend }, 422 | .{ "li", .html_li }, 423 | .{ "link", .html_link }, 424 | .{ "main", .html_main }, 425 | .{ "map", .html_map }, 426 | .{ "mark", .html_mark }, 427 | .{ "menu", .html_menu }, 428 | .{ "meta", .html_meta }, 429 | .{ "meter", .html_meter }, 430 | .{ "nav", .html_nav }, 431 | .{ "noscript", .html_noscript }, 432 | .{ "object", .html_object }, 433 | .{ "ol", .html_ol }, 434 | .{ "optgroup", .html_optgroup }, 435 | .{ "option", .html_option }, 436 | .{ "output", .html_output }, 437 | .{ "p", .html_p }, 438 | .{ "param", .html_param }, 439 | .{ "picture", .html_picture }, 440 | .{ "pre", .html_pre }, 441 | .{ "progress", .html_progress }, 442 | .{ "q", .html_q }, 443 | .{ "rp", .html_rp }, 444 | .{ "rt", .html_rt }, 445 | .{ "ruby", .html_ruby }, 446 | .{ "s", .html_s }, 447 | .{ "samp", .html_samp }, 448 | .{ "script", .html_script }, 449 | .{ "section", .html_section }, 450 | .{ "select", .html_select }, 451 | .{ "slot", .html_slot }, 452 | .{ "small", .html_small }, 453 | .{ "source", .html_source }, 454 | .{ "span", .html_span }, 455 | .{ "strong", .html_strong }, 456 | .{ "style", .html_style }, 457 | .{ "sub", .html_sub }, 458 | .{ "summary", .html_summary }, 459 | .{ "sup", .html_sup }, 460 | .{ "table", .html_table }, 461 | .{ "tbody", .html_tbody }, 462 | .{ "td", .html_td }, 463 | .{ "template", .html_template }, 464 | .{ "textarea", .html_textarea }, 465 | .{ "tfoot", .html_tfoot }, 466 | .{ "th", .html_th }, 467 | .{ "thead", .html_thead }, 468 | .{ "time", .html_time }, 469 | .{ "title", .html_title }, 470 | .{ "tr", .html_tr }, 471 | .{ "track", .html_track }, 472 | .{ "u", .html_u }, 473 | .{ "ul", .html_ul }, 474 | .{ "var", .html_var }, 475 | .{ "video", .html_video }, 476 | .{ "wbr", .html_wbr }, 477 | 478 | .{ "acronym", .html_acronym }, 479 | .{ "applet", .html_applet }, 480 | .{ "basefont", .html_basefont }, 481 | .{ "bgsound", .html_bgsound }, 482 | .{ "big", .html_big }, 483 | .{ "blink", .html_blink }, 484 | .{ "center", .html_center }, 485 | .{ "dir", .html_dir }, 486 | .{ "font", .html_font }, 487 | .{ "frame", .html_frame }, 488 | .{ "frameset", .html_frameset }, 489 | .{ "isindex", .html_isindex }, 490 | .{ "keygen", .html_keygen }, 491 | .{ "listing", .html_listing }, 492 | .{ "marquee", .html_marquee }, 493 | .{ "menuitem", .html_menuitem }, 494 | .{ "multicol", .html_multicol }, 495 | .{ "nextid", .html_nextid }, 496 | .{ "nobr", .html_nobr }, 497 | .{ "noembed", .html_noembed }, 498 | .{ "noframes", .html_noframes }, 499 | .{ "plaintext", .html_plaintext }, 500 | .{ "rb", .html_rb }, 501 | .{ "rtc", .html_rtc }, 502 | .{ "spacer", .html_spacer }, 503 | .{ "strike", .html_strike }, 504 | .{ "tt", .html_tt }, 505 | .{ "xmp", .html_xmp }, 506 | }); 507 | }; 508 | 509 | const mathml_map = StaticStringMap(ElementType).initComptime(.{ 510 | .{ "math", .mathml_math }, 511 | .{ "mi", .mathml_mi }, 512 | .{ "mo", .mathml_mo }, 513 | .{ "mn", .mathml_mn }, 514 | .{ "ms", .mathml_ms }, 515 | .{ "mtext", .mathml_mtext }, 516 | .{ "annotation-xml", .mathml_annotation_xml }, 517 | }); 518 | 519 | const svg_map = StaticStringMap(ElementType).initComptime(.{ 520 | .{ "svg", .svg_svg }, 521 | .{ "foreignObject", .svg_foreign_object }, 522 | .{ "desc", .svg_desc }, 523 | .{ "title", .svg_title }, 524 | .{ "script", .svg_script }, 525 | }); 526 | 527 | /// Get an HTML element's ElementType from its tag name. 528 | pub fn fromStringHtml(tag_name: []const u8) ?ElementType { 529 | return html_map_blk.get(tag_name); 530 | } 531 | 532 | /// Get a MathML element's ElementType from its tag name. 533 | pub fn fromStringMathMl(tag_name: []const u8) ?ElementType { 534 | return mathml_map.get(tag_name); 535 | } 536 | 537 | /// Get an SVG element's ElementType from its tag name. 538 | pub fn fromStringSvg(tag_name: []const u8) ?ElementType { 539 | return svg_map.get(tag_name); 540 | } 541 | 542 | /// Returns the local name of an element based solely on its ElementType, or null if it cannot be determined. 543 | pub fn toLocalName(self: ElementType) ?[]const u8 { 544 | const tag_name = @tagName(self); 545 | return switch (self) { 546 | .html_a, 547 | .html_abbr, 548 | .html_address, 549 | .html_area, 550 | .html_article, 551 | .html_aside, 552 | .html_audio, 553 | .html_b, 554 | .html_base, 555 | .html_bdi, 556 | .html_bdo, 557 | .html_blockquote, 558 | .html_body, 559 | .html_br, 560 | .html_button, 561 | .html_canvas, 562 | .html_caption, 563 | .html_cite, 564 | .html_code, 565 | .html_col, 566 | .html_colgroup, 567 | .html_data, 568 | .html_datalist, 569 | .html_dd, 570 | .html_del, 571 | .html_details, 572 | .html_dfn, 573 | .html_dialog, 574 | .html_div, 575 | .html_dl, 576 | .html_dt, 577 | .html_em, 578 | .html_embed, 579 | .html_fieldset, 580 | .html_figcaption, 581 | .html_figure, 582 | .html_footer, 583 | .html_form, 584 | .html_h1, 585 | .html_h2, 586 | .html_h3, 587 | .html_h4, 588 | .html_h5, 589 | .html_h6, 590 | .html_head, 591 | .html_header, 592 | .html_hgroup, 593 | .html_hr, 594 | .html_html, 595 | .html_i, 596 | .html_iframe, 597 | .html_img, 598 | .html_input, 599 | .html_ins, 600 | .html_kbd, 601 | .html_label, 602 | .html_legend, 603 | .html_li, 604 | .html_link, 605 | .html_main, 606 | .html_map, 607 | .html_mark, 608 | .html_menu, 609 | .html_meta, 610 | .html_meter, 611 | .html_nav, 612 | .html_noscript, 613 | .html_object, 614 | .html_ol, 615 | .html_optgroup, 616 | .html_option, 617 | .html_output, 618 | .html_p, 619 | .html_param, 620 | .html_picture, 621 | .html_pre, 622 | .html_progress, 623 | .html_q, 624 | .html_rp, 625 | .html_rt, 626 | .html_ruby, 627 | .html_s, 628 | .html_samp, 629 | .html_script, 630 | .html_section, 631 | .html_select, 632 | .html_slot, 633 | .html_small, 634 | .html_source, 635 | .html_span, 636 | .html_strong, 637 | .html_style, 638 | .html_sub, 639 | .html_summary, 640 | .html_sup, 641 | .html_table, 642 | .html_tbody, 643 | .html_td, 644 | .html_template, 645 | .html_textarea, 646 | .html_tfoot, 647 | .html_th, 648 | .html_thead, 649 | .html_time, 650 | .html_title, 651 | .html_tr, 652 | .html_track, 653 | .html_u, 654 | .html_ul, 655 | .html_var, 656 | .html_video, 657 | .html_wbr, 658 | 659 | .html_acronym, 660 | .html_applet, 661 | .html_basefont, 662 | .html_bgsound, 663 | .html_big, 664 | .html_blink, 665 | .html_center, 666 | .html_dir, 667 | .html_font, 668 | .html_frame, 669 | .html_frameset, 670 | .html_isindex, 671 | .html_keygen, 672 | .html_listing, 673 | .html_marquee, 674 | .html_menuitem, 675 | .html_multicol, 676 | .html_nextid, 677 | .html_nobr, 678 | .html_noembed, 679 | .html_noframes, 680 | .html_plaintext, 681 | .html_rb, 682 | .html_rtc, 683 | .html_spacer, 684 | .html_strike, 685 | .html_tt, 686 | .html_xmp, 687 | => tag_name[5..], 688 | 689 | .mathml_math, 690 | .mathml_mi, 691 | .mathml_mo, 692 | .mathml_mn, 693 | .mathml_ms, 694 | .mathml_mtext, 695 | => tag_name[7..], 696 | .mathml_annotation_xml => "annotation-xml", 697 | 698 | .svg_svg, 699 | .svg_desc, 700 | .svg_title, 701 | .svg_script, 702 | => tag_name[4..], 703 | .svg_foreign_object => "foreignObject", 704 | 705 | .custom_html, 706 | .some_other_mathml, 707 | .some_other_svg, 708 | => null, 709 | }; 710 | } 711 | }; 712 | 713 | /// The type for the children of an Element node. 714 | pub const ElementOrCharacterData = union(enum) { 715 | element: *Element, 716 | cdata: *CharacterData, 717 | }; 718 | 719 | /// The type for the parent of an Element node. 720 | pub const ParentNode = union(enum) { 721 | element: *Element, 722 | document, 723 | }; 724 | 725 | pub const AttributePrefix = enum { 726 | none, 727 | xlink, 728 | xml, 729 | xmlns, 730 | }; 731 | 732 | pub const AttributeNamespace = enum { 733 | none, 734 | xlink, 735 | xml, 736 | xmlns, 737 | }; 738 | 739 | pub const ElementAttributesKey = struct { 740 | prefix: AttributePrefix, 741 | namespace: AttributeNamespace, 742 | local_name: []const u8, 743 | 744 | pub fn eql(lhs: ElementAttributesKey, rhs: ElementAttributesKey) bool { 745 | switch (lhs.prefix) { 746 | .none, .xlink, .xml, .xmlns => if (lhs.prefix != rhs.prefix) return false, 747 | } 748 | switch (lhs.namespace) { 749 | .none, .xlink, .xml, .xmlns => if (lhs.namespace != rhs.namespace) return false, 750 | } 751 | return std.mem.eql(u8, lhs.local_name, rhs.local_name); 752 | } 753 | }; 754 | 755 | pub const Attribute = struct { 756 | key: ElementAttributesKey, 757 | value: []const u8, 758 | }; 759 | 760 | /// The type for the attributes of an Element node. 761 | pub const ElementAttributes = MultiArrayList(Attribute); 762 | 763 | pub const Element = struct { 764 | element_type: ElementType, 765 | parent: ?ParentNode, 766 | attributes: ElementAttributes, 767 | children: ArrayListUnmanaged(ElementOrCharacterData), 768 | 769 | pub fn deinit(self: *Element, allocator: Allocator) void { 770 | const attr_slice = self.attributes.slice(); 771 | for (attr_slice.items(.key), attr_slice.items(.value)) |key, value| { 772 | allocator.free(key.local_name); 773 | allocator.free(value); 774 | } 775 | self.attributes.deinit(allocator); 776 | self.children.deinit(allocator); 777 | } 778 | 779 | pub fn namespace(self: Element) Namespace { 780 | return self.element_type.namespace(); 781 | } 782 | 783 | pub fn localName(self: *const Element, dom: *const Dom) []const u8 { 784 | return self.element_type.toLocalName() orelse dom.local_names.get(self) orelse unreachable; 785 | } 786 | 787 | pub fn numAttributes(self: Element) u32 { 788 | return @intCast(self.attributes.len); 789 | } 790 | 791 | pub fn appendAttribute(self: *Element, allocator: Allocator, key: ElementAttributesKey, value: []const u8) !void { 792 | // TOOD: This should implement https://dom.spec.whatwg.org/#concept-element-attributes-append 793 | const key_local_name_copy = try allocator.dupe(u8, key.local_name); 794 | errdefer allocator.free(key_local_name_copy); 795 | const value_copy = try allocator.dupe(u8, value); 796 | errdefer allocator.free(value_copy); 797 | try self.attributes.append(allocator, .{ .key = .{ .prefix = key.prefix, .namespace = key.namespace, .local_name = key_local_name_copy }, .value = value_copy }); 798 | } 799 | 800 | pub fn appendAttributeIfNotExists(self: *Element, allocator: Allocator, key: ElementAttributesKey, value: []const u8) !void { 801 | if (self.getAttribute(key) == null) { 802 | try self.appendAttribute(allocator, key, value); 803 | } 804 | } 805 | 806 | pub fn getAttribute(self: Element, key: ElementAttributesKey) ?[]const u8 { 807 | const slice = self.attributes.slice(); 808 | for (slice.items(.key), slice.items(.value)) |k, v| { 809 | if (key.eql(k)) { 810 | return v; 811 | } 812 | } 813 | return null; 814 | } 815 | 816 | pub fn lastChild(self: *Element) ?ElementOrCharacterData { 817 | if (self.children.items.len != 0) { 818 | return self.children.items[self.children.items.len - 1]; 819 | } else { 820 | return null; 821 | } 822 | } 823 | 824 | pub fn indexOfChild(self: *Element, child: ElementOrCharacterData) ?usize { 825 | for (self.children.items, 0..) |c, i| { 826 | if (std.meta.eql(child, c)) return i; 827 | } else return null; 828 | } 829 | 830 | pub fn childBefore(self: *Element, child: ElementOrCharacterData) ?ElementOrCharacterData { 831 | if (self.children.items.len == 0) return null; 832 | if (std.meta.eql(self.children.items[0], child)) return null; 833 | const index = self.indexOfChild(child).?; 834 | return self.children.items[index - 1]; 835 | } 836 | }; 837 | 838 | pub const CharacterDataInterface = enum { 839 | // NOTE: CharacterData is an abstract interface. 840 | text, 841 | comment, 842 | }; 843 | 844 | pub const CharacterData = struct { 845 | data: ArrayListUnmanaged(u8) = .{}, 846 | interface: CharacterDataInterface, 847 | 848 | pub fn init(allocator: Allocator, data: []const u8, interface: CharacterDataInterface) !CharacterData { 849 | var result = CharacterData{ .interface = interface }; 850 | try result.data.appendSlice(allocator, data); 851 | return result; 852 | } 853 | 854 | pub fn deinit(self: *CharacterData, allocator: Allocator) void { 855 | self.data.deinit(allocator); 856 | } 857 | 858 | // TODO: Move this function to mutation. 859 | pub fn append(self: *CharacterData, allocator: Allocator, data: []const u8) !void { 860 | try self.data.appendSlice(allocator, data); 861 | } 862 | }; 863 | -------------------------------------------------------------------------------- /source/token.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const rem = @import("../rem.zig"); 7 | 8 | const std = @import("std"); 9 | const Allocator = std.mem.Allocator; 10 | const StringHashMapUnmanaged = std.StringHashMapUnmanaged; 11 | 12 | pub const TokenDoctype = struct { 13 | name: ?[]const u8, 14 | public_identifier: ?[]const u8, 15 | system_identifier: ?[]const u8, 16 | force_quirks: bool, 17 | }; 18 | 19 | pub const TokenStartTag = struct { 20 | name: []const u8, 21 | attributes: Attributes, 22 | self_closing: bool, 23 | 24 | pub const Attributes = StringHashMapUnmanaged([]const u8); 25 | }; 26 | 27 | pub const TokenEndTag = struct { 28 | name: []const u8, 29 | }; 30 | 31 | pub const TokenComment = struct { 32 | data: []const u8, 33 | }; 34 | 35 | pub const TokenCharacter = struct { 36 | data: u21, 37 | }; 38 | 39 | pub const TokenEof = void; 40 | 41 | pub const Token = union(enum) { 42 | doctype: Doctype, 43 | start_tag: StartTag, 44 | end_tag: EndTag, 45 | comment: Comment, 46 | character: Character, 47 | eof: Eof, 48 | 49 | pub const Doctype = TokenDoctype; 50 | pub const StartTag = TokenStartTag; 51 | pub const EndTag = TokenEndTag; 52 | pub const Comment = TokenComment; 53 | pub const Character = TokenCharacter; 54 | pub const Eof = TokenEof; 55 | 56 | pub fn deinit(token: *Token, allocator: Allocator) void { 57 | switch (token.*) { 58 | .doctype => |d| { 59 | if (d.name) |name| allocator.free(name); 60 | if (d.public_identifier) |public_identifier| allocator.free(public_identifier); 61 | if (d.system_identifier) |system_identifier| allocator.free(system_identifier); 62 | }, 63 | .start_tag => |*t| { 64 | allocator.free(t.name); 65 | var attr_it = t.attributes.iterator(); 66 | while (attr_it.next()) |entry| { 67 | allocator.free(entry.key_ptr.*); 68 | allocator.free(entry.value_ptr.*); 69 | } 70 | t.attributes.deinit(allocator); 71 | }, 72 | .end_tag => |t| { 73 | allocator.free(t.name); 74 | }, 75 | .comment => |c| { 76 | allocator.free(c.data); 77 | }, 78 | .character, .eof => {}, 79 | } 80 | } 81 | 82 | pub fn eql(lhs: Token, rhs: Token) bool { 83 | const eqlNullSlices = rem.util.eqlNullSlices; 84 | if (std.meta.activeTag(lhs) != std.meta.activeTag(rhs)) return false; 85 | switch (lhs) { 86 | .doctype => return lhs.doctype.force_quirks == rhs.doctype.force_quirks and 87 | eqlNullSlices(u8, lhs.doctype.name, rhs.doctype.name) and 88 | eqlNullSlices(u8, lhs.doctype.public_identifier, rhs.doctype.public_identifier) and 89 | eqlNullSlices(u8, lhs.doctype.system_identifier, rhs.doctype.system_identifier), 90 | .start_tag => { 91 | if (!(lhs.start_tag.self_closing == rhs.start_tag.self_closing and 92 | eqlNullSlices(u8, lhs.start_tag.name, rhs.start_tag.name) and 93 | lhs.start_tag.attributes.count() == rhs.start_tag.attributes.count())) return false; 94 | var iterator = lhs.start_tag.attributes.iterator(); 95 | while (iterator.next()) |attr| { 96 | const rhs_value = rhs.start_tag.attributes.get(attr.key_ptr.*) orelse return false; 97 | if (!std.mem.eql(u8, attr.value_ptr.*, rhs_value)) return false; 98 | } 99 | return true; 100 | }, 101 | .end_tag => return eqlNullSlices(u8, lhs.end_tag.name, rhs.end_tag.name), 102 | .comment => return eqlNullSlices(u8, lhs.comment.data, rhs.comment.data), 103 | .character => return lhs.character.data == rhs.character.data, 104 | .eof => return true, 105 | } 106 | } 107 | 108 | pub fn format(value: Token, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { 109 | _ = fmt; 110 | _ = options; 111 | 112 | switch (value) { 113 | .doctype => |d| { 114 | try writer.writeAll("DOCTYPE ("); 115 | if (d.name) |name| try writer.writeAll(name); 116 | if (d.public_identifier) |pi| { 117 | try writer.writeAll(" PUBLIC:"); 118 | try writer.writeAll(pi); 119 | } 120 | if (d.system_identifier) |si| { 121 | try writer.writeAll(" SYSTEM:"); 122 | try writer.writeAll(si); 123 | } 124 | try writer.writeAll(")"); 125 | }, 126 | .start_tag => |t| { 127 | try writer.writeAll("Start tag "); 128 | if (t.self_closing) try writer.writeAll("(self closing) "); 129 | try writer.writeAll("\""); 130 | try writer.writeAll(t.name); 131 | try writer.writeAll("\" ["); 132 | var it = t.attributes.iterator(); 133 | while (it.next()) |entry| { 134 | try writer.writeAll("\""); 135 | try writer.writeAll(entry.key_ptr.*); 136 | try writer.writeAll("\": \""); 137 | try writer.writeAll(entry.value_ptr.*); 138 | try writer.writeAll("\", "); 139 | } 140 | try writer.writeAll("]"); 141 | }, 142 | .end_tag => |t| { 143 | try writer.writeAll("End tag \""); 144 | try writer.writeAll(t.name); 145 | try writer.writeAll("\""); 146 | }, 147 | .comment => |c| { 148 | try writer.writeAll("Comment ("); 149 | try writer.writeAll(c.data); 150 | try writer.writeAll(")"); 151 | }, 152 | .character => |c| { 153 | try writer.writeAll("Character ("); 154 | switch (c.data) { 155 | 0x00...0x08, 0x0B...0x7F => { 156 | const as_u7: u7 = @intCast(c.data); 157 | if (std.ascii.isControl(as_u7) or std.ascii.isWhitespace(as_u7)) { 158 | try writer.print("U+{X}", .{as_u7}); 159 | } else { 160 | try writer.writeByte(as_u7); 161 | } 162 | }, 163 | '\n' => try writer.writeAll(""), 164 | '\t' => try writer.writeAll(""), 165 | else => try writer.print("U+{X}", .{c.data}), 166 | } 167 | try writer.writeAll(")"); 168 | }, 169 | .eof => { 170 | try writer.writeAll("End of file"); 171 | }, 172 | } 173 | } 174 | }; 175 | -------------------------------------------------------------------------------- /source/util.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const std = @import("std"); 7 | const assert = std.debug.assert; 8 | const Allocator = std.mem.Allocator; 9 | const ArrayListUnmanaged = std.ArrayListUnmanaged; 10 | const StringHashMapUnmanaged = std.StringHashMapUnmanaged; 11 | 12 | const rem = @import("../rem.zig"); 13 | const Dom = rem.Dom; 14 | const Document = Dom.Document; 15 | const Element = Dom.Element; 16 | const CharacterData = Dom.CharacterData; 17 | 18 | pub fn freeStringHashMap(map: *StringHashMapUnmanaged([]u8), allocator: Allocator) void { 19 | var iterator = map.iterator(); 20 | while (iterator.next()) |attr| { 21 | allocator.free(attr.key_ptr.*); 22 | allocator.free(attr.value_ptr.*); 23 | } 24 | map.deinit(allocator); 25 | } 26 | 27 | pub fn freeStringHashMapConst(map: *StringHashMapUnmanaged([]const u8), allocator: Allocator) void { 28 | var iterator = map.iterator(); 29 | while (iterator.next()) |attr| { 30 | allocator.free(attr.key_ptr.*); 31 | allocator.free(attr.value_ptr.*); 32 | } 33 | map.deinit(allocator); 34 | } 35 | 36 | // `map1` and `map2` are of type std.[String]HashMap[Unmanaged] 37 | pub fn eqlStringHashMaps(map1: anytype, map2: @TypeOf(map1)) bool { 38 | if (map1.count() != map2.count()) return false; 39 | var iterator = map1.iterator(); 40 | while (iterator.next()) |attr| { 41 | const map2_value = map2.get(attr.key_ptr.*) orelse return false; 42 | if (!std.mem.eql(u8, attr.value_ptr.*, map2_value)) return false; 43 | } 44 | return true; 45 | } 46 | 47 | pub fn eqlNullSlices(comptime T: type, slice1: ?[]const T, slice2: ?[]const T) bool { 48 | if (slice1) |a| { 49 | const b = slice2 orelse return false; 50 | return std.mem.eql(T, a, b); 51 | } else { 52 | return slice2 == null; 53 | } 54 | } 55 | 56 | pub fn eqlNullSlices2(comptime T: type, slice1: []const T, slice2: ?[]const T) bool { 57 | const b = slice2 orelse return false; 58 | return std.mem.eql(T, slice1, b); 59 | } 60 | 61 | pub const eqlIgnoreCase = std.ascii.eqlIgnoreCase; 62 | 63 | /// Assumes the second string is already lowercase. 64 | pub fn eqlIgnoreCase2(a: []const u8, b: []const u8) bool { 65 | if (a.len != b.len) return false; 66 | for (a, b) |c1, c2| { 67 | assert(c2 == std.ascii.toLower(c2)); 68 | if (c2 != std.ascii.toLower(c1)) return false; 69 | } 70 | return true; 71 | } 72 | 73 | pub fn toLowercaseComptime(comptime string: []const u8) [string.len]u8 { 74 | var result: [string.len]u8 = undefined; 75 | for (string, &result) |s, *dest| { 76 | dest.* = std.ascii.toLower(s); 77 | } 78 | return result; 79 | } 80 | 81 | /// Assumes `needle` is already lowercase. 82 | pub fn startsWithIgnoreCase2(haystack: []const u8, needle: []const u8) bool { 83 | return if (needle.len > haystack.len) false else eqlIgnoreCase2(haystack[0..needle.len], needle); 84 | } 85 | 86 | pub fn utf8DecodeStringComptimeLen(comptime string: []const u8) usize { 87 | var i: usize = 0; 88 | var decoded_len: usize = 0; 89 | while (i < string.len) { 90 | i += std.unicode.utf8ByteSequenceLength(string[i]) catch unreachable; 91 | decoded_len += 1; 92 | } 93 | return decoded_len; 94 | } 95 | 96 | pub fn utf8DecodeStringComptime(comptime string: []const u8) [utf8DecodeStringComptimeLen(string)]u21 { 97 | var result: [utf8DecodeStringComptimeLen(string)]u21 = undefined; 98 | if (result.len == 0) return result; 99 | var decoded_it = std.unicode.Utf8View.initComptime(string).iterator(); 100 | var i: usize = 0; 101 | while (decoded_it.nextCodepoint()) |codepoint| { 102 | result[i] = codepoint; 103 | i += 1; 104 | } 105 | return result; 106 | } 107 | 108 | pub fn printDocument(writer: anytype, document: *const Document, dom: *const Dom, allocator: Allocator) !void { 109 | try std.fmt.format(writer, "Document: {s}\n", .{@tagName(document.quirks_mode)}); 110 | 111 | try printDocumentCdatas(writer, document, 0); 112 | 113 | if (document.doctype) |doctype| { 114 | try std.fmt.format(writer, " DocumentType: name={s} publicId={s} systemId={s}\n", .{ doctype.name, doctype.publicId, doctype.systemId }); 115 | } 116 | 117 | try printDocumentCdatas(writer, document, 1); 118 | 119 | const ConstElementOrCharacterData = union(enum) { 120 | element: *const Element, 121 | cdata: *const CharacterData, 122 | }; 123 | var node_stack = ArrayListUnmanaged(struct { node: ConstElementOrCharacterData, depth: usize }){}; 124 | defer node_stack.deinit(allocator); 125 | 126 | if (document.element) |document_element| { 127 | try node_stack.append(allocator, .{ .node = .{ .element = document_element }, .depth = 1 }); 128 | } 129 | 130 | while (node_stack.items.len > 0) { 131 | const item = node_stack.pop().?; 132 | var len = item.depth; 133 | while (len > 0) : (len -= 1) { 134 | try std.fmt.format(writer, " ", .{}); 135 | } 136 | switch (item.node) { 137 | .element => |element| { 138 | try std.fmt.format(writer, "Element: type={s} local_name={s} namespace={s} attributes=[", .{ 139 | @tagName(element.element_type), 140 | element.localName(dom), 141 | @tagName(element.namespace()), 142 | }); 143 | const num_attributes = element.numAttributes(); 144 | if (num_attributes > 0) { 145 | try writer.writeAll(" "); 146 | const attribute_slice = element.attributes.slice(); 147 | var i: u32 = 0; 148 | while (i < num_attributes) : (i += 1) { 149 | const key = attribute_slice.items(.key)[i]; 150 | const value = attribute_slice.items(.value)[i]; 151 | if (key.prefix == .none) { 152 | try std.fmt.format(writer, "\"{s}\"=\"{s}\" ", .{ key.local_name, value }); 153 | } else { 154 | try std.fmt.format(writer, "\"{s}:{s}\"=\"{s}\" ", .{ @tagName(key.prefix), key.local_name, value }); 155 | } 156 | } 157 | } 158 | try std.fmt.format(writer, "]\n", .{}); 159 | 160 | // Add children to stack 161 | var num_children = element.children.items.len; 162 | while (num_children > 0) : (num_children -= 1) { 163 | const node = switch (element.children.items[num_children - 1]) { 164 | .element => |e| ConstElementOrCharacterData{ .element = e }, 165 | .cdata => |c| ConstElementOrCharacterData{ .cdata = c }, 166 | }; 167 | try node_stack.append(allocator, .{ .node = node, .depth = item.depth + 1 }); 168 | } 169 | }, 170 | .cdata => |cdata| try printCdata(writer, cdata), 171 | } 172 | } 173 | 174 | try printDocumentCdatas(writer, document, 2); 175 | } 176 | 177 | fn printDocumentCdatas(writer: anytype, document: *const Document, endpoint_index: u2) !void { 178 | const endpoint = document.cdata_endpoints[endpoint_index]; 179 | for (endpoint.sliceOf(document.cdata.items)) |cdata| { 180 | try printCdata(writer, cdata); 181 | } 182 | } 183 | 184 | fn printCdata(writer: anytype, cdata: *const CharacterData) !void { 185 | const interface = switch (cdata.interface) { 186 | .text => "Text", 187 | .comment => "Comment", 188 | }; 189 | try std.fmt.format(writer, "{s}: \"{}\"\n", .{ interface, std.zig.fmtEscapes(cdata.data.items) }); 190 | } 191 | -------------------------------------------------------------------------------- /test/html5lib-test-tokenizer.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | // Parts of this file were copied from 7 | // https://github.com/watzon/zhtml, which is MIT (Expat) licensed. 8 | // A copyright notice is included below. 9 | // 10 | // Copyright 2020 Chris Watson 11 | // 12 | // Permission is hereby granted, free of charge, to any person obtaining a 13 | // copy of this software and associated documentation files (the "Software"), 14 | // to deal in the Software without restriction, including without limitation 15 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 16 | // and/or sell copies of the Software, and to permit persons to whom the 17 | // Software is furnished to do so, subject to the following conditions: 18 | // 19 | // The above copyright notice and this permission notice shall be included 20 | // in all copies or substantial portions of the Software. 21 | // 22 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 23 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 25 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 27 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 28 | // DEALINGS IN THE SOFTWARE. 29 | 30 | const std = @import("std"); 31 | const testing = std.testing; 32 | const ArrayList = std.ArrayList; 33 | const Allocator = std.mem.Allocator; 34 | const Progress = std.Progress; 35 | 36 | const rem = @import("rem"); 37 | const Token = rem.token.Token; 38 | const AttributeSet = rem.token.AttributeSet; 39 | const Tokenizer = rem.Tokenizer; 40 | const TokenizerState = Tokenizer.State; 41 | const LastStartTag = Tokenizer.LastStartTag; 42 | const Parser = rem.Parser; 43 | const ParseError = Parser.ParseError; 44 | const ErrorHandler = Parser.ErrorHandler; 45 | 46 | comptime { 47 | const min_supported_zig_version_string = "0.12.0-dev.91+a155e3585"; 48 | const min_supported_zig_version = std.SemanticVersion.parse(min_supported_zig_version_string) catch unreachable; 49 | const current_zig_version = @import("builtin").zig_version; 50 | if (current_zig_version.order(min_supported_zig_version) == .lt) { 51 | const current_zig_version_string = @import("builtin").zig_version_string; 52 | @compileError("Due to a regression in your current Zig version (" ++ current_zig_version_string ++ "), this test has been disabled.\n" ++ 53 | "Please use Zig version " ++ min_supported_zig_version_string ++ " or newer and try again.\n" ++ 54 | "See https://github.com/ziglang/zig/issues/16828 for more info."); 55 | } 56 | } 57 | 58 | test "content model flags" { 59 | try runTestFile("test/html5lib-tests/tokenizer/contentModelFlags.test"); 60 | } 61 | 62 | test "domjs" { 63 | try runTestFile("test/html5lib-tests/tokenizer/domjs.test"); 64 | } 65 | 66 | test "entities" { 67 | try runTestFile("test/html5lib-tests/tokenizer/entities.test"); 68 | } 69 | 70 | test "escape flag" { 71 | try runTestFile("test/html5lib-tests/tokenizer/escapeFlag.test"); 72 | } 73 | 74 | test "named entities" { 75 | try runTestFile("test/html5lib-tests/tokenizer/namedEntities.test"); 76 | } 77 | 78 | test "numeric entities" { 79 | try runTestFile("test/html5lib-tests/tokenizer/numericEntities.test"); 80 | } 81 | 82 | test "pending spec changes" { 83 | try runTestFile("test/html5lib-tests/tokenizer/pendingSpecChanges.test"); 84 | } 85 | 86 | test "test 1" { 87 | try runTestFile("test/html5lib-tests/tokenizer/test1.test"); 88 | } 89 | 90 | test "test 2" { 91 | try runTestFile("test/html5lib-tests/tokenizer/test2.test"); 92 | } 93 | 94 | test "test 3" { 95 | try runTestFile("test/html5lib-tests/tokenizer/test3.test"); 96 | } 97 | 98 | test "test 4" { 99 | try runTestFile("test/html5lib-tests/tokenizer/test4.test"); 100 | } 101 | 102 | test "unicode chars" { 103 | try runTestFile("test/html5lib-tests/tokenizer/unicodeChars.test"); 104 | } 105 | 106 | test "unicode chars problematic" { 107 | try runTestFile("test/html5lib-tests/tokenizer/unicodeCharsProblematic.test"); 108 | } 109 | 110 | // TODO: Not supported at the moment. 111 | // test "xml violation" { 112 | // try runTestFile("test/html5lib-tests/tokenizer/xmlViolation.test"); 113 | // } 114 | 115 | var may_prog_root: ?std.Progress.Node = null; 116 | 117 | fn runTestFile(file_path: []const u8) !void { 118 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 119 | defer std.debug.assert(gpa.deinit() == .ok); 120 | const gpa_allocator = gpa.allocator(); 121 | 122 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 123 | defer arena.deinit(); 124 | const arena_allocator = arena.allocator(); 125 | 126 | const contents = try std.fs.cwd().readFileAlloc(arena_allocator, file_path, std.math.maxInt(usize)); 127 | defer arena_allocator.free(contents); 128 | var tree = try std.json.parseFromSlice(std.json.Value, arena_allocator, contents, .{}); 129 | defer tree.deinit(); 130 | 131 | const tests = tree.value.object.get("tests").?.array; 132 | if (may_prog_root == null) { 133 | may_prog_root = Progress.start(.{ .root_name = "", .estimated_total_items = tests.items.len }); 134 | } 135 | 136 | const prog_root = may_prog_root.?; 137 | 138 | for (tests.items) |test_obj| { 139 | const description = test_obj.object.get("description").?.string; 140 | 141 | var states: [6]TokenizerState = undefined; 142 | var num_states: usize = 0; 143 | if (test_obj.object.get("initialStates")) |initial_states_obj| { 144 | for (initial_states_obj.array.items) |initial_state_val| { 145 | states[num_states] = parseInitialState(initial_state_val.string); 146 | num_states += 1; 147 | } 148 | } else { 149 | states[0] = .Data; 150 | num_states = 1; 151 | } 152 | 153 | var prog_task = prog_root.start(description, num_states); 154 | 155 | const double_escaped = if (test_obj.object.get("doubleEscaped")) |de| de.bool else false; 156 | const input_raw = test_obj.object.get("input").?.string; 157 | const input = try getStringDecoded(input_raw, arena_allocator, double_escaped); 158 | defer arena_allocator.free(input); 159 | const expected_tokens = try parseOutput(arena_allocator, test_obj.object.get("output").?.array.items, double_escaped); 160 | defer expected_tokens.deinit(); 161 | const expected_errors = blk: { 162 | if (test_obj.object.get("errors")) |errors_obj| { 163 | break :blk try parseErrors(arena_allocator, errors_obj.array.items); 164 | } else { 165 | break :blk std.ArrayList(ErrorInfo).init(arena_allocator); 166 | } 167 | }; 168 | defer expected_errors.deinit(); 169 | const last_start_tag = blk: { 170 | const string = if (test_obj.object.get("lastStartTag")) |lastStartTagObj| lastStartTagObj.string else break :blk null; 171 | break :blk LastStartTag.fromString(string) orelse std.debug.panic("Unrecognized value for last_start_tag_name: \"{s}\"", .{string}); 172 | }; 173 | 174 | for (states[0..num_states]) |state| { 175 | runTest(gpa_allocator, input, expected_tokens.items, expected_errors.items, state, last_start_tag) catch |err| { 176 | std.debug.print("Test \"{s}\" with initial state \"{s}\" failed\nInput: \"{s}\"\n", .{ description, @tagName(state), input_raw }); 177 | return err; 178 | }; 179 | prog_task.completeOne(); 180 | } 181 | 182 | prog_task.end(); 183 | } 184 | 185 | prog_root.end(); 186 | } 187 | 188 | fn runTest( 189 | allocator: Allocator, 190 | input: []const u21, 191 | expected_tokens: []Token, 192 | expected_errors: []ErrorInfo, 193 | initial_state: TokenizerState, 194 | last_start_tag: ?LastStartTag, 195 | ) !void { 196 | var all_tokens = ArrayList(Token).init(allocator); 197 | defer { 198 | for (all_tokens.items) |*t| t.deinit(allocator); 199 | all_tokens.deinit(); 200 | } 201 | 202 | var parser = try Parser.initTokenizerOnly(input, allocator, .report, initial_state, last_start_tag); 203 | defer parser.deinitTokenizerOnly(); 204 | 205 | try parser.runTokenizerOnly(&all_tokens); 206 | 207 | try std.testing.expect(all_tokens.items[all_tokens.items.len - 1] == .eof); 208 | std.testing.expectEqual(expected_tokens.len, all_tokens.items.len - 1) catch { 209 | std.debug.print( 210 | "Unequal number of tokens\n Expected {}: {any}\n Actual {}: {any}\n", 211 | .{ expected_tokens.len, expected_tokens, all_tokens.items.len - 1, all_tokens.items[0 .. all_tokens.items.len - 1] }, 212 | ); 213 | return error.UnequalNumberOfTokens; 214 | }; 215 | for (expected_tokens, all_tokens.items[0 .. all_tokens.items.len - 1]) |expected, actual| { 216 | expectEqualTokens(expected, actual) catch { 217 | std.debug.print("Mismatched tokens\n Expected: {any}\n Actual: {any}\n", .{ expected, actual }); 218 | return error.MismatchedTokens; 219 | }; 220 | } 221 | 222 | const all_errors = parser.errors(); 223 | 224 | std.testing.expectEqual(expected_errors.len, all_errors.len) catch { 225 | std.debug.print( 226 | "Unequal number of parse errors\n Expected {}: {any}\n Actual {}: {any}\n", 227 | .{ expected_errors.len, expected_errors, all_errors.len, all_errors }, 228 | ); 229 | return error.UnequalNumberOfParseErrors; 230 | }; 231 | for (expected_errors, all_errors) |expected, actual| { 232 | const actual_string = ErrorInfo.errorToSpecId(actual); 233 | testing.expectEqualSlices(u8, expected.id, actual_string) catch { 234 | std.debug.print( 235 | "Mismatched parse errors\n Expected: {s}\n Actual: {s}\n", 236 | .{ expected.id, actual_string }, 237 | ); 238 | return error.MismatchedParseErrors; 239 | }; 240 | } 241 | } 242 | 243 | fn parseOutput(allocator: Allocator, outputs: []const std.json.Value, double_escaped: bool) !std.ArrayList(Token) { 244 | var tokens = try std.ArrayList(Token).initCapacity(allocator, outputs.len); 245 | for (outputs) |output_obj| { 246 | const output_array = output_obj.array.items; 247 | const token_type_str = output_array[0].string; 248 | 249 | if (std.mem.eql(u8, token_type_str, "DOCTYPE")) { 250 | // ["DOCTYPE", name, public_id, system_id, correctness] 251 | try tokens.append(Token{ 252 | .doctype = .{ 253 | .name = if (output_array[1] == .null) null else try getString(output_array[1].string, allocator, double_escaped), 254 | // public_id and system_id are either strings or null. 255 | .public_identifier = if (output_array[2] == .null) null else try getString(output_array[2].string, allocator, double_escaped), 256 | .system_identifier = if (output_array[3] == .null) null else try getString(output_array[3].string, allocator, double_escaped), 257 | // correctness is either true or false; true corresponds to the force-quirks flag being false, and vice-versa. 258 | .force_quirks = !output_array[4].bool, 259 | }, 260 | }); 261 | } else if (std.mem.eql(u8, token_type_str, "StartTag")) { 262 | // ["StartTag", name, {attributes}*, true*] 263 | // ["StartTag", name, {attributes}] 264 | const attributes_obj = output_array[2].object; 265 | var token = Token{ 266 | .start_tag = .{ 267 | .name = try getString(output_array[1].string, allocator, double_escaped), 268 | // When the self-closing flag is set, the StartTag array has true as its fourth entry. 269 | // When the flag is not set, the array has only three entries for backwards compatibility. 270 | .self_closing = if (output_array.len == 3) false else output_array[3].bool, 271 | .attributes = .{}, 272 | }, 273 | }; 274 | var attributes_obj_it = attributes_obj.iterator(); 275 | while (attributes_obj_it.next()) |attribute_entry| { 276 | try token.start_tag.attributes.put( 277 | allocator, 278 | try getString(attribute_entry.key_ptr.*, allocator, double_escaped), 279 | try getString(attribute_entry.value_ptr.string, allocator, double_escaped), 280 | ); 281 | } 282 | try tokens.append(token); 283 | } else if (std.mem.eql(u8, token_type_str, "EndTag")) { 284 | // ["EndTag", name] 285 | try tokens.append(Token{ 286 | .end_tag = .{ 287 | .name = try getString(output_array[1].string, allocator, double_escaped), 288 | }, 289 | }); 290 | } else if (std.mem.eql(u8, token_type_str, "Comment")) { 291 | // ["Comment", data] 292 | try tokens.append(Token{ 293 | .comment = .{ .data = try getString(output_array[1].string, allocator, double_escaped) }, 294 | }); 295 | } else if (std.mem.eql(u8, token_type_str, "Character")) { 296 | // ["Character", data] 297 | // All adjacent character tokens are coalesced into a single ["Character", data] token. 298 | const decoded = try getStringDecoded(output_array[1].string, allocator, double_escaped); 299 | defer allocator.free(decoded); 300 | for (decoded) |c| { 301 | try tokens.append(Token{ .character = .{ .data = c } }); 302 | } 303 | } 304 | } 305 | return tokens; 306 | } 307 | 308 | fn parseErrors(allocator: Allocator, errors: []const std.json.Value) !std.ArrayList(ErrorInfo) { 309 | var error_infos = try std.ArrayList(ErrorInfo).initCapacity(allocator, errors.len); 310 | for (errors) |error_obj| { 311 | const err_string = error_obj.object.get("code").?.string; 312 | error_infos.appendAssumeCapacity(ErrorInfo{ 313 | .id = err_string, 314 | }); 315 | } 316 | return error_infos; 317 | } 318 | 319 | fn parseInitialState(str: []const u8) TokenizerState { 320 | const map = std.StaticStringMap(TokenizerState).initComptime(.{ 321 | .{ "Data state", TokenizerState.Data }, 322 | .{ "PLAINTEXT state", TokenizerState.PLAINTEXT }, 323 | .{ "RCDATA state", TokenizerState.RCDATA }, 324 | .{ "RAWTEXT state", TokenizerState.RAWTEXT }, 325 | .{ "Script data state", TokenizerState.ScriptData }, 326 | .{ "CDATA section state", TokenizerState.CDATASection }, 327 | }); 328 | return map.get(str).?; 329 | } 330 | 331 | fn expectEqualAttributes(expected: AttributeSet, actual: AttributeSet) !void { 332 | try testing.expectEqual(expected.count(), actual.count()); 333 | var expected_it = expected.iterator(); 334 | while (expected_it.next()) |entry| { 335 | const expected_value = entry.value_ptr.*; 336 | const actual_value = actual.get(entry.key_ptr.*); 337 | try testing.expect(actual_value != null); 338 | try testing.expectEqualSlices(u8, expected_value, actual_value.?); 339 | } 340 | } 341 | 342 | fn expectEqualNullableSlices(comptime T: type, expected: ?[]const T, actual: ?[]const T) !void { 343 | if (expected) |e| { 344 | try testing.expect(actual != null); 345 | try testing.expectEqualSlices(T, e, actual.?); 346 | } else { 347 | try testing.expectEqual(expected, actual); 348 | } 349 | } 350 | 351 | fn expectEqualTokens(expected: Token, actual: Token) !void { 352 | try testing.expect(expected.eql(actual)); 353 | } 354 | 355 | const ErrorInfo = struct { 356 | id: []const u8, 357 | //line: usize, 358 | //column: usize, 359 | 360 | fn errorToSpecId(err: ParseError) []const u8 { 361 | // there might be a cleverer way to do this but oh well 362 | return switch (err) { 363 | ParseError.AbruptClosingOfEmptyComment => "abrupt-closing-of-empty-comment", 364 | ParseError.AbruptDOCTYPEPublicIdentifier => "abrupt-doctype-public-identifier", 365 | ParseError.AbruptDOCTYPESystemIdentifier => "abrupt-doctype-system-identifier", 366 | ParseError.AbsenceOfDigitsInNumericCharacterReference => "absence-of-digits-in-numeric-character-reference", 367 | ParseError.CDATAInHtmlContent => "cdata-in-html-content", 368 | ParseError.CharacterReferenceOutsideUnicodeRange => "character-reference-outside-unicode-range", 369 | ParseError.ControlCharacterInInputStream => "control-character-in-input-stream", 370 | ParseError.ControlCharacterReference => "control-character-reference", 371 | ParseError.EndTagWithAttributes => "end-tag-with-attributes", 372 | ParseError.DuplicateAttribute => "duplicate-attribute", 373 | ParseError.EndTagWithTrailingSolidus => "end-tag-with-trailing-solidus", 374 | ParseError.EOFBeforeTagName => "eof-before-tag-name", 375 | ParseError.EOFInCDATA => "eof-in-cdata", 376 | ParseError.EOFInComment => "eof-in-comment", 377 | ParseError.EOFInDOCTYPE => "eof-in-doctype", 378 | ParseError.EOFInScriptHtmlCommentLikeText => "eof-in-script-html-comment-like-text", 379 | ParseError.EOFInTag => "eof-in-tag", 380 | ParseError.IncorrectlyClosedComment => "incorrectly-closed-comment", 381 | ParseError.IncorrectlyOpenedComment => "incorrectly-opened-comment", 382 | ParseError.InvalidCharacterSequenceAfterDOCTYPEName => "invalid-character-sequence-after-doctype-name", 383 | ParseError.InvalidFirstCharacterOfTagName => "invalid-first-character-of-tag-name", 384 | ParseError.MissingAttributeValue => "missing-attribute-value", 385 | ParseError.MissingDOCTYPEName => "missing-doctype-name", 386 | ParseError.MissingDOCTYPEPublicIdentifier => "missing-doctype-public-identifier", 387 | ParseError.MissingDOCTYPESystemIdentifier => "missing-doctype-system-identifier", 388 | ParseError.MissingEndTagName => "missing-end-tag-name", 389 | ParseError.MissingQuoteBeforeDOCTYPEPublicIdentifier => "missing-quote-before-doctype-public-identifier", 390 | ParseError.MissingQuoteBeforeDOCTYPESystemIdentifier => "missing-quote-before-doctype-system-identifier", 391 | ParseError.MissingSemicolonAfterCharacterReference => "missing-semicolon-after-character-reference", 392 | ParseError.MissingWhitespaceAfterDOCTYPEPublicKeyword => "missing-whitespace-after-doctype-public-keyword", 393 | ParseError.MissingWhitespaceAfterDOCTYPESystemKeyword => "missing-whitespace-after-doctype-system-keyword", 394 | ParseError.MissingWhitespaceBeforeDOCTYPEName => "missing-whitespace-before-doctype-name", 395 | ParseError.MissingWhitespaceBetweenAttributes => "missing-whitespace-between-attributes", 396 | ParseError.MissingWhitespaceBetweenDOCTYPEPublicAndSystemIdentifiers => "missing-whitespace-between-doctype-public-and-system-identifiers", 397 | ParseError.NestedComment => "nested-comment", 398 | ParseError.NoncharacterCharacterReference => "noncharacter-character-reference", 399 | ParseError.NoncharacterInInputStream => "noncharacter-in-input-stream", 400 | ParseError.NullCharacterReference => "null-character-reference", 401 | ParseError.SurrogateCharacterReference => "surrogate-character-reference", 402 | ParseError.SurrogateInInputStream => "surrogate-in-input-stream", 403 | ParseError.UnexpectedCharacterAfterDOCTYPESystemIdentifier => "unexpected-character-after-doctype-system-identifier", 404 | ParseError.UnexpectedCharacterInAttributeName => "unexpected-character-in-attribute-name", 405 | ParseError.UnexpectedCharacterInUnquotedAttributeValue => "unexpected-character-in-unquoted-attribute-value", 406 | ParseError.UnexpectedEqualsSignBeforeAttributeName => "unexpected-equals-sign-before-attribute-name", 407 | ParseError.UnexpectedNullCharacter => "unexpected-null-character", 408 | ParseError.UnexpectedQuestionMarkInsteadOfTagName => "unexpected-question-mark-instead-of-tag-name", 409 | ParseError.UnexpectedSolidusInTag => "unexpected-solidus-in-tag", 410 | ParseError.UnknownNamedCharacterReference => "unknown-named-character-reference", 411 | ParseError.NonVoidHtmlElementStartTagWithTrailingSolidus => "non-void-html-element-start-tag-with-trailing-solidus", 412 | ParseError.TreeConstructionError => unreachable, 413 | }; 414 | } 415 | 416 | pub fn format(value: @This(), comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { 417 | _ = fmt; 418 | _ = options; 419 | try std.fmt.format(writer, "{s}", .{value.id}); 420 | } 421 | }; 422 | 423 | fn getString(string: []const u8, allocator: Allocator, double_escaped: bool) ![]u8 { 424 | if (!double_escaped) { 425 | return allocator.dupe(u8, string); 426 | } else { 427 | return doubleEscape(u8, allocator, string); 428 | } 429 | } 430 | 431 | fn getStringDecoded(string: []const u8, allocator: Allocator, double_escaped: bool) ![]u21 { 432 | if (!double_escaped) { 433 | var it = (try std.unicode.Utf8View.init(string)).iterator(); 434 | var list = std.ArrayList(u21).init(allocator); 435 | errdefer list.deinit(); 436 | while (it.nextCodepoint()) |cp| { 437 | try list.append(cp); 438 | } 439 | return list.toOwnedSlice(); 440 | } else { 441 | return doubleEscape(u21, allocator, string); 442 | } 443 | } 444 | 445 | fn doubleEscape(comptime Char: type, allocator: Allocator, string: []const u8) ![]Char { 446 | var result = std.ArrayList(Char).init(allocator); 447 | defer result.deinit(); 448 | 449 | var state: enum { Data, Backslash, Unicode } = .Data; 450 | var pos: usize = 0; 451 | while (pos < string.len) { 452 | switch (state) { 453 | .Data => { 454 | switch (string[pos]) { 455 | '\\' => state = .Backslash, 456 | else => |c| try result.append(c), 457 | } 458 | pos += 1; 459 | }, 460 | .Backslash => { 461 | switch (string[pos]) { 462 | 'u' => state = .Unicode, 463 | else => |c| { 464 | try result.appendSlice(&.{ '\\', c }); 465 | state = .Data; 466 | }, 467 | } 468 | pos += 1; 469 | }, 470 | .Unicode => { 471 | const codepoint = std.fmt.parseUnsigned(u21, string[pos .. pos + 4], 16) catch unreachable; 472 | switch (Char) { 473 | u8 => { 474 | var code_units: [4]u8 = undefined; 475 | const len = std.unicode.utf8Encode(codepoint, &code_units) catch unreachable; 476 | try result.appendSlice(code_units[0..len]); 477 | }, 478 | u21 => { 479 | try result.append(codepoint); 480 | }, 481 | else => unreachable, 482 | } 483 | state = .Data; 484 | pos += 4; 485 | }, 486 | } 487 | } 488 | 489 | return try result.toOwnedSlice(); 490 | } 491 | -------------------------------------------------------------------------------- /test/html5lib-test-tree-construction.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | const std = @import("std"); 7 | const assert = std.debug.assert; 8 | const panic = std.debug.panic; 9 | const Allocator = std.mem.Allocator; 10 | const ArrayList = std.ArrayList; 11 | 12 | const expect = std.testing.expect; 13 | const expectEqual = std.testing.expectEqual; 14 | const expectEqualStrings = std.testing.expectEqualStrings; 15 | 16 | const rem = @import("rem"); 17 | const Dom = rem.Dom; 18 | const Document = Dom.Document; 19 | const DocumentType = Dom.DocumentType; 20 | const Element = Dom.Element; 21 | const ElementType = Dom.ElementType; 22 | const ElementAttributesKey = Dom.ElementAttributesKey; 23 | const CharacterData = Dom.CharacterData; 24 | 25 | const Tokenizer = rem.Tokenizer; 26 | const TreeConstructor = rem.tree_construction.TreeConstructor; 27 | const Parser = rem.Parser; 28 | 29 | fn eql(str1: []const u8, str2: []const u8) bool { 30 | return std.mem.eql(u8, str1, str2); 31 | } 32 | 33 | fn startsWith(str1: []const u8, str2: []const u8) bool { 34 | return std.mem.startsWith(u8, str1, str2); 35 | } 36 | 37 | fn endsWith(str1: []const u8, str2: []const u8) bool { 38 | return std.mem.endsWith(u8, str1, str2); 39 | } 40 | 41 | test "html5lib-tests tree construction without scripting" { 42 | try runTestFile("test/html5lib-tests/tree-construction/adoption01.dat", false); 43 | try runTestFile("test/html5lib-tests/tree-construction/adoption02.dat", false); 44 | try runTestFile("test/html5lib-tests/tree-construction/blocks.dat", false); 45 | try runTestFile("test/html5lib-tests/tree-construction/comments01.dat", false); 46 | try runTestFile("test/html5lib-tests/tree-construction/doctype01.dat", false); 47 | try runTestFile("test/html5lib-tests/tree-construction/domjs-unsafe.dat", false); 48 | try runTestFile("test/html5lib-tests/tree-construction/entities01.dat", false); 49 | try runTestFile("test/html5lib-tests/tree-construction/entities02.dat", false); 50 | try runTestFile("test/html5lib-tests/tree-construction/foreign-fragment.dat", false); 51 | try runTestFile("test/html5lib-tests/tree-construction/html5test-com.dat", false); 52 | try runTestFile("test/html5lib-tests/tree-construction/inbody01.dat", false); 53 | try runTestFile("test/html5lib-tests/tree-construction/isindex.dat", false); 54 | try runTestFile("test/html5lib-tests/tree-construction/main-element.dat", false); 55 | try runTestFile("test/html5lib-tests/tree-construction/math.dat", false); 56 | try runTestFile("test/html5lib-tests/tree-construction/menuitem-element.dat", false); 57 | try runTestFile("test/html5lib-tests/tree-construction/namespace-sensitivity.dat", false); 58 | try runTestFile("test/html5lib-tests/tree-construction/noscript01.dat", false); 59 | try runTestFile("test/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat", false); 60 | try runTestFile("test/html5lib-tests/tree-construction/pending-spec-changes.dat", false); 61 | try runTestFile("test/html5lib-tests/tree-construction/plain-text-unsafe.dat", false); 62 | try runTestFile("test/html5lib-tests/tree-construction/ruby.dat", false); 63 | try runTestFile("test/html5lib-tests/tree-construction/scriptdata01.dat", false); 64 | try runTestFile("test/html5lib-tests/tree-construction/svg.dat", false); 65 | try runTestFile("test/html5lib-tests/tree-construction/tables01.dat", false); 66 | try runTestFile("test/html5lib-tests/tree-construction/template.dat", false); 67 | try runTestFile("test/html5lib-tests/tree-construction/tests1.dat", false); 68 | try runTestFile("test/html5lib-tests/tree-construction/tests2.dat", false); 69 | try runTestFile("test/html5lib-tests/tree-construction/tests3.dat", false); 70 | try runTestFile("test/html5lib-tests/tree-construction/tests4.dat", false); 71 | try runTestFile("test/html5lib-tests/tree-construction/tests5.dat", false); 72 | try runTestFile("test/html5lib-tests/tree-construction/tests6.dat", false); 73 | try runTestFile("test/html5lib-tests/tree-construction/tests7.dat", false); 74 | try runTestFile("test/html5lib-tests/tree-construction/tests8.dat", false); 75 | try runTestFile("test/html5lib-tests/tree-construction/tests9.dat", false); 76 | try runTestFile("test/html5lib-tests/tree-construction/tests10.dat", false); 77 | try runTestFile("test/html5lib-tests/tree-construction/tests11.dat", false); 78 | try runTestFile("test/html5lib-tests/tree-construction/tests12.dat", false); 79 | try runTestFile("test/html5lib-tests/tree-construction/tests14.dat", false); 80 | try runTestFile("test/html5lib-tests/tree-construction/tests15.dat", false); 81 | try runTestFile("test/html5lib-tests/tree-construction/tests16.dat", false); 82 | try runTestFile("test/html5lib-tests/tree-construction/tests17.dat", false); 83 | try runTestFile("test/html5lib-tests/tree-construction/tests18.dat", false); 84 | try runTestFile("test/html5lib-tests/tree-construction/tests19.dat", false); 85 | try runTestFile("test/html5lib-tests/tree-construction/tests20.dat", false); 86 | try runTestFile("test/html5lib-tests/tree-construction/tests21.dat", false); 87 | try runTestFile("test/html5lib-tests/tree-construction/tests22.dat", false); 88 | try runTestFile("test/html5lib-tests/tree-construction/tests23.dat", false); 89 | try runTestFile("test/html5lib-tests/tree-construction/tests24.dat", false); 90 | try runTestFile("test/html5lib-tests/tree-construction/tests25.dat", false); 91 | try runTestFile("test/html5lib-tests/tree-construction/tests26.dat", false); 92 | try runTestFile("test/html5lib-tests/tree-construction/tests_innerHTML_1.dat", false); 93 | try runTestFile("test/html5lib-tests/tree-construction/tricky01.dat", false); 94 | try runTestFile("test/html5lib-tests/tree-construction/webkit01.dat", false); 95 | try runTestFile("test/html5lib-tests/tree-construction/webkit02.dat", false); 96 | try runTestFile("test/html5lib-tests/tree-construction/scripted/adoption01.dat", false); 97 | try runTestFile("test/html5lib-tests/tree-construction/scripted/ark.dat", false); 98 | try runTestFile("test/html5lib-tests/tree-construction/scripted/webkit01.dat", false); 99 | } 100 | 101 | test "html5lib-tests tree construction with scripting" { 102 | // Tests that are being skipped out are not passing. 103 | // The goal of course is to skip none of them. 104 | 105 | // NOTE: All of the skipped tests fail for 1 of these reasons: 106 | // 1. Finding a "script" end tag token in the "text" insertion mode 107 | // 2. Finding an eof token while the current node is a script in the "text" insertion mode 108 | // 3. Finding a "script" end tag token in foreign content, while the current node is an SVG script 109 | 110 | try runTestFile("test/html5lib-tests/tree-construction/adoption01.dat", true); 111 | try runTestFile("test/html5lib-tests/tree-construction/adoption02.dat", true); 112 | try runTestFile("test/html5lib-tests/tree-construction/blocks.dat", true); 113 | try runTestFile("test/html5lib-tests/tree-construction/comments01.dat", true); 114 | try runTestFile("test/html5lib-tests/tree-construction/doctype01.dat", true); 115 | skipTestFile("test/html5lib-tests/tree-construction/domjs-unsafe.dat", true); 116 | try runTestFile("test/html5lib-tests/tree-construction/entities01.dat", true); 117 | try runTestFile("test/html5lib-tests/tree-construction/entities02.dat", true); 118 | try runTestFile("test/html5lib-tests/tree-construction/foreign-fragment.dat", true); 119 | try runTestFile("test/html5lib-tests/tree-construction/html5test-com.dat", true); 120 | try runTestFile("test/html5lib-tests/tree-construction/inbody01.dat", true); 121 | try runTestFile("test/html5lib-tests/tree-construction/isindex.dat", true); 122 | try runTestFile("test/html5lib-tests/tree-construction/main-element.dat", true); 123 | try runTestFile("test/html5lib-tests/tree-construction/math.dat", true); 124 | try runTestFile("test/html5lib-tests/tree-construction/menuitem-element.dat", true); 125 | try runTestFile("test/html5lib-tests/tree-construction/namespace-sensitivity.dat", true); 126 | try runTestFile("test/html5lib-tests/tree-construction/noscript01.dat", true); 127 | try runTestFile("test/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat", true); 128 | try runTestFile("test/html5lib-tests/tree-construction/pending-spec-changes.dat", true); 129 | try runTestFile("test/html5lib-tests/tree-construction/plain-text-unsafe.dat", true); 130 | try runTestFile("test/html5lib-tests/tree-construction/ruby.dat", true); 131 | skipTestFile("test/html5lib-tests/tree-construction/scriptdata01.dat", true); 132 | try runTestFile("test/html5lib-tests/tree-construction/svg.dat", true); 133 | try runTestFile("test/html5lib-tests/tree-construction/tables01.dat", true); 134 | try runTestFile("test/html5lib-tests/tree-construction/template.dat", true); 135 | skipTestFile("test/html5lib-tests/tree-construction/tests1.dat", true); 136 | skipTestFile("test/html5lib-tests/tree-construction/tests2.dat", true); 137 | skipTestFile("test/html5lib-tests/tree-construction/tests3.dat", true); 138 | try runTestFile("test/html5lib-tests/tree-construction/tests4.dat", true); 139 | skipTestFile("test/html5lib-tests/tree-construction/tests5.dat", true); 140 | try runTestFile("test/html5lib-tests/tree-construction/tests6.dat", true); 141 | skipTestFile("test/html5lib-tests/tree-construction/tests7.dat", true); 142 | try runTestFile("test/html5lib-tests/tree-construction/tests8.dat", true); 143 | try runTestFile("test/html5lib-tests/tree-construction/tests9.dat", true); 144 | skipTestFile("test/html5lib-tests/tree-construction/tests10.dat", true); 145 | try runTestFile("test/html5lib-tests/tree-construction/tests11.dat", true); 146 | try runTestFile("test/html5lib-tests/tree-construction/tests12.dat", true); 147 | try runTestFile("test/html5lib-tests/tree-construction/tests14.dat", true); 148 | try runTestFile("test/html5lib-tests/tree-construction/tests15.dat", true); 149 | skipTestFile("test/html5lib-tests/tree-construction/tests16.dat", true); 150 | try runTestFile("test/html5lib-tests/tree-construction/tests17.dat", true); 151 | skipTestFile("test/html5lib-tests/tree-construction/tests18.dat", true); 152 | try runTestFile("test/html5lib-tests/tree-construction/tests19.dat", true); 153 | try runTestFile("test/html5lib-tests/tree-construction/tests20.dat", true); 154 | try runTestFile("test/html5lib-tests/tree-construction/tests21.dat", true); 155 | try runTestFile("test/html5lib-tests/tree-construction/tests22.dat", true); 156 | try runTestFile("test/html5lib-tests/tree-construction/tests23.dat", true); 157 | try runTestFile("test/html5lib-tests/tree-construction/tests24.dat", true); 158 | try runTestFile("test/html5lib-tests/tree-construction/tests25.dat", true); 159 | try runTestFile("test/html5lib-tests/tree-construction/tests26.dat", true); 160 | try runTestFile("test/html5lib-tests/tree-construction/tests_innerHTML_1.dat", true); 161 | try runTestFile("test/html5lib-tests/tree-construction/tricky01.dat", true); 162 | skipTestFile("test/html5lib-tests/tree-construction/webkit01.dat", true); 163 | try runTestFile("test/html5lib-tests/tree-construction/webkit02.dat", true); 164 | skipTestFile("test/html5lib-tests/tree-construction/scripted/adoption01.dat", true); 165 | skipTestFile("test/html5lib-tests/tree-construction/scripted/ark.dat", true); 166 | skipTestFile("test/html5lib-tests/tree-construction/scripted/webkit01.dat", true); 167 | } 168 | 169 | fn skipTestFile(file_path: []const u8, scripting: bool) void { 170 | const scripting_string = if (scripting) "enabled" else "disabled"; 171 | std.debug.print( 172 | \\ 173 | \\SKIPPING the tests in file {s} (scripting {s}) 174 | \\======================================================================= 175 | \\ 176 | , 177 | .{ file_path, scripting_string }, 178 | ); 179 | } 180 | 181 | fn runTestFile(file_path: []const u8, scripting: bool) !void { 182 | const scripting_string = if (scripting) "enabled" else "disabled"; 183 | std.debug.print( 184 | \\ 185 | \\Running the tests in file {s} (scripting {s}) 186 | \\======================================================================= 187 | \\ 188 | , 189 | .{ file_path, scripting_string }, 190 | ); 191 | 192 | const allocator = std.testing.allocator; 193 | 194 | const contents: []const u8 = try std.fs.cwd().readFileAlloc(allocator, file_path, 1 << 24); 195 | defer allocator.free(contents); 196 | 197 | var tests = contents; 198 | var count: usize = 1; 199 | var passed: usize = 0; 200 | while (tests.len > 0) { 201 | defer count += 1; 202 | var the_test = createTest(&tests, allocator) catch |err| switch (err) { 203 | // TODO: Don't skip any tests. 204 | error.TemplateContents => { 205 | std.debug.print("Test #{} (Skipped: Exptected DOM tree contains templates)\n", .{count}); 206 | continue; 207 | }, 208 | error.OutOfMemory => |e| return e, 209 | }; 210 | defer the_test.expected.deinit(); 211 | 212 | if (scripting) { 213 | if (the_test.script == .off) { 214 | std.debug.print("Test #{} (Skipped: Scripting must be off for this test)\n", .{count}); 215 | continue; 216 | } 217 | } else { 218 | if (the_test.script == .on) { 219 | std.debug.print("Test #{} (Skipped: Scripting must be on for this test)\n", .{count}); 220 | continue; 221 | } 222 | } 223 | 224 | try runTest(the_test, allocator, scripting); 225 | passed += 1; 226 | } 227 | std.debug.print("{} total, {} passed, {} skipped\n", .{ count - 1, passed, count - 1 - passed }); 228 | } 229 | 230 | const Expected = struct { 231 | dom: Dom, 232 | document: *Document, 233 | fragment_context: ?*Element, 234 | 235 | fn deinit(self: *@This()) void { 236 | self.dom.deinit(); 237 | } 238 | }; 239 | 240 | const Test = struct { 241 | input: []const u8, 242 | errors: usize, 243 | new_errors: usize, 244 | script: ScriptOption, 245 | expected: Expected, 246 | 247 | const ScriptOption = enum { on, off, both }; 248 | }; 249 | 250 | fn createTest(test_string: *[]const u8, allocator: Allocator) !Test { 251 | var lines = std.mem.splitScalar(u8, test_string.*, '\n'); 252 | defer test_string.* = lines.rest(); 253 | var section = lines.next().?; 254 | 255 | assert(eql(section, "#data")); 256 | var data: []const u8 = lines.rest()[0..0]; 257 | while (!startsWith(lines.rest(), "#errors")) { 258 | data.len += lines.next().?.len + 1; 259 | } 260 | if (data.len > 0) data.len -= 1; 261 | section = lines.next().?; 262 | //std.debug.print("#data\n{s}\n", .{data}); 263 | 264 | assert(eql(section, "#errors")); 265 | var errors: []const u8 = lines.rest()[0..0]; 266 | while (!startsWith(lines.rest(), "#")) { 267 | errors.len += lines.next().?.len + 1; 268 | } 269 | section = lines.next().?; 270 | //std.debug.print("#errors\n{s}", .{errors}); 271 | 272 | var new_errors: []const u8 = lines.rest()[0..0]; 273 | if (startsWith(section, "#new-errors")) { 274 | while (!startsWith(lines.rest(), "#")) { 275 | new_errors.len += lines.next().?.len + 1; 276 | } 277 | section = lines.next().?; 278 | //std.debug.print("#new-errors\n{s}", .{new_errors}); 279 | } 280 | 281 | var document_fragment: []const u8 = lines.rest()[0..0]; 282 | var context_element_type: ?ElementType = null; 283 | if (startsWith(section, "#document-fragment")) { 284 | document_fragment = lines.next().?; 285 | if (startsWith(document_fragment, "svg ")) { 286 | context_element_type = ElementType.fromStringSvg(document_fragment[4..]) orelse .some_other_svg; 287 | } else if (startsWith(document_fragment, "math ")) { 288 | context_element_type = ElementType.fromStringMathMl(document_fragment[5..]) orelse .some_other_mathml; 289 | } else { 290 | context_element_type = ElementType.fromStringHtml(document_fragment) orelse .custom_html; 291 | } 292 | section = lines.next().?; 293 | //std.debug.print("#document-fragment\n{s}\n", .{document_fragment}); 294 | } 295 | 296 | var script: Test.ScriptOption = .both; 297 | if (startsWith(section, "#script")) { 298 | if (eql(section, "#script-off")) { 299 | script = .off; 300 | } else if (eql(section, "#script-on")) { 301 | script = .on; 302 | } else { 303 | unreachable; 304 | } 305 | section = lines.next().?; 306 | } 307 | //std.debug.print("#script-{s}\n", .{@tagName(script)}); 308 | 309 | assert(eql(section, "#document")); 310 | //var document: []const u8 = lines.rest(); 311 | //std.debug.print("#document\n{s}\n", .{document}); 312 | 313 | const expected = parseDom(&lines, context_element_type, allocator) catch |err| switch (err) { 314 | error.DomException => unreachable, 315 | else => |e| return e, 316 | }; 317 | // var stderr = std.io.getStdErr().writer(); 318 | // rem.util.printDocument(stderr, expected.document, &expected.dom, allocator) catch panic("", .{}); 319 | 320 | return Test{ 321 | .input = data, 322 | .errors = std.mem.count(u8, errors, "\n"), 323 | .new_errors = std.mem.count(u8, new_errors, "\n"), 324 | .script = script, 325 | .expected = expected, 326 | }; 327 | } 328 | 329 | fn parseDom(lines: *std.mem.SplitIterator(u8, .scalar), context_element_type: ?ElementType, allocator: Allocator) !Expected { 330 | var stack = ArrayList(*Element).init(allocator); 331 | defer stack.deinit(); 332 | 333 | var dom = Dom{ .allocator = allocator }; 334 | errdefer dom.deinit(); 335 | const document = try dom.makeDocument(); 336 | const fragment_context = if (context_element_type) |ty| try dom.makeElement(ty) else null; 337 | var possible_error: ?error{TemplateContents} = null; 338 | 339 | while (lines.next()) |line| { 340 | if (line.len == 0) { 341 | break; 342 | } 343 | assert(startsWith(line, "| ")); 344 | var first_char: usize = 2; 345 | while (line[first_char] == ' ') { 346 | first_char += 1; 347 | } 348 | const depth = @divExact(first_char, 2) - 1; 349 | assert(depth <= stack.items.len); 350 | if (depth < stack.items.len) { 351 | stack.shrinkRetainingCapacity(depth); 352 | } 353 | const data = line[first_char..]; 354 | 355 | if (startsWith(data, "").?; 360 | const name = data[name_start..name_end]; 361 | 362 | const doctype = if (data[name_end] == ' ') blk: { 363 | assert(data[name_end + 1] == '"'); 364 | const public_id_endquote = std.mem.indexOfScalarPos(u8, data, name_end + 2, '"').?; 365 | const public_id = data[name_end + 2 .. public_id_endquote]; 366 | assert(data[public_id_endquote + 1] == ' '); 367 | assert(data[public_id_endquote + 2] == '"'); 368 | const system_id_endquote = std.mem.indexOfScalarPos(u8, data, public_id_endquote + 3, '"').?; 369 | const system_id = data[public_id_endquote + 3 .. system_id_endquote]; 370 | 371 | break :blk try dom.makeDoctype(name, public_id, system_id); 372 | } else try dom.makeDoctype(name, null, null); 373 | try Dom.mutation.documentAppendDocumentType(&dom, document, doctype, .Suppress); 374 | } else if (startsWith(data, "")) { 381 | comment = comment[5 .. comment.len - 5]; 382 | break; 383 | } 384 | my_line = lines.next().?; 385 | } 386 | 387 | const cdata = try dom.makeCdata(comment, .comment); 388 | if (depth == 0) { 389 | if (fragment_context) |e| { 390 | try Dom.mutation.elementAppend(&dom, e, .{ .cdata = cdata }, .Suppress); 391 | } else { 392 | try Dom.mutation.documentAppendCdata(&dom, document, cdata, .Suppress); 393 | } 394 | } else { 395 | try Dom.mutation.elementAppend(&dom, stack.items[depth - 1], .{ .cdata = cdata }, .Suppress); 396 | } 397 | } else if (startsWith(data, "') { 403 | // nope, actually an attribute 404 | try parseAttribute(&dom, &stack, data, depth); 405 | continue; 406 | } 407 | const tag_name = data[1 .. data.len - 1]; 408 | 409 | var element: *Element = undefined; 410 | if (startsWith(tag_name, "svg ")) { 411 | const maybe_element_type = ElementType.fromStringSvg(tag_name[4..]); 412 | if (maybe_element_type) |t| { 413 | element = try dom.makeElement(t); 414 | } else { 415 | element = try dom.makeElement(.some_other_svg); 416 | try dom.registerLocalName(element, tag_name[4..]); 417 | } 418 | } else if (startsWith(tag_name, "math ")) { 419 | const maybe_element_type = ElementType.fromStringMathMl(tag_name[5..]); 420 | if (maybe_element_type) |t| { 421 | element = try dom.makeElement(t); 422 | } else { 423 | element = try dom.makeElement(.some_other_mathml); 424 | try dom.registerLocalName(element, tag_name[5..]); 425 | } 426 | } else { 427 | const maybe_element_type = ElementType.fromStringHtml(tag_name); 428 | if (maybe_element_type) |t| { 429 | element = try dom.makeElement(t); 430 | } else { 431 | element = try dom.makeElement(.custom_html); 432 | try dom.registerLocalName(element, tag_name); 433 | } 434 | } 435 | 436 | if (depth == 0) { 437 | if (fragment_context) |e| { 438 | try Dom.mutation.elementAppend(&dom, e, .{ .element = element }, .Suppress); 439 | } else { 440 | try Dom.mutation.documentAppendElement(&dom, document, element, .Suppress); 441 | } 442 | } else { 443 | try Dom.mutation.elementAppend(&dom, stack.items[stack.items.len - 1], .{ .element = element }, .Suppress); 444 | } 445 | try stack.append(element); 446 | } else if (data[0] == '"') { 447 | // text 448 | var text: []const u8 = data[0..0]; 449 | var my_line = data; 450 | while (true) { 451 | text.len += my_line.len + 1; 452 | if (text.len > 2 and endsWith(my_line, "\"")) { 453 | text = text[1 .. text.len - 2]; 454 | break; 455 | } 456 | my_line = lines.next().?; 457 | } 458 | 459 | const cdata = try dom.makeCdata(text, .text); 460 | if (depth == 0) { 461 | try Dom.mutation.elementAppend(&dom, fragment_context.?, .{ .cdata = cdata }, .Suppress); 462 | } else { 463 | try Dom.mutation.elementAppend(&dom, stack.items[stack.items.len - 1], .{ .cdata = cdata }, .Suppress); 464 | } 465 | } else if (eql(data, "content")) { 466 | // template contents 467 | 468 | // Our DOM tree does not yet support HTML templates. 469 | // Create a new element and add it to the stack. 470 | // This is done just so we can continue reading the rest of the tree. 471 | if (possible_error == null) possible_error = error.TemplateContents; 472 | const dummy_element = try dom.makeElement(.html_template); 473 | try stack.append(dummy_element); 474 | } else { 475 | // attribute 476 | try parseAttribute(&dom, &stack, data, depth); 477 | } 478 | } 479 | 480 | if (possible_error) |err| return err; 481 | return Expected{ .dom = dom, .document = document, .fragment_context = fragment_context }; 482 | } 483 | 484 | fn parseAttribute(dom: *Dom, stack: *ArrayList(*Element), data: []const u8, depth: usize) !void { 485 | assert(depth == stack.items.len); 486 | const eql_sign = std.mem.indexOfScalar(u8, data, '=').?; 487 | assert(data[eql_sign + 1] == '"'); 488 | assert(data[data.len - 1] == '"'); 489 | const attribute_name = data[0..eql_sign]; 490 | const value = data[eql_sign + 2 .. data.len - 1]; 491 | 492 | var key: ElementAttributesKey = .{ .local_name = attribute_name, .prefix = undefined, .namespace = undefined }; 493 | if (startsWith(attribute_name, "xlink ")) { 494 | key.prefix = .xlink; 495 | key.namespace = .xlink; 496 | } else if (startsWith(attribute_name, "xml ")) { 497 | key.prefix = .xml; 498 | key.namespace = .xml; 499 | } else if (startsWith(attribute_name, "xmlns ")) { 500 | key.prefix = .xmlns; 501 | key.namespace = .xmlns; 502 | } else { 503 | key.prefix = .none; 504 | key.namespace = .none; 505 | } 506 | 507 | try stack.items[stack.items.len - 1].appendAttribute(dom.allocator, key, value); 508 | } 509 | 510 | fn runTest(t: Test, allocator: Allocator, scripting: bool) !void { 511 | const input = input: { 512 | var list = ArrayList(u21).init(allocator); 513 | errdefer list.deinit(); 514 | var i: usize = 0; 515 | while (i < t.input.len) { 516 | const len = std.unicode.utf8ByteSequenceLength(t.input[i]) catch unreachable; 517 | const value = std.unicode.utf8Decode(t.input[i .. i + len]) catch unreachable; 518 | try list.append(value); 519 | i += len; 520 | } 521 | break :input try list.toOwnedSlice(); 522 | }; 523 | defer allocator.free(input); 524 | 525 | var result_dom = Dom{ .allocator = allocator }; 526 | defer result_dom.deinit(); 527 | 528 | if (t.expected.fragment_context) |e| { 529 | var context_element = Element{ 530 | .element_type = e.element_type, 531 | .parent = null, 532 | .attributes = .{}, 533 | .children = .{}, 534 | }; 535 | var parser = try Parser.initFragment(&result_dom, &context_element, input, allocator, .ignore, scripting, .no_quirks); 536 | defer parser.deinit(); 537 | try parser.run(); 538 | 539 | try deeplyCompareDocuments(allocator, t.expected.document, parser.getDocument()); 540 | } else { 541 | var parser = try Parser.init(&result_dom, input, allocator, .ignore, scripting); 542 | defer parser.deinit(); 543 | try parser.run(); 544 | 545 | try deeplyCompareDocuments(allocator, t.expected.document, parser.getDocument()); 546 | } 547 | } 548 | 549 | fn deeplyCompareDocuments(allocator: Allocator, doc1: *const Document, doc2: *const Document) !void { 550 | //try expectEqual(doc1.quirks_mode, doc2.quirks_mode); 551 | comptime var i = 0; 552 | inline while (i < doc1.cdata_endpoints.len) : (i += 1) { 553 | try expectEqual(doc1.cdata_endpoints[i], doc2.cdata_endpoints[i]); 554 | } 555 | for (doc1.cdata.items, doc2.cdata.items) |c1, c2| { 556 | try expectEqualCdatas(c1, c2); 557 | } 558 | 559 | if (doc1.doctype) |d1| { 560 | try expect(doc2.doctype != null); 561 | try expectEqualDoctypes(d1, doc2.doctype.?); 562 | } 563 | 564 | if (doc2.element) |e1| { 565 | try expect(doc2.element != null); 566 | try deeplyCompareElements(allocator, e1, doc2.element.?); 567 | } 568 | } 569 | 570 | fn deeplyCompareElements(allocator: Allocator, element1: *const Element, element2: *const Element) !void { 571 | const ElementPair = struct { 572 | e1: *const Element, 573 | e2: *const Element, 574 | }; 575 | 576 | var stack = ArrayList(ElementPair).init(allocator); 577 | defer stack.deinit(); 578 | try stack.append(.{ .e1 = element1, .e2 = element2 }); 579 | 580 | while (stack.items.len > 0) { 581 | const pair = stack.pop().?; 582 | 583 | try expectEqualElements(pair.e1, pair.e2); 584 | try expectEqual(pair.e1.children.items.len, pair.e2.children.items.len); 585 | var i = pair.e1.children.items.len; 586 | while (i > 0) : (i -= 1) { 587 | const e1_child = pair.e1.children.items[i - 1]; 588 | const e2_child = pair.e2.children.items[i - 1]; 589 | switch (e1_child) { 590 | .element => { 591 | try expect(e2_child == .element); 592 | try stack.append(.{ .e1 = e1_child.element, .e2 = e2_child.element }); 593 | }, 594 | .cdata => { 595 | try expect(e2_child == .cdata); 596 | try expectEqualCdatas(e1_child.cdata, e2_child.cdata); 597 | }, 598 | } 599 | } 600 | } 601 | } 602 | 603 | fn expectEqualDoctypes(d1: *const DocumentType, d2: *const DocumentType) !void { 604 | try expectEqualStrings(d1.name, d2.name); 605 | try expectEqualStrings(d1.publicId, d2.publicId); 606 | try expectEqualStrings(d1.systemId, d2.systemId); 607 | } 608 | 609 | fn expectEqualElements(e1: *const Element, e2: *const Element) !void { 610 | // TODO: If the element type has an interface associated with it, check that for equality too. 611 | try expectEqual(e1.element_type, e2.element_type); 612 | 613 | const e1_slice = e1.attributes.slice(); 614 | const e2_slice = e2.attributes.slice(); 615 | try expectEqual(e1_slice.len, e2_slice.len); 616 | 617 | var i: usize = 0; 618 | while (i < e1_slice.len) : (i += 1) { 619 | const key = e1_slice.items(.key)[i]; 620 | const e2_value = e2.getAttribute(key); 621 | try expect(e2_value != null); 622 | const e1_value = e1_slice.items(.value)[i]; 623 | try expectEqualStrings(e1_value, e2_value.?); 624 | } 625 | } 626 | 627 | fn expectEqualCdatas(c1: *const CharacterData, c2: *const CharacterData) !void { 628 | try expectEqual(c1.interface, c2.interface); 629 | try expectEqualStrings(c1.data.items, c2.data.items); 630 | } 631 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | `character_reference_data.json`: sourced from https://html.spec.whatwg.org/entities.json 2 | -------------------------------------------------------------------------------- /tools/generate_named_characters.zig: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021-2024 Chadwain Holness 2 | // 3 | // You should have received a copy of the GNU General Public License 4 | // along with this program. If not, see . 5 | 6 | //! Expects as program arguments the path to JSON data, and the path to 7 | //! output the resulting zig file. Must be built with runtime safety enabled. 8 | 9 | const std = @import("std"); 10 | const assert = std.debug.assert; 11 | const ArrayList = std.ArrayList; 12 | const Allocator = std.mem.Allocator; 13 | 14 | comptime { 15 | assert(@import("builtin").mode == .Debug); 16 | } 17 | 18 | pub fn main() !void { 19 | var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator); 20 | defer arena_allocator.deinit(); 21 | const arena = arena_allocator.allocator(); 22 | 23 | const args = try std.process.argsAlloc(arena); 24 | const out_file_path = args[2]; 25 | const cwd = std.fs.cwd(); 26 | 27 | const input_data = blk: { 28 | const input_file = args[1]; 29 | const file = try cwd.openFile(input_file, .{}); 30 | defer file.close(); 31 | break :blk try file.readToEndAlloc(arena, std.math.maxInt(c_int)); 32 | }; 33 | 34 | const tree = try createTree(arena, input_data); 35 | const output = try render(&tree, arena); 36 | 37 | var out_file = try cwd.createFile(out_file_path, .{}); 38 | defer out_file.close(); 39 | var writer = out_file.writer(); 40 | try writer.writeAll(output); 41 | 42 | std.debug.print("Generated named character reference data at {s}\n", .{out_file_path}); 43 | } 44 | 45 | const Node = struct { 46 | children: ArrayList(Entry), 47 | input: ArrayList(Item), 48 | 49 | const Entry = struct { 50 | key: u8, 51 | node: ?*Node, 52 | is_match: bool, 53 | characters: []const u8, 54 | 55 | fn getOrCreateChildNode(entry: *Entry, arena: Allocator) !*Node { 56 | if (entry.node == null) { 57 | entry.node = try arena.create(Node); 58 | entry.node.?.* = .{ .children = ArrayList(Entry).init(arena), .input = ArrayList(Item).init(arena) }; 59 | } 60 | return entry.node.?; 61 | } 62 | }; 63 | 64 | fn getOrCreateEntry(node: *Node, key: u8) !*Entry { 65 | for (node.children.items) |*c| { 66 | if (c.key == key) return c; 67 | } 68 | 69 | const insert_pos = searchForInsertPosition(key, node.children.items); 70 | try node.children.insert(insert_pos, .{ .key = key, .node = null, .is_match = false, .characters = undefined }); 71 | return &node.children.items[insert_pos]; 72 | } 73 | 74 | fn searchForInsertPosition(key: u8, entries: []Entry) usize { 75 | var left: usize = 0; 76 | var right: usize = entries.len; 77 | 78 | while (left < right) { 79 | // Avoid overflowing in the midpoint calculation 80 | const mid = left + (right - left) / 2; 81 | // Compare the key with the midpoint element 82 | switch (std.math.order(key, entries[mid].key)) { 83 | .eq => unreachable, 84 | .gt => left = mid + 1, 85 | .lt => right = mid, 86 | } 87 | } 88 | 89 | return left; 90 | } 91 | }; 92 | 93 | const Item = struct { 94 | name: []const u8, 95 | characters: []const u8, 96 | }; 97 | 98 | fn createTree(arena: Allocator, input_data: []const u8) !Node { 99 | var parsed_json = try std.json.parseFromSlice(std.json.Value, arena, input_data, .{}); 100 | 101 | var node = Node{ .children = ArrayList(Node.Entry).init(arena), .input = std.ArrayList(Item).init(arena) }; 102 | var it = parsed_json.value.object.iterator(); 103 | while (it.next()) |o| { 104 | const name = o.key_ptr.*[1..]; 105 | try node.input.append(.{ .name = name, .characters = o.value_ptr.object.get("characters").?.string }); 106 | } 107 | 108 | try createChildren(arena, &node); 109 | return node; 110 | } 111 | 112 | fn createChildren(arena: Allocator, node: *Node) error{OutOfMemory}!void { 113 | for (node.input.items) |i| { 114 | const key = i.name[0]; 115 | const entry = try node.getOrCreateEntry(key); 116 | if (i.name.len > 1) { 117 | const child_node = try entry.getOrCreateChildNode(arena); 118 | try child_node.input.append(Item{ .name = i.name[1..], .characters = i.characters }); 119 | } else { 120 | entry.is_match = true; 121 | entry.characters = i.characters; 122 | } 123 | } 124 | for (node.children.items) |c| { 125 | if (c.node) |n| { 126 | try createChildren(arena, n); 127 | } 128 | } 129 | } 130 | 131 | fn render(root: *const Node, arena: Allocator) ![]u8 { 132 | const Entry = packed struct { has_value: bool, has_children: bool, index_of_children: u14 }; 133 | var entries = std.ArrayList(Entry).init(arena); 134 | 135 | const Child = packed struct { final: bool, char: u7 }; 136 | var children = std.ArrayList(Child).init(arena); 137 | 138 | const Value = struct { 139 | first: ?u21, 140 | second: ?u21, 141 | 142 | fn fromCharacters(characters: []const u8) @This() { 143 | const len1 = std.unicode.utf8ByteSequenceLength(characters[0]) catch unreachable; 144 | const first = std.unicode.utf8Decode(characters[0..len1]) catch unreachable; 145 | if (characters.len <= len1) return @This(){ .first = first, .second = null }; 146 | const len2 = std.unicode.utf8ByteSequenceLength(characters[len1]) catch unreachable; 147 | const second = std.unicode.utf8Decode(characters[len1 .. len1 + len2]) catch unreachable; 148 | return @This(){ .first = first, .second = second }; 149 | } 150 | }; 151 | var values = std.ArrayList(Value).init(arena); 152 | 153 | var stack = std.ArrayList(struct { node: ?*const Node, main_index: u16 }).init(arena); 154 | 155 | try stack.append(.{ .node = root, .main_index = 0 }); 156 | try values.append(.{ .first = null, .second = null }); 157 | try entries.append(.{ .has_value = false, .has_children = false, .index_of_children = undefined }); 158 | 159 | while (stack.items.len > 0) { 160 | const stack_item = stack.pop().?; 161 | const stack_len = stack.items.len; 162 | 163 | const main_index = stack_item.main_index; 164 | const entry = &entries.items[main_index]; 165 | const node = stack_item.node orelse { 166 | entry.has_children = false; 167 | entry.index_of_children = undefined; 168 | continue; 169 | }; 170 | 171 | entry.has_children = true; 172 | entry.index_of_children = @intCast(children.items.len); 173 | for (node.children.items, 0..) |c, i| { 174 | const child_main_index: u16 = @intCast(entries.items.len); 175 | try children.append(.{ .char = @intCast(c.key), .final = (i == node.children.items.len - 1) }); 176 | try stack.insert(stack_len, .{ .node = c.node, .main_index = child_main_index }); 177 | const child_entry = try entries.addOne(); 178 | 179 | child_entry.has_value = c.is_match; 180 | const value = if (c.is_match) Value.fromCharacters(c.characters) else Value{ .first = null, .second = null }; 181 | try values.append(value); 182 | } 183 | } 184 | 185 | var output = ArrayList(u8).init(arena); 186 | var writer = output.writer(); 187 | 188 | try writer.writeAll( 189 | \\//! This is an auto-generated file. 190 | \\ 191 | \\const std = @import("std"); 192 | \\ 193 | \\pub const Index = packed struct { 194 | \\ array_index: u14, 195 | \\ 196 | \\ pub fn entry(index: Index) Entry { 197 | \\ return @bitCast(entries[index.array_index]); 198 | \\ } 199 | \\ 200 | \\ pub fn value(index: Index) Value { 201 | \\ return values[index.array_index]; 202 | \\ } 203 | \\}; 204 | \\ 205 | \\pub const root_index = Index{ .array_index = 0 }; 206 | \\ 207 | \\pub const Entry = packed struct { 208 | \\ has_value: bool, 209 | \\ has_children: bool, 210 | \\ index_of_children: Index, 211 | \\ 212 | \\ pub fn findChild(entry: Entry, char: u21) ?Index { 213 | \\ std.debug.assert(entry.has_children); 214 | \\ const char_u7 = std.math.cast(u7, char) orelse return null; 215 | \\ 216 | \\ var i = entry.index_of_children.array_index; 217 | \\ while (true) : (i += 1) { 218 | \\ const child: Child = @bitCast(children[i]); 219 | \\ if (child.char == char_u7) { 220 | \\ return Index{ .array_index = i + 1 }; 221 | \\ } else if (child.final) { 222 | \\ break; 223 | \\ } 224 | \\ } 225 | \\ 226 | \\ return null; 227 | \\ } 228 | \\}; 229 | \\ 230 | \\const Child = packed struct { 231 | \\ final: bool, 232 | \\ char: u7, 233 | \\}; 234 | \\ 235 | \\comptime { 236 | \\ std.debug.assert(@bitSizeOf(Entry) == 16); 237 | \\ std.debug.assert(@bitSizeOf(Child) == 8); 238 | \\} 239 | \\ 240 | \\/// If the 1st field is null, then the current string does not match any named character references. 241 | \\/// Otherwise, there is a match, and the 2nd field may or may not be null. 242 | \\pub const Value = @Type(std.builtin.Type{ .@"struct" = .{ 243 | \\ .layout = .auto, 244 | \\ .fields = &.{ 245 | \\ .{ 246 | \\ .name = "0", 247 | \\ .type = ?u21, 248 | \\ .default_value_ptr = null, 249 | \\ .is_comptime = false, 250 | \\ .alignment = @alignOf(?u21), 251 | \\ }, 252 | \\ .{ 253 | \\ .name = "1", 254 | \\ .type = ?u21, 255 | \\ .default_value_ptr = null, 256 | \\ .is_comptime = false, 257 | \\ .alignment = @alignOf(?u21), 258 | \\ }, 259 | \\ }, 260 | \\ .decls = &.{}, 261 | \\ .is_tuple = true, 262 | \\} }); 263 | \\ 264 | ); 265 | 266 | try writer.print("\nconst entries = [{}]u16{{", .{entries.items.len}); 267 | for (entries.items, 0..) |entry, i| { 268 | if (i % 20 == 0) try writeNewline(writer); 269 | try writer.print("{}, ", .{@as(u16, @bitCast(entry))}); 270 | } 271 | try writer.writeAll("};\n"); 272 | 273 | try writer.print("\nconst children = [{}]u8{{", .{children.items.len}); 274 | for (children.items, 0..) |child, i| { 275 | if (i % 20 == 0) try writeNewline(writer); 276 | try writer.print("{}, ", .{@as(u8, @bitCast(child))}); 277 | } 278 | try writer.writeAll("};\n"); 279 | 280 | try writer.print("\nconst values = [{}]Value{{", .{values.items.len}); 281 | for (values.items, 0..) |value, i| { 282 | if (i % 5 == 0) try writeNewline(writer); 283 | try writer.writeAll("Value{"); 284 | if (value.first) |first| { 285 | try writer.print("@as(?u21, '\\u{{{X}}}')", .{first}); 286 | if (value.second) |second| { 287 | try writer.print(", @as(?u21, '\\u{{{X}}}')}}, ", .{second}); 288 | } else { 289 | try writer.writeAll(", @as(?u21, null)}, "); 290 | } 291 | } else { 292 | try writer.writeAll("null, null}, "); 293 | } 294 | } 295 | try writer.writeAll("};\n"); 296 | try writer.writeByte(0); 297 | 298 | var ast = try std.zig.Ast.parse(arena, output.items[0 .. output.items.len - 1 :0], .zig); 299 | return try ast.render(arena); 300 | } 301 | 302 | fn writeNewline(writer: anytype) !void { 303 | try writer.writeAll("\n "); 304 | } 305 | --------------------------------------------------------------------------------