├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── package.json ├── src ├── dfa.mjs └── index.mjs └── test ├── data └── samples.mjs ├── lexer.test.mjs ├── module.test.js ├── module.test.mjs ├── style └── tokens.css ├── test.html └── test.mjs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [alwinb] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | ignore/ 3 | private/ 4 | node_modules/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | files = index.mjs dfa.mjs 4 | sources = $(addprefix src/, $(files)) 5 | 6 | #run: all 7 | # @ echo $(sources) 8 | 9 | all: dist/html-lexer.mjs dist/html-lexer.js 10 | 11 | dist/html-lexer.mjs: dist/ $(sources) Makefile 12 | @ echo "Making an ESModule" 13 | @ esbuild --bundle --minify --keep-names --format=esm src/index.mjs > dist/html-lexer.mjs 14 | 15 | dist/html-lexer.js: dist/ $(sources) Makefile 16 | @ echo "Making an CommonJS bundle" 17 | @ esbuild --bundle --minify --keep-names --platform=node src/index.mjs > dist/html-lexer.js 18 | 19 | dist/: 20 | @ mkdir dist/ 21 | 22 | clean: 23 | @ echo "Removing dist/ directory" 24 | @ test -d dist/ && rm -r dist/ || exit 0 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An HTML5 lexer for safe template languages 2 | ========================================== 3 | 4 | [![NPM version][npm-image]][npm-url] 5 | 6 | [npm-image]: https://img.shields.io/npm/v/html-lexer.svg 7 | [npm-url]: https://npmjs.org/package/html-lexer 8 | 9 | A standard compliant, incremental/ streaming HTML5 lexer. 10 | 11 | This is an HTML5 lexer designed to be used a basis for safe and HTML-context 12 | aware template languages, IDEs or syntax highlighters. It is different from the 13 | other available tokenizers in that it preserves all the information of the 14 | input string, e.g. formatting, quotation style and other idiosyncrasies. It 15 | does so by producing annotated chunks of the input string rather than the 16 | slightly more high level tokens that are described in the specification. 17 | However, it does do so in a manner that is compatible with the language defined 18 | in the [HTML5 specification][1]. 19 | 20 | [1]: https://html.spec.whatwg.org/multipage/syntax.html#tokenization 21 | 22 | The main motivation for this project is a jarring absence of safe HTML 23 | template languages. By safe, I mean that the template placeholders are typed 24 | according to their context, and that the template engine ensures that the 25 | strings that come to fill the placeholders are automatically and 26 | correctly escaped to yield valid HTML. 27 | 28 | Usage 29 | ----- 30 | 31 | The produced tokens are simply tuples (arrays) `[type, chunk]` of a token type 32 | and a chunk of the input string. 33 | 34 | The lexer has a ‘push parser’ API. 35 | The `Lexer` constructor takes as its single argument a delegate object with 36 | methods: `write (token)` and `end ()`. 37 | 38 | Example: 39 | 40 | ```javascript 41 | const Lexer = require ('html-lexer') 42 | 43 | const delegate = { 44 | write: (token) => console.log (token), 45 | end: () => null 46 | } 47 | 48 | const lexer = new Lexer (delegate) 49 | lexer.write ('

Hello, World

') 50 | lexer.end () 51 | ``` 52 | 53 | Results in: 54 | 55 | ```javascript 56 | [ 'startTagStart', '<' ] 57 | [ 'tagName', 'h1' ] 58 | [ 'tagEnd', '>' ] 59 | [ 'data', 'Hello,' ] 60 | [ 'space', ' ' ] 61 | [ 'data', 'World' ] 62 | [ 'endTagStart', '' ] 65 | ``` 66 | 67 | The lexer is incremental: `delegate.write` will be called as soon as a token is 68 | available and you can split the input across multiple writes: 69 | 70 | ```javascript 71 | const lexer = new Lexer (delegate) 72 | lexer.write ('Hello, W') 74 | lexer.write ('orld') 75 | lexer.end () 76 | ``` 77 | 78 | 79 | Token types 80 | ----------- 81 | 82 | The tokens emitted are simple tuples `[type, chunk]`. 83 | The type of a token is just a string, and it is one of: 84 | 85 | - `attributeAssign` 86 | - `attributeName` 87 | - `attributeValueData` 88 | - `attributeValueEnd` 89 | - `attributeValueStart` 90 | - `bogusCharRef` 91 | - `charRefDecimal` 92 | - `charRefHex` 93 | - `charRefLegacy` 94 | - `charRefNamed` 95 | - `commentData` 96 | - `commentEndBogus` 97 | - `commentEnd` 98 | - `commentStartBogus` 99 | - `commentStart` 100 | - `data` 101 | - `endTagStart` 102 | - `lessThanSign` 103 | - `uncodedAmpersand` 104 | - `newline` 105 | - `nulls` 106 | - `plaintext` 107 | - `rawtext` 108 | - `rcdata` 109 | - `space` 110 | - `startTagStart` 111 | - `tagEndAutoclose` 112 | - `tagEnd` 113 | - `tagName` 114 | - `tagSpace` 115 | 116 | The `uncodedAmpersand` is emitted for ampersand `&` characters that *do not* start a character reference. 117 | 118 | The `tagSpace` is emitted for 'space' between attributes in 119 | element tags. 120 | 121 | Otherwise the names should be self explanatory. 122 | 123 | 124 | Limitations 125 | ----------- 126 | 127 | * Doctype 128 | The lexer still interprets doctypes as 'bogus comments'. 129 | 130 | * CDATA 131 | The lexer interprets CDATA sections as 'bogus comments'. 132 | (CDATA is only allowed in foreign content - svg and mathml.) 133 | 134 | * Script tags 135 | The lexer interprets script tags as rawtext elements. 136 | This has no dire consequences, other than that html begin and 137 | end comment tags that may surround it, are not marked as such. 138 | 139 | 140 | Changelog 141 | ------------ 142 | 143 | ### 0.5.0 144 | 145 | The projet has been rewritten to use the fast, hand-written DFA-based lexer, 146 | from my related [html-parser] project. 147 | I have been inspired by the techniques described by [Sean Barrett] on their 148 | page about [table-driven lexical analyis]. 149 | 150 | [html-parser]: https://github.com/alwinb/html-parser 151 | [Sean Barrett]: http://nothings.org 152 | [table-driven lexical analyis]: https://nothings.org/computer/lexing.html 153 | 154 | 155 | - **NB** Small changes have been made to the token types: 156 | - The `endTagPrefix` token has been removed: an `rcdata` or `rawtext` token is emitted instead. 157 | - The `bogusCharRef` token has been removed: an `uncodedAmpersand` token is emitted for an ampersand `&` that *does not start a character reference* instead. 158 | - Stretches of NUL-characters, whitespace, and individual newlines are now emitted as separate tokens of type `nulls`, `space`, and `newline`, respectively. 159 | 160 | 161 | ### 0.4.0 162 | 163 | - **NB** The token types have changed to use a more consistent naming scheme. 164 | - Added a Makefile for building a browser version. 165 | - Added a browser based test page. 166 | 167 | 168 | License 169 | ------- 170 | 171 | The source code for this project is licensed under the _Mozilla Public License Version 2.0_, copyright Alwin Blok 2016–2018, 2020–2021, 2023. 172 | 173 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-lexer", 3 | "version": "0.5.0", 4 | "description": "An HTML5 lexer", 5 | "main": "dist/html-lexer.js", 6 | "module": "dist/html-lexer.mjs", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "exports": { 11 | ".": [ 12 | { 13 | "import": "./dist/html-lexer.mjs", 14 | "require": "./dist/html-lexer.js" 15 | } 16 | ] 17 | }, 18 | "keywords": [ 19 | "html5", 20 | "compliant", 21 | "html", 22 | "lexer", 23 | "tokeniser", 24 | "tokenizer", 25 | "parser", 26 | "start-tag", 27 | "end-tag", 28 | "element", 29 | "attribute", 30 | "attribute-name", 31 | "attribute-value" 32 | ], 33 | "repository": { 34 | "type": "git", 35 | "url": "https://github.com/alwinb/html-lexer.git" 36 | }, 37 | "author": "Alwin Blok", 38 | "license": "MIT", 39 | "dependencies": {} 40 | } 41 | -------------------------------------------------------------------------------- /src/dfa.mjs: -------------------------------------------------------------------------------- 1 | function* range (a, z = Infinity) { while (a <= z) yield a++ } 2 | const intsInto = (map, i = 0) => new Proxy ({}, { get:($,k) => (map [k] = i, i++) }) 3 | const log = console.log.bind (console) 4 | 5 | 6 | // HTML Lexer 7 | // ========== 8 | 9 | // TODO: Doctypes, CDATA, Plaintext, ... 10 | // and clean up the produced token tags 11 | 12 | 13 | // Characters - Equivalence Classes 14 | // -------------------------------- 15 | 16 | let defaultClass 17 | const eqClass = (() => { 18 | 19 | const 20 | [ nul, cr, lf, other, quot, squo, space, term, hash, amp, eq, que, excl, dash, lt, gt, slash, digit, A_F, G_WYZ, X ] = range (1) 21 | defaultClass = other 22 | 23 | const eqClassFn = c => 24 | 0x00 === c ? nul : 25 | 0x0D === c ? cr : 26 | 0x0A === c ? lf : 27 | 0x09 === c ? space : 28 | 0x20 === c ? space : 29 | 0x21 === c ? excl : 30 | 0x22 === c ? quot : 31 | 0x23 === c ? hash : 32 | 0x26 === c ? amp : 33 | 0x27 === c ? squo : 34 | 0x2D === c ? dash : 35 | 0x2F === c ? slash : 36 | 0x30 <= c && c <= 0x39 ? digit : 37 | 0x3B === c ? term : 38 | 0x3C === c ? lt : 39 | 0x3D === c ? eq : 40 | 0x3E === c ? gt : 41 | 0x3F === c ? que : 42 | 0x41 <= c && c <= 0x46 ? A_F : 43 | 0x58 === c ? X : 44 | 0x78 === c ? X : 45 | 0x47 <= c && c <= 0x5A ? G_WYZ : 46 | 0x61 <= c && c <= 0x66 ? A_F : 47 | 0x66 <= c && c <= 0x7A ? G_WYZ : defaultClass; 48 | 49 | // Precompute a lookup table 50 | 51 | const eq_array = new Uint8Array (0x7F); 52 | for (let i=0, l=0x7F; i<=l; i++) 53 | eq_array [i] = eqClassFn (i) 54 | return eq_array 55 | }) () 56 | 57 | 58 | // Token Ids 59 | // --------- 60 | 61 | const errorToken = 0 62 | const tokens = { errorToken } 63 | const { 64 | data, rawtext, rcdata, plaintext, 65 | nulls, space, newline, 66 | ampersand, lt, 67 | charRefDecimal, charRefHex, charRefNamed, charRefLegacy, 68 | mDeclStart, 69 | commentStart, commentData, commentEnd, 70 | bogusStart, bogusData, bogusEnd, 71 | startTagStart, endTagStart, tagEnd, 72 | attributeSep, attributeName, attributeAssign, 73 | valueStartApos, valueStartQuot, valueEnd, 74 | unquoted, 75 | squoted, 76 | quoted, 77 | } = intsInto (tokens, 1) 78 | 79 | const names = [] 80 | for (const k in tokens) names[tokens[k]] = k 81 | // log (tokens, names) 82 | 83 | 84 | // DFA States 85 | // ---------- 86 | 87 | const [ 88 | 89 | // Entry States 90 | 91 | Main, RcData, RawText, 92 | BeforeAttribute, BeforeAssign, /*BeforeValue,*/ 93 | BeforeCommentData, InCommentData, Bogus, 94 | ValueQuoted, ValueAposed, ValueUnquoted, 95 | 96 | // Internal States 97 | 98 | RLTs, LXD, DD, DX, 99 | AmpH, AmpX, TOP, 100 | Nul, Wsp, BeforeValue, CR, Tsp, Wrd, Raw, Rcd, Att, 101 | Val, ValQ, ValS, 102 | Bog, Cmt, CmtD, CmtSD, Sep, 103 | Amp, Ref, xRef, dRef, 104 | LT, LTs, LTx, 105 | STN, ETN, DTN, 106 | RawLT, RcdLT, LXDD, 107 | TagE, Bog_, Cmt_, 108 | Eq, lQ_, Sq_, rQ_, nRef_, dRef_, xRef_, NL_ 109 | 110 | ] = range (1) 111 | 112 | 113 | const STOP = 0 114 | const states = { 115 | Main, RcData, RawText, PlainText:TOP, 116 | BeforeAttribute, BeforeAssign, BeforeValue, 117 | BeforeCommentData, InCommentData, Bogus, 118 | ValueQuoted, ValueAposed, ValueUnquoted, 119 | } 120 | 121 | 122 | // State Transitions 123 | // ----------------- 124 | 125 | // Columns are character classes, rows are states. 126 | // The first column marks the acceptance of states, by labeling 127 | // it with a nonzero token-type. The runtime assumes that states 128 | // are pre-sorted, such that all states st >= minAccepts are 129 | // accepting states that produce an output token. 130 | 131 | const ___ = STOP 132 | const minAccepts = TOP 133 | 134 | // REVIEW How shshould NUL be handled in rawtext / rcdata? 135 | // TODO handle newlines separately always 136 | // NB nulls in attribute names and values are to be always 137 | // converted to u+fffd and they do not end unquoted values. 138 | 139 | 140 | const table = [ 141 | // nul CR LF other " ' \s ; # & = ? ! - < > / 0-9 A-F G-WYZ X ; 142 | [ 0, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // STOP 143 | [ 0, Nul, CR, NL_, Wrd, Wrd, Wrd, Wsp, Wrd, Wrd, Amp, Wrd, Wrd, Wrd, Wrd, LT, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd ], // Main 144 | [ 0, Nul, CR, NL_, Rcd, Rcd, Rcd, Rcd, Rcd, Rcd, Amp, Rcd, Rcd, Rcd, Rcd, RcdLT,Rcd, Rcd, Rcd, Rcd, Rcd, Rcd ], // RcData 145 | [ 0, Nul, CR, NL_, Raw, Raw, Raw, Raw, Raw, Raw, Raw, Raw, Raw, Raw, Raw, RawLT,Raw, Raw, Raw, Raw, Raw, Raw ], // RawText 146 | [ 0, /*TODO CRLF*/ Att, Sep, Sep, Att, Att, Att, Sep, Att, Att, Att, Att, Att, Att, Att, Att, TagE, Sep, Att, Att, Att, Att ], // BeforeAttribute 147 | [ 0, /*TODO CRLF*/ Att, Tsp, Tsp, Att, Att, Att, Tsp, Att, Att, Att, Eq, Att, Att, Att, Att, TagE, Sep, Att, Att, Att, Att ], // BeforeAssign 148 | [ 0, Nul, CR, NL_, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, CmtSD,Cmt, Cmt_, Cmt, Cmt, Cmt, Cmt, Cmt ], // BeforeCommentData 149 | [ 0, Nul, CR, NL_, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, CmtD, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt, Cmt ], // InCommentData 150 | [ 0, Nul, CR, NL_, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog, Bog_, Bog, Bog, Bog, Bog, Bog ], // Bogus 151 | [ 0, Nul, CR, NL_, ValQ, rQ_, ValQ, ValQ, ValQ, ValQ, Amp, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ, ValQ ], // ValueQuoted 152 | [ 0, Nul, CR, NL_, ValS, ValS, rQ_, ValS, ValS, ValS, Amp, ValS, ValS, ValS, ValS, ValS, ValS, ValS, ValS, ValS, ValS, ValS ], // ValueAposed 153 | [ 0, /*TODO CRLF*/ Val, Sep, Sep, Val, Val, Val, Sep, Val, Val, Amp, Val, Val, Val, Val, Val, TagE, Val, Val, Val, Val, Val ], // ValueUnquoted 154 | [ 0, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ETN, ETN, ETN ], // RLTs: after 192 | [ bogusEnd, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // Bog_: after > 193 | [ commentEnd, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // Cmt_: 194 | [ attributeAssign, ___, Eq , Eq , ___, ___, ___, Eq , ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // Eq: after = 195 | [ valueStartQuot, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // lQ_ after " 196 | [ valueStartApos, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // Sq_ after ' 197 | [ valueEnd, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // rQ_ after ' or " (or space) 198 | [ charRefNamed, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // nRef_ after eg. & 199 | [ charRefDecimal, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // dRef_ after eg. 200 | [ charRefHex, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // xRef_ after eg. ª 201 | [ newline, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___ ], // NL_ after CRLF or LF 202 | // nul CR LF other " ' \s ; # & = ? ! - < > / 0-9 A-F G-WYZ X ; 203 | ] 204 | 205 | 206 | // State machine bundle 207 | // -------------------- 208 | 209 | const DFA = { 210 | eqClass, 211 | defaultClass, 212 | tokens, 213 | states, 214 | table, 215 | initialState: Main, 216 | minAccepts, 217 | } 218 | 219 | export default DFA -------------------------------------------------------------------------------- /src/index.mjs: -------------------------------------------------------------------------------- 1 | const log = console.log.bind (console) 2 | const { defineProperties:define } = Object 3 | 4 | // Imports - DFA 5 | // ------------- 6 | 7 | import DFA from './dfa.mjs' 8 | const { eqClass, defaultClass, tokens:T, states:S, initialState, table, minAccepts } = DFA 9 | const FAIL = 0 10 | const errorToken = 0 11 | 12 | 13 | // TokenTypes 14 | // ---------- 15 | 16 | // This maps the DFA tokenTypes from ints to strings; 17 | // It renames some of the token-types to maintain some 18 | // compatibility with previous versions of html-lexer. 19 | 20 | const names = [] 21 | for (const k in T) names[T[k]] = k 22 | names [T.unquoted] = 'attributeValueData' 23 | names [T.quoted] = 'attributeValueData' 24 | names [T.squoted] = 'attributeValueData' 25 | names [T.attributeSep] = 'tagSpace' 26 | names [T.valueStartApos] = 'attributeValueStart' 27 | names [T.valueStartQuot] = 'attributeValueStart' 28 | names [T.valueEnd] = 'attributeValueEnd' 29 | names [T.bogusStart] = 'commentStartBogus' 30 | names [T.bogusData] = 'commentData' 31 | names [T.bogusEnd] = 'commentEndBogus' 32 | names [T.lt] = 'lessThanSign' 33 | names [T.ampersand] = 'uncodedAmpersand' 34 | 35 | const tokenTypes = {} 36 | for (const x of names) tokenTypes[x] = x 37 | delete tokenTypes.errorToken 38 | delete tokenTypes.mDeclStart 39 | 40 | 41 | // Lexer / Push Parser 42 | // ------------------- 43 | 44 | function Lexer (delegate) { 45 | 46 | // State 47 | 48 | let buffer = '' 49 | let closed = false // true after end() call 50 | let line = 1, lastnl = 0, _c = 0 // line counter 51 | let anchor = 0, end = 0, pos = 0 // lexer position 52 | let entry = S.Main // lexer (entry) state-id 53 | let lastTagType = 0 54 | let lastStartTagName = '' 55 | 56 | // API 57 | 58 | return define (this, { 59 | position: { get:getPosition }, 60 | write: { value: write, hidden:true }, 61 | end: { value: writeEOF, hidden:true }, 62 | parse: { value: writeEOF, hidden:true } 63 | }) 64 | 65 | // Public methods 66 | 67 | function write (input) { 68 | buffer += input 69 | const length = buffer.length 70 | while (pos < length) { 71 | let state = entry 72 | let exit = entry < minAccepts ? FAIL : entry 73 | do { 74 | const c = buffer.charCodeAt(pos++) 75 | state = table [state] [c <= 0x7a ? eqClass[c] : defaultClass] 76 | if (minAccepts <= state) (exit = state, end = pos) 77 | // Newline counter 78 | if (c === 0xD || c === 0xA) (lastnl = pos, line += (_c !== 0xD)); 79 | _c = c 80 | } while (state && pos < length) 81 | 82 | if (end < buffer.length || closed) 83 | emit (table [exit] [0], anchor, end) 84 | else { 85 | pos = end = anchor 86 | break 87 | } 88 | } 89 | buffer = buffer.substr (end) 90 | anchor = pos = end = 0 91 | } 92 | 93 | function writeEOF (input = '') { 94 | closed = true 95 | write (input) 96 | delegate.end () 97 | } 98 | 99 | // Private methods 100 | 101 | function getPosition () { 102 | return { line, column: pos-lastnl } 103 | } 104 | 105 | function emit (type, anchor_, end_) { 106 | // log ('emit', {buffer, l:buffer.length, anchor_, end_, closed }) 107 | switch (type) { 108 | 109 | case errorToken: { 110 | const message = `Lexer error at line ${line}:${pos-lastnl}` 111 | throw new SyntaxError (message) 112 | } break 113 | 114 | case T.startTagStart: { 115 | const tagName = buffer.substring (anchor+1, end_) 116 | lastTagType = type 117 | lastStartTagName = tagName.toLowerCase () 118 | delegate.write (['startTagStart', '<']) 119 | delegate.write (['tagName', tagName]) 120 | entry = S.BeforeAttribute 121 | return anchor = pos = end_ // NB returns 122 | } 123 | 124 | case T.endTagStart: { 125 | const tagName = buffer.substring (anchor+2, end_) 126 | lastTagType = type 127 | if (entry === S.Main || lastStartTagName === tagName.toLowerCase ()) 128 | entry = S.BeforeAttribute 129 | else entry === S.RcData ? T.rcdata : T.rawtext 130 | delegate.write (['endTagStart', 'Legacy Named Character References' 4 | , '&amp &' 5 | , '&amp; &' 6 | , '&ampo &o' 7 | , '&amp* &*' 8 | , '&amp & ' 9 | , '&amp= &=' 10 | , '&notin ¬in' 11 | , '&notit ¬it' 12 | , '&notina ¬ina' 13 | , '&notita ¬ita' 14 | , '&notin; ' 15 | , '&notit; ¬it;' 16 | , '&notin;a ∉a' 17 | , '&notit;a ¬it;a' 18 | , '&notin= ¬in=' 19 | , '&notit= ¬it=' 20 | , '&notin;= ∉=' 21 | , '&notit;= ¬it;=' // REVIEW (¬it; is not a named charref; but neither is &foo;) 22 | , '&foo;= &foo;=' 23 | , '&foo; &foo;' 24 | , 'Link' 25 | 26 | , '\n' 27 | , '

Legacy character references in rcdata

' 28 | , ` and more' 69 | , 'rcdata2 it, see yes' 71 | , 'rcdata5 it' 73 | , 'rawtext and more' 74 | , 'rawtext2 ending it see' 76 | , 'rawtext4 see' 77 | , 'script hello thus' 78 | , 'nonalpha tag This is not a <ém attr>tag' 79 | , 'double open tag A double less than sign <
content
' 80 | , 'bad end tag
This is blue And this too!' 81 | , 'closePlaintext hi asd<as &ap, </plaintext> cannot be ended' 82 | 83 | , '\n' 84 | , 'comment: <!weird markup declaration> and such' 85 | , 'comment: <!> and such' 86 | , 'comment: <?> and such' 87 | , 'comment: </> and such' 88 | , 'comment: <!-> and such' 89 | , 'comment: <?-> and such' 90 | , 'comment: <!-> and such' 91 | , 'comment: <!--> and such' 92 | , 'comment: <?--> and such' 93 | , 'comment: <!--> and such' 94 | , 'comment: <!--!> part of the comment --> and such' 95 | , 'comment: <!---!> part of the comment --> and such' 96 | , 'comment: <!----!> and such' 97 | , 'comment: <!-> and such' 98 | , 'comment: <!-- with -> within --> and subsequent data' 99 | , 'comment: <!-- with bogus end -> part of the comment --> and subsequent data' 100 | , 'comment: <!-- Comment with -- double dash within --> and subsequent data' 101 | , 'comment: <!-- Comment with --!- weird stuff within --> and subsequent data' 102 | , 'comment: <!-- Comment with strange end --!> and subsequent data' 103 | , 'bogus comment: <! with end !@> and subsequent data' 104 | , 'bogus comment: </ with end !@> and subsequent data' 105 | , 'bogus comment: <? with end !@> and subsequent data' 106 | , 'bogus comment: <!- with end -> and subsequent data' 107 | 108 | , '\n' 109 | , '<!doctype foo>' 110 | , `<!ba>` 111 | , `<! xos >` 112 | 113 | , '\n' 114 | , 'missing space attribues connected <div name="a"name="b" >' 115 | , 'nonalpha attribute weird template tag <div {name="a" name="b" >' 116 | , 'normalHtml This is <span class = "s1">html</span> Yeah!' 117 | , 'unescaped ampersand data & such' 118 | , 'unescaped ampersand Hash data &# such' 119 | , 'unescaped ampersand HashEx data &#x such' 120 | , 'unescaped ampersand HashExZed data &#xz such' 121 | 122 | , '\n' 123 | , 'slashes: <span/>' 124 | , 'slashes: <span name=foo//>' 125 | , 'slashes: <div//>' 126 | , 'slashes: <div/foo/bar//>' 127 | , 'slashes: <span//>' 128 | , 'slashes: <span />' 129 | , 'slashes: <span <>' 130 | , 'slashes: <span //>' 131 | , 'slashes: <span / />' 132 | , 'slashes: <span/////>' 133 | , 'slashes: <span/////name////=/blabla>' 134 | , 'slashes: <span / attr >foo bar</span>' 135 | , 'slashes: <span name=/ >asdf' 136 | , 'slashes: <span name=/>asdf' 137 | , 'slashes: <span name=// />asdf' 138 | , 'slashes: <span name= / />asdf' 139 | 140 | , '\n' 141 | , 'weirdEquals <span attr = / asd >content</span>' 142 | , 'weirdEquals2 <span attr = @ asd >content</span>' 143 | , 'weirdEquals3 <span attr /= asd >content</span>' 144 | , 'weirdEquals4 <span attr @= asd >content</span>' 145 | , 'missingValue <span name=>asdf' 146 | , 'invalidAttributeValue1 <div class= =at >' 147 | , 'invalidAttributeValue2 <div class= <at >' 148 | , 'invalidAttributeValue3 <div class= `at >' 149 | ] 150 | 151 | 152 | const EOFSamples = 153 | [ 'data state eof in da' 154 | , 'tagOpen state eof in <' 155 | , 'tagName state eof in <d' 156 | , 'selfClosingStartTag state in <div /' 157 | , 'endTagOpen state in </a' 158 | , 'beforeAttributeName state <div ' 159 | , 'attributeName state <div at' 160 | , 'afterAttributeName state <div attr ' 161 | , 'beforeAttributeValue state <div attr =' 162 | , 'attributeValueDoubleQuoted state <div attr="te' 163 | , 'attributeValueSingleQuoted state <div attr=\'te' 164 | , 'attributeValueUnquoted state <div attr=te' 165 | , 'afterAttributeValueQuoted state <div attr="test"' 166 | , 'markupDeclarationOpen state a markup decl <!' 167 | , 'selfClosingTag state An eof after a / <span /' 168 | , 'commentStart state a comment start <!--' 169 | , 'commentStartDash state a comment start dash <!---' 170 | , 'comment state a comment <!-- hello th' 171 | , 'commentEndDash state a comment end dash <!-- hello th -' 172 | , 'commentEnd state a comment end <!-- hello th --' 173 | , 'commentEndBang state a comment end bang <!-- hello th --!' 174 | , 'bogusComment state <! bogus comment' 175 | , 'charRefIn_ state data &' 176 | , 'numericCharRef state data &#' 177 | , 'hexadecimalCharRef state data &#x' 178 | , 'hexDigits state data &#x1a' 179 | , 'decimalCharRef state data &#1' 180 | , 'namedCharRef state data &name' 181 | , 'namedCharRefInAttr state <span attr="asd&amp;a&b c">text</span>' 182 | , 'namedCharRefInData state named charref in data asd&amp;a&b cde' 183 | , 'rawtext state eof in raw text <script> funct' 184 | , 'plaintext state eof in raw text <plaintext> asdf' 185 | , 'rawtextLessThanSign state eof in raw text less than sign <script> if (i<' 186 | , 'rawtextEndTagOpen state eof in raw text end tag open <script> asdf </' 187 | ] 188 | 189 | const samples2 = [ 190 | '<span a=&amp b>', 191 | '<table><input type=hidden type=still-hidden>', 192 | 193 | // '</ tttt>', 194 | // '<table><input type = hidden /// / type= still-hidden&amp;foo >foo', 195 | // '<script type=hidden ///type=still-hidden&amp;foo >foo</x>bae', 196 | // '<!doctype script type = hidden ///type= still-hidden&amp;foo >foo</x>bae', 197 | // '<!--> <!---> <!-----> bae', 198 | // `<test val = unq&amped bar="foo" bee='buzz'> bae`, 199 | // `<plaintext = unq&amped bar="foo" bee='buzz'> baeasas </plaintext > `, 200 | // `<test val = unq&amped b // >`, 201 | 202 | '<script a =\n b>foo bar </script>', 203 | '<h1>Hello, World</h1>', 204 | '<!namas >', 205 | '<foo/>', 206 | 207 | // Newlines 208 | // -------- 209 | 210 | // Newlines in data 211 | `Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 212 | `Test &amp; Line1 \nLine2 \r\r Line4 \r\nLine5`, 213 | 214 | // Newlines in rcdata 215 | `<textarea>Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 216 | `<textarea>Test &amp; Line1<\nLine2<\r\rLine4<\r\nLine5`, 217 | `<textarea>Test &amp; Line1</\nLine2</\r\rLine4</\r\nLine5`, 218 | 219 | // Newlines in attribute values 220 | `<div title="Test &amp; Line1\nLine2\r\rLine4\r\nLine5" foo >`, 221 | `<div title='Test &amp; Line1\nLine2\r\rLine4\r\nLine5' foo >`, 222 | `<div title="Test &amp; Line1 \nLine2 \r\rLine4 \r\nLine5" foo >`, 223 | `<div title='Test &amp; Line1 \nLine2 \r\rLine4 \r\nLine5' foo >`, 224 | 225 | // Newlines in rawtext 226 | `<style>Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 227 | `<style>Test &amp; Line1<\nLine2<\r\rLine4<\r\nLine5`, 228 | `<style>Test &amp; Line1</\nLine2</\r\rLine4</\r\nLine5`, 229 | 230 | // Newlines in comments 231 | `<!-- Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 232 | `<!? Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 233 | 234 | // No newlines in plaintext then 235 | `<plaintext>Test &amp; Line1\nLine2\r\rLine4\r\nLine5`, 236 | 237 | // NUL 238 | // --- 239 | 240 | // NULs in data 241 | `Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 242 | `Test &amp; Line1 \0Line2 \0\0 Line4 \0\0Line5`, 243 | 244 | // NULs in rcdata 245 | `<textarea>Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 246 | `<textarea>Test &amp; Line1<\0Line2<\0\0Line4<\0\0Line5`, 247 | `<textarea>Test &amp; Line1</\0Line2</\0\0Line4</\0\0Line5`, 248 | 249 | // NULs in attribute values 250 | `<div title="Test &amp; Line1\0Line2\0\0Line4\0\0Line5" foo >`, 251 | `<div title='Test &amp; Line1\0Line2\0\0Line4\0\0Line5' foo >`, 252 | `<div title=Line1\0Line2\0\0Line4\0\0Line5 foo >`, 253 | 254 | // NULs in rawtext 255 | `<style>Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 256 | `<style>Test &amp; Line1<\0Line2<\0\0Line4<\0\0Line5`, 257 | `<style>Test &amp; Line1</\0Line2</\0\0Line4</\0\0Line5`, 258 | 259 | // NULs in comments 260 | `<!-- Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 261 | `<!? Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 262 | 263 | // No NULs in plaintext then 264 | `<plaintext>Test &amp; Line1\0Line2\0\0Line4\0\0Line5`, 265 | 266 | 267 | ] 268 | 269 | 270 | // Exports 271 | // ------- 272 | export { samples, samples2, EOFSamples } -------------------------------------------------------------------------------- /test/lexer.test.mjs: -------------------------------------------------------------------------------- 1 | import { Lexer } from 'html-lexer' 2 | import { samples, samples2, EOFSamples } from './data/samples.mjs' 3 | const log = console.log.bind (console) 4 | 5 | 6 | // Test 7 | // ---- 8 | 9 | const delegate = { write:log, end:log } 10 | 11 | for (const samples_ of [samples, samples2, EOFSamples]) 12 | for (const x of samples_) { 13 | log (JSON.stringify (x)) 14 | log ('================') 15 | const lexer = new Lexer (delegate) 16 | lexer.parse (x) 17 | } 18 | 19 | 20 | -------------------------------------------------------------------------------- /test/module.test.js: -------------------------------------------------------------------------------- 1 | const { Lexer } = require ('html-lexer') 2 | const log = console.log.bind (console) 3 | // log (html) 4 | 5 | const delegate = { 6 | write: (token) => console.log (token), 7 | end: () => null 8 | } 9 | 10 | const lexer = new Lexer (delegate) 11 | lexer.write ('<h1>Hello, World</h1>') 12 | lexer.end () 13 | -------------------------------------------------------------------------------- /test/module.test.mjs: -------------------------------------------------------------------------------- 1 | import { Lexer } from 'html-lexer' 2 | const log = console.log.bind (console) 3 | 4 | const delegate = { 5 | write: (token) => console.log (token), 6 | end: () => null 7 | } 8 | 9 | const lexer = new Lexer (delegate) 10 | lexer.write ('<h1>Hello, World</h1>') 11 | lexer.end () 12 | -------------------------------------------------------------------------------- /test/style/tokens.css: -------------------------------------------------------------------------------- 1 | html, body { 2 | color:white; 3 | background:#202020; 4 | } 5 | 6 | /** Styles for the tokens **/ 7 | 8 | pre, code { 9 | font-size:13px; 10 | color:white; 11 | background:#202020; 12 | font-family:Menlo, Monaco, Monospace; 13 | tab-size:2; 14 | } 15 | 16 | /* Extras */ 17 | 18 | .error { 19 | display:inline-block; 20 | width:0; 21 | padding-left:20px; 22 | margin-left:-10px; 23 | overflow:hidden; 24 | position:absolute; 25 | z-index:100; 26 | border-bottom: 1px dotted red; 27 | margin-top:-1px; 28 | } 29 | 30 | .error:hover { 31 | padding:1px 4px; 32 | margin-top:-1px; 33 | border:none; 34 | width:auto; 35 | overflow:none; 36 | background-color:red; 37 | } 38 | 39 | /* html colors */ 40 | 41 | /* html colors */ 42 | 43 | .commentStart, 44 | .commentStartBogus, 45 | .commentData, 46 | .commentEnd, 47 | .commentEndBogus, 48 | .space { 49 | color:#746D74; 50 | } 51 | 52 | .attributeValueStart, 53 | .attributeValueData, 54 | .attributeValueEnd { 55 | color: #8F9D6A; 56 | } 57 | 58 | .charRefLegacy, 59 | .charRefDecimal, 60 | .charRefHex, 61 | .charRefNamed { 62 | color: #DDF7AC; 63 | } 64 | 65 | .charRefDecimal, 66 | .charRefHex, 67 | .charRefNamed { 68 | /* color:#E9784F; */ 69 | } 70 | 71 | .startTagStart, 72 | .endTagStart, 73 | .tagName, 74 | .tagEnd, 75 | .tagEndAutoclose { 76 | color:#B38243; 77 | } 78 | 79 | .attributeAssign, 80 | .attributeName { 81 | color: #D4BD7F; 82 | } 83 | 84 | .lessThanSign, 85 | .uncodedAmpersand { 86 | background:#60315A; 87 | } 88 | 89 | .endTagPrefix { 90 | text-decoration:underline; 91 | } 92 | 93 | 94 | /** Some additional stying for the rest of the sample page **/ 95 | 96 | html, body { 97 | color:#202020; 98 | background:white; 99 | margin:0; padding:0; 100 | font-size:24px; 101 | } 102 | 103 | body { 104 | font-family:Helvetica, Arial, Sans; 105 | font-size:16px; 106 | line-height:1rem; 107 | } 108 | 109 | body { 110 | padding:0 1rem; 111 | } 112 | h1 { 113 | font-size:48px; 114 | line-height:2rem; 115 | min-height:2rem; 116 | } 117 | pre#colors, pre#inspector { 118 | display:block; 119 | line-height:1rem; 120 | padding:.7rem 1rem 1.3rem; 121 | border-radius:.3rem; 122 | } 123 | 124 | #colors { 125 | cursor:default; 126 | } 127 | #colors span { 128 | padding:.15rem 0; 129 | border-radius:.1rem; 130 | } 131 | #colors span:hover { 132 | background:#444; 133 | } 134 | -------------------------------------------------------------------------------- /test/test.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html lang="en"> 3 | <head> 4 | <meta charset="utf-8"> 5 | <title>Html Lexer example page</title> 6 | <link rel="stylesheet" type="text/css" href="./style/tokens.css"> 7 | </head> 8 | <body> 9 | <h1>HTML5 Lexer</h1> 10 | <p> 11 | A test page for html-lexer. 12 | </p> 13 | 14 | 15 | <div> 16 | <pre id=inspector style=white-space:unset> 17 | Inspect the output by clicking on it, below 18 | </pre> 19 | <pre id=colors></pre> 20 | </div> 21 | 22 | <script type=module> 23 | 24 | import { Lexer } from '../src/index.mjs' 25 | import * as sampleData from './data/samples.mjs' 26 | 27 | const log = console.log.bind (console) 28 | const doc = document 29 | 30 | 31 | function* tokens (str) { 32 | lexer.write ('<h1>Hello, World</h1>') 33 | lexer.end () 34 | } 35 | 36 | const sString = sym => 37 | /Symbol\((.*)\)/.exec (String(sym))[1] || 'Symbol()' 38 | 39 | const pre = doc.getElementById ('colors') 40 | const pre2 = doc.getElementById ('inspector') 41 | const objectKey = Symbol () 42 | 43 | function inspect (...args) { 44 | pre2.innerHTML = '' 45 | for (let x of args) 46 | pre2.append (JSON.stringify (x, null, 2), doc.createElement ('BR')) 47 | } 48 | 49 | function show (data) { 50 | const delegate = { 51 | write: (token) => { 52 | const { position } = lexer 53 | const el = renderChunk (token) 54 | el [objectKey] = { tokenType:token[0], stateAfter:position } 55 | pre.append (el) 56 | }, 57 | end: () => null 58 | } 59 | const lexer = new Lexer (delegate) 60 | lexer.write (data) 61 | lexer.end () 62 | } 63 | 64 | function renderChunk ([type, value]) { 65 | const e = doc.createElement ('SPAN') 66 | e.title = e.className = type 67 | let tnode = doc.createTextNode(value) 68 | e.append (value) 69 | return e 70 | } 71 | 72 | function main () { 73 | for (let x of sampleData.samples) { 74 | show (x) 75 | pre.append (doc.createElement ('br')) 76 | } 77 | for (let x of sampleData.EOFSamples) { 78 | show (x) 79 | pre.append (doc.createElement ('br')) 80 | } 81 | 82 | doc.body.onclick = function (evt) { 83 | if (objectKey in evt.target) 84 | inspect (evt.target [objectKey]) 85 | } 86 | } 87 | 88 | main () 89 | </script> 90 | </body> 91 | </html> -------------------------------------------------------------------------------- /test/test.mjs: -------------------------------------------------------------------------------- 1 | import { Lexer } from 'html-lexer' 2 | const log = console.log.bind (console) 3 | 4 | const delegate = { 5 | write: (token) => console.log (token), 6 | end: () => console.log('\n') 7 | } 8 | 9 | { 10 | const lexer = new Lexer (delegate) 11 | lexer.write ('<h1>Hello, World</h1>') 12 | lexer.end () 13 | } 14 | 15 | { 16 | const lexer = new Lexer (delegate) 17 | lexer.write ('<h') 18 | lexer.write ('1>Hello, W') 19 | lexer.write ('orld</h1>') 20 | lexer.end () 21 | } 22 | 23 | { 24 | let lexer 25 | const delegate = { 26 | write: (token) => console.log (token, lexer.position), 27 | end: () => null 28 | } 29 | 30 | lexer = new Lexer (delegate) 31 | lexer.write ('<h1>Hello, World</h1>') 32 | lexer.end () 33 | } 34 | 35 | 36 | { 37 | const lexer = new Lexer (delegate) 38 | lexer.write ('<!doctype html>sp') 39 | lexer.write ('<sp') 40 | lexer.write ('an>Hi</span>') 41 | lexer.write ('&amp; &a') 42 | lexer.write ('mp') 43 | lexer.write (';I am &notit ok') 44 | lexer.write ('\nI said: I am &not') 45 | lexer.end ('<!asd') 46 | } 47 | 48 | 49 | --------------------------------------------------------------------------------