├── .editorconfig ├── .gitignore ├── LICENSE ├── README.md ├── package.json ├── rollup.config.js ├── src ├── config.ts ├── index.ts ├── misc.spec.ts ├── parse.spec.ts ├── parse.ts ├── safeHtml.spec.ts ├── safeHtml.ts ├── test │ ├── issue_6.spec.ts │ └── issue_7.spec.ts ├── tokenize.spec.ts ├── tokenize.ts ├── types.ts ├── utils.ts └── walk.ts ├── tsconfig.json └── yarn.lock /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | tab_width = 2 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | max_line_length = 80 12 | 13 | [*.md] 14 | trim_trailing_whitespace = false 15 | indent_size = 4 16 | 17 | [*.go] 18 | indent_style = tab 19 | 20 | [*.py] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .cache/ 3 | temp/ 4 | *.log 5 | .DS_Store 6 | .DS_Store* 7 | *~ 8 | .*~ 9 | *.swp 10 | .*.swp 11 | *.tgz 12 | .idea/ 13 | .vscode/ 14 | dist/ 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 acrazing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html5parser 2 | 3 | `html5parser` is a super fast and tiny **HTML5** parser. 4 | 5 | ## Highlights 6 | 7 | - **[Fast](#benchmark)**: maybe the fastest one you can find on GitHub. 8 | - **Tiny**: the fully packaged bundle size is less than `5kb`. 9 | - **Cross platform**: works in the modern browsers and Node.js. 10 | - **[HTML5 only](#warnings)**: any thing not in the specification will be ignored. 11 | - **Accurate**: every token could be located in source file. 12 | 13 | ## Table of Contents 14 | 15 | - [Installation](#installation) 16 | - [Quick start](#quick-start) 17 | - [API Reference](#api-reference) 18 | - Core 19 | - [tokenize()](#tokenizeinput) 20 | - [parse()](#parseinput) 21 | - Utilities 22 | - [walk()](#walkast-options) 23 | - [safeHtml()](#safehtmlinput) 24 | - [safeHtmlDefaultOptions](#safehtmldefaultoptions) 25 | - [Warnings](#warnings) 26 | - [Benchmark](#benchmark) 27 | 28 | ## Installation 29 | 30 | 1. Package manager 31 | 32 | ```bash 33 | npm i -S html5parser 34 | 35 | # or var yarn 36 | yarn add html5parser 37 | ``` 38 | 39 | 2. CDN 40 | 41 | ```html 42 | 43 | ``` 44 | 45 | ## Quick start 46 | 47 | [![Edit html5parser - quick start](https://codesandbox.io/static/img/play-codesandbox.svg)](https://codesandbox.io/s/keen-wind-2mpwr?fontsize=14&hidenavigation=1&theme=dark) 48 | 49 | ```typescript jsx 50 | import { parse, walk, SyntaxKind } from 'html5parser'; 51 | 52 | const ast = parse('Hello html5parser!'); 53 | 54 | walk(ast, { 55 | enter: (node) => { 56 | if (node.type === SyntaxKind.Tag && node.name === 'title' && Array.isArray(node.body)) { 57 | const text = node.body[0]; 58 | if (text.type !== SyntaxKind.Text) { 59 | return; 60 | } 61 | const div = document.createElement('div'); 62 | div.innerHTML = `The title of the input is ${text.value}`; 63 | document.body.appendChild(div); 64 | } 65 | }, 66 | }); 67 | ``` 68 | 69 | ## API Reference 70 | 71 | ### tokenize(input) 72 | 73 | Low level API to parse string to tokens: 74 | 75 | ```typescript jsx 76 | function tokenize(input: string): IToken[]; 77 | ``` 78 | 79 | - `IToken` 80 | 81 | ```typescript jsx 82 | interface IToken { 83 | start: number; 84 | end: number; 85 | value: string; 86 | type: TokenKind; 87 | } 88 | ``` 89 | 90 | - `TokenKind` 91 | 92 | ```typescript jsx 93 | const enum TokenKind { 94 | Literal, 95 | OpenTag, // trim leading '<' 96 | OpenTagEnd, // trim tailing '>', only could be '/' or '' 97 | CloseTag, // trim leading '' 98 | Whitespace, // the whitespace between attributes 99 | AttrValueEq, 100 | AttrValueNq, 101 | AttrValueSq, 102 | AttrValueDq, 103 | } 104 | ``` 105 | 106 | ### parse(input) 107 | 108 | Core API to parse string to AST: 109 | 110 | ```typescript jsx 111 | function parse(input: string, options?: ParseOptions): INode[]; 112 | ``` 113 | 114 | - `ParseOptions` 115 | 116 | ```typescript jsx 117 | interface ParseOptions { 118 | // create tag's attributes map 119 | // if true, will set ITag.attributeMap property 120 | // as a `Record` 121 | setAttributeMap: boolean; 122 | } 123 | ``` 124 | 125 | - `INode` 126 | 127 | ```typescript jsx 128 | export type INode = IText | ITag; 129 | ``` 130 | 131 | - `ITag` 132 | 133 | ```typescript jsx 134 | export interface ITag extends IBaseNode { 135 | type: SyntaxKind.Tag; 136 | // original open tag,
137 | open: IText; 138 | // lower case tag name, div 139 | name: string; 140 | // original case tag name, Div 141 | rawName: string; 142 | attributes: IAttribute[]; 143 | // the attribute map, if `options.setAttributeMap` is `true` 144 | // this will be a Record, key is the attribute name literal, 145 | // value is the attribute self. 146 | attributeMap: Record | undefined; 147 | body: 148 | | Array // with close tag 149 | | undefined // self closed 150 | | null; // EOF before open tag end 151 | // original close tag,
152 | close: 153 | | IText // with close tag 154 | | undefined // self closed 155 | | null; // EOF before end or without close tag 156 | } 157 | ``` 158 | 159 | - `IAttribute` 160 | 161 | ```typescript jsx 162 | export interface IAttribute extends IBaseNode { 163 | name: IText; 164 | value: IAttributeValue | undefined; 165 | } 166 | ``` 167 | 168 | - `IAttributeValue` 169 | 170 | ```typescript jsx 171 | export interface IAttributeValue extends IBaseNode { 172 | value: string; 173 | quote: "'" | '"' | undefined; 174 | } 175 | ``` 176 | 177 | - `IText` 178 | 179 | ```typescript jsx 180 | export interface IText extends IBaseNode { 181 | type: SyntaxKind.Text; 182 | value: string; 183 | } 184 | ``` 185 | 186 | - `IBaseNode` 187 | 188 | ```typescript jsx 189 | export interface IBaseNode { 190 | start: number; 191 | end: number; 192 | } 193 | ``` 194 | 195 | - `SyntaxKind` 196 | 197 | ```typescript jsx 198 | export enum SyntaxKind { 199 | Text = 'Text', 200 | Tag = 'Tag', 201 | } 202 | ``` 203 | 204 | ### walk(ast, options) 205 | 206 | Visit all the nodes of the AST with specified callbacks: 207 | 208 | ```typescript jsx 209 | function walk(ast: INode[], options: WalkOptions): void; 210 | ``` 211 | 212 | - `IWalkOptions` 213 | 214 | ```typescript jsx 215 | export interface IWalkOptions { 216 | enter?(node: INode, parent: INode | void, index: number): void; 217 | leave?(node: INode, parent: INode | void, index: number): void; 218 | } 219 | ``` 220 | 221 | ### safeHtml(input) 222 | 223 | Parse input to AST and keep the tags and attributes by whitelists, and then 224 | print it to a string. 225 | 226 | ```typescript jsx 227 | function safeHtml(input: string, options?: Partial): string; 228 | ``` 229 | 230 | 231 | 232 | - `SafeHtmlOptions` 233 | 234 | ```typescript jsx 235 | export interface SafeHtmlOptions { 236 | allowedTags: string[]; 237 | allowedAttrs: string[]; 238 | tagAllowedAttrs: Record; 239 | allowedUrl: RegExp; 240 | } 241 | ``` 242 | 243 | #### safeHtmlDefaultOptions 244 | 245 | The default options of [`safeHtml`](#safehtmlinput), you can modify it, its 246 | effect is global. 247 | 248 | ```typescript jsx 249 | const safeHtmlDefaultOptions: SafeHtmlOptions; 250 | ``` 251 | 252 | ## Warnings 253 | 254 | This is use for HTML5, that means: 255 | 256 | 1. All tags like ``, `` (except for ``, case insensitive) 257 | is treated as `Comment`, that means `CDATASection` is treated as comment. 258 | 2. Special tag names: 259 | 260 | - `"!doctype"` (case insensitive), the doctype declaration 261 | - `"!"`: short comment 262 | - `"!--"`: normal comment 263 | - `""`(empty string): short comment, for ``, the leading `?` is treated as comment content 264 | 265 | ## Benchmark 266 | 267 | Thanks for [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark), 268 | I created a pull request at [pulls/7](https://github.com/AndreasMadsen/htmlparser-benchmark/pull/7/files), 269 | and its result on my MacBook Pro is: 270 | 271 | ```bash 272 | $ npm test 273 | 274 | > htmlparser-benchmark@1.1.3 test ~/htmlparser-benchmark 275 | > node execute.js 276 | 277 | gumbo-parser failed (exit code 1) 278 | high5 failed (exit code 1) 279 | 280 | html-parser : 28.6524 ms/file ± 21.4282 281 | 282 | html5 : 130.423 ms/file ± 161.478 283 | 284 | html5parser : 2.37975 ms/file ± 3.30717 285 | 286 | htmlparser : 16.6576 ms/file ± 109.840 287 | 288 | htmlparser2-dom : 3.45602 ms/file ± 5.05830 289 | 290 | htmlparser2 : 2.61135 ms/file ± 4.33535 291 | hubbub failed (exit code 1) 292 | libxmljs failed (exit code 1) 293 | 294 | neutron-html5parser: 2.89331 ms/file ± 2.94316 295 | parse5 failed (exit code 1) 296 | 297 | sax : 10.2110 ms/file ± 13.5204 298 | ``` 299 | 300 | ## License 301 | 302 | ``` 303 | The MIT License (MIT) 304 | 305 | Copyright (c) 2020 acrazing 306 | 307 | Permission is hereby granted, free of charge, to any person obtaining a copy 308 | of this software and associated documentation files (the "Software"), to deal 309 | in the Software without restriction, including without limitation the rights 310 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 311 | copies of the Software, and to permit persons to whom the Software is 312 | furnished to do so, subject to the following conditions: 313 | 314 | The above copyright notice and this permission notice shall be included in all 315 | copies or substantial portions of the Software. 316 | 317 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 318 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 319 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 320 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 321 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 322 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 323 | SOFTWARE. 324 | ``` 325 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html5parser", 3 | "description": "A super fast & tiny HTML5 parser", 4 | "version": "2.0.2", 5 | "author": "acrazing ", 6 | "keywords": [ 7 | "html5", 8 | "parser", 9 | "ast", 10 | "attributes" 11 | ], 12 | "license": "MIT", 13 | "main": "dist/html5parser.cjs.js", 14 | "module": "dist/html5parser.es.js", 15 | "umd": "dist/html5parser.umd.js", 16 | "types": "dist/index.d.ts", 17 | "files": [ 18 | "dist/", 19 | "src/" 20 | ], 21 | "scripts": { 22 | "clean": "rm -rf dist temp .cache", 23 | "bundle": "rollup -c rollup.config.js", 24 | "build": "run-s clean bundle", 25 | "prepublishOnly": "run-s test build", 26 | "test": "jest" 27 | }, 28 | "repository": { 29 | "type": "git", 30 | "url": "git+https://github.com/acrazing/html5parser.git" 31 | }, 32 | "bugs": { 33 | "url": "https://github.com/acrazing/html5parser/issues" 34 | }, 35 | "homepage": "https://github.com/acrazing/html5parser#readme", 36 | "devDependencies": { 37 | "@rollup/plugin-commonjs": "^19.0.0", 38 | "@rollup/plugin-node-resolve": "^13.0.0", 39 | "@types/fs-extra": "^9.0.11", 40 | "@types/jest": "^26.0.23", 41 | "@types/node": "^15.0.2", 42 | "@types/node-fetch": "^2.5.10", 43 | "fs-extra": "^10.0.0", 44 | "husky": "^4.3.0", 45 | "jest": "^26.6.3", 46 | "lint-staged": "^11.0.0", 47 | "node-fetch": "^2.6.1", 48 | "npm-run-all": "^4.1.5", 49 | "prettier": "^2.3.0", 50 | "rollup": "^2.47.0", 51 | "rollup-plugin-sourcemaps": "^0.6.3", 52 | "rollup-plugin-terser": "^7.0.2", 53 | "rollup-plugin-typescript2": "^0.30.0", 54 | "ts-jest": "^26.5.6", 55 | "typescript": "^4.2.4" 56 | }, 57 | "jest": { 58 | "moduleFileExtensions": [ 59 | "ts", 60 | "tsx", 61 | "js", 62 | "jsx", 63 | "json", 64 | "node", 65 | "mjs" 66 | ], 67 | "cacheDirectory": ".cache/jest", 68 | "collectCoverage": false, 69 | "collectCoverageFrom": [ 70 | "/src/**/*.{ts,tsx}", 71 | "!**/*.d.ts" 72 | ], 73 | "coverageDirectory": "temp/coverage", 74 | "globals": { 75 | "__DEV__": true, 76 | "ENV": {} 77 | }, 78 | "testMatch": [ 79 | "/src/**/*.spec.{ts,tsx}" 80 | ], 81 | "transform": { 82 | "^.+\\.tsx?$": "ts-jest" 83 | } 84 | }, 85 | "cliVersion": "8.8.4", 86 | "husky": { 87 | "hooks": { 88 | "pre-commit": "lint-staged" 89 | } 90 | }, 91 | "lint-staged": { 92 | "*.{js,jsx,ts,tsx,json,css,less,scss,md}": [ 93 | "prettier --write" 94 | ] 95 | }, 96 | "prettier": { 97 | "printWidth": 100, 98 | "tabWidth": 2, 99 | "useTabs": false, 100 | "semi": true, 101 | "singleQuote": true, 102 | "jsxSingleQuote": false, 103 | "trailingComma": "all", 104 | "bracketSpacing": true, 105 | "jsxBracketSameLine": false, 106 | "arrowParens": "always", 107 | "endOfLine": "lf" 108 | }, 109 | "dependencies": { 110 | "tslib": "^2.2.0" 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | /* 2 | * @since 2020-11-03 13:16:07 3 | * @author acrazing 4 | */ 5 | 6 | import commonjs from '@rollup/plugin-commonjs'; 7 | import resolve from '@rollup/plugin-node-resolve'; 8 | import sourceMaps from 'rollup-plugin-sourcemaps'; 9 | import { terser } from 'rollup-plugin-terser'; 10 | import typescript from 'rollup-plugin-typescript2'; 11 | 12 | const packageJson = require('./package.json'); 13 | 14 | const deps = Object.keys(packageJson.dependencies).concat(Object.keys(packageJson.devDependencies)); 15 | 16 | const id = packageJson.name.split('/').pop(); 17 | 18 | const options = (format, index = 'index') => ({ 19 | input: `src/${index}.ts`, 20 | output: { 21 | file: `dist/${id}${index === 'index' ? '' : '-' + index}.${format}.js`, 22 | format, 23 | sourcemap: true, 24 | name: id.charAt(0).toUpperCase() + id.substring(1), 25 | plugins: format === 'umd' ? [terser({ format: { comments: false } })] : [], 26 | }, 27 | external: format === 'umd' ? [] : deps, 28 | plugins: [ 29 | typescript({ 30 | tsconfigOverride: { 31 | compilerOptions: { module: 'esnext' }, 32 | exclude: ['src/**/*.spec.ts', 'src/**/*.spec.tsx'], 33 | }, 34 | }), 35 | commonjs(), 36 | resolve({ preferBuiltins: true }), 37 | sourceMaps(), 38 | ], 39 | }); 40 | 41 | export default [options('cjs'), options('es'), options('umd')]; 42 | -------------------------------------------------------------------------------- /src/config.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-19 15:56:14 7 | * @version 1.0.0 8 | * @desc config.ts 9 | */ 10 | 11 | function createMap(keys: string, value: T): Record { 12 | return keys.split(',').reduce((pre, now) => { 13 | pre[now] = value; 14 | return pre; 15 | }, Object.create(null)); 16 | } 17 | 18 | export const selfCloseTags = createMap( 19 | 'area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr,!doctype,,!,!--', 20 | true, 21 | ); 22 | 23 | export const noNestedTags = createMap('li,option,select,textarea', true); 24 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-19 01:09:54 7 | * @version 1.0.0 8 | * @desc index.ts 9 | */ 10 | 11 | export { SyntaxKind, IBaseNode, IAttributeValue, IAttribute, ITag, IText, INode } from './types'; 12 | export { TokenKind, IToken, tokenize } from './tokenize'; 13 | export { ParseOptions, parse } from './parse'; 14 | export { WalkOptions, walk } from './walk'; 15 | export { SafeHtmlOptions, safeHtmlDefaultOptions, safeHtml } from './safeHtml'; 16 | -------------------------------------------------------------------------------- /src/misc.spec.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-22 22:17:46 7 | * @version 1.0.0 8 | * @desc misc.spec.ts 9 | */ 10 | 11 | import * as fs from 'fs-extra'; 12 | import fetch from 'node-fetch'; 13 | import * as path from 'path'; 14 | import { parse } from './parse'; 15 | 16 | function run(url: string) { 17 | const id = url.replace(/[^\w\d]+/g, '_').replace(/^_+|_+$/g, ''); 18 | return fetch(url) 19 | .then((r) => r.text()) 20 | .then((d) => { 21 | console.log('[FETCH:OK]: %s', url); 22 | fs.outputFileSync(path.join(process.cwd(), 'temp', `${id}.html`), d); 23 | console.time('parse:' + url); 24 | const ast = parse(d); 25 | console.timeEnd('parse:' + url); 26 | fs.outputJSONSync(path.join(process.cwd(), 'temp', `${id}.json`), ast, { 27 | spaces: 2, 28 | }); 29 | }) 30 | .catch((err) => { 31 | console.error('[ERR]: %s, %s', id, err.message); 32 | }); 33 | } 34 | 35 | const scenes = [ 36 | 'https://www.baidu.com/', 37 | 'https://www.qq.com/?fromdefault', 38 | 'https://www.taobao.com/', 39 | ]; 40 | 41 | describe('real scenarios', () => { 42 | for (const scene of scenes) { 43 | it(`parse ${scene}`, async () => run(scene)); 44 | } 45 | }); 46 | -------------------------------------------------------------------------------- /src/parse.spec.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-20 17:32:24 7 | * @version 1.0.0 8 | * @desc parse.spec.ts 9 | */ 10 | 11 | import * as assert from 'assert'; 12 | import { parse } from './parse'; 13 | import { IAttribute, IAttributeValue, INode, ITag, IText, SyntaxKind } from './types'; 14 | 15 | export let index = 0; 16 | 17 | export function text(input: string, start = index): IText { 18 | return { 19 | type: SyntaxKind.Text, 20 | start: start, 21 | end: index = input.length + start, 22 | value: input, 23 | }; 24 | } 25 | 26 | export function tag( 27 | input: string, 28 | name: string, 29 | open: IText, 30 | attributes: IAttribute[], 31 | body: INode[] | undefined | null, 32 | close: IText | undefined | null, 33 | start: number, 34 | rawName = name, 35 | ): ITag { 36 | return { 37 | start: start, 38 | end: index = start + input.length, 39 | type: SyntaxKind.Tag, 40 | open: open, 41 | name: name, 42 | rawName: rawName, 43 | attributes: attributes, 44 | attributeMap: undefined, 45 | body: body, 46 | close: close, 47 | }; 48 | } 49 | 50 | function attr(name: IText, value?: IAttributeValue): IAttribute { 51 | return { 52 | start: name.start, 53 | end: index = value ? value.end : name.end, 54 | name: name, 55 | value: value, 56 | }; 57 | } 58 | 59 | function value(input: string, quote: undefined | "'" | '"', start = index): IAttributeValue { 60 | return { 61 | start: start, 62 | end: index = start + (quote === void 0 ? 0 : 2) + input.length, 63 | value: input, 64 | quote: quote, 65 | }; 66 | } 67 | 68 | const scenes: Array<{ 69 | name: string; 70 | input: string; 71 | nodes: INode[]; 72 | }> = [ 73 | { 74 | name: 'text', 75 | input: 'hello world', 76 | nodes: [text('hello world', 0)], 77 | }, 78 | { 79 | name: 'text twice', 80 | input: 'hello < world', 81 | nodes: [text('hello < world', 0)], 82 | }, 83 | { 84 | name: 'single tag', 85 | input: '
', 86 | nodes: [tag('
', 'div', text('
', 0), [], [], text('
'), 0)], 87 | }, 88 | { 89 | name: 'tag attributes', 90 | input: '
', 91 | nodes: [ 92 | tag( 93 | '
', 94 | 'div', 95 | text('
', 0), 96 | [ 97 | attr(text('a1', 5)), 98 | attr(text('b2', index + 1), value('c3', void 0, index + 1)), 99 | attr(text('d4', index + 1), value('e5', void 0, index + 3)), 100 | attr(text('f6', index + 1), value('g7', "'", index + 1)), 101 | attr(text('h8', index + 1), value('i9', '"', index + 1)), 102 | ], 103 | void 0, 104 | null, 105 | 0, 106 | ), 107 | ], 108 | }, 109 | { 110 | name: 'nested tags', 111 | input: ` 112 |
113 | hello world 114 |

h1

115 | 116 | 117 |
118 |
119 | span 120 | 121 |
122 |
123 |
124 | `, 125 | nodes: [ 126 | text('\n', 0), 127 | tag( 128 | `
129 | hello world 130 |

h1

131 | 132 | 133 |
134 |
135 | span 136 | 137 |
138 |
139 |
`, 140 | 'div', 141 | text('
', 1), 142 | [attr(text('id', 6), value('1', '"', 9))], 143 | [ 144 | text('\n hello world\n ', 13), 145 | tag( 146 | '

h1

', 147 | 'h1', 148 | text('

', 30), 149 | [attr(text('id', 34), value('h1', '"', 37))], 150 | [text('h1', 42)], 151 | text('

', 44), 152 | 30, 153 | ), 154 | text('\n ', 49), 155 | tag( 156 | '', 157 | 'img', 158 | text('', 52), 159 | [attr(text('src', 57), value('/src/index.ts', '"', 61))], 160 | void 0, 161 | null, 162 | 52, 163 | ), 164 | text('\n ', 77), 165 | tag('', 'input', text('', 80), [], void 0, null, 80), 166 | text('\n ', 89), 167 | tag( 168 | `
169 |
170 | span 171 | 172 |
173 |
`, 174 | 'div', 175 | text('
', 92), 176 | [attr(text('id', 97), value('2', '"', 100))], 177 | [ 178 | text('\n ', 104), 179 | tag( 180 | `
181 | span 182 | 183 |
`, 184 | 'div', 185 | text('
', 109), 186 | [attr(text('id', 114), value('3', '"', 117))], 187 | [ 188 | text('\n ', 121), 189 | tag( 190 | 'span', 191 | 'span', 192 | text('', 128), 193 | [], 194 | [text('span', 134)], 195 | text('', 138), 196 | 128, 197 | ), 198 | text('\n ', 145), 199 | tag( 200 | '', 201 | 'empty', 202 | text('', 152), 203 | [], 204 | [], 205 | text('', 159), 206 | 152, 207 | ), 208 | text('\n ', 167), 209 | ], 210 | text('
', 172), 211 | 109, 212 | ), 213 | text('\n ', 178), 214 | ], 215 | text('
', 181), 216 | 92, 217 | ), 218 | text('\n', 187), 219 | ], 220 | text('
', 188), 221 | 1, 222 | ), 223 | text('\n ', 194), 224 | ], 225 | }, 226 | { 227 | name: 'doctype', 228 | input: '', 229 | nodes: [ 230 | tag( 231 | '', 232 | '!doctype', 233 | text('', 0), 234 | [attr(text('html', 10))], 235 | void 0, 236 | null, 237 | 0, 238 | ), 239 | tag('', 'html', text('', 15), [], [], text('', 21), 15), 240 | ], 241 | }, 242 | { 243 | name: 'comments', 244 | input: 245 | '', 246 | nodes: [ 247 | tag( 248 | '', 249 | '!--', 250 | text('', 20), 254 | 0, 255 | ), 256 | tag( 257 | '', 258 | '!', 259 | text('', 42), 263 | 23, 264 | ), 265 | tag( 266 | '', 267 | '!', 268 | text('', 62), 272 | 43, 273 | ), 274 | tag( 275 | '', 276 | '', 277 | text('<', 63), 278 | [], 279 | [text('? qm comment ?', 64)], 280 | text('>', 78), 281 | 63, 282 | ), 283 | tag( 284 | '', 285 | '!', 286 | text('', 97), 290 | 79, 291 | ), 292 | ], 293 | }, 294 | { 295 | name: 'normal comment special', 296 | input: '', 297 | nodes: [ 298 | tag( 299 | '', 300 | '!--', 301 | text('', 14), 305 | 0, 306 | ), 307 | ], 308 | }, 309 | { 310 | name: 'script', 311 | input: '', 312 | nodes: [ 313 | tag( 314 | '', 315 | 'script', 316 | text('', 22), 320 | 0, 321 | ), 322 | ], 323 | }, 324 | { 325 | name: 'script', 326 | input: '', 327 | nodes: [ 328 | tag( 329 | '', 330 | 'style', 331 | text('', 20), 335 | 0, 336 | ), 337 | ], 338 | }, 339 | { 340 | name: 'tag name', 341 | input: '
', 342 | nodes: [tag('
', 'div', text('
', 0), [], [], text('
', 5), 0, 'DIV')], 343 | }, 344 | ]; 345 | 346 | describe('parse cases', () => { 347 | for (const scene of scenes) { 348 | it(`case ${JSON.stringify(scene.name)}`, () => { 349 | assert.deepStrictEqual(parse(scene.input), scene.nodes); 350 | }); 351 | } 352 | }); 353 | 354 | describe('parse options', () => { 355 | it('should setAttributeMap', () => { 356 | const ast = parse(`
`, { 357 | setAttributeMap: true, 358 | }); 359 | const div = tag( 360 | '
', 361 | 'div', 362 | text('
', 0), 363 | [ 364 | attr(text('same', 5), value('1', '"', index + 1)), 365 | attr(text('diff', index + 1), value('2', '"', index + 1)), 366 | attr(text('same', index + 1), value('3', '"', index + 1)), 367 | ], 368 | void 0, 369 | null, 370 | 0, 371 | ); 372 | div.attributeMap = { 373 | same: div.attributes[2], 374 | diff: div.attributes[1], 375 | }; 376 | expect(ast).toEqual([div]); 377 | }); 378 | }); 379 | -------------------------------------------------------------------------------- /src/parse.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-19 00:54:46 7 | * @version 1.0.0 8 | * @desc parse.ts 9 | */ 10 | 11 | import { noNestedTags, selfCloseTags } from './config'; 12 | import { IToken, tokenize, TokenKind } from './tokenize'; 13 | import { IAttribute, IAttributeValue, INode, ITag, IText, SyntaxKind } from './types'; 14 | import { getLineRanges, getPosition } from './utils'; 15 | import { walk } from './walk'; 16 | 17 | interface IContext { 18 | parent: IContext | undefined; 19 | tag: ITag; 20 | } 21 | 22 | export interface ParseOptions { 23 | // create tag's attributes map 24 | // if true, will set ITag.attributeMap property 25 | // as a `Record` 26 | setAttributeMap: boolean; 27 | } 28 | 29 | let index: number; 30 | let count: number; 31 | let tokens: IToken[]; 32 | let tagChain: IContext | undefined; 33 | let nodes: INode[]; 34 | let token: IToken; 35 | let node: IText | undefined; 36 | let buffer: string; 37 | let lines: number[] | undefined; 38 | let parseOptions: ParseOptions | undefined; 39 | 40 | function init(input?: string, options?: ParseOptions) { 41 | if (input === void 0) { 42 | count = 0; 43 | tokens.length = 0; 44 | buffer = ''; 45 | } else { 46 | tokens = tokenize(input); 47 | count = tokens.length; 48 | buffer = input; 49 | } 50 | index = 0; 51 | tagChain = void 0; 52 | nodes = []; 53 | token = void 0 as any; 54 | node = void 0; 55 | lines = void 0; 56 | parseOptions = options; 57 | } 58 | 59 | function pushNode(_node: ITag | IText) { 60 | if (!tagChain) { 61 | nodes.push(_node); 62 | } else if ( 63 | _node.type === SyntaxKind.Tag && 64 | _node.name === tagChain.tag.name && 65 | noNestedTags[_node.name] 66 | ) { 67 | tagChain = tagChain.parent; 68 | pushNode(_node); 69 | } else if (tagChain.tag.body) { 70 | tagChain.tag.end = _node.end; 71 | tagChain.tag.body.push(_node); 72 | } 73 | } 74 | 75 | function pushTagChain(tag: ITag) { 76 | tagChain = { parent: tagChain, tag: tag }; 77 | node = void 0; 78 | } 79 | 80 | function createLiteral(start = token.start, end = token.end, value = token.value): IText { 81 | return { start, end, value, type: SyntaxKind.Text }; 82 | } 83 | 84 | function createTag(): ITag { 85 | return { 86 | start: token.start - 1, // include < 87 | end: token.end, 88 | type: SyntaxKind.Tag, 89 | open: createLiteral(token.start - 1), // not finished 90 | name: token.value, 91 | rawName: buffer.substring(token.start, token.end), 92 | attributes: [], 93 | attributeMap: void 0, 94 | body: null, 95 | close: null, 96 | }; 97 | } 98 | 99 | function createAttribute(): IAttribute { 100 | return { 101 | start: token.start, 102 | end: token.end, 103 | name: createLiteral(), 104 | value: void 0, 105 | }; 106 | } 107 | 108 | function createAttributeValue(): IAttributeValue { 109 | return { 110 | start: token.start, 111 | end: token.end, 112 | value: 113 | token.type === TokenKind.AttrValueNq 114 | ? token.value 115 | : token.value.substr(1, token.value.length - 2), 116 | quote: 117 | token.type === TokenKind.AttrValueNq 118 | ? void 0 119 | : token.type === TokenKind.AttrValueSq 120 | ? "'" 121 | : '"', 122 | }; 123 | } 124 | 125 | function appendLiteral(_node: IText | IAttributeValue = node as IText) { 126 | _node.value += token.value; 127 | _node.end = token.end; 128 | } 129 | 130 | function unexpected() { 131 | if (lines === void 0) { 132 | lines = getLineRanges(buffer); 133 | } 134 | const [line, column] = getPosition(lines, token.start); 135 | throw new Error( 136 | `Unexpected token "${token.value}(${token.type})" at [${line},${column}]` + 137 | (tagChain ? ` when parsing tag: ${JSON.stringify(tagChain.tag.name)}.` : ''), 138 | ); 139 | } 140 | 141 | function buildAttributeMap(tag: ITag) { 142 | tag.attributeMap = {}; 143 | for (const attr of tag.attributes) { 144 | tag.attributeMap[attr.name.value] = attr; 145 | } 146 | } 147 | 148 | const enum OpenTagState { 149 | BeforeAttr, 150 | InName, 151 | AfterName, 152 | AfterEqual, 153 | InValue, 154 | } 155 | 156 | function parseOpenTag() { 157 | let state = OpenTagState.BeforeAttr; 158 | 159 | let attr: IAttribute = void 0 as any; 160 | 161 | const tag = createTag(); 162 | pushNode(tag); 163 | if (tag.name === '' || tag.name === '!' || tag.name === '!--') { 164 | tag.open.value = '<' + tag.open.value; 165 | if (index === count) { 166 | return; 167 | } else { 168 | token = tokens[++index]; 169 | if (token.type !== TokenKind.OpenTagEnd) { 170 | node = createLiteral(); 171 | tag.body = [node]; 172 | while (++index < count) { 173 | token = tokens[index]; 174 | if (token.type === TokenKind.OpenTagEnd) { 175 | node = void 0; 176 | break; 177 | } 178 | appendLiteral(); 179 | } 180 | } 181 | tag.close = createLiteral(token.start, token.end + 1, `${token.value}>`); 182 | tag.end = tag.close.end; 183 | } 184 | return; 185 | } 186 | while (++index < count) { 187 | token = tokens[index]; 188 | if (token.type === TokenKind.OpenTagEnd) { 189 | tag.end = tag.open.end = token.end + 1; 190 | tag.open.value = buffer.substring(tag.open.start, tag.open.end); 191 | if (token.value === '' && !selfCloseTags[tag.name]) { 192 | tag.body = []; 193 | pushTagChain(tag); 194 | } else { 195 | tag.body = void 0; 196 | } 197 | break; 198 | } else if (state === OpenTagState.BeforeAttr) { 199 | if (token.type !== TokenKind.Whitespace) { 200 | attr = createAttribute(); 201 | state = OpenTagState.InName; 202 | tag.attributes.push(attr); 203 | } 204 | } else if (state === OpenTagState.InName) { 205 | if (token.type === TokenKind.Whitespace) { 206 | state = OpenTagState.AfterName; 207 | } else if (token.type === TokenKind.AttrValueEq) { 208 | state = OpenTagState.AfterEqual; 209 | } else { 210 | appendLiteral(attr.name); 211 | } 212 | } else if (state === OpenTagState.AfterName) { 213 | if (token.type !== TokenKind.Whitespace) { 214 | if (token.type === TokenKind.AttrValueEq) { 215 | state = OpenTagState.AfterEqual; 216 | } else { 217 | attr = createAttribute(); 218 | state = OpenTagState.InName; 219 | tag.attributes.push(attr); 220 | } 221 | } 222 | } else if (state === OpenTagState.AfterEqual) { 223 | if (token.type !== TokenKind.Whitespace) { 224 | attr.value = createAttributeValue(); 225 | if (token.type === TokenKind.AttrValueNq) { 226 | state = OpenTagState.InValue; 227 | } else { 228 | attr.end = attr.value.end; 229 | state = OpenTagState.BeforeAttr; 230 | } 231 | } 232 | } else { 233 | if (token.type === TokenKind.Whitespace) { 234 | attr.end = attr.value!.end; 235 | state = OpenTagState.BeforeAttr; 236 | } else { 237 | appendLiteral(attr.value); 238 | } 239 | } 240 | } 241 | } 242 | 243 | function parseCloseTag() { 244 | let _context = tagChain; 245 | while (true) { 246 | if (!_context || token.value.trim() === _context.tag.name) { 247 | break; 248 | } 249 | _context = _context.parent; 250 | } 251 | if (!_context) { 252 | return; 253 | } 254 | _context.tag.close = createLiteral( 255 | token.start - 2, 256 | token.end + 1, 257 | buffer.substring(token.start - 2, token.end + 1), 258 | ); 259 | _context.tag.end = _context.tag.close.end; 260 | _context = _context.parent; 261 | tagChain = _context; 262 | } 263 | 264 | export function parse(input: string, options?: ParseOptions): INode[] { 265 | init(input, { 266 | setAttributeMap: false, 267 | ...options, 268 | } as ParseOptions); 269 | while (index < count) { 270 | token = tokens[index]; 271 | switch (token.type) { 272 | case TokenKind.Literal: 273 | if (!node) { 274 | node = createLiteral(); 275 | pushNode(node); 276 | } else { 277 | appendLiteral(node); 278 | } 279 | break; 280 | case TokenKind.OpenTag: 281 | node = void 0; 282 | parseOpenTag(); 283 | break; 284 | case TokenKind.CloseTag: 285 | node = void 0; 286 | parseCloseTag(); 287 | break; 288 | default: 289 | unexpected(); 290 | break; 291 | } 292 | index++; 293 | } 294 | const _nodes = nodes; 295 | if (parseOptions?.setAttributeMap) { 296 | walk(_nodes, { 297 | enter(node: IText | ITag): void { 298 | if (node.type === SyntaxKind.Tag) { 299 | buildAttributeMap(node); 300 | } 301 | }, 302 | }); 303 | } 304 | init(); 305 | return _nodes; 306 | } 307 | -------------------------------------------------------------------------------- /src/safeHtml.spec.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * @since 2020-09-09 23:37:28 3 | * @author acrazing 4 | */ 5 | 6 | import { safeHtml } from './safeHtml'; 7 | 8 | const htmlInput = ` 9 |
10 |

H1

11 |

H2

12 | \t 13 | \t 14 |

15 | Span 16 | 17 | 18 |
TD
19 | 20 |

21 | Downloadchild 22 | Javascriptchild 23 |
24 | `; 25 | 26 | const htmlOutput = ` 27 |
28 |

H1

29 |

H2

30 | \t 31 | \t 32 |

33 | Span 34 | 35 | 36 |
TD
37 | 38 |

39 | Downloadchild 40 | Javascriptchild 41 |
42 | `; 43 | 44 | describe('safeHtml', () => { 45 | it('should stringify safe html as expected', () => { 46 | expect(safeHtml(htmlInput)).toEqual(htmlOutput); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /src/safeHtml.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * @since 2020-09-09 22:53:14 3 | * @author acrazing 4 | */ 5 | 6 | import { selfCloseTags } from './config'; 7 | import { parse } from './parse'; 8 | import { INode, SyntaxKind } from './types'; 9 | 10 | export interface SafeHtmlOptions { 11 | allowedTags: string[]; 12 | allowedAttrs: string[]; 13 | tagAllowedAttrs: Record; 14 | allowedUrl: RegExp; 15 | } 16 | 17 | export const safeHtmlDefaultOptions: SafeHtmlOptions = { 18 | allowedTags: [ 19 | 'a', 20 | 'abbr', 21 | 'address', 22 | 'area', 23 | 'article', 24 | 'aside', 25 | 'b', 26 | 'bdi', 27 | 'bdo', 28 | 'big', 29 | 'blockquote', 30 | 'br', 31 | 'button', 32 | 'caption', 33 | 'cite', 34 | 'code', 35 | 'col', 36 | 'colgroup', 37 | 'data', 38 | 'dd', 39 | 'del', 40 | 'dfn', 41 | 'div', 42 | 'dl', 43 | 'dt', 44 | 'em', 45 | 'figcaption', 46 | 'figure', 47 | 'footer', 48 | 'h1', 49 | 'h2', 50 | 'h3', 51 | 'h4', 52 | 'h5', 53 | 'h6', 54 | 'header', 55 | 'hgroup', 56 | 'hr', 57 | 'i', 58 | 'img', 59 | 'ins', 60 | 'kbd', 61 | 'label', 62 | 'li', 63 | 'main', 64 | 'map', 65 | 'ol', 66 | 'p', 67 | 'picture', 68 | 'pre', 69 | 'q', 70 | 'rp', 71 | 'rt', 72 | 'ruby', 73 | 's', 74 | 'samp', 75 | 'section', 76 | 'small', 77 | 'span', 78 | 'strong', 79 | 'sub', 80 | 'summary', 81 | 'sup', 82 | 'table', 83 | 'tbody', 84 | 'td', 85 | 'tfoot', 86 | 'th', 87 | 'thead', 88 | 'time', 89 | 'tr', 90 | 'u', 91 | 'ul', 92 | 'var', 93 | 'wbr', 94 | ], 95 | allowedAttrs: ['style'], 96 | tagAllowedAttrs: { 97 | a: ['href', 'target'], 98 | img: ['src'], 99 | td: ['rowspan', 'colspan'], 100 | th: ['rowspan', 'colspan'], 101 | time: ['datetime'], 102 | colgroup: ['span'], 103 | col: ['span'], 104 | }, 105 | allowedUrl: /^(?:mailto|tel|https?|ftp|[^:]*[^a-z0-9.+-][^:]*):|^[^:]*$/i, 106 | }; 107 | 108 | export function safeHtml(input: string, options: Partial = {}): string { 109 | const config: SafeHtmlOptions = { 110 | ...safeHtmlDefaultOptions, 111 | ...options, 112 | tagAllowedAttrs: { 113 | ...safeHtmlDefaultOptions.tagAllowedAttrs, 114 | ...options.tagAllowedAttrs, 115 | }, 116 | }; 117 | const ast = parse(input); 118 | return stringify(ast, config, input); 119 | } 120 | 121 | function stringify(ast: INode[], config: SafeHtmlOptions, input: string): string { 122 | return ast 123 | .map((node) => { 124 | if (node.type === SyntaxKind.Text) { 125 | return node.value; 126 | } 127 | if (config.allowedTags.indexOf(node.name) === -1) { 128 | return ''; 129 | } 130 | if (selfCloseTags[node.name]) { 131 | if (node.body !== void 0) { 132 | throw new Error(`self closed tag "${node.name}" should not have body`); 133 | } 134 | } else { 135 | if (!node.body || !node.close) { 136 | throw new Error(`tag "${node.name}" should have body and close`); 137 | } 138 | } 139 | const attrs = node.attributes 140 | .filter((a) => { 141 | if ( 142 | config.allowedAttrs.indexOf(a.name.value) > -1 || 143 | config.tagAllowedAttrs[node.name]?.indexOf(a.name.value) > -1 144 | ) { 145 | if (!a.value) { 146 | return true; 147 | } 148 | if (a.name.value !== 'src' && a.name.value !== 'href') { 149 | return true; 150 | } 151 | if (config.allowedUrl.test(a.value.value)) { 152 | return true; 153 | } 154 | return false; 155 | } 156 | return false; 157 | }) 158 | .map((a) => input.substring(a.start, a.end)) 159 | .join(' '); 160 | const head = '<' + node.rawName + (attrs ? ' ' + attrs : '') + '>'; 161 | if (!node.body) { 162 | return head; 163 | } 164 | return head + stringify(node.body, config, input) + ``; 165 | }) 166 | .join(''); 167 | } 168 | -------------------------------------------------------------------------------- /src/test/issue_6.spec.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * @since 2020-09-09 22:04:54 3 | * @author acrazing 4 | */ 5 | 6 | import { parse } from '../parse'; 7 | import { tag, text } from '../parse.spec'; 8 | import { tokenize, TokenKind } from '../tokenize'; 9 | import { token, tokenIndex } from '../tokenize.spec'; 10 | 11 | describe('issue #6', () => { 12 | it('should tokenize upper case tag to lower', () => { 13 | expect(tokenize('')).toEqual([ 14 | token('test', TokenKind.OpenTag, 1), 15 | token('', TokenKind.OpenTagEnd), 16 | token('test', TokenKind.CloseTag, tokenIndex + 3), 17 | ]); 18 | }); 19 | it('should parse upper case as expected', () => { 20 | expect(parse('')).toEqual([ 21 | tag( 22 | '', 23 | 'test', 24 | text('', 0), 25 | [], 26 | [], 27 | text(''), 28 | 0, 29 | 'Test', 30 | ), 31 | ]); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /src/test/issue_7.spec.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * @since 2020-09-09 22:43:03 3 | * @author acrazing 4 | */ 5 | 6 | import { parse } from '../parse'; 7 | import { tag, text } from '../parse.spec'; 8 | 9 | describe('issue #7', () => { 10 | it('should parse comment as expected', () => { 11 | expect(parse('\n-\n')).toEqual([ 12 | tag( 13 | '', 14 | '!--', 15 | text(''), 19 | 0, 20 | ), 21 | text('\n-\n'), 22 | ]); 23 | }); 24 | }); 25 | -------------------------------------------------------------------------------- /src/tokenize.spec.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-19 14:15:25 7 | * @version 1.0.0 8 | * @desc tokenize.spec.ts 9 | * 10 | * @formatter:off 11 | */ 12 | 13 | import * as assert from 'assert'; 14 | import { IToken, tokenize, TokenKind } from './tokenize'; 15 | 16 | interface ICase { 17 | name: string; 18 | input: string; 19 | tokens: IToken[]; 20 | } 21 | 22 | export let tokenIndex = 0; 23 | 24 | export function token(value: string, type: TokenKind = TokenKind.Literal, start = tokenIndex) { 25 | const v = { 26 | start: start, 27 | end: start + value.length, 28 | value, 29 | type, 30 | }; 31 | tokenIndex = v.end; 32 | return v; 33 | } 34 | 35 | const cases: ICase[] = [ 36 | { 37 | name: 'single Literal', 38 | input: 'hello', 39 | tokens: [token('hello', TokenKind.Literal, 0)], 40 | }, 41 | { 42 | name: 'Literal end with <', 43 | input: 'hello<', 44 | tokens: [token('hello', void 0, 0), token('<')], 45 | }, 46 | { 47 | name: 'Literal unexpected <', 48 | input: 'hello< world', 49 | tokens: [token('hello', void 0, 0), token('< world')], 50 | }, 51 | { 52 | name: 'OpenTag EOF', 53 | input: '
", 59 | tokens: [ 60 | token('div', TokenKind.OpenTag, 1), 61 | token(' ', TokenKind.Whitespace), 62 | token('a1', TokenKind.AttrValueNq), 63 | token(' ', TokenKind.Whitespace), 64 | token("'b2'", TokenKind.AttrValueSq), 65 | token(' ', TokenKind.Whitespace), 66 | token('"c3"', TokenKind.AttrValueDq), 67 | token(' ', TokenKind.Whitespace), 68 | token("'d4'", TokenKind.AttrValueSq), 69 | token('e5', TokenKind.AttrValueNq), 70 | token(' ', TokenKind.Whitespace), 71 | token("'f6\"'", TokenKind.AttrValueSq), 72 | token(' ', TokenKind.Whitespace), 73 | token('"g7\'"', TokenKind.AttrValueDq), 74 | token('', TokenKind.OpenTagEnd), 75 | token('div', TokenKind.CloseTag, tokenIndex + 3), 76 | ], 77 | }, 78 | { 79 | name: 'attribute values', 80 | input: '
M
', 81 | tokens: [ 82 | token('div', TokenKind.OpenTag, 1), 83 | token(' ', TokenKind.Whitespace), 84 | token('a', TokenKind.AttrValueNq), 85 | token(' ', TokenKind.Whitespace), 86 | token('b', TokenKind.AttrValueNq), 87 | token('=', TokenKind.AttrValueEq), 88 | token(' ', TokenKind.Whitespace), 89 | token('c', TokenKind.AttrValueNq), 90 | token('=', TokenKind.AttrValueEq), 91 | token('1', TokenKind.AttrValueNq), 92 | token(' ', TokenKind.Whitespace), 93 | token('d', TokenKind.AttrValueNq), 94 | token(' ', TokenKind.Whitespace), 95 | token('e', TokenKind.AttrValueNq), 96 | token(' ', TokenKind.Whitespace), 97 | token('=', TokenKind.AttrValueEq), 98 | token(' ', TokenKind.Whitespace), 99 | token('f', TokenKind.AttrValueNq), 100 | token(' ', TokenKind.Whitespace), 101 | token('=', TokenKind.AttrValueEq), 102 | token(' ', TokenKind.Whitespace), 103 | token('g', TokenKind.AttrValueNq), 104 | token(' ', TokenKind.Whitespace), 105 | token("'h'", TokenKind.AttrValueSq), 106 | token('=', TokenKind.AttrValueEq), 107 | token('i', TokenKind.AttrValueNq), 108 | token(' ', TokenKind.Whitespace), 109 | token('"j"', TokenKind.AttrValueDq), 110 | token('k', TokenKind.AttrValueNq), 111 | token('=', TokenKind.AttrValueEq), 112 | token('lmn', TokenKind.AttrValueNq), 113 | token(' ', TokenKind.Whitespace), 114 | token('o', TokenKind.AttrValueNq), 115 | token('=', TokenKind.AttrValueEq), 116 | token("'pq'", TokenKind.AttrValueSq), 117 | token(' ', TokenKind.Whitespace), 118 | token('r', TokenKind.AttrValueNq), 119 | token('=', TokenKind.AttrValueEq), 120 | token('"st"', TokenKind.AttrValueDq), 121 | token('u', TokenKind.AttrValueNq), 122 | token('', TokenKind.OpenTagEnd), 123 | token('M', void 0, tokenIndex + 1), 124 | token('div', TokenKind.CloseTag, tokenIndex + 2), 125 | ], 126 | }, 127 | { 128 | name: 'normal doctype', 129 | input: '', 130 | tokens: [ 131 | token('!doctype', TokenKind.OpenTag, 1), 132 | token(' ', TokenKind.Whitespace), 133 | token('html', TokenKind.AttrValueNq), 134 | token('', TokenKind.OpenTagEnd), 135 | ], 136 | }, 137 | { 138 | name: 'unexpected eof end doctype', 139 | input: '', 150 | tokens: [ 151 | token('!--', TokenKind.OpenTag, 1), 152 | token(' hello world '), 153 | token('--', TokenKind.OpenTagEnd), 154 | ], 155 | }, 156 | { 157 | name: 'short comment', 158 | input: '', 159 | tokens: [ 160 | token('', TokenKind.OpenTag, 1), 161 | token('? hello world ?'), 162 | token('', TokenKind.OpenTagEnd), 163 | token('!', TokenKind.OpenTag, tokenIndex + 2), 164 | token('- hello world -'), 165 | token('', TokenKind.OpenTagEnd), 166 | ], 167 | }, 168 | { 169 | name: 'open tag end', 170 | input: '', 171 | tokens: [ 172 | token('a1', TokenKind.OpenTag, 1), 173 | token('', TokenKind.OpenTagEnd), 174 | token('b2', TokenKind.OpenTag, tokenIndex + 2), 175 | token('/', TokenKind.OpenTagEnd), 176 | token('c3', TokenKind.OpenTag, tokenIndex + 2), 177 | token(' ', TokenKind.Whitespace), 178 | token('/', TokenKind.OpenTagEnd), 179 | token('d4', TokenKind.OpenTag, tokenIndex + 2), 180 | token(' ', TokenKind.Whitespace), 181 | token('/', TokenKind.AttrValueNq), 182 | token(' ', TokenKind.Whitespace), 183 | token('', TokenKind.OpenTagEnd), 184 | token('e5', TokenKind.OpenTag, tokenIndex + 2), 185 | token(' ', TokenKind.Whitespace), 186 | token('f6', TokenKind.AttrValueNq), 187 | token('/', TokenKind.OpenTagEnd), 188 | token('g7', TokenKind.OpenTag, tokenIndex + 2), 189 | token(' ', TokenKind.Whitespace), 190 | token('/', TokenKind.AttrValueNq), 191 | token('h8', TokenKind.AttrValueNq), 192 | token('', TokenKind.OpenTagEnd), 193 | token('i9', TokenKind.OpenTag, tokenIndex + 2), 194 | token(' ', TokenKind.Whitespace), 195 | token('/', TokenKind.AttrValueNq), 196 | token('j10', TokenKind.AttrValueNq), 197 | token('/', TokenKind.OpenTagEnd), 198 | token('k11', TokenKind.OpenTag, tokenIndex + 2), 199 | token('/', TokenKind.AttrValueNq), 200 | token('/', TokenKind.OpenTagEnd), 201 | ], 202 | }, 203 | { 204 | name: 'close tag', 205 | input: '
', 206 | tokens: [ 207 | token('div', TokenKind.CloseTag, 2), 208 | token(' div ', TokenKind.CloseTag, tokenIndex + 3), 209 | ], 210 | }, 211 | { 212 | name: 'special normal comment', 213 | input: '', 214 | tokens: [ 215 | token('!--', TokenKind.OpenTag, 1), 216 | token('-- '), 217 | token('- '), 218 | token('-- '), 219 | token('-'), 220 | token('-'), 221 | token('--', TokenKind.OpenTagEnd), 222 | ], 223 | }, 224 | { 225 | name: 'script', 226 | input: '', 227 | tokens: [ 228 | token('script', TokenKind.OpenTag, 1), 229 | token('', TokenKind.OpenTagEnd), 230 | token('
', TokenKind.Literal, tokenIndex + 1), 231 | token('
', 238 | tokens: [ 239 | token('style', TokenKind.OpenTag, 1), 240 | token('', TokenKind.OpenTagEnd), 241 | token('
', TokenKind.Literal, tokenIndex + 1), 242 | token(' { 249 | for (const _case of cases) { 250 | it(`case "${_case.name}"`, () => { 251 | const tokens = tokenize(_case.input); 252 | assert.deepStrictEqual(tokens, _case.tokens); 253 | }); 254 | } 255 | }); 256 | -------------------------------------------------------------------------------- /src/tokenize.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * Copyright 2017 - acrazing 4 | * 5 | * @author acrazing joking.young@gmail.com 6 | * @since 2017-08-19 00:54:29 7 | * @version 1.0.0 8 | * @desc tokenize.ts 9 | */ 10 | 11 | const enum State { 12 | Literal, 13 | BeforeOpenTag, 14 | OpeningTag, 15 | AfterOpenTag, 16 | InValueNq, 17 | InValueSq, 18 | InValueDq, 19 | ClosingOpenTag, 20 | OpeningSpecial, 21 | OpeningDoctype, 22 | OpeningNormalComment, 23 | InNormalComment, 24 | InShortComment, 25 | ClosingNormalComment, 26 | ClosingTag, 27 | } 28 | 29 | export const enum TokenKind { 30 | Literal, 31 | OpenTag, // trim leading '<' 32 | OpenTagEnd, // trim tailing '>', only could be '/' or '' 33 | CloseTag, // trim leading '' 34 | Whitespace, // the whitespace between attributes 35 | AttrValueEq, 36 | AttrValueNq, 37 | AttrValueSq, 38 | AttrValueDq, 39 | } 40 | 41 | export interface IToken { 42 | start: number; 43 | end: number; 44 | value: string; 45 | type: TokenKind; 46 | } 47 | 48 | let state: State; 49 | let buffer: string; 50 | let bufSize: number; 51 | let sectionStart: number; 52 | let index: number; 53 | let tokens: IToken[]; 54 | let char: number; 55 | let inScript: boolean; 56 | let inStyle: boolean; 57 | let offset: number; 58 | 59 | function makeCodePoints(input: string) { 60 | return { 61 | lower: input 62 | .toLowerCase() 63 | .split('') 64 | .map((c) => c.charCodeAt(0)), 65 | upper: input 66 | .toUpperCase() 67 | .split('') 68 | .map((c) => c.charCodeAt(0)), 69 | length: input.length, 70 | }; 71 | } 72 | 73 | const doctype = makeCodePoints('!doctype'); 74 | const style = makeCodePoints('style'); 75 | const script = makeCodePoints('script'); 76 | 77 | const enum Chars { 78 | _S = 32, // ' ' 79 | _N = 10, // \n 80 | _T = 9, // \t 81 | _R = 13, // \r 82 | _F = 12, // \f 83 | Lt = 60, // < 84 | Ep = 33, // ! 85 | Cl = 45, // - 86 | Sl = 47, // / 87 | Gt = 62, // > 88 | Qm = 63, // ? 89 | La = 97, // a 90 | Lz = 122, // z 91 | Ua = 65, // A 92 | Uz = 90, // Z 93 | Eq = 61, // = 94 | Sq = 39, // ' 95 | Dq = 34, // " 96 | Ld = 100, // d 97 | Ud = 68, //D 98 | } 99 | 100 | function isWhiteSpace() { 101 | return ( 102 | char === Chars._S || 103 | char === Chars._N || 104 | char === Chars._T || 105 | char === Chars._T || 106 | char === Chars._R || 107 | char === Chars._F 108 | ); 109 | } 110 | 111 | function init(input: string) { 112 | state = State.Literal; 113 | buffer = input; 114 | bufSize = input.length; 115 | sectionStart = 0; 116 | index = 0; 117 | tokens = []; 118 | inScript = false; 119 | inStyle = false; 120 | offset = 0; 121 | } 122 | 123 | export function tokenize(input: string): IToken[] { 124 | init(input); 125 | while (index < bufSize) { 126 | char = buffer.charCodeAt(index); 127 | switch (state) { 128 | case State.Literal: 129 | parseLiteral(); 130 | break; 131 | case State.BeforeOpenTag: 132 | parseBeforeOpenTag(); 133 | break; 134 | case State.OpeningTag: 135 | parseOpeningTag(); 136 | break; 137 | case State.AfterOpenTag: 138 | parseAfterOpenTag(); 139 | break; 140 | case State.InValueNq: 141 | parseInValueNq(); 142 | break; 143 | case State.InValueSq: 144 | parseInValueSq(); 145 | break; 146 | case State.InValueDq: 147 | parseInValueDq(); 148 | break; 149 | case State.ClosingOpenTag: 150 | parseClosingOpenTag(); 151 | break; 152 | case State.OpeningSpecial: 153 | parseOpeningSpecial(); 154 | break; 155 | case State.OpeningDoctype: 156 | parseOpeningDoctype(); 157 | break; 158 | case State.OpeningNormalComment: 159 | parseOpeningNormalComment(); 160 | break; 161 | case State.InNormalComment: 162 | parseNormalComment(); 163 | break; 164 | case State.InShortComment: 165 | parseShortComment(); 166 | break; 167 | case State.ClosingNormalComment: 168 | parseClosingNormalComment(); 169 | break; 170 | case State.ClosingTag: 171 | parseClosingTag(); 172 | break; 173 | default: 174 | unexpected(); 175 | break; 176 | } 177 | index++; 178 | } 179 | switch (state) { 180 | case State.Literal: 181 | case State.BeforeOpenTag: 182 | case State.InValueNq: 183 | case State.InValueSq: 184 | case State.InValueDq: 185 | case State.ClosingOpenTag: 186 | case State.InNormalComment: 187 | case State.InShortComment: 188 | case State.ClosingNormalComment: 189 | emitToken(TokenKind.Literal); 190 | break; 191 | case State.OpeningTag: 192 | emitToken(TokenKind.OpenTag); 193 | break; 194 | case State.AfterOpenTag: 195 | break; 196 | case State.OpeningSpecial: 197 | emitToken(TokenKind.OpenTag, State.InShortComment); 198 | break; 199 | case State.OpeningDoctype: 200 | if (index - sectionStart === doctype.length) { 201 | emitToken(TokenKind.OpenTag); 202 | } else { 203 | emitToken(TokenKind.OpenTag, void 0, sectionStart + 1); 204 | emitToken(TokenKind.Literal); 205 | } 206 | break; 207 | case State.OpeningNormalComment: 208 | if (index - sectionStart === 2) { 209 | emitToken(TokenKind.OpenTag); 210 | } else { 211 | emitToken(TokenKind.OpenTag, void 0, sectionStart + 1); 212 | emitToken(TokenKind.Literal); 213 | } 214 | break; 215 | case State.ClosingTag: 216 | emitToken(TokenKind.CloseTag); 217 | break; 218 | default: 219 | break; 220 | } 221 | const _tokens = tokens; 222 | init(''); 223 | return _tokens; 224 | } 225 | 226 | function emitToken(kind: TokenKind, newState = state, end = index) { 227 | let value = buffer.substring(sectionStart, end); 228 | if (kind === TokenKind.OpenTag || kind === TokenKind.CloseTag) { 229 | value = value.toLowerCase(); 230 | } 231 | if (kind === TokenKind.OpenTag) { 232 | if (value === 'script') { 233 | inScript = true; 234 | } else if (value === 'style') { 235 | inStyle = true; 236 | } 237 | } 238 | if (kind === TokenKind.CloseTag) { 239 | inScript = inStyle = false; 240 | } 241 | if (!((kind === TokenKind.Literal || kind === TokenKind.Whitespace) && end === sectionStart)) { 242 | // empty literal should be ignored 243 | tokens.push({ type: kind, start: sectionStart, end, value }); 244 | } 245 | if (kind === TokenKind.OpenTagEnd || kind === TokenKind.CloseTag) { 246 | sectionStart = end + 1; 247 | state = State.Literal; 248 | } else { 249 | sectionStart = end; 250 | state = newState; 251 | } 252 | } 253 | 254 | function parseLiteral() { 255 | if (char === Chars.Lt) { 256 | // < 257 | emitToken(TokenKind.Literal, State.BeforeOpenTag); 258 | } 259 | } 260 | 261 | function parseBeforeOpenTag() { 262 | if (inScript || inStyle) { 263 | if (char === Chars.Sl) { 264 | state = State.ClosingTag; 265 | sectionStart = index + 1; 266 | } else { 267 | state = State.Literal; 268 | } 269 | return; 270 | } 271 | if ((char >= Chars.La && char <= Chars.Lz) || (char >= Chars.Ua && char <= Chars.Uz)) { 272 | // 293 | // any other chars covert to normal state 294 | state = State.Literal; 295 | } 296 | } 297 | 298 | function parseOpeningTag() { 299 | if (isWhiteSpace()) { 300 | //
304 | emitToken(TokenKind.OpenTag); 305 | emitToken(TokenKind.OpenTagEnd); 306 | } else if (char === Chars.Sl) { 307 | //
315 | emitToken(TokenKind.Whitespace); 316 | emitToken(TokenKind.OpenTagEnd); 317 | } else if (char === Chars.Sl) { 318 | //
339 | emitToken(TokenKind.AttrValueNq); 340 | emitToken(TokenKind.OpenTagEnd); 341 | } else if (char === Chars.Sl) { 342 | //
371 | emitToken(TokenKind.OpenTagEnd); 372 | } else { 373 | //
374 | emitToken(TokenKind.AttrValueNq, State.AfterOpenTag); 375 | parseAfterOpenTag(); 376 | } 377 | } 378 | 379 | function parseOpeningSpecial() { 380 | switch (char) { 381 | case Chars.Cl: // 405 | emitToken(TokenKind.OpenTag, void 0, sectionStart + 1); 406 | emitToken(TokenKind.Literal); 407 | emitToken(TokenKind.OpenTagEnd); 408 | } else if (doctype.lower[offset] !== char && doctype.upper[offset] !== char) { 409 | // 433 | emitToken(TokenKind.Literal); 434 | emitToken(TokenKind.OpenTagEnd); 435 | } 436 | } 437 | 438 | function parseClosingNormalComment() { 439 | offset = index - sectionStart; 440 | if (offset === 2) { 441 | if (char === Chars.Gt) { 442 | // 443 | emitToken(TokenKind.OpenTagEnd); 444 | } else if (char === Chars.Cl) { 445 | //