├── demo ├── test40.html ├── test27.html ├── test39.html ├── demo.html ├── test14.html ├── test1.html ├── test17.html ├── test5.html ├── demo2.html ├── demo1.html ├── test33.html ├── demo4.html ├── demo5.html ├── test7.html ├── test.html ├── demo copy.html ├── test21.html ├── test2.html ├── test30.html ├── test32.html ├── test38.html ├── demo3.html ├── test6.html ├── test9.html ├── test20.html ├── test22.html ├── test24.html ├── test10.html ├── test11.html ├── test8.html ├── test13.html ├── test36.html ├── test4.html ├── test35.html ├── test28.html ├── test34.html ├── test29.html ├── test37.html ├── test18.html ├── test16.html ├── test25.html ├── test31.html ├── test19.html ├── test12.html ├── test23.html └── test26.html ├── src ├── index.ts ├── definition.ts ├── parser │ ├── Comment.ts │ ├── Directive.ts │ ├── tagClose.ts │ ├── DTD.ts │ ├── Html.ts │ └── parseText.ts └── parser.ts ├── .vscode ├── settings.json └── launch.json ├── dist ├── definition.js ├── index.js ├── parser │ ├── Comment.js │ ├── tagClose.js │ ├── DTD.js │ ├── Directive.js │ ├── Html.js │ └── parseText.js ├── parser.js └── lexer.js ├── .gitignore ├── .github └── workflows │ └── tests.yml ├── mycheck ├── checkfile.js ├── check-dist.js └── check.ts ├── README_HTML.md ├── package.json ├── LICENSE ├── compare.js ├── test-server.js ├── server.js ├── script └── addSuffixJs.js ├── test ├── test25.spec.ts ├── test26.spec.ts ├── test24.spec.ts └── testall.spec.ts ├── tsconfig.json ├── tsconfig-esmodule.json └── README.md /demo/test40.html: -------------------------------------------------------------------------------- 1 |
1
2
-------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export { parse as paser } from "./parser" 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "typescript.tsdk": "node_modules\\typescript\\lib" 3 | } -------------------------------------------------------------------------------- /dist/definition.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | -------------------------------------------------------------------------------- /src/definition.ts: -------------------------------------------------------------------------------- 1 | 2 | export interface Keywords { 3 | [index: string]: any 4 | } 5 | 6 | export interface TokenNameMap { 7 | [index: number]: any 8 | } 9 | -------------------------------------------------------------------------------- /demo/test27.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |

11{{res.value}}

6 |
7 |
8 | github 9 | 10 | -------------------------------------------------------------------------------- /dist/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.paser = void 0; 4 | var parser_1 = require("./parser"); 5 | Object.defineProperty(exports, "paser", { enumerable: true, get: function () { return parser_1.parse; } }); 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules/ 2 | /src/*.js.map 3 | /dist/*.js.map 4 | /dist/examples/*/*.js.map 5 | /.cache/ 6 | /dist/src/*.js.map 7 | /dist/vm/*.js.map 8 | /dist/test/*.js.map 9 | /dist/src/*/*.js.map 10 | /dist/vm/*/*.js.map 11 | /out/*.json 12 | /files/* 13 | /test/testfile.js 14 | /app/* 15 | /dist-esmodule/* 16 | /copy/* 17 | /matchtest/* 18 | -------------------------------------------------------------------------------- /demo/test39.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | click here 11 | 12 | -------------------------------------------------------------------------------- /demo/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
hello
11 | 12 | -------------------------------------------------------------------------------- /demo/test14.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 哈哈哈哈 12 | 13 | 14 | -------------------------------------------------------------------------------- /demo/test1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
haha<
11 | 12 | 13 | -------------------------------------------------------------------------------- /demo/test17.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /demo/test5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
11 |
12 | 哈哈哈哈 13 |
14 |
15 | 16 | -------------------------------------------------------------------------------- /demo/demo2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 哈哈 12 | 13 | 14 | -------------------------------------------------------------------------------- /demo/demo1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
11 |

11{{res.value}}

12 |
13 | 14 | 15 | -------------------------------------------------------------------------------- /demo/test33.html: -------------------------------------------------------------------------------- 1 | 3 | 5 | 6 | 7 | West Africa Regional Leaders to Meet 8 | 9 | 10 | 11 |
123
12 | 13 | -------------------------------------------------------------------------------- /demo/demo4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 15 | 16 | -------------------------------------------------------------------------------- /demo/demo5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /demo/test7.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 13 | 16 |
17 | 18 |
19 | 20 | -------------------------------------------------------------------------------- /demo/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
11 |
12 |

11{{res.value}}

13 |
14 | 15 |
16 | 17 | 18 | -------------------------------------------------------------------------------- /demo/demo copy.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Document 9 | 10 | 11 |
12 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /demo/test21.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /demo/test2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
11 |
12 |

11{{res.value}}

13 |
14 | 15 |
16 | 17 | 18 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | strategy: 10 | matrix: 11 | node-version: [10.x, 12.x] 12 | 13 | steps: 14 | 15 | - uses: actions/checkout@v2 16 | 17 | - name: Use Node.js ${{ matrix.node-version }} 18 | uses: actions/setup-node@v1 19 | with: 20 | node-version: ${{ matrix.node-version }} 21 | 22 | - run: npm install 23 | 24 | - run: npm run test-all -------------------------------------------------------------------------------- /demo/test30.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 14 | 15 | -------------------------------------------------------------------------------- /demo/test32.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Document 11 | 12 | 13 | 14 |
top
15 | 16 |
test32
17 | 18 | 456 19 | 789 20 |
21 | 123 22 | 23 | 24 |
out
25 | 26 | 27 |
28 | -------------------------------------------------------------------------------- /demo/test38.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
  • 11 | 13 | 14 |
  • 15 | 16 | -------------------------------------------------------------------------------- /demo/demo3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 16 | 17 | -------------------------------------------------------------------------------- /demo/test6.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

    11 | 本部分 12 | APIs 13 |  ,请参考 14 | Web APIs 15 |  以及 16 | DOM 17 | 。 18 |

    19 | 20 | -------------------------------------------------------------------------------- /mycheck/checkfile.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | const paser = require('../dist/index').paser 4 | 5 | const files = fs.readdirSync(path.resolve(__dirname, "../files")) 6 | 7 | for (let i = 10; i < 20; i++) { 8 | console.log("-----------------------------------------------------") 9 | console.log(files[i], ` ${i}`) 10 | let code = fs.readFileSync(path.resolve(__dirname, `../files/${ 11 | files[i] 12 | }`), {encoding: 'utf-8'}) 13 | 14 | console.time("test") 15 | let ast = paser(code) 16 | console.timeEnd("test") 17 | 18 | } 19 | -------------------------------------------------------------------------------- /demo/test9.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 一行代码卖出570美元, 天价代码的内幕 14 | 15 | 16 | -------------------------------------------------------------------------------- /demo/test20.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [{ 7 | "name": "ts-node", 8 | "type": "pwa-node", 9 | "request": "launch", 10 | "args": [ 11 | "${relativeFile}" // 入口文件 12 | ], 13 | "runtimeArgs": [ 14 | "--nolazy", 15 | "-r", 16 | "ts-node/register" 17 | ], 18 | "sourceMaps": true, 19 | "cwd": "${workspaceRoot}", 20 | "protocol": "inspector", 21 | // "console": "integratedTerminal", 22 | "internalConsoleOptions": "openOnSessionStart" 23 | }] 24 | } -------------------------------------------------------------------------------- /demo/test22.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /demo/test24.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 13 | 14 | 第一百七十九章 顶级秘法·28分钟前 17 |

    18 | 19 | -------------------------------------------------------------------------------- /demo/test10.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 一行代码卖出570美元, 天价代码的内幕 19 | 20 | 21 | -------------------------------------------------------------------------------- /demo/test11.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 | 13 |

    14 | 李宁,东北大学计算机专业硕士,超过20年软件开发和培训经验,UnityMarvel创始人,CSDN学院高级讲师,企业内训讲师,IT畅销书作者。曾出版超过30本IT畅销书,培训过数以千计的企业学员,制作做数千小时的视频课程。代表作包括《Python从菜鸟到高手》、《Python爬虫技术:深入理解原理... 15 |

    16 |

    17 |
    18 |
    19 | 20 | -------------------------------------------------------------------------------- /demo/test8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | === 和非严格的比较操作符 ==,以及 Object.is() 12 | 方法。 13 |

    14 | JavaScript ( 函数优先的轻量级非浏览器环境 17 |

    18 | 19 | -------------------------------------------------------------------------------- /demo/test13.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 |
    12 |
    13 |
    14 | 15 | 哈哈哈哈 16 | 17 |
    18 | 19 |
    20 |
    21 | 22 |
    23 |
    24 |
    25 |
    26 | 27 |
    28 |
    29 | 32 | 35 |
    36 | 37 | -------------------------------------------------------------------------------- /demo/test36.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 17 | 18 | -------------------------------------------------------------------------------- /src/parser/Comment.ts: -------------------------------------------------------------------------------- 1 | import { Lexer } from "../lexer"; 2 | 3 | export function paseComment(lexer: Lexer) { 4 | let content = "" 5 | while (lexer.sourceCode.slice(0, 3) !== "-->") { 6 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 7 | lexer.lineNum += 1 8 | content += lexer.sourceCode.slice(0, 2) 9 | lexer.skipSourceCode(2) 10 | continue 11 | } else { 12 | if (lexer.isNewLine(lexer.sourceCode[0])) { 13 | lexer.lineNum += 1 14 | content += lexer.sourceCode.slice(0, 1) 15 | lexer.skipSourceCode(1) 16 | continue 17 | } 18 | } 19 | content += lexer.sourceCode[0] 20 | lexer.skipSourceCode(1) 21 | } 22 | lexer.skipSourceCode(3) 23 | lexer.hasCache = false 24 | return { 25 | type: "comment", 26 | LineNum: lexer.GetLineNum(), 27 | content, 28 | } 29 | } -------------------------------------------------------------------------------- /demo/test4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 |

    12 | JavaScript ( 函数优先的轻量级非浏览器环境 16 |

    17 |
    18 |
    19 |
    20 | 哈哈哈哈 21 |
    22 |
    23 | 27 | 28 | -------------------------------------------------------------------------------- /README_HTML.md: -------------------------------------------------------------------------------- 1 | ``` 2 | tag-open ::= '<' tag-name ws* attr-list? ws* '>' 3 | tag-empty ::= '<' tag-name ws* attr-list? ws* '/>' 4 | tag-close ::= '' 5 | 6 | 7 | attr-list ::= (ws+ attr)* 8 | attr ::= attr-empty | attr-unquoted | attr-single-quoted | attr-double-quoted 9 | 10 | attr-empty ::= attr-name 11 | attr-unquoted ::= attr-name ws* '=' ws* attr-unquoted-value 12 | attr-single-quoted ::= attr-name ws* "=" ws* "'" attr-single-quoted-value "'" 13 | attr-double-quoted ::= attr-name ws* "=" ws* '"' attr-double-quoted-value '"' 14 | 15 | tag-name ::= alphabets (alphabets | digits)* // digits can not become first letter 16 | attr-name ::= [^\s"'>/=[#x0000-#x001f]+ // [^\s"'>/=[\u0000-\u001f]+ 17 | 18 | // These three items should not contain 'ambiguous ampersand'... 19 | attr-unquoted-value ::= [^\s"'=<>`]+ 20 | attr-single-quoted-value ::= [^']* 21 | attr-double-quoted-value ::= [^"]* 22 | 23 | alphabets ::= [a-zA-Z] 24 | digits ::= [0-9] 25 | ws ::= #x9 | #xA | #xD | #x20 26 | 27 | ``` 28 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@liulinboyi/htmlparser", 3 | "main": "./dist/index.js", 4 | "files": [ 5 | "dist" 6 | ], 7 | "devDependencies": { 8 | "@playwright/test": "^1.13.1", 9 | "@types/node": "^14.14.20", 10 | "cross-env": "^7.0.3", 11 | "playwright": "^1.13.1", 12 | "prettier": "2.2.1", 13 | "pretty-quick": "3.1.0", 14 | "request": "^2.88.2", 15 | "request-promise": "^4.2.6", 16 | "ts-lint": "4.5.1", 17 | "ts-loader": "8.0.15", 18 | "ts-node": "^9.1.1", 19 | "tslint": "5.20.1", 20 | "tslint-config-prettier": "1.18.0", 21 | "typescript": "^4.1.3" 22 | }, 23 | "scripts": { 24 | "test-ts": "ts-node ./mycheck/check.ts", 25 | "build": "rm -rf ./dist/ && tsc --project ./tsconfig.json", 26 | "build-es": "rm -rf ./dist-esmodule/ && tsc --project ./tsconfig-esmodule.json && node ./script/addSuffixJs.js", 27 | "build-all": "npm run build && npm run build-es", 28 | "test": "playwright test testall.spec.ts", 29 | "test-all": "node test-server.js" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /demo/test35.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 14 | 15 |
    16 |

    More In News

    17 | 20 | 21 | 29 | 30 |
    31 | 32 | 33 | -------------------------------------------------------------------------------- /dist/parser/Comment.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.paseComment = void 0; 4 | function paseComment(lexer) { 5 | let content = ""; 6 | while (lexer.sourceCode.slice(0, 3) !== "-->") { 7 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 8 | lexer.lineNum += 1; 9 | content += lexer.sourceCode.slice(0, 2); 10 | lexer.skipSourceCode(2); 11 | continue; 12 | } 13 | else { 14 | if (lexer.isNewLine(lexer.sourceCode[0])) { 15 | lexer.lineNum += 1; 16 | content += lexer.sourceCode.slice(0, 1); 17 | lexer.skipSourceCode(1); 18 | continue; 19 | } 20 | } 21 | content += lexer.sourceCode[0]; 22 | lexer.skipSourceCode(1); 23 | } 24 | lexer.skipSourceCode(3); 25 | lexer.hasCache = false; 26 | return { 27 | type: "comment", 28 | LineNum: lexer.GetLineNum(), 29 | content, 30 | }; 31 | } 32 | exports.paseComment = paseComment; 33 | -------------------------------------------------------------------------------- /src/parser/Directive.ts: -------------------------------------------------------------------------------- 1 | import { Lexer } from "../lexer"; 2 | 3 | export function paseDirective(lexer: Lexer) { 4 | let content = "" 5 | while (lexer.sourceCode.slice(0, 1) !== "]") { 6 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 7 | lexer.lineNum += 1 8 | content += lexer.sourceCode.slice(0, 2) 9 | lexer.skipSourceCode(2) 10 | continue 11 | } else { 12 | if (lexer.isNewLine(lexer.sourceCode[0])) { 13 | lexer.lineNum += 1 14 | content += lexer.sourceCode.slice(0, 1) 15 | lexer.skipSourceCode(1) 16 | continue 17 | } 18 | } 19 | content += lexer.sourceCode[0] 20 | lexer.skipSourceCode(1) 21 | } 22 | content += lexer.sourceCode[0] // ]加入content 23 | lexer.skipSourceCode(1) // ] 24 | lexer.isIgnored() // 空格 25 | lexer.skipSourceCode(1) // > 26 | lexer.hasCache = false 27 | return { 28 | type: "comment", // 在浏览器中解析成comment了 29 | LineNum: lexer.GetLineNum(), 30 | content, 31 | } 32 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 liulinboy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dist/parser/tagClose.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.parseClose = exports.isSpecialTag = exports.Node = void 0; 4 | const lexer_1 = require("../lexer"); 5 | const Html_1 = require("./Html"); 6 | class Node { 7 | constructor() { 8 | this.children = []; 9 | this.attr = []; 10 | } 11 | } 12 | exports.Node = Node; 13 | function isSpecialTag(node) { 14 | let tags = [ 15 | "img", 16 | "source", 17 | "link", 18 | "meta", 19 | "area", 20 | "input", 21 | "br" 22 | ]; 23 | return tags.includes(node.tag); 24 | } 25 | exports.isSpecialTag = isSpecialTag; 26 | function parseClose(lexer) { 27 | lexer.hasCache = false; 28 | let node = new Node(); 29 | node.closeTag = true; 30 | node.LineNum = lexer.GetLineNum(); 31 | node.type = "tag"; 32 | node.tag = Html_1.parseTag(lexer); 33 | lexer.NextTokenIs(lexer_1.TOKEN_RIGHT_PAREN); // > 34 | // lexer.isIgnored() 35 | if (isSpecialTag(node)) { 36 | return null; 37 | } 38 | return node; 39 | } 40 | exports.parseClose = parseClose; 41 | -------------------------------------------------------------------------------- /src/parser/tagClose.ts: -------------------------------------------------------------------------------- 1 | import { Lexer, TOKEN_RIGHT_PAREN } from "../lexer"; 2 | import { parseTag } from "./Html"; 3 | 4 | export interface Node { 5 | LineNum?: number, 6 | children?: Array, 7 | attr: Array, 8 | type?: string, 9 | tag?: string, 10 | selfClose?: boolean 11 | closeTag?: boolean 12 | } 13 | 14 | export class Node { 15 | constructor() { 16 | this.children = [] 17 | this.attr = [] 18 | } 19 | } 20 | 21 | export function isSpecialTag(node: any) { 22 | let tags = [ 23 | "img", 24 | "source", 25 | "link", 26 | "meta", 27 | "area", 28 | "input", 29 | "br" 30 | ] 31 | return tags.includes(node.tag) 32 | } 33 | 34 | export function parseClose(lexer: Lexer) { 35 | lexer.hasCache = false 36 | let node = new Node() 37 | node.closeTag = true 38 | node.LineNum = lexer.GetLineNum() 39 | node.type = "tag" 40 | node.tag = parseTag(lexer) 41 | lexer.NextTokenIs(TOKEN_RIGHT_PAREN) // > 42 | // lexer.isIgnored() 43 | if (isSpecialTag(node)) { 44 | return null 45 | } 46 | return node 47 | } -------------------------------------------------------------------------------- /src/parser/DTD.ts: -------------------------------------------------------------------------------- 1 | import { Lexer } from "../lexer"; 2 | 3 | interface DTD { 4 | content?: string, 5 | type: string, 6 | LineNum: number 7 | } 8 | 9 | class DTD { 10 | constructor() { 11 | this.type = "DTD" 12 | } 13 | } 14 | 15 | export function parseDtd(lexer: Lexer) { 16 | let dtd = new DTD() 17 | let content = "" 18 | while (lexer.sourceCode[0] !== ">") { 19 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 20 | lexer.lineNum += 1 21 | content += lexer.sourceCode.slice(0, 2) 22 | lexer.skipSourceCode(2) 23 | continue 24 | } else { 25 | if (lexer.isNewLine(lexer.sourceCode[0])) { 26 | lexer.lineNum += 1 27 | content += lexer.sourceCode.slice(0, 1) 28 | lexer.skipSourceCode(1) 29 | continue 30 | } 31 | } 32 | content += lexer.sourceCode[0] 33 | lexer.skipSourceCode(1) 34 | } 35 | lexer.skipSourceCode(1) 36 | lexer.isIgnored() 37 | lexer.hasCache = false 38 | dtd.content = content 39 | dtd.LineNum = lexer.GetLineNum() 40 | return dtd 41 | } -------------------------------------------------------------------------------- /dist/parser/DTD.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.parseDtd = void 0; 4 | class DTD { 5 | constructor() { 6 | this.type = "DTD"; 7 | } 8 | } 9 | function parseDtd(lexer) { 10 | let dtd = new DTD(); 11 | let content = ""; 12 | while (lexer.sourceCode[0] !== ">") { 13 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 14 | lexer.lineNum += 1; 15 | content += lexer.sourceCode.slice(0, 2); 16 | lexer.skipSourceCode(2); 17 | continue; 18 | } 19 | else { 20 | if (lexer.isNewLine(lexer.sourceCode[0])) { 21 | lexer.lineNum += 1; 22 | content += lexer.sourceCode.slice(0, 1); 23 | lexer.skipSourceCode(1); 24 | continue; 25 | } 26 | } 27 | content += lexer.sourceCode[0]; 28 | lexer.skipSourceCode(1); 29 | } 30 | lexer.skipSourceCode(1); 31 | lexer.isIgnored(); 32 | lexer.hasCache = false; 33 | dtd.content = content; 34 | dtd.LineNum = lexer.GetLineNum(); 35 | return dtd; 36 | } 37 | exports.parseDtd = parseDtd; 38 | -------------------------------------------------------------------------------- /dist/parser/Directive.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.paseDirective = void 0; 4 | function paseDirective(lexer) { 5 | let content = ""; 6 | while (lexer.sourceCode.slice(0, 1) !== "]") { 7 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 8 | lexer.lineNum += 1; 9 | content += lexer.sourceCode.slice(0, 2); 10 | lexer.skipSourceCode(2); 11 | continue; 12 | } 13 | else { 14 | if (lexer.isNewLine(lexer.sourceCode[0])) { 15 | lexer.lineNum += 1; 16 | content += lexer.sourceCode.slice(0, 1); 17 | lexer.skipSourceCode(1); 18 | continue; 19 | } 20 | } 21 | content += lexer.sourceCode[0]; 22 | lexer.skipSourceCode(1); 23 | } 24 | content += lexer.sourceCode[0]; // ]加入content 25 | lexer.skipSourceCode(1); // ] 26 | lexer.isIgnored(); // 空格 27 | lexer.skipSourceCode(1); // > 28 | lexer.hasCache = false; 29 | return { 30 | type: "comment", 31 | LineNum: lexer.GetLineNum(), 32 | content, 33 | }; 34 | } 35 | exports.paseDirective = paseDirective; 36 | -------------------------------------------------------------------------------- /demo/test28.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 | 12 | 13 | 14 | 订阅 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | -------------------------------------------------------------------------------- /demo/test34.html: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 29 |
    30 |
    31 |
    32 |
    33 | 34 | -------------------------------------------------------------------------------- /compare.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | let parser = fs.readFileSync(path.resolve(__dirname, `./out/$parser.ast.json`), {encoding: 'utf-8'}) 4 | let browser = fs.readFileSync(path.resolve(__dirname, `./out/$browser.ast.json`), {encoding: 'utf-8'}) 5 | parser = JSON.parse(parser) 6 | browser = JSON.parse(browser) 7 | console.log(parser, browser) 8 | 9 | let count = browser.length > parser.length ? browser.length : parser.length; 10 | for (let i = 0; i < count; i++) { 11 | let a = browser[i] 12 | let b = parser[i] 13 | if (a === undefined) { 14 | debugger 15 | } 16 | if (a.tag.toLowerCase() !== b.tag) { 17 | let resta = [] 18 | let restb = [] 19 | let start = i - 10 < 0 ? 0 : i - 10 20 | for (let s = start; s < i; s++) { 21 | resta.push(browser[s]) 22 | restb.push(parser[s]) 23 | } 24 | resta.push(browser[i]) 25 | restb.push(parser[i]) 26 | for (let s = i; s < i + 10; s++) { 27 | resta.push(browser[s]) 28 | restb.push(parser[s]) 29 | } 30 | debugger 31 | 32 | } 33 | // console.log(browser[i] ? browser[i].tag : "undefined", parser[i] ? parser[i].tag : "undefined") 34 | // expect(browser[i].tag.toLowerCase()).toBe(parser[i].tag) 35 | // console.assert(browser[i].tag.toLowerCase() === parser[i].tag, `${browser[i] ? browser[i].tag : "undefined"}, ${parser[i] ? parser[i].tag : "undefined"}`) 36 | } 37 | -------------------------------------------------------------------------------- /test-server.js: -------------------------------------------------------------------------------- 1 | let http = require('http'); 2 | let url = require('url'); 3 | let util = require('util'); 4 | let fs = require('fs'); 5 | let path = require("path") 6 | const {spawn} = require('child_process'); 7 | const process = require("process"); 8 | 9 | const pour = (cmd, args, opts = { 10 | encoding: 'utf8' 11 | }, stdout = process.stdout, stderr = process.stderr) => { 12 | return new Promise((resolve, reject) => { 13 | const p = spawn(cmd, args, opts); 14 | p.stdout.setEncoding('utf-8'); 15 | p.stdout.on('data', data => { 16 | stdout.write(data, "utf8"); 17 | }); 18 | p.stderr.on('data', data => { 19 | stderr.write(data); 20 | }); 21 | p.on('close', code => { 22 | resolve(code); 23 | }); 24 | }); 25 | } 26 | 27 | async function exec(shell, args, opt) { 28 | console.log(`${shell} ${ 29 | args.join(" ") 30 | }`) 31 | await pour(shell, args, opt); 32 | } 33 | 34 | let server = http.createServer((req, res) => { 35 | var pathname = url.parse(req.url).pathname; // 获取url的pathname (/index.html) 36 | // console.log("file:" + pathname.substring(1)) // 将‘/’去掉 37 | // console.log(__dirname, __filename) 38 | let curPath = path.join(__dirname, pathname) 39 | // console.log(curPath) 40 | fs.readFile(curPath, function (err, data) { // fs模块加载文件 41 | if (err) { 42 | res.writeHead(404, {'Content-Type': 'text/html'}); 43 | } else { 44 | res.writeHead(200, {'Content-Type': 'text/html'}); 45 | res.write(data.toString()); 46 | } 47 | res.end(); 48 | }); 49 | 50 | }); 51 | 52 | server.listen(3000, '127.0.0.1', async () => { 53 | console.log("服务器已经运行,请打开浏览,输入:http://127.0.0.1:3000/ 来进行访问.") 54 | await exec(process.platform === 'win32' ? 'npm.cmd' : "npm", ["run", "test"]) 55 | process.exit(0); 56 | }); 57 | -------------------------------------------------------------------------------- /mycheck/check-dist.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | const paser = require('../dist/index').paser 4 | 5 | let paths = [ 6 | "demo.html", 7 | "demo1.html", 8 | "demo2.html", 9 | "demo3.html", 10 | "demo4.html", 11 | "demo5.html", 12 | "test4.html", 13 | "test5.html", 14 | "test6.html", 15 | "test7.html", 16 | "test8.html", 17 | "MDN_HTML.html", 18 | "MDN_JavaScript.html", 19 | "CSDN.html", 20 | "CSDN_SPM.html", // error 21 | "test9.html", 22 | "test10.html", 23 | "test11.html", 24 | "test12.html", 25 | "test13.html", 26 | "test14.html", 27 | "test15.html", 28 | "test16.html", 29 | "test17.html", 30 | "test18.html", 31 | "test19.html", 32 | "test20.html", 33 | "test21.html", 34 | "test22.html", 35 | "test23.html", 36 | "google.html", 37 | "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741.html", 38 | "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741-origin.html", 39 | "039c4b966d1f2a0c589ac0aad211fe65500ad1cb58c7f45b34251db7056803ec-origin.html", 40 | "0475e5eeadaaca857eea3f36d0eda01937fe672d48be7f98ba6bc7f25ecd63d0.html", 41 | "078cdb456d1beb698aeed86e0f2161e442e9431c4580295f1ba4ece22741068c.html", 42 | "0e55dcdbeb54c88ee87942b9fef7ea5398fa9a1e83493d55844b479506a80fd8.html", 43 | "qidian.html", 44 | "test24.html", 45 | "test25.html", 46 | "test26.html", 47 | ] 48 | 49 | for (let p of paths) { 50 | let code = fs.readFileSync(path.resolve(__dirname, `../demo/${p}`), {encoding: 'utf-8'}) 51 | // console.log(code, 'code') 52 | if (code.length > 0) { 53 | console.time("test") 54 | let ast = paser(code) 55 | console.timeEnd("test") 56 | // console.log(__dirname, __filename) 57 | console.log(path.resolve(__dirname, "../out/", `./${p}.ast.json`)) 58 | fs.writeFileSync(path.resolve(__dirname, "../out/", `./${p}.ast.json`), JSON.stringify(ast, null, 4)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /demo/test29.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 第一百七十九章 顶级秘法·28分钟前 14 |

    15 |
    16 | 17 | 第一百七十九章 顶级秘法·28分钟前 20 |

    21 | 22 | 123 23 | 第一百七十九章 顶级秘法·28分钟前 26 |

    27 |
    28 | 第一百七十九章 顶级秘法·28分钟前 31 |

    32 |
    33 | 第一百七十九章 顶级秘法·28分钟前 36 | 37 | 38 | -------------------------------------------------------------------------------- /demo/test37.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 13 |
    14 |
    xufive
    15 |
    16 |

    17 | 天元浪子

    18 |
    20 | 23 | 24 | CSDN博客技术专家 25 | 38 | 39 |

    总经理

    40 |

    153篇

    41 |
    42 |
    43 |

    44 | 生于1968年,程序员,使用python超过10年。长期从事数据处理工作,先后参与过风云系列卫星、碳卫星、海洋卫星、嫦娥探测器等卫星数据处理。

    45 |

    46 | 47 | 48 |
    567
    49 |
    50 | 51 | -------------------------------------------------------------------------------- /server.js: -------------------------------------------------------------------------------- 1 | let http = require('http'); 2 | let url = require('url'); 3 | let util = require('util'); 4 | let fs = require('fs'); 5 | let path = require("path") 6 | const {spawn} = require('child_process'); 7 | const process = require("process"); 8 | const { URL, URLSearchParams } = require('url'); 9 | 10 | const pour = (cmd, args, opts = { 11 | encoding: 'utf8' 12 | }, stdout = process.stdout, stderr = process.stderr) => { 13 | return new Promise((resolve, reject) => { 14 | const p = spawn(cmd, args, opts); 15 | p.stdout.setEncoding('utf-8'); 16 | p.stdout.on('data', data => { 17 | stdout.write(data, "utf8"); 18 | }); 19 | p.stderr.on('data', data => { 20 | stderr.write(data); 21 | }); 22 | p.on('close', code => { 23 | resolve(code); 24 | }); 25 | }); 26 | } 27 | 28 | async function exec(shell, args, opt) { 29 | console.log(`${shell} ${ 30 | args.join(" ") 31 | }`) 32 | await pour(shell, args, opt); 33 | } 34 | 35 | let server = http.createServer((req, res) => { 36 | if (path.normalize(decodeURIComponent(req.url)) !== decodeURIComponent(req.url)) { 37 | res.statusCode = 403; 38 | res.end(); 39 | return; 40 | } 41 | const u = decodeURIComponent(req.url) 42 | console.log("decodeURIComponent",u) 43 | // var pathname = url.parse(u).pathname; // 获取url的pathname (/index.html) 44 | console.log("file:" + u.substring(1)) // 将‘/’去掉 45 | // console.log(__dirname, __filename) 46 | let curPath = path.join(__dirname, u.substring(1)) 47 | console.log(curPath) 48 | fs.readFile(curPath, function (err, data) { // fs模块加载文件 49 | if (err) { 50 | res.writeHead(404, {'Content-Type': 'text/html'}); 51 | } else { 52 | res.writeHead(200, {'Content-Type': 'text/html'}); 53 | res.write(data.toString()); 54 | } 55 | res.end(); 56 | }); 57 | 58 | }); 59 | 60 | server.listen(3000, '127.0.0.1', async () => { 61 | console.log("服务器已经运行,请打开浏览,输入:http://127.0.0.1:3000/ 来进行访问.") 62 | // await exec(process.platform === 'win32' ? 'npm.cmd' : "npm", ["run", "test"]) 63 | // process.exit(0); 64 | }); 65 | -------------------------------------------------------------------------------- /script/addSuffixJs.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs").promises 2 | const path = require("path") 3 | 4 | void async function () { 5 | try { 6 | let p = path.resolve(__dirname, "../dist-esmodule") 7 | let paths = await fs.readdir(p) 8 | // console.log(paths) 9 | let stack = [... paths] 10 | while (stack.length) { 11 | let top = stack.pop() 12 | let pat = path.resolve(p, top) 13 | let stat = await fs.stat(pat) 14 | if (stat.isDirectory()) { 15 | let temp = await fs.readdir(pat) 16 | if (temp) { 17 | for (let i of temp) { 18 | stack.push(path.join(top, i)) 19 | } 20 | } 21 | } else { 22 | // console.log(pat) 23 | 24 | let personList = await fs.readFile(pat, {encoding: "utf8"}) 25 | 26 | var regexpNames = /(?:export|import)(?:\s)*?(?:\{)??.*?(?:\})??(?:\s)*?from(?:\s)*?"(.+?)"/gm 27 | 28 | var match = personList.matchAll(regexpNames); 29 | 30 | let count = 0 31 | for (let item of match) { 32 | if (/.js$/.test(item[1])) { 33 | continue 34 | } 35 | let temp = item[0] 36 | let index = item.index + count 37 | let now = temp.replace(item[1], `${ 38 | item[1] 39 | }.js`) 40 | let past = personList.slice(0, index) 41 | let feature = personList.slice(index + temp.length, personList.length) 42 | personList = `${past}${now}${feature}` 43 | count = count + 3 44 | } 45 | 46 | await fs.writeFile(pat, personList, {encoding: "utf8"}) 47 | } 48 | } 49 | // for (let item of paths) { 50 | // let pat = path.resolve(p, item) 51 | // console.log(pat) 52 | // let stat = await fs.stat(pat) 53 | // console.log(stat.isDirectory()) 54 | // } 55 | } catch (error) { 56 | console.log(error) 57 | } 58 | 59 | }() 60 | -------------------------------------------------------------------------------- /mycheck/check.ts: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | import { paser } from '../src/index' 4 | 5 | let paths = [ 6 | // "demo.html", 7 | // "demo1.html", 8 | // "demo2.html", 9 | // "demo3.html", 10 | // "demo4.html", 11 | // "demo5.html", 12 | // "test4.html", 13 | // "test5.html", 14 | // "test6.html", 15 | // "test7.html", 16 | // "test8.html", 17 | // "MDN_HTML.html", 18 | // "MDN_JavaScript.html", 19 | // "CSDN.html", 20 | // "CSDN_SPM.html", 21 | // "test9.html", 22 | // "test10.html", 23 | // "test11.html", 24 | // "test12.html", 25 | // "test13.html", 26 | // "test14.html", 27 | // "test15.html", 28 | // "test16.html", 29 | // "test17.html", 30 | // "test18.html", 31 | // "test19.html", 32 | // "test20.html", 33 | // "test21.html", 34 | // "test22.html", 35 | // "test23.html", 36 | // "google.html", 37 | // "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741.html", 38 | // "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741-origin.html", 39 | // "039c4b966d1f2a0c589ac0aad211fe65500ad1cb58c7f45b34251db7056803ec-origin.html", 40 | // "0475e5eeadaaca857eea3f36d0eda01937fe672d48be7f98ba6bc7f25ecd63d0.html", 41 | // "078cdb456d1beb698aeed86e0f2161e442e9431c4580295f1ba4ece22741068c.html", 42 | // "0e55dcdbeb54c88ee87942b9fef7ea5398fa9a1e83493d55844b479506a80fd8.html", 43 | // "qidian.html", 44 | // "test24.html", 45 | // "test25.html", 46 | // "test26.html", 47 | // "qidian1.html", 48 | // "test27.html", 49 | // "test28.html", 50 | // "test30.html", 51 | // "test31.html", 52 | // "test32.html", 53 | // "test33.html", 54 | // "test34.html", 55 | // "test35.html", 56 | // "test36.html", 57 | // "test38.html", 58 | // "test40.html", 59 | ] 60 | 61 | for (let p of paths) { 62 | let code = fs.readFileSync(path.resolve(__dirname, `../demo/${p}`), { encoding: 'utf-8' }) 63 | // console.log(code, 'code') 64 | if (code.length > 0) { 65 | // console.time("test") 66 | let ast = paser(code, true) 67 | // console.timeEnd("test") 68 | // console.log(__dirname, __filename) 69 | console.log(path.resolve(__dirname, "../out/", `./${p}.ast.json`)) 70 | fs.writeFileSync(path.resolve(__dirname, "../out/", `./${p}.ast.json`), JSON.stringify(ast, null, 4)) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /demo/test18.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
      11 | 12 | 18 | 19 | 25 |
    • 26 | 27 | 30 |
    • 31 | 34 |
    • 35 | 37 | 38 |
    • 39 | 42 |
    • Print 44 |
    • 45 |
    46 | 47 | -------------------------------------------------------------------------------- /demo/test16.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 17 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test/test25.spec.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from '@playwright/test'; 2 | const request = require("request-promise") 3 | import { parse } from '../src/parser' 4 | 5 | function getAll(source, type) { 6 | let current = source; 7 | let res = [] 8 | let stack = Array.isArray(current) ? current : [current]; 9 | while (stack.length) { 10 | let top = stack.pop(); 11 | // console.log([top]) 12 | if (type === "browser") { 13 | if (top.nodeName === "#comment") { 14 | continue 15 | } 16 | if (top.nodeName === "#text" && /[\r\n]+/.test(top.data)) { 17 | continue 18 | } 19 | res.push({ tag: top.tagName ? top.tagName : "text", content: top.data }) 20 | } else if (type === "parser") { 21 | if (top.type === "comment") { 22 | continue 23 | } 24 | if (top.type === "text" && /[\r\n]+/.test(top.content)) { 25 | continue 26 | } 27 | res.push({ tag: top.tag ? top.tag : "text", content: top.content }) 28 | } 29 | if (top.childNodes) { 30 | stack.push(...top.childNodes) 31 | } else if (top.children) { 32 | stack.push(...top.children) 33 | } 34 | } 35 | return res 36 | } 37 | 38 | test('test25.html', async ({ page }) => { 39 | // encodeURIComponent 40 | let url = "http://127.0.0.1:3000/demo/test25.html" 41 | await page.goto(url, { 42 | referer: "", 43 | // timeout: 30, 44 | waitUntil: "domcontentloaded" 45 | }); 46 | 47 | await page.evaluateHandle(`document.body.classList.add("body")`) 48 | 49 | // const name = await page.innerText('title'); 50 | // expect(name).toBe('Document'); 51 | // const elementHandle = await page.$('body'); 52 | // console.log(elementHandle) 53 | // const bodyElement = elementHandle.asElement() 54 | // console.log(bodyElement) 55 | 56 | // const aHandle = await page.evaluateHandle('document'); 57 | // console.log(aHandle) 58 | 59 | // const aHandle = await page.evaluateHandle(() => document.body); 60 | // let doc = await aHandle.jsonValue() 61 | // console.log(doc) 62 | // const resultHandle = await page.evaluateHandle(body => body.innerHTML, aHandle); 63 | // const jsonValue = await resultHandle.jsonValue() 64 | // console.log(jsonValue); 65 | // await resultHandle.dispose(); 66 | 67 | const aHandle = await page.evaluateHandle(`(${getAll.toString()})(Array.from(document.childNodes),"browser")`); 68 | // console.log(aHandle) 69 | const browser: any = await aHandle.jsonValue() 70 | 71 | let html = await request({ 72 | method: "GET", 73 | uri: url, 74 | headers: { 75 | "Accept": "*/*", 76 | "Accept-Encoding": "deflate, br", 77 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 78 | "Connection": "keep-alive", 79 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59", 80 | }, 81 | }) 82 | 83 | let ast = parse(html) 84 | 85 | // let body = search(ast, "body") 86 | 87 | let parser = getAll(ast.children, "parser") 88 | 89 | // console.log(browser, parser) 90 | let count = browser.length > parser.length ? browser.length : parser.length; 91 | for (let i = 0; i < count; i++) { 92 | // console.log(browser[i] ? browser[i].tag : "undefined", parser[i] ? parser[i].tag : "undefined") 93 | expect(browser[i].tag.toLowerCase()).toBe(parser[i].tag) 94 | // console.assert(browser[i].tag.toLowerCase() === parser[i].tag, `${browser[i] ? browser[i].tag : "undefined"}, ${parser[i] ? parser[i].tag : "undefined"}`) 95 | } 96 | }); -------------------------------------------------------------------------------- /test/test26.spec.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from '@playwright/test'; 2 | const request = require("request-promise") 3 | import { parse } from '../src/parser' 4 | 5 | function getAll(source, type) { 6 | let current = source; 7 | let res = [] 8 | let stack = Array.isArray(current) ? current : [current]; 9 | while (stack.length) { 10 | let top = stack.pop(); 11 | // console.log([top]) 12 | if (type === "browser") { 13 | if (top.nodeName === "#comment") { 14 | continue 15 | } 16 | if (top.nodeName === "#text" && /[\r\n]+/.test(top.data)) { 17 | continue 18 | } 19 | res.push({ tag: top.tagName ? top.tagName : "text", content: top.data }) 20 | } else if (type === "parser") { 21 | if (top.type === "comment") { 22 | continue 23 | } 24 | if (top.type === "text" && /[\r\n]+/.test(top.content)) { 25 | continue 26 | } 27 | res.push({ tag: top.tag ? top.tag : "text", content: top.content }) 28 | } 29 | if (top.childNodes) { 30 | stack.push(...top.childNodes) 31 | } else if (top.children) { 32 | stack.push(...top.children) 33 | } 34 | } 35 | return res 36 | } 37 | 38 | test('test26.html', async ({ page }) => { 39 | // encodeURIComponent 40 | let url = "http://127.0.0.1:3000/demo/test26.html" 41 | await page.goto(url, { 42 | referer: "", 43 | // timeout: 30, 44 | waitUntil: "domcontentloaded" 45 | }); 46 | 47 | await page.evaluateHandle(`document.body.classList.add("body")`) 48 | 49 | // const name = await page.innerText('title'); 50 | // expect(name).toBe('Document'); 51 | // const elementHandle = await page.$('body'); 52 | // console.log(elementHandle) 53 | // const bodyElement = elementHandle.asElement() 54 | // console.log(bodyElement) 55 | 56 | // const aHandle = await page.evaluateHandle('document'); 57 | // console.log(aHandle) 58 | 59 | // const aHandle = await page.evaluateHandle(() => document.body); 60 | // let doc = await aHandle.jsonValue() 61 | // console.log(doc) 62 | // const resultHandle = await page.evaluateHandle(body => body.innerHTML, aHandle); 63 | // const jsonValue = await resultHandle.jsonValue() 64 | // console.log(jsonValue); 65 | // await resultHandle.dispose(); 66 | 67 | const aHandle = await page.evaluateHandle(`(${getAll.toString()})(Array.from(document.childNodes),"browser")`); 68 | // console.log(aHandle) 69 | const browser: any = await aHandle.jsonValue() 70 | 71 | let html = await request({ 72 | method: "GET", 73 | uri: url, 74 | headers: { 75 | "Accept": "*/*", 76 | "Accept-Encoding": "deflate, br", 77 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 78 | "Connection": "keep-alive", 79 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59", 80 | }, 81 | }) 82 | 83 | let ast = parse(html) 84 | 85 | // let body = search(ast, "body") 86 | 87 | let parser = getAll(ast.children, "parser") 88 | 89 | // console.log(browser, parser) 90 | let count = browser.length > parser.length ? browser.length : parser.length; 91 | for (let i = 0; i < count; i++) { 92 | // console.log(browser[i] ? browser[i].tag : "undefined", parser[i] ? parser[i].tag : "undefined") 93 | expect(browser[i].tag.toLowerCase()).toBe(parser[i].tag) 94 | // console.assert(browser[i].tag.toLowerCase() === parser[i].tag, `${browser[i] ? browser[i].tag : "undefined"}, ${parser[i] ? parser[i].tag : "undefined"}`) 95 | } 96 | }); -------------------------------------------------------------------------------- /demo/test25.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 18 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test/test24.spec.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from '@playwright/test'; 2 | import { webkit } from 'playwright' 3 | const request = require("request-promise") 4 | import { parse } from '../src/parser' 5 | 6 | function getAll(source, type) { 7 | let current = source; 8 | let res = [] 9 | let stack = Array.isArray(current) ? current : [current]; 10 | while (stack.length) { 11 | let top = stack.pop(); 12 | // console.log([top]) 13 | if (type === "browser") { 14 | if (top.nodeName === "#comment") { 15 | continue 16 | } 17 | if (top.nodeName === "#text" && /[\r\n]+/.test(top.data)) { 18 | continue 19 | } 20 | res.push({ tag: top.tagName ? top.tagName : "text", content: top.data }) 21 | } else if (type === "parser") { 22 | if (top.type === "comment") { 23 | continue 24 | } 25 | if (top.type === "text" && /[\r\n]+/.test(top.content)) { 26 | continue 27 | } 28 | res.push({ tag: top.tag ? top.tag : "text", content: top.content }) 29 | } 30 | if (top.childNodes) { 31 | stack.push(...top.childNodes) 32 | } else if (top.children) { 33 | stack.push(...top.children) 34 | } 35 | } 36 | return res 37 | } 38 | 39 | test('test24.html', async (/*{ page }*/) => { 40 | // encodeURIComponent 41 | let url = "http://127.0.0.1:3000/demo/test24.html" 42 | const web = await webkit.launch(); 43 | const context = await web.newContext({ 44 | javaScriptEnabled: false 45 | }); 46 | const newpage = await context.newPage(); 47 | await newpage.goto(url, { 48 | referer: "", 49 | // timeout: 30, 50 | waitUntil: "domcontentloaded" 51 | }); 52 | 53 | await newpage.evaluateHandle(`document.body.classList.add("body")`) 54 | 55 | // const name = await page.innerText('title'); 56 | // expect(name).toBe('Document'); 57 | // const elementHandle = await page.$('body'); 58 | // console.log(elementHandle) 59 | // const bodyElement = elementHandle.asElement() 60 | // console.log(bodyElement) 61 | 62 | // const aHandle = await page.evaluateHandle('document'); 63 | // console.log(aHandle) 64 | 65 | // const aHandle = await page.evaluateHandle(() => document.body); 66 | // let doc = await aHandle.jsonValue() 67 | // console.log(doc) 68 | // const resultHandle = await page.evaluateHandle(body => body.innerHTML, aHandle); 69 | // const jsonValue = await resultHandle.jsonValue() 70 | // console.log(jsonValue); 71 | // await resultHandle.dispose(); 72 | 73 | const aHandle = await newpage.evaluateHandle(`(${getAll.toString()})(Array.from(document.childNodes),"browser")`); 74 | // console.log(aHandle) 75 | const browser: any = await aHandle.jsonValue() 76 | 77 | let html = await request({ 78 | method: "GET", 79 | uri: url, 80 | headers: { 81 | "Accept": "*/*", 82 | "Accept-Encoding": "deflate, br", 83 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 84 | "Connection": "keep-alive", 85 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59", 86 | }, 87 | }) 88 | 89 | let ast = parse(html) 90 | 91 | // let body = search(ast, "body") 92 | 93 | let parser = getAll(ast.children, "parser") 94 | 95 | // console.log(browser, parser) 96 | let count = browser.length > parser.length ? browser.length : parser.length; 97 | for (let i = 0; i < count; i++) { 98 | // console.log(browser[i] ? browser[i].tag : "undefined", parser[i] ? parser[i].tag : "undefined") 99 | expect(browser[i].tag.toLowerCase()).toBe(parser[i].tag) 100 | // console.assert(browser[i].tag.toLowerCase() === parser[i].tag, `${browser[i] ? browser[i].tag : "undefined"}, ${parser[i] ? parser[i].tag : "undefined"}`) 101 | } 102 | }); -------------------------------------------------------------------------------- /demo/test31.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 51 | 52 | -------------------------------------------------------------------------------- /demo/test19.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 |
    12 |

    >A federal appeals court has dealt the Obama administration yet another blow in its quest to keep at least 13 | some age restrictions on the sale of emergency contraceptive pills.

    14 |

    In a three-paragraph 16 | order, a three-judge panel for the United States Court of Appeals for the 2nd Circuit ruled that 17 | although the government's appeal of a lower 19 | court decision removing all age restrictions on morning-after pills is still pending, at least some 20 | medications must be made available over the counter immediately.

    21 |

    Specifically, the panel said that while the requirement for one-pill versions of the morning-after pill to be 22 | made available without age restrictions can be delayed while the appeal is considered, that is not the case 23 | for "two-pill variants," which include generic products Next Choice and other levonorgestral tablets.

    26 |

    Ironically, the FDA had sought to produce a compromise by approving 28 | in late April an over-the-counter version of Plan B 29 | One-Step, a one-pill version that would be available on pharmacy shelves but only to those 15 and 30 | over who are able to produce proper identification.

    31 |

    But Plan B One-Step costs 33 | in the neighborhood of $50, while the generic two-pill formulations cost about $20 to $35.

    35 |

    The saga of trying to move emergency contraception from a prescription-only to an over-the-counter product 36 | has been ongoing for more than a decade through two successive presidential administrations.

    37 |

    U.S. District Court Judge Edward Korman, who has overseen the case since 2005, has made it clear that he 38 | thinks the government has dragged its feet to the point of violating the law.

    39 |

    But few expected the New York-based appeals court to agree with Korman, even in part, by denying the 40 | government's request to stay his April 6 order while the appeal is being heard.

    41 |

    The government — via the Departments of Justice and Health and Human Services — had no immediate comment on 42 | the ruling. Representatives would say only that they were "reviewing the order" from the appeals court.

    43 |

    Those who have been pursuing the case, however, had a bit more to say.

    44 |

    "Today's decision from the 2nd Circuit marks an historic day for women's health," said Nancy Northup, 45 | president and CEO of the Center for Reproductive Rights, which 46 | has represented some of the plaintiffs in the lawsuit. "Finally, after more than a decade of politically 47 | motivated delays, women will no longer have to endure intrusive, onerous and medically unnecessary 48 | restrictions to get emergency contraception."

    49 |

    What happens next remains unclear. Some lawyers say the government might be able to appeal to the full 2nd 50 | Circuit. But more likely, if they insist on fighting, government attorneys would have to seek relief from 51 | the Supreme Court justice who oversees the 2nd Circuit — Ruth Bader Ginsburg.

    52 |
    Copyright 2013 NPR. To see more, visit http://www.npr.org/. 54 |
    55 |

    56 |
    57 |
    58 | 59 | -------------------------------------------------------------------------------- /src/parser/Html.ts: -------------------------------------------------------------------------------- 1 | import { Lexer, TOKEN_EQUAL, TOKEN_LEFT_PAREN, TOKEN_NAME, TOKEN_QUOTE, TOKEN_RIGHT_PAREN, TOKEN_SELF_CLOSE, TOKEN_SINGLE_QUOTE } from "../lexer"; 2 | 3 | export interface Node { 4 | LineNum?: number, 5 | children?: Array, 6 | attr: Array, 7 | type?: string, 8 | tag?: string, 9 | selfClose?: boolean, 10 | parent?: any, 11 | } 12 | 13 | let temp = Symbol("temp") 14 | let nextSibling = temp 15 | 16 | export class Node { 17 | constructor() { 18 | this.children = [] 19 | this.attr = [] 20 | } 21 | get nextSibling() { 22 | if (nextSibling !== temp) return nextSibling 23 | if (!this.parent) return null 24 | let lengtn = this.parent.children.length 25 | let index = -1 26 | for (let item of this.parent.children) { 27 | index++ 28 | if (item === this) { 29 | break 30 | } 31 | } 32 | if (index + 1 > lengtn) { 33 | return null 34 | } 35 | return this.parent.children[index + 1] 36 | } 37 | 38 | set nextSibling(value: any) { 39 | nextSibling = value 40 | } 41 | } 42 | 43 | export function parseTag(lexer: Lexer) { 44 | return lexer.NextTokenIs(TOKEN_NAME).nowToken; // tag_name 45 | } 46 | 47 | export function parseName(lexer: Lexer, node: any) { 48 | let attrReg = /[^\s"'>/=[\u0000-\u001f]+/.exec(lexer.sourceCode) 49 | let name = "" 50 | if (attrReg) { 51 | name = attrReg[0] 52 | } 53 | if (name.includes("<")) { 54 | /* 55 | 57 | 58 |

    59 | 60 | 暂时当做selfClose标签处理 61 | */ 62 | node.selfClose = true 63 | } 64 | lexer.skipSourceCode(name.length) 65 | return name 66 | } 67 | 68 | function genereteAttr(name: any, value?: any) { 69 | return { 70 | name, 71 | value, 72 | } 73 | } 74 | 75 | export function parseValue(lexer: Lexer) { 76 | if (lexer.sourceCode[0] === "'") { 77 | return parseSingleQuotedAttr(lexer) 78 | } else if (lexer.sourceCode[0] === '"') { 79 | return parseDoubleQuotedAttr(lexer) 80 | } else { 81 | return parseString(lexer) 82 | } 83 | } 84 | 85 | export function parseString(lexer: Lexer) { 86 | let value = "" 87 | // lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 88 | // lexer.stack.pop() 89 | let res = /[^\s><]*/.exec(lexer.sourceCode) 90 | if (res) { 91 | value = res[0] 92 | } 93 | lexer.skipSourceCode(value.length) 94 | // lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 95 | // lexer.stack.pop() 96 | return value 97 | } 98 | 99 | export function parseSingleQuotedAttr(lexer: Lexer) { 100 | let value = "" 101 | lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 102 | lexer.stack.pop() 103 | let res = /[^']*/.exec(lexer.sourceCode) 104 | if (res) { 105 | value = res[0] 106 | } 107 | lexer.skipSourceCode(value.length) 108 | lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 109 | lexer.stack.pop() 110 | return value 111 | } 112 | 113 | export function parseDoubleQuotedAttr(lexer: Lexer) { 114 | let value = "" 115 | lexer.NextTokenIs(TOKEN_QUOTE); 116 | lexer.stack.pop() 117 | let res = /[^"]*/.exec(lexer.sourceCode) 118 | if (res) { 119 | value = res[0] 120 | } 121 | lexer.skipSourceCode(value.length) 122 | lexer.NextTokenIs(TOKEN_QUOTE); 123 | lexer.stack.pop() 124 | return value 125 | } 126 | 127 | export function parseAttr(lexer: Lexer, node: any) { 128 | let attrItem: { 129 | name?: string, 130 | value?: string, 131 | } = {} 132 | lexer.isIgnored(); // 空格 133 | let tag = parseName(lexer, node) 134 | lexer.isIgnored() // 空格 135 | if (tag) { 136 | let attr = tag 137 | attrItem = genereteAttr(attr); // name 138 | lexer.isIgnored(); // 空格 139 | if (lexer.sourceCode[0] === "=") { 140 | lexer.NextTokenIs(TOKEN_EQUAL) // = 141 | lexer.stack.pop() 142 | lexer.isIgnored(); // 空格 143 | attrItem.value = parseValue(lexer) 144 | lexer.isIgnored(); // 空格 145 | } else { 146 | attrItem.value = "true" 147 | lexer.isIgnored(); // 空格 148 | } 149 | } 150 | return attrItem 151 | } 152 | 153 | function checkAttrEnd(lexer: Lexer, node: Node) { 154 | if (lexer.sourceCode[0] === ">") { 155 | lexer.skipSourceCode(1) 156 | lexer.stack.push({ lineNum: lexer.lineNum, tokenType: TOKEN_RIGHT_PAREN /*>*/, token: ">" }) 157 | return false 158 | } else if (lexer.sourceCode.slice(0, 2) === "/>") { 159 | node.selfClose = true 160 | lexer.skipSourceCode(2); 161 | lexer.stack.push({ lineNum: lexer.lineNum, tokenType: TOKEN_SELF_CLOSE /*/>
    */, token: "/>" }) 162 | return false 163 | } else { 164 | return true 165 | } 166 | } 167 | 168 | export function parseHtml(lexer: Lexer) { 169 | let node = new Node() 170 | if (!lexer.check) { 171 | node.nextSibling = null 172 | } 173 | 174 | node.LineNum = lexer.GetLineNum() 175 | lexer.NextTokenIs(TOKEN_LEFT_PAREN) // < 176 | node.type = "tag" 177 | node.tag = parseTag(lexer) 178 | lexer.isIgnored() 179 | while (checkAttrEnd(lexer, node)) { 180 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 181 | lexer.lineNum += 1 182 | lexer.skipSourceCode(2) 183 | } else { 184 | if (lexer.isNewLine(lexer.sourceCode[0])) { 185 | lexer.lineNum += 1 186 | lexer.skipSourceCode(1) 187 | } 188 | } 189 | let res = parseAttr(lexer, node) 190 | node.attr.push(res) 191 | } 192 | // lexer.isIgnored() 193 | return node 194 | } -------------------------------------------------------------------------------- /dist/parser/Html.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.parseHtml = exports.parseAttr = exports.parseDoubleQuotedAttr = exports.parseSingleQuotedAttr = exports.parseString = exports.parseValue = exports.parseName = exports.parseTag = exports.Node = void 0; 4 | const lexer_1 = require("../lexer"); 5 | let temp = Symbol("temp"); 6 | let nextSibling = temp; 7 | class Node { 8 | constructor() { 9 | this.children = []; 10 | this.attr = []; 11 | } 12 | get nextSibling() { 13 | if (nextSibling !== temp) 14 | return nextSibling; 15 | if (!this.parent) 16 | return null; 17 | let lengtn = this.parent.children.length; 18 | let index = -1; 19 | for (let item of this.parent.children) { 20 | index++; 21 | if (item === this) { 22 | break; 23 | } 24 | } 25 | if (index + 1 > lengtn) { 26 | return null; 27 | } 28 | return this.parent.children[index + 1]; 29 | } 30 | set nextSibling(value) { 31 | nextSibling = value; 32 | } 33 | } 34 | exports.Node = Node; 35 | function parseTag(lexer) { 36 | return lexer.NextTokenIs(lexer_1.TOKEN_NAME).nowToken; // tag_name 37 | } 38 | exports.parseTag = parseTag; 39 | function parseName(lexer, node) { 40 | let attrReg = /[^\s"'>/=[\u0000-\u001f]+/.exec(lexer.sourceCode); 41 | let name = ""; 42 | if (attrReg) { 43 | name = attrReg[0]; 44 | } 45 | if (name.includes("<")) { 46 | /* 47 |
    49 | 50 |

    51 | 52 | 暂时当做selfClose标签处理 53 | */ 54 | node.selfClose = true; 55 | } 56 | lexer.skipSourceCode(name.length); 57 | return name; 58 | } 59 | exports.parseName = parseName; 60 | function genereteAttr(name, value) { 61 | return { 62 | name, 63 | value, 64 | }; 65 | } 66 | function parseValue(lexer) { 67 | if (lexer.sourceCode[0] === "'") { 68 | return parseSingleQuotedAttr(lexer); 69 | } 70 | else if (lexer.sourceCode[0] === '"') { 71 | return parseDoubleQuotedAttr(lexer); 72 | } 73 | else { 74 | return parseString(lexer); 75 | } 76 | } 77 | exports.parseValue = parseValue; 78 | function parseString(lexer) { 79 | let value = ""; 80 | // lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 81 | // lexer.stack.pop() 82 | let res = /[^\s><]*/.exec(lexer.sourceCode); 83 | if (res) { 84 | value = res[0]; 85 | } 86 | lexer.skipSourceCode(value.length); 87 | // lexer.NextTokenIs(TOKEN_SINGLE_QUOTE); 88 | // lexer.stack.pop() 89 | return value; 90 | } 91 | exports.parseString = parseString; 92 | function parseSingleQuotedAttr(lexer) { 93 | let value = ""; 94 | lexer.NextTokenIs(lexer_1.TOKEN_SINGLE_QUOTE); 95 | lexer.stack.pop(); 96 | let res = /[^']*/.exec(lexer.sourceCode); 97 | if (res) { 98 | value = res[0]; 99 | } 100 | lexer.skipSourceCode(value.length); 101 | lexer.NextTokenIs(lexer_1.TOKEN_SINGLE_QUOTE); 102 | lexer.stack.pop(); 103 | return value; 104 | } 105 | exports.parseSingleQuotedAttr = parseSingleQuotedAttr; 106 | function parseDoubleQuotedAttr(lexer) { 107 | let value = ""; 108 | lexer.NextTokenIs(lexer_1.TOKEN_QUOTE); 109 | lexer.stack.pop(); 110 | let res = /[^"]*/.exec(lexer.sourceCode); 111 | if (res) { 112 | value = res[0]; 113 | } 114 | lexer.skipSourceCode(value.length); 115 | lexer.NextTokenIs(lexer_1.TOKEN_QUOTE); 116 | lexer.stack.pop(); 117 | return value; 118 | } 119 | exports.parseDoubleQuotedAttr = parseDoubleQuotedAttr; 120 | function parseAttr(lexer, node) { 121 | let attrItem = {}; 122 | lexer.isIgnored(); // 空格 123 | let tag = parseName(lexer, node); 124 | lexer.isIgnored(); // 空格 125 | if (tag) { 126 | let attr = tag; 127 | attrItem = genereteAttr(attr); // name 128 | lexer.isIgnored(); // 空格 129 | if (lexer.sourceCode[0] === "=") { 130 | lexer.NextTokenIs(lexer_1.TOKEN_EQUAL); // = 131 | lexer.stack.pop(); 132 | lexer.isIgnored(); // 空格 133 | attrItem.value = parseValue(lexer); 134 | lexer.isIgnored(); // 空格 135 | } 136 | else { 137 | attrItem.value = "true"; 138 | lexer.isIgnored(); // 空格 139 | } 140 | } 141 | return attrItem; 142 | } 143 | exports.parseAttr = parseAttr; 144 | function checkAttrEnd(lexer, node) { 145 | if (lexer.sourceCode[0] === ">") { 146 | lexer.skipSourceCode(1); 147 | lexer.stack.push({ lineNum: lexer.lineNum, tokenType: lexer_1.TOKEN_RIGHT_PAREN /*>*/, token: ">" }); 148 | return false; 149 | } 150 | else if (lexer.sourceCode.slice(0, 2) === "/>") { 151 | node.selfClose = true; 152 | lexer.skipSourceCode(2); 153 | lexer.stack.push({ lineNum: lexer.lineNum, tokenType: lexer_1.TOKEN_SELF_CLOSE /*/>
    */, token: "/>" }); 154 | return false; 155 | } 156 | else { 157 | return true; 158 | } 159 | } 160 | function parseHtml(lexer) { 161 | let node = new Node(); 162 | if (!lexer.check) { 163 | node.nextSibling = null; 164 | } 165 | node.LineNum = lexer.GetLineNum(); 166 | lexer.NextTokenIs(lexer_1.TOKEN_LEFT_PAREN); // < 167 | node.type = "tag"; 168 | node.tag = parseTag(lexer); 169 | lexer.isIgnored(); 170 | while (checkAttrEnd(lexer, node)) { 171 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 172 | lexer.lineNum += 1; 173 | lexer.skipSourceCode(2); 174 | } 175 | else { 176 | if (lexer.isNewLine(lexer.sourceCode[0])) { 177 | lexer.lineNum += 1; 178 | lexer.skipSourceCode(1); 179 | } 180 | } 181 | let res = parseAttr(lexer, node); 182 | node.attr.push(res); 183 | } 184 | // lexer.isIgnored() 185 | return node; 186 | } 187 | exports.parseHtml = parseHtml; 188 | -------------------------------------------------------------------------------- /demo/test12.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /demo/test23.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |
    11 |
    12 |
    13 |
    Go Pro Now!
    14 |
    Unlock all the features
    15 |
    16 |
    17 |
    18 |
    19 |
    20 | Save 25% 21 |
    22 |
    23 |
    24 |
    25 | $2.99 26 |
    27 | Monthly 28 | Annual Billing 29 |
    30 |
    31 | Upgrade Now 32 |
    33 |
    34 | 42 |
    43 |
    44 | 45 | 46 | 47 | 50 | 51 | 52 | 53 | 56 | 57 | 58 | 59 | 62 | 63 | 64 | 65 | 68 | 69 | 70 | 71 | 74 | 75 | 76 | 77 | 80 | 81 | 82 |
    48 | Pro Version Features: 49 |
    54 | Unlimited Words for non-stop Listening 55 |
    60 | Read Text in 27 Different Languages 61 |
    66 | Read Text at 21 Different Speed Settings 67 |
    72 | Autoplay Text - Fast and Easy 73 |
    78 | Automatic Language Detection 79 |
    83 |
    84 |
    85 |
    86 |
    87 |
    88 |
    89 |
    90 |
    91 |
    92 | 93 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Basic Options */ 6 | // "incremental": true, /* Enable incremental compilation */ 7 | "target": "ESNEXT", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */ 8 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */ 9 | "lib": ["ESNext", "DOM"], /* Specify library files to be included in the compilation. */ 10 | "allowJs": true, /* Allow javascript files to be compiled. */ 11 | // "checkJs": true, /* Report errors in .js files. */ 12 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 13 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 14 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 15 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 16 | // "outFile": "./", /* Concatenate and emit output to single file. */ 17 | "outDir": "./dist", /* Redirect output structure to the directory. */ 18 | "rootDir": "./src", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 19 | // "composite": true, /* Enable project compilation */ 20 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 21 | // "removeComments": true, /* Do not emit comments to output. */ 22 | // "noEmit": true, /* Do not emit outputs. */ 23 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 24 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 25 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 26 | 27 | /* Strict Type-Checking Options */ 28 | "strict": true, /* Enable all strict type-checking options. */ 29 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 30 | // "strictNullChecks": true, /* Enable strict null checks. */ 31 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 32 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 33 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 34 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 35 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 36 | 37 | /* Additional Checks */ 38 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 39 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 40 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 41 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 42 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ 43 | 44 | /* Module Resolution Options */ 45 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 46 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 47 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 48 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 49 | // "typeRoots": [], /* List of folders to include type definitions from. */ 50 | // "types": [], /* Type declaration files to be included in compilation. */ 51 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 52 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 53 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 54 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 55 | 56 | /* Source Map Options */ 57 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 58 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 59 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 60 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 61 | 62 | /* Experimental Options */ 63 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 64 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 65 | 66 | /* Advanced Options */ 67 | "skipLibCheck": true, /* Skip type checking of declaration files. */ 68 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 69 | }, 70 | "exclude": [ 71 | "vm", 72 | "examples", 73 | "test", 74 | "app", 75 | "dist-esmodule", 76 | "script", 77 | "copy", 78 | "mycheck", 79 | "coverage", 80 | "matchtest", 81 | "compare.js", 82 | "jest-playwright.config.js", 83 | "jest.config.js", 84 | "server.js", 85 | "test-server.js", 86 | ], 87 | } 88 | -------------------------------------------------------------------------------- /tsconfig-esmodule.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Basic Options */ 6 | // "incremental": true, /* Enable incremental compilation */ 7 | "target": "ESNEXT", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */ 8 | "module": "ESNext", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */ 9 | "lib": ["ESNext", "DOM"], /* Specify library files to be included in the compilation. */ 10 | "allowJs": true, /* Allow javascript files to be compiled. */ 11 | // "checkJs": true, /* Report errors in .js files. */ 12 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 13 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 14 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 15 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 16 | // "outFile": "./", /* Concatenate and emit output to single file. */ 17 | "outDir": "./dist-esmodule", /* Redirect output structure to the directory. */ 18 | "rootDir": "./src", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 19 | // "composite": true, /* Enable project compilation */ 20 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 21 | // "removeComments": true, /* Do not emit comments to output. */ 22 | // "noEmit": true, /* Do not emit outputs. */ 23 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 24 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 25 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 26 | 27 | /* Strict Type-Checking Options */ 28 | "strict": true, /* Enable all strict type-checking options. */ 29 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 30 | // "strictNullChecks": true, /* Enable strict null checks. */ 31 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 32 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 33 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 34 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 35 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 36 | 37 | /* Additional Checks */ 38 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 39 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 40 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 41 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 42 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ 43 | 44 | /* Module Resolution Options */ 45 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 46 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 47 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 48 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 49 | // "typeRoots": [], /* List of folders to include type definitions from. */ 50 | // "types": [], /* Type declaration files to be included in compilation. */ 51 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 52 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 53 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 54 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 55 | 56 | /* Source Map Options */ 57 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 58 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 59 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 60 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 61 | 62 | /* Experimental Options */ 63 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 64 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 65 | 66 | /* Advanced Options */ 67 | "skipLibCheck": true, /* Skip type checking of declaration files. */ 68 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 69 | }, 70 | "exclude": [ 71 | "vm", 72 | "examples", 73 | "test", 74 | "app", 75 | "dist", 76 | "script", 77 | "copy", 78 | "mycheck", 79 | "coverage", 80 | "matchtest", 81 | "compare.js", 82 | "jest-playwright.config.js", 83 | "jest.config.js", 84 | "server.js", 85 | "test-server.js", 86 | ], 87 | } 88 | -------------------------------------------------------------------------------- /test/testall.spec.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from '@playwright/test'; 2 | import { chromium } from 'playwright' 3 | const request = require("request-promise") 4 | import { parse } from '../src/parser' 5 | const path = require("path") 6 | const fs = require("fs") 7 | 8 | function getAll(source, type) { 9 | let current = source; 10 | let res = [] 11 | let stack = Array.isArray(current) ? current : [current]; 12 | while (stack.length) { 13 | let top = stack.pop(); 14 | // console.log([top]) 15 | if (type === "browser") { 16 | if (top.nodeName === "#comment") { 17 | continue 18 | } 19 | if (top.nodeName === "#text" && /[\r\n]+/.test(top.data)) { 20 | continue 21 | } 22 | if (top.nodeName === "html") { // 浏览器中DocumentType的nodeName位html 23 | res.push({ tag: "DTD", content: null }) 24 | continue 25 | } 26 | res.push({ tag: top.nodeName === "#text" ? "text" : top.nodeName, content: top.nodeName === "#text" ? top.data : null }) 27 | } else if (type === "parser") { 28 | if (top.type === "comment") { 29 | continue 30 | } 31 | // top.content.replace(/[\r\n]+/g, "").trim() 32 | if (top.type === "text" && /[\r\n]+/.test(top.content)) { 33 | continue 34 | } 35 | if (top.type === "DTD") { 36 | res.push({ tag: "dtd", content: top.content, LineNum: top.LineNum }) 37 | continue 38 | } 39 | res.push({ tag: top.tag ? top.tag : "text", content: top.content, LineNum: top.LineNum }) 40 | } 41 | if (top.childNodes) { 42 | stack.push(...top.childNodes) 43 | } else if (top.children) { 44 | // debugger 45 | stack.push(...top.children) 46 | } 47 | } 48 | return res 49 | } 50 | 51 | 52 | let paths = [ 53 | "demo.html", 54 | "demo1.html", 55 | "demo2.html", 56 | "demo3.html", 57 | "demo4.html", 58 | "demo5.html", 59 | "test4.html", 60 | "test5.html", 61 | "test6.html", 62 | "test7.html", 63 | "test8.html", 64 | "MDN_HTML.html", 65 | "MDN_JavaScript.html", 66 | "CSDN.html", 67 | 68 | // "CSDN_SPM.html", 69 | 70 | "test9.html", 71 | "test10.html", 72 | "test11.html", 73 | "test12.html", 74 | "test13.html", 75 | "test14.html", 76 | "test15.html", 77 | "test16.html", 78 | "test17.html", 79 | "test18.html", 80 | "test19.html", 81 | "test20.html", 82 | "test21.html", 83 | "test22.html", 84 | "test23.html", 85 | "google.html", 86 | 87 | // "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741.html", 88 | // "005055fd7e2625aba5e8d2d370ea4914a152fe50d16620f896cdf4b1a68ba741-origin.html", 89 | // "039c4b966d1f2a0c589ac0aad211fe65500ad1cb58c7f45b34251db7056803ec-origin.html", 90 | // ok 91 | "0475e5eeadaaca857eea3f36d0eda01937fe672d48be7f98ba6bc7f25ecd63d0.html", 92 | "078cdb456d1beb698aeed86e0f2161e442e9431c4580295f1ba4ece22741068c.html", 93 | "0e55dcdbeb54c88ee87942b9fef7ea5398fa9a1e83493d55844b479506a80fd8.html", 94 | "qidian.html", 95 | "test24.html", 96 | "test25.html", 97 | "test26.html", 98 | "qidian1.html", 99 | "test27.html", 100 | "test28.html", 101 | "test29.html", 102 | "test30.html", 103 | "test31.html", 104 | "test32.html", 105 | "test33.html", 106 | "test34.html", 107 | "test35.html", 108 | "test36.html", 109 | "test38.html", 110 | "test39.html", 111 | ] 112 | 113 | for (let item of paths) { 114 | test(`${item}`, async (/*{ page }*/) => { 115 | let url = `http://127.0.0.1:3000/demo/${item}` 116 | // console.log(url) 117 | const web = await chromium.launch(); 118 | const context = await web.newContext({ 119 | javaScriptEnabled: false 120 | }); 121 | const newpage = await context.newPage(); 122 | await newpage.goto(url, { 123 | referer: "", 124 | // timeout: 30, 125 | waitUntil: "domcontentloaded" 126 | }); 127 | 128 | await newpage.evaluateHandle(`document.body.classList.add("body")`) 129 | 130 | // const name = await page.innerText('title'); 131 | // expect(name).toBe('Document'); 132 | // const elementHandle = await page.$('body'); 133 | // console.log(elementHandle) 134 | // const bodyElement = elementHandle.asElement() 135 | // console.log(bodyElement) 136 | 137 | // const aHandle = await page.evaluateHandle('document'); 138 | // console.log(aHandle) 139 | 140 | // const aHandle = await page.evaluateHandle(() => document.body); 141 | // let doc = await aHandle.jsonValue() 142 | // console.log(doc) 143 | // const resultHandle = await page.evaluateHandle(body => body.innerHTML, aHandle); 144 | // const jsonValue = await resultHandle.jsonValue() 145 | // console.log(jsonValue); 146 | // await resultHandle.dispose(); 147 | 148 | const aHandle = await newpage.evaluateHandle(`(${getAll.toString()})(Array.from(document.childNodes),"browser")`); 149 | // console.log(aHandle) 150 | const browser: any = await aHandle.jsonValue() 151 | 152 | let html = await request({ 153 | method: "GET", 154 | uri: url, 155 | headers: { 156 | "Accept": "*/*", 157 | "Accept-Encoding": "deflate, br", 158 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 159 | "Connection": "keep-alive", 160 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59", 161 | }, 162 | }) 163 | 164 | let ast = parse(html, false) 165 | 166 | // let body = search(ast, "body") 167 | 168 | let parser = getAll(ast.children, "parser") 169 | if (!fs.existsSync(path.resolve(__dirname, "../out/"))) { 170 | fs.mkdirSync(path.resolve(__dirname, "../out/")) 171 | } 172 | fs.writeFileSync(path.resolve(__dirname, "../out/", `./$parser.ast.json`), JSON.stringify(parser, null, 4)) 173 | fs.writeFileSync(path.resolve(__dirname, "../out/", `./$browser.ast.json`), JSON.stringify(browser, null, 4)) 174 | 175 | // console.log(browser, parser) 176 | let count = browser.length > parser.length ? browser.length : parser.length; 177 | for (let i = 0; i < count; i++) { 178 | // console.log(browser[i] ? browser[i].tag : "undefined", parser[i] ? parser[i].tag : "undefined") 179 | 180 | // if (browser[i] && parser[i]) { 181 | expect(browser[i].tag.toLowerCase()).toBe(parser[i].tag) 182 | // } else { 183 | // console.log(i) 184 | // console.log(browser[i], parser[i]) 185 | // } 186 | 187 | // console.assert(browser[i].tag.toLowerCase() === parser[i].tag, `${browser[i] ? browser[i].tag : "undefined"}, ${parser[i] ? parser[i].tag : "undefined"}`) 188 | } 189 | }); 190 | } 191 | -------------------------------------------------------------------------------- /src/parser/parseText.ts: -------------------------------------------------------------------------------- 1 | import { COMMENT, Lexer, regexName, TOKEN_CLOSE, TOKEN_CONTENT_TEXT, TOKEN_DTD, TOKEN_LEFT_PAREN, TOKEN_NAME, TOKEN_RIGHT_PAREN, TOKEN_SELF_CLOSE } from "../lexer"; 2 | import { parseHtml } from "./Html"; 3 | import { isSpecialTag, parseClose } from "./tagClose"; 4 | 5 | 6 | export interface Node { 7 | LineNum?: number, 8 | children?: Array, 9 | content: string, 10 | type?: string, 11 | selfClose?: boolean, 12 | parent?: any, 13 | } 14 | 15 | let temp = Symbol("temp") 16 | let nextSibling = temp 17 | 18 | export class Node { 19 | constructor() { 20 | this.content = "" 21 | } 22 | get nextSibling() { 23 | if (nextSibling !== temp) return nextSibling 24 | if (!this.parent) return null 25 | let lengtn = this.parent.children.length 26 | let index = -1 27 | for (let item of this.parent.children) { 28 | index++ 29 | if (item === this) { 30 | break 31 | } 32 | } 33 | if (index + 1 > lengtn) { 34 | return null 35 | } 36 | return this.parent.children[index + 1] 37 | } 38 | 39 | set nextSibling(value: any) { 40 | nextSibling = value 41 | } 42 | } 43 | 44 | export function isClose(lexer: Lexer) { 45 | const length = lexer.stack.length 46 | const topTwo = length >= 2 ? lexer.stack[length - 2].tokenType : "" 47 | 48 | const isTOKEN_DTD = topTwo === TOKEN_DTD 49 | const isCOMMENT = topTwo === COMMENT 50 | if (length >= 2 && 51 | (isTOKEN_DTD /*dtd*/ || 52 | isCOMMENT /*comment*/) 53 | ) { 54 | return true 55 | } 56 | 57 | if (length < 4) return false 58 | 59 | const topThree = lexer.stack[length - 3].tokenType 60 | const topFour = lexer.stack[length - 4].tokenType 61 | const isTOKEN_RIGHT_PAREN = topTwo === TOKEN_RIGHT_PAREN 62 | const isTOKEN_NAME = topThree === TOKEN_NAME 63 | //
    64 | let one = isTOKEN_RIGHT_PAREN /*>*/ && 65 | isTOKEN_NAME /*tag_name*/ && 66 | topFour === TOKEN_CLOSE /* 68 | let close = isTOKEN_RIGHT_PAREN /*>*/ && 69 | topThree === TOKEN_NAME /*tag_name*/ && 70 | topFour === TOKEN_LEFT_PAREN /*<*/; 71 | // /> 72 | let selfClose = topTwo === TOKEN_SELF_CLOSE; // />
    73 | return one || close || selfClose; 74 | } 75 | 76 | /* 77 | 提取出来的公共代码 78 | */ 79 | function judgeEnd(lexer: Lexer) { 80 | if (lexer.sourceCode.slice(0, 2) === "*/ || 81 | lexer.sourceCode.slice(0, 2) === " || */ 84 | ) { 85 | return false 86 | } else { 87 | /*在这里是什么特征看调用函数的注释部分contentText
    */ 88 | if ((lexer.sourceCode[0] === "<" 89 | && regexName.test(lexer.sourceCode[1]))) { 90 | let parseRes = parseHtml(lexer) 91 | if (parseRes.selfClose) { 92 | return false 93 | } 94 | } 95 | return true 96 | } 97 | } 98 | 99 | function contentEnd(lexer: Lexer) { 100 | //
    contentText
    101 | //
    contentText
    102 | // contentText 103 | //
    contentText
    ||
    contentText
    104 | let stack = lexer.stack 105 | const length = stack.length 106 | if (isClose(lexer) && 107 | length >= 4 && 108 | stack[length - 2].tokenType === TOKEN_RIGHT_PAREN /*>*/ && 109 | stack[length - 3].tokenType === TOKEN_NAME /*name*/ && 110 | stack[length - 4].tokenType === TOKEN_LEFT_PAREN /*<*/ 111 | ) { 112 | // 118 | */ 119 | let script = ["", ""] 120 | if (script.includes(lexer.sourceCode.slice(0, script[0].length))) { 121 | return false 122 | } else { 123 | return true 124 | } 125 | } 126 | // noscript 127 | // if (lexer.stack[length - 3].token === "noscript") { 128 | // /* 129 | // 132 | // */ 133 | // let script = "" 134 | // if (lexer.sourceCode.slice(0, script.length) === script) { 135 | // return false 136 | // } else { 137 | // return true 138 | // } 139 | // } 140 | judgeEnd(lexer) 141 | } 142 | //
    contentText
    143 | //
    contentText 144 | //
    contentText
    145 | //
    contentText
    146 | if (isClose(lexer) && 147 | length >= 4 && 148 | stack[length - 2].tokenType === TOKEN_RIGHT_PAREN /*>*/ && 149 | stack[length - 3].tokenType === TOKEN_NAME /*name*/ && 150 | stack[length - 4].tokenType === TOKEN_CLOSE /*contentText
    155 | //
    contentText 156 | //
    contentText
    157 | //
    contentText
    158 | 159 | if (isClose(lexer) && 160 | length >= 4 && 161 | stack[length - 2].tokenType === TOKEN_SELF_CLOSE /*self-close />
    */ && 162 | stack[length - 3].tokenType === TOKEN_NAME /*name*/ && 163 | stack[length - 4].tokenType === TOKEN_LEFT_PAREN /*<*/ 164 | ) { 165 | judgeEnd(lexer) 166 | } 167 | 168 | // contentText
    169 | // contentText 170 | // contentText
    171 | // contentText
    172 | if (isClose(lexer) && 173 | stack[length - 2].tokenType === COMMENT /*COMMENT*/ 174 | ) { 175 | judgeEnd(lexer) 176 | } 177 | 178 | // contentText
    179 | // contentText
    180 | // contentText 181 | // contentText
    182 | if (isClose(lexer) && 183 | stack[length - 2].tokenType === TOKEN_DTD /*DTD*/ 184 | ) { 185 | judgeEnd(lexer) 186 | } 187 | 188 | /*contentText
    */ 189 | /*contentText || */ 191 | /*contentText
    */ 192 | if (stack[length - 1].tokenType === TOKEN_CONTENT_TEXT /*contentText*/) { 193 | if ((lexer.sourceCode[0] === "<" && 194 | regexName.test(lexer.sourceCode[1])) /*contentText
    */ || 195 | lexer.sourceCode.slice(0, 2) === " || */ 197 | ) { 198 | return false 199 | } else { 200 | /*contentText
    */ 201 | if ((lexer.sourceCode[0] === "<" 202 | && regexName.test(lexer.sourceCode[1]))) { 203 | let parseRes = parseHtml(lexer) 204 | if (parseRes.selfClose) { 205 | return false 206 | } 207 | } 208 | return true 209 | } 210 | } 211 | 212 | 213 | // return true 214 | throw new Error(`not find contentEnd! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`) 215 | 216 | } 217 | 218 | export function parseText(lexer: Lexer) { 219 | lexer.hasCache = false 220 | let node = new Node() 221 | if (!lexer.check) { 222 | node.nextSibling = null 223 | } 224 | 225 | // lexer.isIgnored(); 226 | node.LineNum = lexer.GetLineNum() 227 | 228 | let content = "" 229 | while (contentEnd(lexer) && !lexer.isEmpty()) { 230 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 231 | lexer.lineNum += 1 232 | content += lexer.sourceCode.slice(0, 2) 233 | lexer.skipSourceCode(2) 234 | } else { 235 | if (lexer.isNewLine(lexer.sourceCode[0])) { 236 | lexer.lineNum += 1 237 | content += lexer.sourceCode[0] 238 | lexer.skipSourceCode(1) 239 | } else { 240 | content += lexer.sourceCode[0] 241 | lexer.skipSourceCode(1) 242 | } 243 | } 244 | } 245 | 246 | if ( 247 | lexer.stack.length >= 3 && 248 | isSpecialTag({ tag: lexer.stack[lexer.stack.length - 3].token })) { 249 | let token = lexer.stack[lexer.stack.length - 3].token 250 | let tokenLen = ``.length 251 | if (lexer.sourceCode.slice(0, tokenLen) === ``) { 252 | lexer.skipSourceCode(2) 253 | let res = { lineNum: lexer.lineNum, tokenType: TOKEN_CLOSE, token: " lengtn) { 27 | return null; 28 | } 29 | return this.parent.children[index + 1]; 30 | } 31 | set nextSibling(value) { 32 | nextSibling = value; 33 | } 34 | } 35 | exports.Node = Node; 36 | function isClose(lexer) { 37 | const length = lexer.stack.length; 38 | const topTwo = length >= 2 ? lexer.stack[length - 2].tokenType : ""; 39 | const isTOKEN_DTD = topTwo === lexer_1.TOKEN_DTD; 40 | const isCOMMENT = topTwo === lexer_1.COMMENT; 41 | if (length >= 2 && 42 | (isTOKEN_DTD /*dtd*/ || 43 | isCOMMENT /*comment*/)) { 44 | return true; 45 | } 46 | if (length < 4) 47 | return false; 48 | const topThree = lexer.stack[length - 3].tokenType; 49 | const topFour = lexer.stack[length - 4].tokenType; 50 | const isTOKEN_RIGHT_PAREN = topTwo === lexer_1.TOKEN_RIGHT_PAREN; 51 | const isTOKEN_NAME = topThree === lexer_1.TOKEN_NAME; 52 | // 53 | let one = isTOKEN_RIGHT_PAREN /*>*/ && 54 | isTOKEN_NAME /*tag_name*/ && 55 | topFour === lexer_1.TOKEN_CLOSE /* 57 | let close = isTOKEN_RIGHT_PAREN /*>*/ && 58 | topThree === lexer_1.TOKEN_NAME /*tag_name*/ && 59 | topFour === lexer_1.TOKEN_LEFT_PAREN /*<*/; 60 | // /> 61 | let selfClose = topTwo === lexer_1.TOKEN_SELF_CLOSE; // />
    62 | return one || close || selfClose; 63 | } 64 | exports.isClose = isClose; 65 | /* 66 | 提取出来的公共代码 67 | */ 68 | function judgeEnd(lexer) { 69 | if (lexer.sourceCode.slice(0, 2) === "*/ || 70 | lexer.sourceCode.slice(0, 2) === " || */) { 73 | return false; 74 | } 75 | else { 76 | /*在这里是什么特征看调用函数的注释部分contentText
    */ 77 | if ((lexer.sourceCode[0] === "<" 78 | && lexer_1.regexName.test(lexer.sourceCode[1]))) { 79 | let parseRes = Html_1.parseHtml(lexer); 80 | if (parseRes.selfClose) { 81 | return false; 82 | } 83 | } 84 | return true; 85 | } 86 | } 87 | function contentEnd(lexer) { 88 | //
    contentText
    89 | //
    contentText
    90 | // contentText 91 | //
    contentText
    ||
    contentText
    92 | let stack = lexer.stack; 93 | const length = stack.length; 94 | if (isClose(lexer) && 95 | length >= 4 && 96 | stack[length - 2].tokenType === lexer_1.TOKEN_RIGHT_PAREN /*>*/ && 97 | stack[length - 3].tokenType === lexer_1.TOKEN_NAME /*name*/ && 98 | stack[length - 4].tokenType === lexer_1.TOKEN_LEFT_PAREN /*<*/) { 99 | // 105 | */ 106 | let script = ["", ""]; 107 | if (script.includes(lexer.sourceCode.slice(0, script[0].length))) { 108 | return false; 109 | } 110 | else { 111 | return true; 112 | } 113 | } 114 | // noscript 115 | // if (lexer.stack[length - 3].token === "noscript") { 116 | // /* 117 | // 120 | // */ 121 | // let script = "" 122 | // if (lexer.sourceCode.slice(0, script.length) === script) { 123 | // return false 124 | // } else { 125 | // return true 126 | // } 127 | // } 128 | judgeEnd(lexer); 129 | } 130 | //
    contentText
    131 | //
    contentText 132 | //
    contentText
    133 | //
    contentText
    134 | if (isClose(lexer) && 135 | length >= 4 && 136 | stack[length - 2].tokenType === lexer_1.TOKEN_RIGHT_PAREN /*>*/ && 137 | stack[length - 3].tokenType === lexer_1.TOKEN_NAME /*name*/ && 138 | stack[length - 4].tokenType === lexer_1.TOKEN_CLOSE /*contentText
    142 | //
    contentText 143 | //
    contentText
    144 | //
    contentText
    145 | if (isClose(lexer) && 146 | length >= 4 && 147 | stack[length - 2].tokenType === lexer_1.TOKEN_SELF_CLOSE /*self-close />
    */ && 148 | stack[length - 3].tokenType === lexer_1.TOKEN_NAME /*name*/ && 149 | stack[length - 4].tokenType === lexer_1.TOKEN_LEFT_PAREN /*<*/) { 150 | judgeEnd(lexer); 151 | } 152 | // contentText
    153 | // contentText 154 | // contentText
    155 | // contentText
    156 | if (isClose(lexer) && 157 | stack[length - 2].tokenType === lexer_1.COMMENT /*COMMENT*/) { 158 | judgeEnd(lexer); 159 | } 160 | // contentText
    161 | // contentText
    162 | // contentText 163 | // contentText
    164 | if (isClose(lexer) && 165 | stack[length - 2].tokenType === lexer_1.TOKEN_DTD /*DTD*/) { 166 | judgeEnd(lexer); 167 | } 168 | /*contentText
    */ 169 | /*contentText || */ 171 | /*contentText
    */ 172 | if (stack[length - 1].tokenType === lexer_1.TOKEN_CONTENT_TEXT /*contentText*/) { 173 | if ((lexer.sourceCode[0] === "<" && 174 | lexer_1.regexName.test(lexer.sourceCode[1])) /*contentText
    */ || 175 | lexer.sourceCode.slice(0, 2) === " || */) { 177 | return false; 178 | } 179 | else { 180 | /*contentText
    */ 181 | if ((lexer.sourceCode[0] === "<" 182 | && lexer_1.regexName.test(lexer.sourceCode[1]))) { 183 | let parseRes = Html_1.parseHtml(lexer); 184 | if (parseRes.selfClose) { 185 | return false; 186 | } 187 | } 188 | return true; 189 | } 190 | } 191 | // return true 192 | throw new Error(`not find contentEnd! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`); 193 | } 194 | function parseText(lexer) { 195 | lexer.hasCache = false; 196 | let node = new Node(); 197 | if (!lexer.check) { 198 | node.nextSibling = null; 199 | } 200 | // lexer.isIgnored(); 201 | node.LineNum = lexer.GetLineNum(); 202 | let content = ""; 203 | while (contentEnd(lexer) && !lexer.isEmpty()) { 204 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) { 205 | lexer.lineNum += 1; 206 | content += lexer.sourceCode.slice(0, 2); 207 | lexer.skipSourceCode(2); 208 | } 209 | else { 210 | if (lexer.isNewLine(lexer.sourceCode[0])) { 211 | lexer.lineNum += 1; 212 | content += lexer.sourceCode[0]; 213 | lexer.skipSourceCode(1); 214 | } 215 | else { 216 | content += lexer.sourceCode[0]; 217 | lexer.skipSourceCode(1); 218 | } 219 | } 220 | } 221 | if (lexer.stack.length >= 3 && 222 | tagClose_1.isSpecialTag({ tag: lexer.stack[lexer.stack.length - 3].token })) { 223 | let token = lexer.stack[lexer.stack.length - 3].token; 224 | let tokenLen = ``.length; 225 | if (lexer.sourceCode.slice(0, tokenLen) === ``) { 226 | lexer.skipSourceCode(2); 227 | let res = { lineNum: lexer.lineNum, tokenType: lexer_1.TOKEN_CLOSE, token: " 11 | 12 | 13 | 14 | 15 | 16 | Document 17 | 18 | 19 |
    20 |

    11{{res.value}}

    21 |
    22 | 23 | 24 | 25 | ``` 26 | 27 | ## AST 28 |
    29 | 点击查看详情(Click to view details) 30 |
    
     31 | {
     32 |     "type": "root",
     33 |     "children": [
     34 |         {
     35 |             "type": "DTD",
     36 |             "LineNum": 1,
     37 |             "content": "DOCTYPE html"
     38 |         },
     39 |         {
     40 |             "content": "\r\n",
     41 |             "LineNum": 1,
     42 |             "type": "text"
     43 |         },
     44 |         {
     45 |             "children": [
     46 |                 {
     47 |                     "content": "\r\n",
     48 |                     "LineNum": 2,
     49 |                     "type": "text"
     50 |                 },
     51 |                 {
     52 |                     "children": [
     53 |                         {
     54 |                             "content": "\r\n    ",
     55 |                             "LineNum": 3,
     56 |                             "type": "text"
     57 |                         },
     58 |                         {
     59 |                             "children": [],
     60 |                             "attr": [
     61 |                                 {
     62 |                                     "name": "charset",
     63 |                                     "value": "UTF-8"
     64 |                                 }
     65 |                             ],
     66 |                             "LineNum": 4,
     67 |                             "type": "tag",
     68 |                             "tag": "meta"
     69 |                         },
     70 |                         {
     71 |                             "content": "\r\n    ",
     72 |                             "LineNum": 4,
     73 |                             "type": "text"
     74 |                         },
     75 |                         {
     76 |                             "children": [],
     77 |                             "attr": [
     78 |                                 {
     79 |                                     "name": "http-equiv",
     80 |                                     "value": "X-UA-Compatible"
     81 |                                 },
     82 |                                 {
     83 |                                     "name": "content",
     84 |                                     "value": "IE=edge"
     85 |                                 }
     86 |                             ],
     87 |                             "LineNum": 5,
     88 |                             "type": "tag",
     89 |                             "tag": "meta"
     90 |                         },
     91 |                         {
     92 |                             "content": "\r\n    ",
     93 |                             "LineNum": 5,
     94 |                             "type": "text"
     95 |                         },
     96 |                         {
     97 |                             "children": [],
     98 |                             "attr": [
     99 |                                 {
    100 |                                     "name": "name",
    101 |                                     "value": "viewport"
    102 |                                 },
    103 |                                 {
    104 |                                     "name": "content",
    105 |                                     "value": "width=device-width, initial-scale=1.0"
    106 |                                 }
    107 |                             ],
    108 |                             "LineNum": 6,
    109 |                             "type": "tag",
    110 |                             "tag": "meta"
    111 |                         },
    112 |                         {
    113 |                             "content": "\r\n    ",
    114 |                             "LineNum": 6,
    115 |                             "type": "text"
    116 |                         },
    117 |                         {
    118 |                             "children": [
    119 |                                 {
    120 |                                     "content": "Document",
    121 |                                     "LineNum": 7,
    122 |                                     "type": "text"
    123 |                                 }
    124 |                             ],
    125 |                             "attr": [],
    126 |                             "LineNum": 7,
    127 |                             "type": "tag",
    128 |                             "tag": "title"
    129 |                         },
    130 |                         {
    131 |                             "content": "\r\n",
    132 |                             "LineNum": 7,
    133 |                             "type": "text"
    134 |                         }
    135 |                     ],
    136 |                     "attr": [],
    137 |                     "LineNum": 3,
    138 |                     "type": "tag",
    139 |                     "tag": "head"
    140 |                 },
    141 |                 {
    142 |                     "content": "\r\n",
    143 |                     "LineNum": 8,
    144 |                     "type": "text"
    145 |                 },
    146 |                 {
    147 |                     "children": [
    148 |                         {
    149 |                             "content": "\r\n    ",
    150 |                             "LineNum": 9,
    151 |                             "type": "text"
    152 |                         },
    153 |                         {
    154 |                             "children": [
    155 |                                 {
    156 |                                     "content": "\r\n        ",
    157 |                                     "LineNum": 10,
    158 |                                     "type": "text"
    159 |                                 },
    160 |                                 {
    161 |                                     "children": [
    162 |                                         {
    163 |                                             "content": "11{{res.value}}",
    164 |                                             "LineNum": 11,
    165 |                                             "type": "text"
    166 |                                         }
    167 |                                     ],
    168 |                                     "attr": [
    169 |                                         {
    170 |                                             "name": "v-if",
    171 |                                             "value": "res.value"
    172 |                                         },
    173 |                                         {
    174 |                                             "name": "name",
    175 |                                             "value": "11"
    176 |                                         },
    177 |                                         {
    178 |                                             "name": "@click",
    179 |                                             "value": "tes"
    180 |                                         }
    181 |                                     ],
    182 |                                     "LineNum": 11,
    183 |                                     "type": "tag",
    184 |                                     "tag": "h1"
    185 |                                 },
    186 |                                 {
    187 |                                     "content": "\r\n    ",
    188 |                                     "LineNum": 11,
    189 |                                     "type": "text"
    190 |                                 }
    191 |                             ],
    192 |                             "attr": [],
    193 |                             "LineNum": 10,
    194 |                             "type": "tag",
    195 |                             "tag": "div"
    196 |                         },
    197 |                         {
    198 |                             "content": "\r\n    ",
    199 |                             "LineNum": 12,
    200 |                             "type": "text"
    201 |                         },
    202 |                         {
    203 |                             "children": [],
    204 |                             "attr": [
    205 |                                 {
    206 |                                     "name": "href",
    207 |                                     "value": "http://github.com/"
    208 |                                 }
    209 |                             ],
    210 |                             "LineNum": 13,
    211 |                             "type": "tag",
    212 |                             "tag": "a"
    213 |                         },
    214 |                         {
    215 |                             "content": "\r\n",
    216 |                             "LineNum": 13,
    217 |                             "type": "text"
    218 |                         }
    219 |                     ],
    220 |                     "attr": [],
    221 |                     "LineNum": 9,
    222 |                     "type": "tag",
    223 |                     "tag": "body"
    224 |                 },
    225 |                 {
    226 |                     "content": "\r\n",
    227 |                     "LineNum": 14,
    228 |                     "type": "text"
    229 |                 }
    230 |             ],
    231 |             "attr": [
    232 |                 {
    233 |                     "name": "lang",
    234 |                     "value": "en"
    235 |                 }
    236 |             ],
    237 |             "LineNum": 2,
    238 |             "type": "tag",
    239 |             "tag": "html"
    240 |         }
    241 |     ],
    242 |     "LineNum": 1
    243 | }
    244 | 
    245 |
    246 | 247 | ## 添加应用 248 | [查找节点](https://github.com/liulinboyi/HTMLParser-App/tree/main/platform) 249 | 250 | ## TIPS 251 | 252 | > 无运行时依赖 253 | 254 | 没有做到浏览器那样兼容性巨好,HTML写成啥样都不报错都会解析,我只解析了一部分奇葩写法~有的HTML写法太奇葩了,要兼容就需要更多的分支和处理,需要更多的精力就算了。 255 | 256 | ## 注意 257 | 258 | #### ~~tsc编译后无法加上.js后缀,导致无法使用module,所以在所有ts文件导入加上了js后缀~~ 259 | #### ~~https://segmentfault.com/q/1010000038671707~~ 260 | #### ~~[社区讨论](https://github.com/microsoft/TypeScript/issues/16577)~~ 261 | 262 | #### 已解决,写了个[脚本](./script/addSuffixJs.js),将所有编译后的ES modules的导入导出部分加上了js后缀 263 | 264 | ## [测试](./test) 265 | #### 使用[playwright](https://github.com/microsoft/playwright.git)和浏览器生成的DOM结构做了对比,除了一些奇葩写法,其他基本没问题。 266 | -------------------------------------------------------------------------------- /dist/parser.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.parse = exports.Program = void 0; 4 | const lexer_1 = require("./lexer"); 5 | const Comment_1 = require("./parser/Comment"); 6 | const Directive_1 = require("./parser/Directive"); 7 | const DTD_1 = require("./parser/DTD"); 8 | const Html_1 = require("./parser/Html"); 9 | const parseText_1 = require("./parser/parseText"); 10 | const tagClose_1 = require("./parser/tagClose"); 11 | class Program { 12 | constructor() { 13 | this.type = 'root'; 14 | this.children = []; 15 | } 16 | } 17 | exports.Program = Program; 18 | // SourceCode ::= Statement+ 19 | function parseSourceCode(lexer, check) { 20 | let LineNum = lexer.GetLineNum(); 21 | let root = parseStatements(lexer, check); 22 | root.LineNum = LineNum; 23 | return root; 24 | } 25 | /** 26 | * 将children中的多余的text节点去除 27 | * @param children 28 | * @returns 29 | */ 30 | function filterText(children) { 31 | for (let start = 0; start < children.length; start++) { 32 | if (children[start].type === "text") { 33 | // 从实践中知道,如果有去除body后多余的text节点,则最多是两个取一个,所以有下面代码 34 | let i = start + 1; 35 | if (i < children.length && children[i].type === "text") { 36 | // 其中重要的特征就是,里面是只有\r\n和空格 37 | // 只要当前标签和下一个标签这两个标签,则一定会删除一个"空标签(只包含\r\n和空格)" 38 | if (!children[i].content.replace(/[\r\n]+/g, "").trim()) { 39 | children[i].delete = true; // 添加上delete属性,后面好处理 40 | } 41 | else { 42 | children[start].delete = true; // 添加上delete属性,后面好处理 43 | } 44 | } 45 | } 46 | } 47 | // 删除delete为true的标签 48 | return children.filter((item) => !item.delete); 49 | } 50 | // Statement 51 | function parseStatements(lexer, check) { 52 | if (check) { 53 | lexer.check = true; 54 | } 55 | let root = { 56 | type: "root", 57 | children: [], 58 | LineNum: 1 59 | }; 60 | let statements = [root]; 61 | let Block_level_elements = [ 62 | "address", 63 | "article", 64 | "aside", 65 | "audio", 66 | "blockquote", 67 | "canvas", 68 | "dd", 69 | // "div", 70 | "dl", 71 | "fieldset", 72 | "figcaption", 73 | "figure", 74 | "figcaption", 75 | "footer", 76 | "form", 77 | "header", 78 | "hgroup", 79 | "hr", 80 | "noscript", 81 | "ol", 82 | "output", 83 | "p", 84 | "pre", 85 | "section", 86 | "table", 87 | "tfoot", 88 | "ul", 89 | "video" 90 | ]; 91 | let inlInline_elementsine = [ 92 | "b", 93 | "big", 94 | "i", 95 | "small", 96 | "tt", 97 | "abbr", 98 | "acronym", 99 | "cite", 100 | "code", 101 | "dfn", 102 | "em", 103 | "kbd", 104 | "strong", 105 | "samp", 106 | "var", 107 | "a", 108 | "bdo", 109 | "br", 110 | "img", 111 | "map", 112 | "object", 113 | "q", 114 | "script", 115 | "span", 116 | "sub", 117 | "sup", 118 | "button", 119 | "input", 120 | "label", 121 | "select", 122 | "textarea" 123 | ]; 124 | let notInSelf = [ 125 | "a", 126 | "br", 127 | "img", 128 | "script", 129 | "button", 130 | "input", 131 | ]; 132 | // select 里面的select会消失 133 | // textarea 会解析成 134 | let body = null; 135 | let mainBodyFinished = false; 136 | let uniqueStack = []; 137 | // let mainBodyFinishedIsText = false 138 | // 先调用LookAhead一次,将GetNextToken的结果缓存 139 | while (!isSourceCodeEnd(lexer.LookAhead().tokenType)) { 140 | // if (lexer.GetLineNum() === 20) { 141 | // debugger 142 | // } 143 | let statement = {}; 144 | statement = parseStatement(lexer); 145 | // console.log(`at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 30)}`) 146 | if (!statement) 147 | continue; 148 | let stack = statements; 149 | let s = statement; 150 | const length = stack.length - 1; 151 | if (s.type === "tag") { 152 | s.tag = s.tag.toLocaleLowerCase(); 153 | } 154 | if (!s.closeTag) { 155 | uniqueStack = []; 156 | if (notInSelf.includes(s.tag) && s.tag === stack[length].tag) { // 不能包含自己的元素 157 | stack.pop(); 158 | stack[stack.length - 1].children.push(s); 159 | stack.push(s); 160 | if (check) { 161 | s.parent = stack[stack.length - 1]; 162 | } 163 | continue; 164 | } 165 | // 处理多个body标签的问题 166 | // 如果mainBodyFinished位false,表示还未出现第一个body,并且当前起始标签是body,则寻找他的父节点,并将其赋值给body变量 167 | if (!mainBodyFinished && s.tag === "body" && !body) { 168 | // 寻找父节点 169 | let i = stack.length - 1; 170 | let parent = null; 171 | while (stack[i].type !== "tag" && i >= 0) { 172 | i--; 173 | } 174 | parent = i >= 0 ? stack[i] : null; 175 | // 找到的节点,赋值给body 176 | body = s; 177 | // 找到的父节点赋值给上面节点的parent属性,方便后续处理 178 | body.parent = parent; 179 | } 180 | stack[length].children.push(s); // 栈顶就是levalElement层级元素 181 | if (check) { 182 | s.parent = stack[length]; 183 | } 184 | if (s.type === "tag" && !s.selfClose && !tagClose_1.isSpecialTag(s)) { 185 | stack.push(s); 186 | // 处理多个body标签的问题 187 | // 如果已经出现过一个body标签并且现在这个起始标签还是body,则将其从栈中弹出,并且将其从栈顶的children中弹出 188 | if (mainBodyFinished && s.tag === "body") { 189 | stack.pop(); 190 | stack[length].children.pop(); 191 | if (check) { 192 | s.parent = null; 193 | } 194 | } 195 | } 196 | // 处理多个body标签的问题 197 | // 如果出现第一个body起始标签,则将mainBodyFinished置为true,方便在第一个body标签中再次出现body起始标签时将其忽略 198 | if (!mainBodyFinished && s.tag === "body") { 199 | mainBodyFinished = true; 200 | } 201 | } 202 | else { 203 | if (stack[length].tag !== s.tag) { 204 | uniqueStack.push(s); 205 | // 处理多个body标签的问题 206 | // 如果当前第一个body标签解析完成(mainBodyFinished),并且当前结束标签是body,则直接进行下次循环 207 | if (mainBodyFinished && s.tag === "body") { 208 | continue; 209 | } 210 | if (Block_level_elements.includes(s.tag)) { // 如果是块级元素会加入到levalElement层级元素当child 211 | stack[length].children.push(s); 212 | if (check) { 213 | s.parent = stack[length]; 214 | } 215 | } 216 | // 学习浏览器HTML解析,即使匹配不上也不报错,直接添加到levalElement层级元素当child 217 | console.warn(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`); 218 | // throw new Error(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`) 219 | } 220 | else { 221 | // 处理多个body标签的问题 222 | // 如果第一个body标签没有解析完成(mainBodyFinished),并且当前结束标签是body,则mainBodyFinished置为true 223 | if (!mainBodyFinished && s.tag === "body") { 224 | mainBodyFinished = true; 225 | } 226 | stack.pop(); 227 | if (uniqueStack.length > 0 && uniqueStack[uniqueStack.length - 1].tag === stack[stack.length - 1].tag) { 228 | uniqueStack.pop(); 229 | stack.pop(); 230 | } 231 | } 232 | } 233 | } 234 | // 处理多个body标签的问题 235 | // 找出body在父节点的索引 236 | let index = body && body.parent.children.findIndex((item) => item === body); 237 | // 从父节点下一个索引开始添加到第一个body中 238 | let real = index + 1; 239 | if (body) { 240 | for (let i = real; i < body.parent.children.length; i++) { 241 | if (body.parent.children[i].type === "tag") { 242 | body.parent.children[i].children = filterText(body.parent.children[i].children); 243 | } 244 | body.children.push(body.parent.children[i]); 245 | } 246 | let childrenLength = body.parent.children.length; 247 | for (let i = real; i < childrenLength; i++) { 248 | body.parent.children.pop(); 249 | } 250 | body.children = filterText(body.children); 251 | body.parent = null; 252 | } 253 | for (let i = 0; i < root.children.length; i++) { 254 | if (root.children[i].type === "DTD") { 255 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) { 256 | root.children[i - 1].delete = true; 257 | } 258 | } 259 | if (root.children[i].tag === "html") { 260 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) { 261 | root.children[i - 1].delete = true; 262 | } 263 | } 264 | if (check) { 265 | root.children[i].parent = null; 266 | } 267 | } 268 | root.children = root.children.filter((item) => !item.delete); 269 | return root; 270 | } 271 | function parseStatement(lexer) { 272 | // 向前看一个token并跳过 273 | lexer.LookAheadAndSkip(lexer_1.TOKEN_IGNORED); // skip if source code start with ignored token 274 | let look = lexer.LookAhead().tokenType; 275 | let flag = false; 276 | let top = lexer.stack[lexer.stack.length - 1]; 277 | if (top.tokenType === lexer_1.TOKEN_CONTENT_TEXT 278 | // isClose(lexer) && 279 | // top.tokenType !== TOKEN_LEFT_PAREN /*<*/ && 280 | // top.tokenType !== TOKEN_CLOSE /*, 13 | } 14 | 15 | export class Program { 16 | constructor() { 17 | this.type = 'root' 18 | this.children = [] 19 | } 20 | } 21 | 22 | 23 | // SourceCode ::= Statement+ 24 | function parseSourceCode(lexer: Lexer, check: boolean) { 25 | let LineNum = lexer.GetLineNum() 26 | let root = parseStatements(lexer, check) 27 | root.LineNum = LineNum 28 | return root 29 | } 30 | 31 | /** 32 | * 将children中的多余的text节点去除 33 | * @param children 34 | * @returns 35 | */ 36 | function filterText(children: any) { 37 | for (let start = 0; start < children.length; start++) { 38 | if (children[start].type === "text") { 39 | // 从实践中知道,如果有去除body后多余的text节点,则最多是两个取一个,所以有下面代码 40 | let i = start + 1 41 | if (i < children.length && children[i].type === "text") { 42 | // 其中重要的特征就是,里面是只有\r\n和空格 43 | // 只要当前标签和下一个标签这两个标签,则一定会删除一个"空标签(只包含\r\n和空格)" 44 | if (!children[i].content.replace(/[\r\n]+/g, "").trim()) { 45 | children[i].delete = true // 添加上delete属性,后面好处理 46 | } else { 47 | children[start].delete = true // 添加上delete属性,后面好处理 48 | } 49 | } 50 | } 51 | } 52 | // 删除delete为true的标签 53 | return children.filter((item: any) => !item.delete) 54 | } 55 | 56 | // Statement 57 | function parseStatements(lexer: Lexer, check: boolean) { 58 | 59 | if (check) { 60 | lexer.check = true 61 | } 62 | 63 | let root: any = { 64 | type: "root", 65 | children: [], 66 | LineNum: 1 67 | } 68 | 69 | let statements: Array = [root] 70 | 71 | let Block_level_elements = [ // 块级元素 72 | "address", 73 | "article", 74 | "aside", 75 | "audio", 76 | "blockquote", 77 | "canvas", 78 | "dd", 79 | // "div", 80 | "dl", 81 | "fieldset", 82 | "figcaption", 83 | "figure", 84 | "figcaption", 85 | "footer", 86 | "form", 87 | "header", 88 | "hgroup", 89 | "hr", 90 | "noscript", 91 | "ol", 92 | "output", 93 | "p", 94 | "pre", 95 | "section", 96 | "table", 97 | "tfoot", 98 | "ul", 99 | "video" 100 | ] 101 | 102 | let inlInline_elementsine = [ // 行内元素 103 | "b", 104 | "big", 105 | "i", 106 | "small", 107 | "tt", 108 | "abbr", 109 | "acronym", 110 | "cite", 111 | "code", 112 | "dfn", 113 | "em", 114 | "kbd", 115 | "strong", 116 | "samp", 117 | "var", 118 | "a", 119 | "bdo", 120 | "br", 121 | "img", 122 | "map", 123 | "object", 124 | "q", 125 | "script", 126 | "span", 127 | "sub", 128 | "sup", 129 | "button", 130 | "input", 131 | "label", 132 | "select", 133 | "textarea" 134 | ] 135 | 136 | let notInSelf = [ // 不能包含自己的元素 137 | "a", 138 | "br", 139 | "img", 140 | "script", 141 | "button", 142 | "input", 143 | ] 144 | // select 里面的select会消失 145 | // textarea 会解析成 146 | 147 | 148 | let body: any = null 149 | let mainBodyFinished = false 150 | let uniqueStack = [] 151 | // let mainBodyFinishedIsText = false 152 | // 先调用LookAhead一次,将GetNextToken的结果缓存 153 | while (!isSourceCodeEnd(lexer.LookAhead().tokenType)) { 154 | // if (lexer.GetLineNum() === 20) { 155 | // debugger 156 | // } 157 | let statement: any = {} 158 | statement = parseStatement(lexer) 159 | // console.log(`at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 30)}`) 160 | if (!statement) continue 161 | let stack = statements; 162 | let s = statement; 163 | const length = stack.length - 1 164 | if (s.type === "tag") { 165 | s.tag = s.tag.toLocaleLowerCase() 166 | } 167 | if (!s.closeTag) { 168 | 169 | uniqueStack = [] 170 | if (notInSelf.includes(s.tag) && s.tag === stack[length].tag) { // 不能包含自己的元素 171 | stack.pop() 172 | stack[stack.length - 1].children.push(s) 173 | stack.push(s) 174 | if (check) { 175 | s.parent = stack[stack.length - 1] 176 | } 177 | continue 178 | } 179 | 180 | // 处理多个body标签的问题 181 | // 如果mainBodyFinished位false,表示还未出现第一个body,并且当前起始标签是body,则寻找他的父节点,并将其赋值给body变量 182 | if (!mainBodyFinished && s.tag === "body" && !body) { 183 | // 寻找父节点 184 | let i = stack.length - 1 185 | let parent = null 186 | while (stack[i].type !== "tag" && i >= 0) { 187 | i--; 188 | } 189 | parent = i >= 0 ? stack[i] : null; 190 | // 找到的节点,赋值给body 191 | body = s 192 | // 找到的父节点赋值给上面节点的parent属性,方便后续处理 193 | body.parent = parent 194 | } 195 | 196 | stack[length].children.push(s) // 栈顶就是levalElement层级元素 197 | if (check) { 198 | s.parent = stack[length] 199 | } 200 | if (s.type === "tag" && !s.selfClose && !isSpecialTag(s)) { 201 | stack.push(s) 202 | // 处理多个body标签的问题 203 | // 如果已经出现过一个body标签并且现在这个起始标签还是body,则将其从栈中弹出,并且将其从栈顶的children中弹出 204 | if (mainBodyFinished && s.tag === "body") { 205 | stack.pop() 206 | stack[length].children.pop() 207 | if (check) { 208 | s.parent = null 209 | } 210 | } 211 | } 212 | // 处理多个body标签的问题 213 | // 如果出现第一个body起始标签,则将mainBodyFinished置为true,方便在第一个body标签中再次出现body起始标签时将其忽略 214 | if (!mainBodyFinished && s.tag === "body") { 215 | mainBodyFinished = true 216 | } 217 | } else { 218 | if (stack[length].tag !== s.tag) { 219 | uniqueStack.push(s) 220 | // 处理多个body标签的问题 221 | // 如果当前第一个body标签解析完成(mainBodyFinished),并且当前结束标签是body,则直接进行下次循环 222 | if (mainBodyFinished && s.tag === "body") { 223 | continue 224 | } 225 | if (Block_level_elements.includes(s.tag)) { // 如果是块级元素会加入到levalElement层级元素当child 226 | stack[length].children.push(s) 227 | if (check) { 228 | s.parent = stack[length] 229 | } 230 | } 231 | // 学习浏览器HTML解析,即使匹配不上也不报错,直接添加到levalElement层级元素当child 232 | console.warn(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`) 233 | // throw new Error(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`) 234 | } else { 235 | // 处理多个body标签的问题 236 | // 如果第一个body标签没有解析完成(mainBodyFinished),并且当前结束标签是body,则mainBodyFinished置为true 237 | if (!mainBodyFinished && s.tag === "body") { 238 | mainBodyFinished = true 239 | } 240 | stack.pop() 241 | if (uniqueStack.length > 0 && uniqueStack[uniqueStack.length - 1].tag === stack[stack.length - 1].tag) { 242 | uniqueStack.pop() 243 | stack.pop() 244 | } 245 | } 246 | } 247 | } 248 | 249 | // 处理多个body标签的问题 250 | // 找出body在父节点的索引 251 | let index = body && body.parent.children.findIndex((item: any) => item === body) 252 | // 从父节点下一个索引开始添加到第一个body中 253 | let real = index + 1 254 | 255 | if (body) { 256 | 257 | for (let i = real; i < body.parent.children.length; i++) { 258 | if (body.parent.children[i].type === "tag") { 259 | body.parent.children[i].children = filterText(body.parent.children[i].children) 260 | } 261 | body.children.push(body.parent.children[i]) 262 | } 263 | 264 | let childrenLength = body.parent.children.length 265 | for (let i = real; i < childrenLength; i++) { 266 | body.parent.children.pop() 267 | } 268 | 269 | body.children = filterText(body.children) 270 | 271 | body.parent = null 272 | } 273 | 274 | 275 | for (let i = 0; i < root.children.length; i++) { 276 | if (root.children[i].type === "DTD") { 277 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) { 278 | root.children[i - 1].delete = true 279 | } 280 | } 281 | if (root.children[i].tag === "html") { 282 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) { 283 | root.children[i - 1].delete = true 284 | } 285 | } 286 | if (check) { 287 | root.children[i].parent = null 288 | } 289 | } 290 | 291 | root.children = root.children.filter((item: any) => !item.delete) 292 | 293 | return root 294 | } 295 | 296 | function parseStatement(lexer: Lexer) { 297 | // 向前看一个token并跳过 298 | lexer.LookAheadAndSkip(TOKEN_IGNORED) // skip if source code start with ignored token 299 | let look = lexer.LookAhead().tokenType 300 | let flag = false 301 | let top = lexer.stack[lexer.stack.length - 1] 302 | if ( 303 | top.tokenType === TOKEN_CONTENT_TEXT 304 | // isClose(lexer) && 305 | // top.tokenType !== TOKEN_LEFT_PAREN /*<*/ && 306 | // top.tokenType !== TOKEN_CLOSE /* 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 385 | 386 | -------------------------------------------------------------------------------- /dist/lexer.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.NewLexer = exports.Lexer = exports.tokenNameMap = exports.keywords = exports.regexName = exports.SourceCharacter = exports.DIRECTIVE = exports.COMMENT = exports.INTERGER = exports.TOKEN_IGNORED = exports.TOKEN_NAME = exports.TOKEN_SELF_CLOSE = exports.TOKEN_DTD = exports.TOKEN_CLOSE = exports.TOKEN_CONTENT_TEXT = exports.TOKEN_DUOQUOTE = exports.TOKEN_LEFT_LINE = exports.TOKEN_SINGLE_QUOTE = exports.TOKEN_QUOTE = exports.TOKEN_EQUAL = exports.TOKEN_RIGHT_PAREN = exports.TOKEN_TAG_NAME = exports.TOKEN_LEFT_PAREN = exports.TOKEN_EOF = exports.Tokens = void 0; 4 | // token const 5 | var Tokens; 6 | (function (Tokens) { 7 | Tokens[Tokens["TOKEN_EOF"] = 0] = "TOKEN_EOF"; 8 | Tokens[Tokens["TOKEN_LEFT_PAREN"] = 1] = "TOKEN_LEFT_PAREN"; 9 | Tokens[Tokens["TOKEN_TAG_NAME"] = 2] = "TOKEN_TAG_NAME"; 10 | Tokens[Tokens["TOKEN_RIGHT_PAREN"] = 3] = "TOKEN_RIGHT_PAREN"; 11 | Tokens[Tokens["TOKEN_EQUAL"] = 4] = "TOKEN_EQUAL"; 12 | Tokens[Tokens["TOKEN_QUOTE"] = 5] = "TOKEN_QUOTE"; 13 | Tokens[Tokens["TOKEN_SINGLE_QUOTE"] = 6] = "TOKEN_SINGLE_QUOTE"; 14 | Tokens[Tokens["TOKEN_LEFT_LINE"] = 7] = "TOKEN_LEFT_LINE"; 15 | Tokens[Tokens["TOKEN_DUOQUOTE"] = 8] = "TOKEN_DUOQUOTE"; 16 | Tokens[Tokens["TOKEN_CONTENT_TEXT"] = 9] = "TOKEN_CONTENT_TEXT"; 17 | Tokens[Tokens["TOKEN_CLOSE"] = 10] = "TOKEN_CLOSE"; 18 | Tokens[Tokens["TOKEN_DTD"] = 11] = "TOKEN_DTD"; 19 | Tokens[Tokens["TOKEN_SELF_CLOSE"] = 12] = "TOKEN_SELF_CLOSE"; 20 | Tokens[Tokens["TOKEN_NAME"] = 13] = "TOKEN_NAME"; 21 | Tokens[Tokens["TOKEN_IGNORED"] = 14] = "TOKEN_IGNORED"; 22 | Tokens[Tokens["INTERGER"] = 15] = "INTERGER"; 23 | Tokens[Tokens["COMMENT"] = 16] = "COMMENT"; 24 | Tokens[Tokens["DIRECTIVE"] = 17] = "DIRECTIVE"; 25 | Tokens[Tokens["SourceCharacter"] = 18] = "SourceCharacter"; 26 | })(Tokens = exports.Tokens || (exports.Tokens = {})); 27 | exports.TOKEN_EOF = Tokens.TOKEN_EOF, exports.TOKEN_LEFT_PAREN = Tokens.TOKEN_LEFT_PAREN, exports.TOKEN_TAG_NAME = Tokens.TOKEN_TAG_NAME, exports.TOKEN_RIGHT_PAREN = Tokens.TOKEN_RIGHT_PAREN, exports.TOKEN_EQUAL = Tokens.TOKEN_EQUAL, exports.TOKEN_QUOTE = Tokens.TOKEN_QUOTE, exports.TOKEN_SINGLE_QUOTE = Tokens.TOKEN_SINGLE_QUOTE, exports.TOKEN_LEFT_LINE = Tokens.TOKEN_LEFT_LINE, exports.TOKEN_DUOQUOTE = Tokens.TOKEN_DUOQUOTE, exports.TOKEN_CONTENT_TEXT = Tokens.TOKEN_CONTENT_TEXT, exports.TOKEN_CLOSE = Tokens.TOKEN_CLOSE, exports.TOKEN_DTD = Tokens.TOKEN_DTD, exports.TOKEN_SELF_CLOSE = Tokens.TOKEN_SELF_CLOSE, exports.TOKEN_NAME = Tokens.TOKEN_NAME, exports.TOKEN_IGNORED = Tokens.TOKEN_IGNORED, exports.INTERGER = Tokens.INTERGER, exports.COMMENT = Tokens.COMMENT, exports.DIRECTIVE = Tokens.DIRECTIVE, exports.SourceCharacter = Tokens.SourceCharacter; 28 | // regex match patterns 29 | exports.regexName = /^[a-zA-z]+[0-9]*([-_:']*[a-zA-z0-9]*)*/; 30 | // 关键字 31 | exports.keywords = {}; 32 | exports.tokenNameMap = { 33 | [exports.TOKEN_EOF]: "EOF", 34 | [exports.TOKEN_LEFT_PAREN]: "<", 35 | [exports.TOKEN_TAG_NAME]: "tagNmae", 36 | [exports.TOKEN_RIGHT_PAREN]: ">", 37 | [exports.TOKEN_EQUAL]: "=", 38 | [exports.TOKEN_QUOTE]: "\"", 39 | [exports.TOKEN_SINGLE_QUOTE]: "'", 40 | [exports.TOKEN_LEFT_LINE]: "/", 41 | [exports.TOKEN_DUOQUOTE]: "\"\"", 42 | [exports.TOKEN_CONTENT_TEXT]: "ContentText", 43 | [exports.TOKEN_CLOSE]: "close", 44 | [exports.TOKEN_DTD]: "dtd", 45 | [exports.TOKEN_SELF_CLOSE]: "self-close", 46 | [exports.TOKEN_NAME]: "Name", 47 | [exports.TOKEN_IGNORED]: "Ignored", 48 | [exports.INTERGER]: "INTERGER", 49 | [exports.COMMENT]: "COMMENT", 50 | [exports.DIRECTIVE]: "DIRECTIVE", 51 | [exports.SourceCharacter]: "SourceCharacter", 52 | }; 53 | class Lexer { 54 | constructor(sourceCode, lineNum, nextToken, nextTokenType, nextTokenLineNum) { 55 | this.sourceCode = sourceCode; 56 | this.lineNum = lineNum; 57 | this.nextToken = nextToken; 58 | this.nextTokenType = nextTokenType; 59 | this.nextTokenLineNum = nextTokenLineNum; 60 | this.hasCache = false; 61 | this.stack = []; 62 | } 63 | get judgeIsContent() { 64 | const length = this.stack.length - 1; 65 | return this.stack[length].tokenType === exports.TOKEN_RIGHT_PAREN /*>*/ || 66 | this.stack[length].tokenType === exports.TOKEN_SELF_CLOSE /*/>
    */ || 67 | this.stack[length].tokenType === exports.TOKEN_DTD /*dtd*/ || 68 | this.stack[length].tokenType === exports.COMMENT /**/ || 69 | this.stack[length].tokenType === exports.TOKEN_CONTENT_TEXT; /*ContentText*/ 70 | } 71 | get isContentText() { 72 | if (this.stack.length < 1) { 73 | if (this.sourceCode[0] === "<") { 74 | return false; 75 | } 76 | return true; 77 | } 78 | let origin = this.sourceCode; 79 | // while (this.stack.length > 10) { 80 | // this.stack.shift() 81 | // } 82 | if (this.judgeIsContent) { 83 | // this.isIgnored() 84 | //