*/
172 | if (stack[length - 1].tokenType === lexer_1.TOKEN_CONTENT_TEXT /*contentText*/) {
173 | if ((lexer.sourceCode[0] === "<" &&
174 | lexer_1.regexName.test(lexer.sourceCode[1])) /*contentText
*/ ||
175 | lexer.sourceCode.slice(0, 2) === " || */) {
177 | return false;
178 | }
179 | else {
180 | /*contentText
*/
181 | if ((lexer.sourceCode[0] === "<"
182 | && lexer_1.regexName.test(lexer.sourceCode[1]))) {
183 | let parseRes = Html_1.parseHtml(lexer);
184 | if (parseRes.selfClose) {
185 | return false;
186 | }
187 | }
188 | return true;
189 | }
190 | }
191 | // return true
192 | throw new Error(`not find contentEnd! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`);
193 | }
194 | function parseText(lexer) {
195 | lexer.hasCache = false;
196 | let node = new Node();
197 | if (!lexer.check) {
198 | node.nextSibling = null;
199 | }
200 | // lexer.isIgnored();
201 | node.LineNum = lexer.GetLineNum();
202 | let content = "";
203 | while (contentEnd(lexer) && !lexer.isEmpty()) {
204 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) {
205 | lexer.lineNum += 1;
206 | content += lexer.sourceCode.slice(0, 2);
207 | lexer.skipSourceCode(2);
208 | }
209 | else {
210 | if (lexer.isNewLine(lexer.sourceCode[0])) {
211 | lexer.lineNum += 1;
212 | content += lexer.sourceCode[0];
213 | lexer.skipSourceCode(1);
214 | }
215 | else {
216 | content += lexer.sourceCode[0];
217 | lexer.skipSourceCode(1);
218 | }
219 | }
220 | }
221 | if (lexer.stack.length >= 3 &&
222 | tagClose_1.isSpecialTag({ tag: lexer.stack[lexer.stack.length - 3].token })) {
223 | let token = lexer.stack[lexer.stack.length - 3].token;
224 | let tokenLen = `${token}>`.length;
225 | if (lexer.sourceCode.slice(0, tokenLen) === `${token}>`) {
226 | lexer.skipSourceCode(2);
227 | let res = { lineNum: lexer.lineNum, tokenType: lexer_1.TOKEN_CLOSE, token: "" };
228 | lexer.stack.push(res);
229 | tagClose_1.parseClose(lexer);
230 | lexer.GetNextToken();
231 | while (contentEnd(lexer) && !lexer.isEmpty()) {
232 | if (lexer.nextSourceCodeIs("\r\n") || lexer.nextSourceCodeIs("\n\r")) {
233 | lexer.lineNum += 1;
234 | content += lexer.sourceCode.slice(0, 2);
235 | lexer.skipSourceCode(2);
236 | }
237 | else {
238 | if (lexer.isNewLine(lexer.sourceCode[0])) {
239 | lexer.lineNum += 1;
240 | content += lexer.sourceCode[0];
241 | lexer.skipSourceCode(1);
242 | }
243 | else {
244 | content += lexer.sourceCode[0];
245 | lexer.skipSourceCode(1);
246 | }
247 | }
248 | }
249 | lexer.stack.splice(lexer.stack.length - 4, lexer.stack.length - 1);
250 | }
251 | }
252 | // lexer.isIgnored();
253 | node.content = content;
254 | node.type = "text";
255 | return node;
256 | }
257 | exports.parseText = parseText;
258 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HTML Parser
2 |
3 | ## 解析HTML
4 |
5 | [](https://github.com/liulinboyi/HTMLParser/actions/workflows/tests.yml)
6 |
7 | ## HTML
8 |
9 | ```html
10 |
11 |
12 |
13 |
14 |
15 |
16 |
Document
17 |
18 |
19 |
20 |
11{{res.value}}
21 |
22 |
23 |
24 |
25 | ```
26 |
27 | ## AST
28 |
29 | 点击查看详情(Click to view details)
30 |
31 | {
32 | "type": "root",
33 | "children": [
34 | {
35 | "type": "DTD",
36 | "LineNum": 1,
37 | "content": "DOCTYPE html"
38 | },
39 | {
40 | "content": "\r\n",
41 | "LineNum": 1,
42 | "type": "text"
43 | },
44 | {
45 | "children": [
46 | {
47 | "content": "\r\n",
48 | "LineNum": 2,
49 | "type": "text"
50 | },
51 | {
52 | "children": [
53 | {
54 | "content": "\r\n ",
55 | "LineNum": 3,
56 | "type": "text"
57 | },
58 | {
59 | "children": [],
60 | "attr": [
61 | {
62 | "name": "charset",
63 | "value": "UTF-8"
64 | }
65 | ],
66 | "LineNum": 4,
67 | "type": "tag",
68 | "tag": "meta"
69 | },
70 | {
71 | "content": "\r\n ",
72 | "LineNum": 4,
73 | "type": "text"
74 | },
75 | {
76 | "children": [],
77 | "attr": [
78 | {
79 | "name": "http-equiv",
80 | "value": "X-UA-Compatible"
81 | },
82 | {
83 | "name": "content",
84 | "value": "IE=edge"
85 | }
86 | ],
87 | "LineNum": 5,
88 | "type": "tag",
89 | "tag": "meta"
90 | },
91 | {
92 | "content": "\r\n ",
93 | "LineNum": 5,
94 | "type": "text"
95 | },
96 | {
97 | "children": [],
98 | "attr": [
99 | {
100 | "name": "name",
101 | "value": "viewport"
102 | },
103 | {
104 | "name": "content",
105 | "value": "width=device-width, initial-scale=1.0"
106 | }
107 | ],
108 | "LineNum": 6,
109 | "type": "tag",
110 | "tag": "meta"
111 | },
112 | {
113 | "content": "\r\n ",
114 | "LineNum": 6,
115 | "type": "text"
116 | },
117 | {
118 | "children": [
119 | {
120 | "content": "Document",
121 | "LineNum": 7,
122 | "type": "text"
123 | }
124 | ],
125 | "attr": [],
126 | "LineNum": 7,
127 | "type": "tag",
128 | "tag": "title"
129 | },
130 | {
131 | "content": "\r\n",
132 | "LineNum": 7,
133 | "type": "text"
134 | }
135 | ],
136 | "attr": [],
137 | "LineNum": 3,
138 | "type": "tag",
139 | "tag": "head"
140 | },
141 | {
142 | "content": "\r\n",
143 | "LineNum": 8,
144 | "type": "text"
145 | },
146 | {
147 | "children": [
148 | {
149 | "content": "\r\n ",
150 | "LineNum": 9,
151 | "type": "text"
152 | },
153 | {
154 | "children": [
155 | {
156 | "content": "\r\n ",
157 | "LineNum": 10,
158 | "type": "text"
159 | },
160 | {
161 | "children": [
162 | {
163 | "content": "11{{res.value}}",
164 | "LineNum": 11,
165 | "type": "text"
166 | }
167 | ],
168 | "attr": [
169 | {
170 | "name": "v-if",
171 | "value": "res.value"
172 | },
173 | {
174 | "name": "name",
175 | "value": "11"
176 | },
177 | {
178 | "name": "@click",
179 | "value": "tes"
180 | }
181 | ],
182 | "LineNum": 11,
183 | "type": "tag",
184 | "tag": "h1"
185 | },
186 | {
187 | "content": "\r\n ",
188 | "LineNum": 11,
189 | "type": "text"
190 | }
191 | ],
192 | "attr": [],
193 | "LineNum": 10,
194 | "type": "tag",
195 | "tag": "div"
196 | },
197 | {
198 | "content": "\r\n ",
199 | "LineNum": 12,
200 | "type": "text"
201 | },
202 | {
203 | "children": [],
204 | "attr": [
205 | {
206 | "name": "href",
207 | "value": "http://github.com/"
208 | }
209 | ],
210 | "LineNum": 13,
211 | "type": "tag",
212 | "tag": "a"
213 | },
214 | {
215 | "content": "\r\n",
216 | "LineNum": 13,
217 | "type": "text"
218 | }
219 | ],
220 | "attr": [],
221 | "LineNum": 9,
222 | "type": "tag",
223 | "tag": "body"
224 | },
225 | {
226 | "content": "\r\n",
227 | "LineNum": 14,
228 | "type": "text"
229 | }
230 | ],
231 | "attr": [
232 | {
233 | "name": "lang",
234 | "value": "en"
235 | }
236 | ],
237 | "LineNum": 2,
238 | "type": "tag",
239 | "tag": "html"
240 | }
241 | ],
242 | "LineNum": 1
243 | }
244 |
245 |
246 |
247 | ## 添加应用
248 | [查找节点](https://github.com/liulinboyi/HTMLParser-App/tree/main/platform)
249 |
250 | ## TIPS
251 |
252 | > 无运行时依赖
253 |
254 | 没有做到浏览器那样兼容性巨好,HTML写成啥样都不报错都会解析,我只解析了一部分奇葩写法~有的HTML写法太奇葩了,要兼容就需要更多的分支和处理,需要更多的精力就算了。
255 |
256 | ## 注意
257 |
258 | #### ~~tsc编译后无法加上.js后缀,导致无法使用module,所以在所有ts文件导入加上了js后缀~~
259 | #### ~~https://segmentfault.com/q/1010000038671707~~
260 | #### ~~[社区讨论](https://github.com/microsoft/TypeScript/issues/16577)~~
261 |
262 | #### 已解决,写了个[脚本](./script/addSuffixJs.js),将所有编译后的ES modules的导入导出部分加上了js后缀
263 |
264 | ## [测试](./test)
265 | #### 使用[playwright](https://github.com/microsoft/playwright.git)和浏览器生成的DOM结构做了对比,除了一些奇葩写法,其他基本没问题。
266 |
--------------------------------------------------------------------------------
/dist/parser.js:
--------------------------------------------------------------------------------
1 | "use strict";
2 | Object.defineProperty(exports, "__esModule", { value: true });
3 | exports.parse = exports.Program = void 0;
4 | const lexer_1 = require("./lexer");
5 | const Comment_1 = require("./parser/Comment");
6 | const Directive_1 = require("./parser/Directive");
7 | const DTD_1 = require("./parser/DTD");
8 | const Html_1 = require("./parser/Html");
9 | const parseText_1 = require("./parser/parseText");
10 | const tagClose_1 = require("./parser/tagClose");
11 | class Program {
12 | constructor() {
13 | this.type = 'root';
14 | this.children = [];
15 | }
16 | }
17 | exports.Program = Program;
18 | // SourceCode ::= Statement+
19 | function parseSourceCode(lexer, check) {
20 | let LineNum = lexer.GetLineNum();
21 | let root = parseStatements(lexer, check);
22 | root.LineNum = LineNum;
23 | return root;
24 | }
25 | /**
26 | * 将children中的多余的text节点去除
27 | * @param children
28 | * @returns
29 | */
30 | function filterText(children) {
31 | for (let start = 0; start < children.length; start++) {
32 | if (children[start].type === "text") {
33 | // 从实践中知道,如果有去除body后多余的text节点,则最多是两个取一个,所以有下面代码
34 | let i = start + 1;
35 | if (i < children.length && children[i].type === "text") {
36 | // 其中重要的特征就是,里面是只有\r\n和空格
37 | // 只要当前标签和下一个标签这两个标签,则一定会删除一个"空标签(只包含\r\n和空格)"
38 | if (!children[i].content.replace(/[\r\n]+/g, "").trim()) {
39 | children[i].delete = true; // 添加上delete属性,后面好处理
40 | }
41 | else {
42 | children[start].delete = true; // 添加上delete属性,后面好处理
43 | }
44 | }
45 | }
46 | }
47 | // 删除delete为true的标签
48 | return children.filter((item) => !item.delete);
49 | }
50 | // Statement
51 | function parseStatements(lexer, check) {
52 | if (check) {
53 | lexer.check = true;
54 | }
55 | let root = {
56 | type: "root",
57 | children: [],
58 | LineNum: 1
59 | };
60 | let statements = [root];
61 | let Block_level_elements = [
62 | "address",
63 | "article",
64 | "aside",
65 | "audio",
66 | "blockquote",
67 | "canvas",
68 | "dd",
69 | // "div",
70 | "dl",
71 | "fieldset",
72 | "figcaption",
73 | "figure",
74 | "figcaption",
75 | "footer",
76 | "form",
77 | "header",
78 | "hgroup",
79 | "hr",
80 | "noscript",
81 | "ol",
82 | "output",
83 | "p",
84 | "pre",
85 | "section",
86 | "table",
87 | "tfoot",
88 | "ul",
89 | "video"
90 | ];
91 | let inlInline_elementsine = [
92 | "b",
93 | "big",
94 | "i",
95 | "small",
96 | "tt",
97 | "abbr",
98 | "acronym",
99 | "cite",
100 | "code",
101 | "dfn",
102 | "em",
103 | "kbd",
104 | "strong",
105 | "samp",
106 | "var",
107 | "a",
108 | "bdo",
109 | "br",
110 | "img",
111 | "map",
112 | "object",
113 | "q",
114 | "script",
115 | "span",
116 | "sub",
117 | "sup",
118 | "button",
119 | "input",
120 | "label",
121 | "select",
122 | "textarea"
123 | ];
124 | let notInSelf = [
125 | "a",
126 | "br",
127 | "img",
128 | "script",
129 | "button",
130 | "input",
131 | ];
132 | // select
里面的select会消失
133 | // textarea
会解析成
134 | let body = null;
135 | let mainBodyFinished = false;
136 | let uniqueStack = [];
137 | // let mainBodyFinishedIsText = false
138 | // 先调用LookAhead一次,将GetNextToken的结果缓存
139 | while (!isSourceCodeEnd(lexer.LookAhead().tokenType)) {
140 | // if (lexer.GetLineNum() === 20) {
141 | // debugger
142 | // }
143 | let statement = {};
144 | statement = parseStatement(lexer);
145 | // console.log(`at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 30)}`)
146 | if (!statement)
147 | continue;
148 | let stack = statements;
149 | let s = statement;
150 | const length = stack.length - 1;
151 | if (s.type === "tag") {
152 | s.tag = s.tag.toLocaleLowerCase();
153 | }
154 | if (!s.closeTag) {
155 | uniqueStack = [];
156 | if (notInSelf.includes(s.tag) && s.tag === stack[length].tag) { // 不能包含自己的元素
157 | stack.pop();
158 | stack[stack.length - 1].children.push(s);
159 | stack.push(s);
160 | if (check) {
161 | s.parent = stack[stack.length - 1];
162 | }
163 | continue;
164 | }
165 | // 处理多个body标签的问题
166 | // 如果mainBodyFinished位false,表示还未出现第一个body,并且当前起始标签是body,则寻找他的父节点,并将其赋值给body变量
167 | if (!mainBodyFinished && s.tag === "body" && !body) {
168 | // 寻找父节点
169 | let i = stack.length - 1;
170 | let parent = null;
171 | while (stack[i].type !== "tag" && i >= 0) {
172 | i--;
173 | }
174 | parent = i >= 0 ? stack[i] : null;
175 | // 找到的节点,赋值给body
176 | body = s;
177 | // 找到的父节点赋值给上面节点的parent属性,方便后续处理
178 | body.parent = parent;
179 | }
180 | stack[length].children.push(s); // 栈顶就是levalElement层级元素
181 | if (check) {
182 | s.parent = stack[length];
183 | }
184 | if (s.type === "tag" && !s.selfClose && !tagClose_1.isSpecialTag(s)) {
185 | stack.push(s);
186 | // 处理多个body标签的问题
187 | // 如果已经出现过一个body标签并且现在这个起始标签还是body,则将其从栈中弹出,并且将其从栈顶的children中弹出
188 | if (mainBodyFinished && s.tag === "body") {
189 | stack.pop();
190 | stack[length].children.pop();
191 | if (check) {
192 | s.parent = null;
193 | }
194 | }
195 | }
196 | // 处理多个body标签的问题
197 | // 如果出现第一个body起始标签,则将mainBodyFinished置为true,方便在第一个body标签中再次出现body起始标签时将其忽略
198 | if (!mainBodyFinished && s.tag === "body") {
199 | mainBodyFinished = true;
200 | }
201 | }
202 | else {
203 | if (stack[length].tag !== s.tag) {
204 | uniqueStack.push(s);
205 | // 处理多个body标签的问题
206 | // 如果当前第一个body标签解析完成(mainBodyFinished),并且当前结束标签是body,则直接进行下次循环
207 | if (mainBodyFinished && s.tag === "body") {
208 | continue;
209 | }
210 | if (Block_level_elements.includes(s.tag)) { // 如果是块级元素会加入到levalElement层级元素当child
211 | stack[length].children.push(s);
212 | if (check) {
213 | s.parent = stack[length];
214 | }
215 | }
216 | // 学习浏览器HTML解析,即使匹配不上也不报错,直接添加到levalElement层级元素当child
217 | console.warn(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`);
218 | // throw new Error(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`)
219 | }
220 | else {
221 | // 处理多个body标签的问题
222 | // 如果第一个body标签没有解析完成(mainBodyFinished),并且当前结束标签是body,则mainBodyFinished置为true
223 | if (!mainBodyFinished && s.tag === "body") {
224 | mainBodyFinished = true;
225 | }
226 | stack.pop();
227 | if (uniqueStack.length > 0 && uniqueStack[uniqueStack.length - 1].tag === stack[stack.length - 1].tag) {
228 | uniqueStack.pop();
229 | stack.pop();
230 | }
231 | }
232 | }
233 | }
234 | // 处理多个body标签的问题
235 | // 找出body在父节点的索引
236 | let index = body && body.parent.children.findIndex((item) => item === body);
237 | // 从父节点下一个索引开始添加到第一个body中
238 | let real = index + 1;
239 | if (body) {
240 | for (let i = real; i < body.parent.children.length; i++) {
241 | if (body.parent.children[i].type === "tag") {
242 | body.parent.children[i].children = filterText(body.parent.children[i].children);
243 | }
244 | body.children.push(body.parent.children[i]);
245 | }
246 | let childrenLength = body.parent.children.length;
247 | for (let i = real; i < childrenLength; i++) {
248 | body.parent.children.pop();
249 | }
250 | body.children = filterText(body.children);
251 | body.parent = null;
252 | }
253 | for (let i = 0; i < root.children.length; i++) {
254 | if (root.children[i].type === "DTD") {
255 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) {
256 | root.children[i - 1].delete = true;
257 | }
258 | }
259 | if (root.children[i].tag === "html") {
260 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) {
261 | root.children[i - 1].delete = true;
262 | }
263 | }
264 | if (check) {
265 | root.children[i].parent = null;
266 | }
267 | }
268 | root.children = root.children.filter((item) => !item.delete);
269 | return root;
270 | }
271 | function parseStatement(lexer) {
272 | // 向前看一个token并跳过
273 | lexer.LookAheadAndSkip(lexer_1.TOKEN_IGNORED); // skip if source code start with ignored token
274 | let look = lexer.LookAhead().tokenType;
275 | let flag = false;
276 | let top = lexer.stack[lexer.stack.length - 1];
277 | if (top.tokenType === lexer_1.TOKEN_CONTENT_TEXT
278 | // isClose(lexer) &&
279 | // top.tokenType !== TOKEN_LEFT_PAREN /*<*/ &&
280 | // top.tokenType !== TOKEN_CLOSE /**/ &&
281 | // top.tokenType !== TOKEN_DTD /*DTD*/ &&
282 | // top.tokenType !== COMMENT /*COMMENT*/
283 | ) {
284 | flag = true;
285 | }
286 | else {
287 | flag = false;
288 | }
289 | if (flag) {
290 | return parseText_1.parseText(lexer);
291 | }
292 | else {
293 | switch (look) {
294 | case lexer_1.TOKEN_LEFT_PAREN: // <
295 | return Html_1.parseHtml(lexer);
296 | case lexer_1.TOKEN_CLOSE: //
297 | return tagClose_1.parseClose(lexer);
298 | case lexer_1.TOKEN_DTD: // dtd
299 | return DTD_1.parseDtd(lexer);
300 | case lexer_1.COMMENT:
301 | return Comment_1.paseComment(lexer);
302 | case lexer_1.DIRECTIVE:
303 | return Directive_1.paseDirective(lexer);
304 | default:
305 | throw new Error(`parseStatement(): unknown Statement. at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 50)}`);
306 | }
307 | }
308 | }
309 | function isSourceCodeEnd(token) {
310 | return token === lexer_1.TOKEN_EOF;
311 | }
312 | function parse(code, check) {
313 | let lexer = lexer_1.NewLexer(code);
314 | let sourceCode = parseSourceCode(lexer, check);
315 | lexer.NextTokenIs(lexer_1.TOKEN_EOF);
316 | return sourceCode;
317 | }
318 | exports.parse = parse;
319 |
--------------------------------------------------------------------------------
/src/parser.ts:
--------------------------------------------------------------------------------
1 | import { COMMENT, DIRECTIVE, Lexer, NewLexer, TOKEN_CLOSE, TOKEN_CONTENT_TEXT, TOKEN_DTD, TOKEN_EOF, TOKEN_IGNORED, TOKEN_LEFT_PAREN } from "./lexer"
2 | import { paseComment } from "./parser/Comment"
3 | import { paseDirective } from "./parser/Directive"
4 | import { parseDtd } from "./parser/DTD"
5 | import { parseHtml } from "./parser/Html"
6 | import { parseText } from "./parser/parseText"
7 | import { isSpecialTag, parseClose } from "./parser/tagClose"
8 |
9 | export interface Program {
10 | type?: string,
11 | LineNum?: number,
12 | children: Array,
13 | }
14 |
15 | export class Program {
16 | constructor() {
17 | this.type = 'root'
18 | this.children = []
19 | }
20 | }
21 |
22 |
23 | // SourceCode ::= Statement+
24 | function parseSourceCode(lexer: Lexer, check: boolean) {
25 | let LineNum = lexer.GetLineNum()
26 | let root = parseStatements(lexer, check)
27 | root.LineNum = LineNum
28 | return root
29 | }
30 |
31 | /**
32 | * 将children中的多余的text节点去除
33 | * @param children
34 | * @returns
35 | */
36 | function filterText(children: any) {
37 | for (let start = 0; start < children.length; start++) {
38 | if (children[start].type === "text") {
39 | // 从实践中知道,如果有去除body后多余的text节点,则最多是两个取一个,所以有下面代码
40 | let i = start + 1
41 | if (i < children.length && children[i].type === "text") {
42 | // 其中重要的特征就是,里面是只有\r\n和空格
43 | // 只要当前标签和下一个标签这两个标签,则一定会删除一个"空标签(只包含\r\n和空格)"
44 | if (!children[i].content.replace(/[\r\n]+/g, "").trim()) {
45 | children[i].delete = true // 添加上delete属性,后面好处理
46 | } else {
47 | children[start].delete = true // 添加上delete属性,后面好处理
48 | }
49 | }
50 | }
51 | }
52 | // 删除delete为true的标签
53 | return children.filter((item: any) => !item.delete)
54 | }
55 |
56 | // Statement
57 | function parseStatements(lexer: Lexer, check: boolean) {
58 |
59 | if (check) {
60 | lexer.check = true
61 | }
62 |
63 | let root: any = {
64 | type: "root",
65 | children: [],
66 | LineNum: 1
67 | }
68 |
69 | let statements: Array = [root]
70 |
71 | let Block_level_elements = [ // 块级元素
72 | "address",
73 | "article",
74 | "aside",
75 | "audio",
76 | "blockquote",
77 | "canvas",
78 | "dd",
79 | // "div",
80 | "dl",
81 | "fieldset",
82 | "figcaption",
83 | "figure",
84 | "figcaption",
85 | "footer",
86 | "form",
87 | "header",
88 | "hgroup",
89 | "hr",
90 | "noscript",
91 | "ol",
92 | "output",
93 | "p",
94 | "pre",
95 | "section",
96 | "table",
97 | "tfoot",
98 | "ul",
99 | "video"
100 | ]
101 |
102 | let inlInline_elementsine = [ // 行内元素
103 | "b",
104 | "big",
105 | "i",
106 | "small",
107 | "tt",
108 | "abbr",
109 | "acronym",
110 | "cite",
111 | "code",
112 | "dfn",
113 | "em",
114 | "kbd",
115 | "strong",
116 | "samp",
117 | "var",
118 | "a",
119 | "bdo",
120 | "br",
121 | "img",
122 | "map",
123 | "object",
124 | "q",
125 | "script",
126 | "span",
127 | "sub",
128 | "sup",
129 | "button",
130 | "input",
131 | "label",
132 | "select",
133 | "textarea"
134 | ]
135 |
136 | let notInSelf = [ // 不能包含自己的元素
137 | "a",
138 | "br",
139 | "img",
140 | "script",
141 | "button",
142 | "input",
143 | ]
144 | // select 里面的select会消失
145 | // textarea 会解析成
146 |
147 |
148 | let body: any = null
149 | let mainBodyFinished = false
150 | let uniqueStack = []
151 | // let mainBodyFinishedIsText = false
152 | // 先调用LookAhead一次,将GetNextToken的结果缓存
153 | while (!isSourceCodeEnd(lexer.LookAhead().tokenType)) {
154 | // if (lexer.GetLineNum() === 20) {
155 | // debugger
156 | // }
157 | let statement: any = {}
158 | statement = parseStatement(lexer)
159 | // console.log(`at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 30)}`)
160 | if (!statement) continue
161 | let stack = statements;
162 | let s = statement;
163 | const length = stack.length - 1
164 | if (s.type === "tag") {
165 | s.tag = s.tag.toLocaleLowerCase()
166 | }
167 | if (!s.closeTag) {
168 |
169 | uniqueStack = []
170 | if (notInSelf.includes(s.tag) && s.tag === stack[length].tag) { // 不能包含自己的元素
171 | stack.pop()
172 | stack[stack.length - 1].children.push(s)
173 | stack.push(s)
174 | if (check) {
175 | s.parent = stack[stack.length - 1]
176 | }
177 | continue
178 | }
179 |
180 | // 处理多个body标签的问题
181 | // 如果mainBodyFinished位false,表示还未出现第一个body,并且当前起始标签是body,则寻找他的父节点,并将其赋值给body变量
182 | if (!mainBodyFinished && s.tag === "body" && !body) {
183 | // 寻找父节点
184 | let i = stack.length - 1
185 | let parent = null
186 | while (stack[i].type !== "tag" && i >= 0) {
187 | i--;
188 | }
189 | parent = i >= 0 ? stack[i] : null;
190 | // 找到的节点,赋值给body
191 | body = s
192 | // 找到的父节点赋值给上面节点的parent属性,方便后续处理
193 | body.parent = parent
194 | }
195 |
196 | stack[length].children.push(s) // 栈顶就是levalElement层级元素
197 | if (check) {
198 | s.parent = stack[length]
199 | }
200 | if (s.type === "tag" && !s.selfClose && !isSpecialTag(s)) {
201 | stack.push(s)
202 | // 处理多个body标签的问题
203 | // 如果已经出现过一个body标签并且现在这个起始标签还是body,则将其从栈中弹出,并且将其从栈顶的children中弹出
204 | if (mainBodyFinished && s.tag === "body") {
205 | stack.pop()
206 | stack[length].children.pop()
207 | if (check) {
208 | s.parent = null
209 | }
210 | }
211 | }
212 | // 处理多个body标签的问题
213 | // 如果出现第一个body起始标签,则将mainBodyFinished置为true,方便在第一个body标签中再次出现body起始标签时将其忽略
214 | if (!mainBodyFinished && s.tag === "body") {
215 | mainBodyFinished = true
216 | }
217 | } else {
218 | if (stack[length].tag !== s.tag) {
219 | uniqueStack.push(s)
220 | // 处理多个body标签的问题
221 | // 如果当前第一个body标签解析完成(mainBodyFinished),并且当前结束标签是body,则直接进行下次循环
222 | if (mainBodyFinished && s.tag === "body") {
223 | continue
224 | }
225 | if (Block_level_elements.includes(s.tag)) { // 如果是块级元素会加入到levalElement层级元素当child
226 | stack[length].children.push(s)
227 | if (check) {
228 | s.parent = stack[length]
229 | }
230 | }
231 | // 学习浏览器HTML解析,即使匹配不上也不报错,直接添加到levalElement层级元素当child
232 | console.warn(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`)
233 | // throw new Error(`${stack[length].tag} and ${s.tag} is not math! at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 100)}`)
234 | } else {
235 | // 处理多个body标签的问题
236 | // 如果第一个body标签没有解析完成(mainBodyFinished),并且当前结束标签是body,则mainBodyFinished置为true
237 | if (!mainBodyFinished && s.tag === "body") {
238 | mainBodyFinished = true
239 | }
240 | stack.pop()
241 | if (uniqueStack.length > 0 && uniqueStack[uniqueStack.length - 1].tag === stack[stack.length - 1].tag) {
242 | uniqueStack.pop()
243 | stack.pop()
244 | }
245 | }
246 | }
247 | }
248 |
249 | // 处理多个body标签的问题
250 | // 找出body在父节点的索引
251 | let index = body && body.parent.children.findIndex((item: any) => item === body)
252 | // 从父节点下一个索引开始添加到第一个body中
253 | let real = index + 1
254 |
255 | if (body) {
256 |
257 | for (let i = real; i < body.parent.children.length; i++) {
258 | if (body.parent.children[i].type === "tag") {
259 | body.parent.children[i].children = filterText(body.parent.children[i].children)
260 | }
261 | body.children.push(body.parent.children[i])
262 | }
263 |
264 | let childrenLength = body.parent.children.length
265 | for (let i = real; i < childrenLength; i++) {
266 | body.parent.children.pop()
267 | }
268 |
269 | body.children = filterText(body.children)
270 |
271 | body.parent = null
272 | }
273 |
274 |
275 | for (let i = 0; i < root.children.length; i++) {
276 | if (root.children[i].type === "DTD") {
277 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) {
278 | root.children[i - 1].delete = true
279 | }
280 | }
281 | if (root.children[i].tag === "html") {
282 | if (i - 1 >= 0 && root.children[i - 1].type === "text" && !root.children[i - 1].content.replace(/[\r\n]+/g, "").trim()) {
283 | root.children[i - 1].delete = true
284 | }
285 | }
286 | if (check) {
287 | root.children[i].parent = null
288 | }
289 | }
290 |
291 | root.children = root.children.filter((item: any) => !item.delete)
292 |
293 | return root
294 | }
295 |
296 | function parseStatement(lexer: Lexer) {
297 | // 向前看一个token并跳过
298 | lexer.LookAheadAndSkip(TOKEN_IGNORED) // skip if source code start with ignored token
299 | let look = lexer.LookAhead().tokenType
300 | let flag = false
301 | let top = lexer.stack[lexer.stack.length - 1]
302 | if (
303 | top.tokenType === TOKEN_CONTENT_TEXT
304 | // isClose(lexer) &&
305 | // top.tokenType !== TOKEN_LEFT_PAREN /*<*/ &&
306 | // top.tokenType !== TOKEN_CLOSE /**/ &&
307 | // top.tokenType !== TOKEN_DTD /*DTD*/ &&
308 | // top.tokenType !== COMMENT /*COMMENT*/
309 | ) {
310 | flag = true
311 | } else {
312 | flag = false
313 | }
314 |
315 | if (flag) {
316 | return parseText(lexer)
317 | } else {
318 | switch (look) {
319 | case TOKEN_LEFT_PAREN: // <
320 | return parseHtml(lexer)
321 | case TOKEN_CLOSE: //
322 | return parseClose(lexer)
323 | case TOKEN_DTD: // dtd
324 | return parseDtd(lexer)
325 | case COMMENT:
326 | return paseComment(lexer)
327 | case DIRECTIVE:
328 | return paseDirective(lexer)
329 | default:
330 | throw new Error(`parseStatement(): unknown Statement. at line ${lexer.GetLineNum()} ${lexer.sourceCode.slice(0, 50)}`)
331 | }
332 | }
333 |
334 |
335 | }
336 |
337 | function isSourceCodeEnd(token: number): boolean {
338 | return token === TOKEN_EOF
339 | }
340 |
341 | export function parse(code: string, check: boolean) {
342 |
343 | let lexer = NewLexer(code)
344 | let sourceCode = parseSourceCode(lexer, check);
345 |
346 | lexer.NextTokenIs(TOKEN_EOF)
347 | return sourceCode
348 | }
349 |
--------------------------------------------------------------------------------
/demo/test26.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 |
385 |
386 |
--------------------------------------------------------------------------------
/dist/lexer.js:
--------------------------------------------------------------------------------
1 | "use strict";
2 | Object.defineProperty(exports, "__esModule", { value: true });
3 | exports.NewLexer = exports.Lexer = exports.tokenNameMap = exports.keywords = exports.regexName = exports.SourceCharacter = exports.DIRECTIVE = exports.COMMENT = exports.INTERGER = exports.TOKEN_IGNORED = exports.TOKEN_NAME = exports.TOKEN_SELF_CLOSE = exports.TOKEN_DTD = exports.TOKEN_CLOSE = exports.TOKEN_CONTENT_TEXT = exports.TOKEN_DUOQUOTE = exports.TOKEN_LEFT_LINE = exports.TOKEN_SINGLE_QUOTE = exports.TOKEN_QUOTE = exports.TOKEN_EQUAL = exports.TOKEN_RIGHT_PAREN = exports.TOKEN_TAG_NAME = exports.TOKEN_LEFT_PAREN = exports.TOKEN_EOF = exports.Tokens = void 0;
4 | // token const
5 | var Tokens;
6 | (function (Tokens) {
7 | Tokens[Tokens["TOKEN_EOF"] = 0] = "TOKEN_EOF";
8 | Tokens[Tokens["TOKEN_LEFT_PAREN"] = 1] = "TOKEN_LEFT_PAREN";
9 | Tokens[Tokens["TOKEN_TAG_NAME"] = 2] = "TOKEN_TAG_NAME";
10 | Tokens[Tokens["TOKEN_RIGHT_PAREN"] = 3] = "TOKEN_RIGHT_PAREN";
11 | Tokens[Tokens["TOKEN_EQUAL"] = 4] = "TOKEN_EQUAL";
12 | Tokens[Tokens["TOKEN_QUOTE"] = 5] = "TOKEN_QUOTE";
13 | Tokens[Tokens["TOKEN_SINGLE_QUOTE"] = 6] = "TOKEN_SINGLE_QUOTE";
14 | Tokens[Tokens["TOKEN_LEFT_LINE"] = 7] = "TOKEN_LEFT_LINE";
15 | Tokens[Tokens["TOKEN_DUOQUOTE"] = 8] = "TOKEN_DUOQUOTE";
16 | Tokens[Tokens["TOKEN_CONTENT_TEXT"] = 9] = "TOKEN_CONTENT_TEXT";
17 | Tokens[Tokens["TOKEN_CLOSE"] = 10] = "TOKEN_CLOSE";
18 | Tokens[Tokens["TOKEN_DTD"] = 11] = "TOKEN_DTD";
19 | Tokens[Tokens["TOKEN_SELF_CLOSE"] = 12] = "TOKEN_SELF_CLOSE";
20 | Tokens[Tokens["TOKEN_NAME"] = 13] = "TOKEN_NAME";
21 | Tokens[Tokens["TOKEN_IGNORED"] = 14] = "TOKEN_IGNORED";
22 | Tokens[Tokens["INTERGER"] = 15] = "INTERGER";
23 | Tokens[Tokens["COMMENT"] = 16] = "COMMENT";
24 | Tokens[Tokens["DIRECTIVE"] = 17] = "DIRECTIVE";
25 | Tokens[Tokens["SourceCharacter"] = 18] = "SourceCharacter";
26 | })(Tokens = exports.Tokens || (exports.Tokens = {}));
27 | exports.TOKEN_EOF = Tokens.TOKEN_EOF, exports.TOKEN_LEFT_PAREN = Tokens.TOKEN_LEFT_PAREN, exports.TOKEN_TAG_NAME = Tokens.TOKEN_TAG_NAME, exports.TOKEN_RIGHT_PAREN = Tokens.TOKEN_RIGHT_PAREN, exports.TOKEN_EQUAL = Tokens.TOKEN_EQUAL, exports.TOKEN_QUOTE = Tokens.TOKEN_QUOTE, exports.TOKEN_SINGLE_QUOTE = Tokens.TOKEN_SINGLE_QUOTE, exports.TOKEN_LEFT_LINE = Tokens.TOKEN_LEFT_LINE, exports.TOKEN_DUOQUOTE = Tokens.TOKEN_DUOQUOTE, exports.TOKEN_CONTENT_TEXT = Tokens.TOKEN_CONTENT_TEXT, exports.TOKEN_CLOSE = Tokens.TOKEN_CLOSE, exports.TOKEN_DTD = Tokens.TOKEN_DTD, exports.TOKEN_SELF_CLOSE = Tokens.TOKEN_SELF_CLOSE, exports.TOKEN_NAME = Tokens.TOKEN_NAME, exports.TOKEN_IGNORED = Tokens.TOKEN_IGNORED, exports.INTERGER = Tokens.INTERGER, exports.COMMENT = Tokens.COMMENT, exports.DIRECTIVE = Tokens.DIRECTIVE, exports.SourceCharacter = Tokens.SourceCharacter;
28 | // regex match patterns
29 | exports.regexName = /^[a-zA-z]+[0-9]*([-_:']*[a-zA-z0-9]*)*/;
30 | // 关键字
31 | exports.keywords = {};
32 | exports.tokenNameMap = {
33 | [exports.TOKEN_EOF]: "EOF",
34 | [exports.TOKEN_LEFT_PAREN]: "<",
35 | [exports.TOKEN_TAG_NAME]: "tagNmae",
36 | [exports.TOKEN_RIGHT_PAREN]: ">",
37 | [exports.TOKEN_EQUAL]: "=",
38 | [exports.TOKEN_QUOTE]: "\"",
39 | [exports.TOKEN_SINGLE_QUOTE]: "'",
40 | [exports.TOKEN_LEFT_LINE]: "/",
41 | [exports.TOKEN_DUOQUOTE]: "\"\"",
42 | [exports.TOKEN_CONTENT_TEXT]: "ContentText",
43 | [exports.TOKEN_CLOSE]: "close",
44 | [exports.TOKEN_DTD]: "dtd",
45 | [exports.TOKEN_SELF_CLOSE]: "self-close",
46 | [exports.TOKEN_NAME]: "Name",
47 | [exports.TOKEN_IGNORED]: "Ignored",
48 | [exports.INTERGER]: "INTERGER",
49 | [exports.COMMENT]: "COMMENT",
50 | [exports.DIRECTIVE]: "DIRECTIVE",
51 | [exports.SourceCharacter]: "SourceCharacter",
52 | };
53 | class Lexer {
54 | constructor(sourceCode, lineNum, nextToken, nextTokenType, nextTokenLineNum) {
55 | this.sourceCode = sourceCode;
56 | this.lineNum = lineNum;
57 | this.nextToken = nextToken;
58 | this.nextTokenType = nextTokenType;
59 | this.nextTokenLineNum = nextTokenLineNum;
60 | this.hasCache = false;
61 | this.stack = [];
62 | }
63 | get judgeIsContent() {
64 | const length = this.stack.length - 1;
65 | return this.stack[length].tokenType === exports.TOKEN_RIGHT_PAREN /*>*/ ||
66 | this.stack[length].tokenType === exports.TOKEN_SELF_CLOSE /*/> */ ||
67 | this.stack[length].tokenType === exports.TOKEN_DTD /*dtd*/ ||
68 | this.stack[length].tokenType === exports.COMMENT /**/ ||
69 | this.stack[length].tokenType === exports.TOKEN_CONTENT_TEXT; /*ContentText*/
70 | }
71 | get isContentText() {
72 | if (this.stack.length < 1) {
73 | if (this.sourceCode[0] === "<") {
74 | return false;
75 | }
76 | return true;
77 | }
78 | let origin = this.sourceCode;
79 | // while (this.stack.length > 10) {
80 | // this.stack.shift()
81 | // }
82 | if (this.judgeIsContent) {
83 | // this.isIgnored()
84 | //
85 | // if (this.stack.length > 2 && this.stack[this.stack.length - 2].token === "noscript") {
86 | // return true
87 | // }
88 | if (this.sourceCode[0] === "<") {
89 | this.sourceCode = origin;
90 | return false;
91 | }
92 | else {
93 | this.sourceCode = origin;
94 | return true;
95 | }
96 | }
97 | else {
98 | return false;
99 | }
100 | }
101 | /**
102 | * LookAhead (向前看) 一个 Token, 告诉我们下一个 Token 是什么
103 | * @returns
104 | */
105 | LookAhead() {
106 | // lexer.nextToken already setted
107 | if (this.hasCache) {
108 | return { tokenType: this.nextTokenType, lineNum: this.lineNum, token: this.nextToken };
109 | }
110 | // set it
111 | // 当前行
112 | let { lineNum, tokenType, token } = this.GetNextToken();
113 | // *
114 | // 下一行
115 | this.hasCache = true;
116 | this.lineNum = lineNum;
117 | this.nextTokenType = tokenType;
118 | this.nextToken = token;
119 | return { tokenType, lineNum, token };
120 | }
121 | LookAheadAndSkip(expectedType) {
122 | // get next token
123 | // 查看看下一个Token信息
124 | let { lineNum, tokenType, token } = this.GetNextToken();
125 | // not is expected type, reverse cursor
126 | if (tokenType != expectedType) {
127 | this.hasCache = true;
128 | this.lineNum = lineNum;
129 | this.nextTokenType = tokenType;
130 | this.nextToken = token;
131 | }
132 | }
133 | /**
134 | * 断言下一个 Token 是什么
135 | */
136 | NextTokenIs(tokenType) {
137 | const { lineNum: nowLineNum, tokenType: nowTokenType, token: nowToken } = this.GetNextToken();
138 | // syntax error
139 | if (tokenType != nowTokenType) {
140 | throw new Error(`NextTokenIs(): syntax error near '${exports.tokenNameMap[nowTokenType]}', expected token: {${exports.tokenNameMap[tokenType]}} but got {${exports.tokenNameMap[nowTokenType]}}. at line ${this.GetLineNum()} ${this.sourceCode.slice(0, 100)}`);
141 | }
142 | return { nowLineNum, nowToken, nowTokenType };
143 | }
144 | // MatchToken() 的封装,每一次调用,都会吃掉相应Token
145 | GetNextToken() {
146 | // next token already loaded
147 | if (this.hasCache) {
148 | // 在LookAhead和LookAheadSkip处对nextTokenLineNum进行了赋值操作
149 | let lineNum = this.lineNum;
150 | let tokenType = this.nextTokenType;
151 | let token = this.nextToken;
152 | this.hasCache = false;
153 | return {
154 | lineNum,
155 | tokenType,
156 | token
157 | };
158 | }
159 | return this.MatchToken();
160 | }
161 | checkCode(c) {
162 | // 确保源代码,不包含非法字符,对应着SourceCharacter的EBNF
163 | if (!/\u0009|\u000A|\u000D|[\u0020-\uFFFF]/.test(c)) {
164 | throw new Error('The source code contains characters that cannot be parsed.');
165 | }
166 | }
167 | // 直接跳过几个字符,返回被跳过的字符
168 | next(skip) {
169 | this.checkCode(this.sourceCode[0]);
170 | const code = this.sourceCode[0];
171 | this.skipSourceCode(skip);
172 | return code;
173 | }
174 | isTagNmae() {
175 | let origin = this.sourceCode;
176 | this.skipSourceCode(1);
177 | // if (this.sourceCode[0] === "/") {
178 | // this.sourceCode = origin
179 | // return false
180 | // }
181 | let tag_name = exports.regexName.exec(this.sourceCode);
182 | if (tag_name) {
183 | let tag = tag_name[0];
184 | this.skipSourceCode(tag.length);
185 | this.isIgnored();
186 | this.hasCache = false;
187 | if (this.sourceCode[0] === "=") {
188 | this.sourceCode = origin;
189 | return false;
190 | }
191 | else {
192 | this.sourceCode = origin;
193 | return true;
194 | }
195 | }
196 | else {
197 | this.sourceCode = origin;
198 | return false;
199 | }
200 | }
201 | // 匹配Token并跳过匹配的Token
202 | MatchToken() {
203 | this.checkCode(this.sourceCode[0]); // 只做检查,不吃字符
204 | // if(this.lineNum === 12) {
205 | // debugger
206 | // }
207 | // finish
208 | if (this.sourceCode.length == 0) {
209 | let res = { lineNum: this.lineNum, tokenType: exports.TOKEN_EOF, token: exports.tokenNameMap[exports.TOKEN_EOF] };
210 | this.stack.push(res);
211 | return res;
212 | }
213 | if (this.isContentText) {
214 | let contentText = /[\s\S]+/.exec(this.sourceCode[0]);
215 | if (contentText) {
216 | let res = { lineNum: this.lineNum, tokenType: exports.TOKEN_CONTENT_TEXT /*ContentText*/, token: contentText[0] };
217 | this.stack.push(res);
218 | return res;
219 | }
220 | }
221 | else {
222 | // check ignored
223 | if (this.isIgnored()) {
224 | let res = { lineNum: this.lineNum, tokenType: exports.TOKEN_IGNORED, token: "Ignored" };
225 | this.stack.push(res);
226 | return res;
227 | }
228 | switch (this.sourceCode[0]) {
229 | case '<':
230 | //
231 | if (this.sourceCode.slice(0, 4) === "