',
31 | )
32 | .map((test) => ({
33 | html: test.input,
34 | fragmentContext: test.fragmentContext,
35 | }));
36 |
37 | globalThis.runMicro = function (parser) {
38 | for (const test of microTests) {
39 | if (test.fragmentContext) {
40 | parser.parseFragment(test.fragmentContext, test.html);
41 | } else {
42 | parser.parse(test.html);
43 | }
44 | }
45 | };
46 |
47 | // Pages data
48 | const pages = loadSAXParserTestData().map((test) => test.src);
49 |
50 | globalThis.runPages = function (parser) {
51 | for (const page of pages) {
52 | parser.parse(page);
53 | }
54 | };
55 |
56 | // Stream data
57 | globalThis.files = readdirSync(saxPath).map((dirName) => new URL(`${dirName}/src.html`, saxPath).pathname);
58 |
59 | // Utils
60 | function getHz(suite, testName) {
61 | for (let i = 0; i < suite.length; i++) {
62 | if (suite[i].name === testName) {
63 | return suite[i].hz;
64 | }
65 | }
66 | }
67 |
68 | function runBench({ name, workingCopyFn, upstreamFn, defer = false }) {
69 | const suite = new Benchmark.Suite(name);
70 |
71 | suite
72 | .add('Working copy', workingCopyFn, { defer })
73 | .add('Upstream', upstreamFn, { defer })
74 | .on('start', () => console.log(name))
75 | .on('cycle', (event) => console.log(String(event.target)))
76 | .on('complete', () => {
77 | const workingCopyHz = getHz(suite, 'Working copy');
78 | const upstreamHz = getHz(suite, 'Upstream');
79 |
80 | if (workingCopyHz > upstreamHz) {
81 | console.log(`Working copy is ${(workingCopyHz / upstreamHz).toFixed(2)}x faster.\n`);
82 | } else {
83 | console.log(`Working copy is ${(upstreamHz / workingCopyHz).toFixed(2)}x slower.\n`);
84 | }
85 | })
86 | .run();
87 | }
88 |
89 | // Benchmarks
90 | runBench({
91 | name: 'parse5 regression benchmark - MICRO',
92 | workingCopyFn: () => runMicro(workingCopy),
93 | upstreamFn: () => runMicro(upstreamParser),
94 | });
95 |
96 | runBench({
97 | name: 'parse5 regression benchmark - HUGE',
98 | workingCopyFn: () => workingCopy.parse(hugePage),
99 | upstreamFn: () => upstreamParser.parse(hugePage),
100 | });
101 |
102 | runBench({
103 | name: 'parse5 regression benchmark - PAGES',
104 | workingCopyFn: () => runPages(workingCopy),
105 | upstreamFn: () => runPages(upstreamParser),
106 | });
107 |
108 | runBench({
109 | name: 'parse5 regression benchmark - STREAM',
110 | defer: true,
111 | workingCopyFn: async (deferred) => {
112 | const parsePromises = files.map((fileName) => {
113 | const stream = createReadStream(fileName, 'utf8');
114 | const parserStream = new WorkingCopyParserStream();
115 |
116 | stream.pipe(parserStream);
117 | return finished(parserStream);
118 | });
119 |
120 | await Promise.all(parsePromises);
121 | deferred.resolve();
122 | },
123 | upstreamFn: async (deferred) => {
124 | const parsePromises = files.map(async (fileName) => {
125 | const stream = createReadStream(fileName, 'utf8');
126 | const writable = new WritableStreamStub();
127 |
128 | stream.pipe(writable);
129 |
130 | await finished(writable);
131 |
132 | upstreamParser.parse(writable.writtenData);
133 | });
134 |
135 | await Promise.all(parsePromises);
136 | deferred.resolve();
137 | },
138 | });
139 |
--------------------------------------------------------------------------------
/docs/list-of-packages.md:
--------------------------------------------------------------------------------
1 | # List of parse5 toolset packages
2 |
3 | - [parse5](https://github.com/inikulin/parse5/tree/master/packages/parse5) - HTML parser and serializer.
4 | - [parse5-htmlparser2-tree-adapter](https://github.com/inikulin/parse5/tree/master/packages/parse5-htmlparser2-tree-adapter) - [htmlparser2](https://github.com/fb55/htmlparser2) tree adapter.
5 | - [parse5-parser-stream](https://github.com/inikulin/parse5/tree/master/packages/parse5-parser-stream) - streaming HTML parser with scripting support.
6 | - [parse5-plain-text-conversion-stream](https://github.com/inikulin/parse5/tree/master/packages/parse5-plain-text-conversion-stream) - stream that converts plain text files into HTML documents.
7 | - [parse5-sax-parser](https://github.com/inikulin/parse5/tree/master/packages/parse5-sax-parser) - streaming SAX-style HTML parser.
8 | - [parse5-html-rewriting-stream](https://github.com/inikulin/parse5/tree/master/packages/parse5-html-rewriting-stream) - streaming HTML rewriter.
9 |
--------------------------------------------------------------------------------
/docs/version-history.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | See [GitHub Releases](https://github.com/inikulin/parse5/releases) for the changelog.
--------------------------------------------------------------------------------
/eslint.config.js:
--------------------------------------------------------------------------------
1 | import eslintjs from '@eslint/js';
2 | import eslintConfigPrettier from 'eslint-config-prettier';
3 | import { configs as tseslintConfigs } from 'typescript-eslint';
4 | import globals from 'globals';
5 | import eslintUnicorn from 'eslint-plugin-unicorn';
6 |
7 | const { configs: eslintConfigs } = eslintjs;
8 |
9 | const sourceFiles = ['bench/**/*.js', 'scripts/**/*.ts', 'packages/*/lib/**/*.ts'];
10 | const testFiles = ['test/**/*.{ts,js}', '**/*.test.ts'];
11 | const ignoreFiles = [
12 | 'test/data/html5lib-tests',
13 | 'test/data/html5lib-tests-fork',
14 | 'packages/*/dist/',
15 | 'test/dist/',
16 | 'docs/build/',
17 | 'coverage/',
18 | ];
19 | const allFiles = [...sourceFiles, ...testFiles];
20 |
21 | export default [
22 | {
23 | files: allFiles,
24 | },
25 | {
26 | ignores: ignoreFiles,
27 | },
28 | {
29 | languageOptions: {
30 | globals: {
31 | ...globals.nodeBuiltin,
32 | ...globals.es2019,
33 | },
34 | },
35 | },
36 | eslintConfigs.recommended,
37 | ...tseslintConfigs.recommended,
38 | {
39 | rules: {
40 | 'no-console': 'error',
41 | curly: ['error', 'all'],
42 | 'prefer-arrow-callback': 'error',
43 | 'one-var': ['error', 'never'],
44 | 'no-var': 'error',
45 | 'prefer-const': 'error',
46 | 'object-shorthand': 'error',
47 | 'prefer-destructuring': [
48 | 'error',
49 | {
50 | object: true,
51 | array: false,
52 | },
53 | ],
54 | 'prefer-template': 'error',
55 | 'arrow-body-style': ['error', 'as-needed'],
56 | },
57 | },
58 | {
59 | files: ['**/*.ts'],
60 | rules: {
61 | '@typescript-eslint/no-unsafe-declaration-merging': 'off',
62 | '@typescript-eslint/no-non-null-assertion': 'warn',
63 | '@typescript-eslint/explicit-function-return-type': 'error',
64 | '@typescript-eslint/consistent-type-imports': 'error',
65 |
66 | '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
67 | },
68 | },
69 | {
70 | files: testFiles,
71 | rules: {
72 | '@typescript-eslint/no-non-null-assertion': 'off',
73 | },
74 | },
75 | eslintConfigPrettier,
76 | eslintUnicorn.configs.recommended,
77 | {
78 | rules: {
79 | 'unicorn/no-null': 'off',
80 | 'unicorn/prevent-abbreviations': 'off',
81 | 'unicorn/prefer-string-slice': 'off',
82 | 'unicorn/prefer-code-point': 'off',
83 | 'unicorn/no-array-push-push': 'off',
84 | 'unicorn/no-for-loop': 'off',
85 | 'unicorn/consistent-destructuring': 'off',
86 | 'unicorn/prefer-string-replace-all': 'off',
87 | 'unicorn/prefer-at': 'off',
88 | 'unicorn/number-literal-case': 'off',
89 | 'unicorn/no-nested-ternary': 'off',
90 | 'unicorn/consistent-function-scoping': 'off',
91 | 'unicorn/prefer-switch': ['error', { emptyDefaultCase: 'do-nothing-comment' }],
92 | 'unicorn/prefer-single-call': 'off',
93 | },
94 | },
95 | ];
96 |
--------------------------------------------------------------------------------
/media/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inikulin/parse5/0e9be1ce4033c0b8faf3d1e84da9076207c3316c/media/logo.png
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-build-scripts",
3 | "private": true,
4 | "type": "module",
5 | "workspaces": [
6 | "packages/*",
7 | "bench",
8 | "test"
9 | ],
10 | "devDependencies": {
11 | "@eslint/js": "^9.28.0",
12 | "@vitest/coverage-v8": "^3.1.4",
13 | "eslint": "^9.28.0",
14 | "eslint-config-prettier": "^10.1.5",
15 | "eslint-plugin-unicorn": "^59.0.1",
16 | "globals": "^16.2.0",
17 | "husky": "^9.1.7",
18 | "nano-staged": "^0.8.0",
19 | "outdent": "^0.8.0",
20 | "prettier": "^3.5.3",
21 | "ts-node": "^10.9.2",
22 | "typedoc": "^0.28.5",
23 | "typescript": "^5.8.3",
24 | "typescript-eslint": "^8.33.0",
25 | "vitest": "^3.0.1"
26 | },
27 | "scripts": {
28 | "build": "npm run build:esm && npm run build:cjs --workspaces --if-present",
29 | "build:esm": "tsc --build packages/* test",
30 | "build:docs": "typedoc",
31 | "prettier": "prettier '**/*.{js,ts,md,json,yml}' --log-level warn",
32 | "format": "npm run format:es && npm run format:prettier",
33 | "format:es": "npm run lint:es -- --fix",
34 | "format:prettier": "npm run prettier -- --write",
35 | "lint": "npm run lint:es && npm run lint:prettier",
36 | "lint:es": "eslint .",
37 | "lint:prettier": "npm run prettier -- --check",
38 | "unit-tests": "vitest run",
39 | "unit-tests-coverage": "vitest run --coverage",
40 | "test": "npm run lint && npm run unit-tests",
41 | "generate-feedback-tests": "node --loader ts-node/esm scripts/generate-parser-feedback-test/index.ts test/data/html5lib-tests/tree-construction/*.dat",
42 | "bench-perf": "npm run build && node bench/perf/index.js",
43 | "bench-memory-sax": "npm run build && node bench/memory/sax-parser.js",
44 | "preversion": "npm test",
45 | "pre-commit": "nano-staged",
46 | "publish": "npm publish --workspaces",
47 | "prepare": "husky install",
48 | "prepublish": "npm run build"
49 | },
50 | "nano-staged": {
51 | "*.{js,ts}": [
52 | "prettier --write",
53 | "eslint --fix"
54 | ],
55 | "*.{md,json,yml}": [
56 | "prettier --write"
57 | ]
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
parse5-html-rewriting-stream
9 | Streaming HTML rewriter.
10 |
11 |
12 |
13 |
14 | npm install --save parse5-html-rewriting-stream
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Changelog
34 |
35 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/lib/index.ts:
--------------------------------------------------------------------------------
1 | import { html, type Token } from 'parse5';
2 | import {
3 | SAXParser,
4 | type EndTag,
5 | type StartTag,
6 | type Doctype,
7 | type Text,
8 | type Comment,
9 | type SaxToken,
10 | } from 'parse5-sax-parser';
11 | import { escapeText, escapeAttribute } from 'entities/escape';
12 |
13 | /**
14 | * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
15 | * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
16 | *
17 | * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
18 | * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
19 | *
20 | * @example
21 | *
22 | * ```js
23 | * const RewritingStream = require('parse5-html-rewriting-stream');
24 | * const http = require('http');
25 | * const fs = require('fs');
26 | *
27 | * const file = fs.createWriteStream('/home/google.com.html');
28 | * const rewriter = new RewritingStream();
29 | *
30 | * // Replace divs with spans
31 | * rewriter.on('startTag', startTag => {
32 | * if (startTag.tagName === 'span') {
33 | * startTag.tagName = 'div';
34 | * }
35 | *
36 | * rewriter.emitStartTag(startTag);
37 | * });
38 | *
39 | * rewriter.on('endTag', endTag => {
40 | * if (endTag.tagName === 'span') {
41 | * endTag.tagName = 'div';
42 | * }
43 | *
44 | * rewriter.emitEndTag(endTag);
45 | * });
46 | *
47 | * // Wrap all text nodes with an tag
48 | * rewriter.on('text', (_, raw) => {
49 | * // Use the raw representation of text without HTML entities decoding
50 | * rewriter.emitRaw(`${raw} `);
51 | * });
52 | *
53 | * http.get('http://google.com', res => {
54 | * // Assumes response is UTF-8.
55 | * res.setEncoding('utf8');
56 | * // `RewritingStream` is a `Transform` stream, which means you can pipe
57 | * // through it.
58 | * res.pipe(rewriter).pipe(file);
59 | * });
60 | * ```
61 | */
62 | export class RewritingStream extends SAXParser {
63 | /** Note: `sourceCodeLocationInfo` is always enabled. */
64 | constructor() {
65 | super({ sourceCodeLocationInfo: true });
66 | }
67 |
68 | override _transformChunk(chunk: string): string {
69 | // NOTE: ignore upstream return values as we want to push to
70 | // the `Writable` part of the `Transform` stream ourselves.
71 | super._transformChunk(chunk);
72 | return '';
73 | }
74 |
75 | private _getRawHtml(location: Token.Location): string {
76 | const { droppedBufferSize, html } = this.tokenizer.preprocessor;
77 | const start = location.startOffset - droppedBufferSize;
78 | const end = location.endOffset - droppedBufferSize;
79 |
80 | return html.slice(start, end);
81 | }
82 |
83 | // Events
84 | protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean {
85 | if (!super.emitIfListenerExists(eventName, token)) {
86 | this.emitRaw(this._getRawHtml(token.sourceCodeLocation!));
87 | }
88 |
89 | // NOTE: don't skip new lines after `` and other tags,
90 | // otherwise we'll have incorrect raw data.
91 | this.parserFeedbackSimulator.skipNextNewLine = false;
92 | return true;
93 | }
94 |
95 | // Emitter API
96 | protected override _emitToken(eventName: string, token: SaxToken): void {
97 | this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!));
98 | }
99 |
100 | /** Emits a serialized document type token into the output stream. */
101 | public emitDoctype(token: Doctype): void {
102 | let res = `';
115 |
116 | this.push(res);
117 | }
118 |
119 | /** Emits a serialized start tag token into the output stream. */
120 | public emitStartTag(token: StartTag): void {
121 | let res = `<${token.tagName}`;
122 |
123 | for (const attr of token.attrs) {
124 | res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
125 | }
126 |
127 | res += token.selfClosing ? '/>' : '>';
128 |
129 | this.push(res);
130 | }
131 |
132 | /** Emits a serialized end tag token into the output stream. */
133 | public emitEndTag(token: EndTag): void {
134 | this.push(`${token.tagName}>`);
135 | }
136 |
137 | /** Emits a serialized text token into the output stream. */
138 | public emitText({ text }: Text): void {
139 | this.push(
140 | !this.parserFeedbackSimulator.inForeignContent &&
141 | html.hasUnescapedText(this.tokenizer.lastStartTagName, true)
142 | ? text
143 | : escapeText(text),
144 | );
145 | }
146 |
147 | /** Emits a serialized comment token into the output stream. */
148 | public emitComment(token: Comment): void {
149 | this.push(``);
150 | }
151 |
152 | /** Emits a raw HTML string into the output stream. */
153 | public emitRaw(html: string): void {
154 | this.push(html);
155 | }
156 | }
157 |
158 | export interface RewritingStream {
159 | /** Raised when the rewriter encounters a start tag. */
160 | on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this;
161 | /** Raised when rewriter encounters an end tag. */
162 | on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this;
163 | /** Raised when rewriter encounters a comment. */
164 | on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this;
165 | /** Raised when rewriter encounters text content. */
166 | on(event: 'text', listener: (text: Text, rawHtml: string) => void): this;
167 | /** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */
168 | on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this;
169 |
170 | /**
171 | * Base event handler.
172 | *
173 | * @param event Name of the event
174 | * @param handler Event handler
175 | */
176 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
177 | on(event: string, handler: (...args: any[]) => void): this;
178 | }
179 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-html-rewriting-stream",
3 | "type": "module",
4 | "description": "Streaming HTML rewriter.",
5 | "version": "7.1.0",
6 | "author": "Ivan Nikulin (https://github.com/inikulin)",
7 | "contributors": "https://github.com/inikulin/parse5/graphs/contributors",
8 | "homepage": "https://parse5.js.org",
9 | "funding": "https://github.com/inikulin/parse5?sponsor=1",
10 | "keywords": [
11 | "parse5",
12 | "parser",
13 | "stream",
14 | "streaming",
15 | "rewritter",
16 | "rewrite",
17 | "HTML"
18 | ],
19 | "license": "MIT",
20 | "main": "dist/index.js",
21 | "module": "dist/index.js",
22 | "types": "dist/index.d.ts",
23 | "exports": "./dist/index.js",
24 | "dependencies": {
25 | "entities": "^6.0.0",
26 | "parse5": "^7.0.0",
27 | "parse5-sax-parser": "^7.0.0"
28 | },
29 | "repository": {
30 | "type": "git",
31 | "url": "git://github.com/inikulin/parse5.git"
32 | },
33 | "files": [
34 | "dist/**/*.js",
35 | "dist/**/*.d.ts"
36 | ]
37 | }
38 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../../tsconfig.json",
3 | "compilerOptions": {
4 | "rootDir": "lib",
5 | "outDir": "dist"
6 | },
7 | "include": ["**/*.ts"],
8 | "exclude": ["**/*.test.ts", "dist"],
9 | "references": [{ "path": "../parse5/tsconfig.json" }, { "path": "../parse5-sax-parser/tsconfig.json" }]
10 | }
11 |
--------------------------------------------------------------------------------
/packages/parse5-html-rewriting-stream/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["../../typedoc.base.json"],
3 | "entryPoints": ["lib/index.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/packages/parse5-htmlparser2-tree-adapter/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5-htmlparser2-tree-adapter/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
11 |
12 |
13 |
14 | npm install --save parse5-htmlparser2-tree-adapter
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Changelog
34 |
35 |
--------------------------------------------------------------------------------
/packages/parse5-htmlparser2-tree-adapter/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-htmlparser2-tree-adapter",
3 | "type": "module",
4 | "description": "htmlparser2 tree adapter for parse5.",
5 | "version": "7.1.0",
6 | "author": "Ivan Nikulin (https://github.com/inikulin)",
7 | "contributors": "https://github.com/inikulin/parse5/graphs/contributors",
8 | "homepage": "https://parse5.js.org",
9 | "funding": "https://github.com/inikulin/parse5?sponsor=1",
10 | "keywords": [
11 | "parse5",
12 | "parser",
13 | "tree adapter",
14 | "htmlparser2"
15 | ],
16 | "license": "MIT",
17 | "main": "dist/cjs/index.js",
18 | "module": "dist/index.js",
19 | "types": "dist/index.d.ts",
20 | "exports": {
21 | "import": "./dist/index.js",
22 | "require": "./dist/cjs/index.js"
23 | },
24 | "dependencies": {
25 | "domhandler": "^5.0.3",
26 | "parse5": "^7.0.0"
27 | },
28 | "scripts": {
29 | "build:cjs": "tsc --noCheck --moduleResolution node10 --module CommonJS --target ES6 --outDir dist/cjs && echo '{\"type\":\"commonjs\"}' > dist/cjs/package.json"
30 | },
31 | "repository": {
32 | "type": "git",
33 | "url": "git://github.com/inikulin/parse5.git"
34 | },
35 | "files": [
36 | "dist/cjs/package.json",
37 | "dist/**/*.js",
38 | "dist/**/*.d.ts"
39 | ]
40 | }
41 |
--------------------------------------------------------------------------------
/packages/parse5-htmlparser2-tree-adapter/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../../tsconfig.json",
3 | "compilerOptions": {
4 | "rootDir": "lib",
5 | "outDir": "dist"
6 | },
7 | "include": ["**/*.ts"],
8 | "exclude": ["**/*.test.ts", "dist"]
9 | }
10 |
--------------------------------------------------------------------------------
/packages/parse5-htmlparser2-tree-adapter/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["../../typedoc.base.json"],
3 | "entryPoints": ["lib/index.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
parse5-parser-stream
9 | Streaming HTML parser with scripting support.
10 |
11 |
12 |
13 |
14 | npm install --save parse5-parser-stream
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Changelog
34 |
35 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/lib/index.ts:
--------------------------------------------------------------------------------
1 | import { Writable } from 'node:stream';
2 | import { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5';
3 |
4 | /**
5 | * Streaming HTML parser with scripting support.
6 | * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
7 | *
8 | * @example
9 | *
10 | * ```js
11 | * const ParserStream = require('parse5-parser-stream');
12 | * const http = require('http');
13 | * const { finished } = require('node:stream');
14 | *
15 | * // Fetch the page content and obtain it's node
16 | * http.get('http://inikulin.github.io/parse5/', res => {
17 | * const parser = new ParserStream();
18 | *
19 | * finished(parser, () => {
20 | * console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head'
21 | * });
22 | *
23 | * res.pipe(parser);
24 | * });
25 | * ```
26 | *
27 | */
28 | export class ParserStream extends Writable {
29 | static getFragmentStream(
30 | fragmentContext?: T['parentNode'] | null,
31 | options?: ParserOptions,
32 | ): ParserStream {
33 | const parser = Parser.getFragmentParser(fragmentContext, options);
34 | const stream = new ParserStream(options, parser);
35 | return stream;
36 | }
37 |
38 | private lastChunkWritten = false;
39 | private writeCallback: undefined | (() => void) = undefined;
40 |
41 | private pendingHtmlInsertions: string[] = [];
42 | /** The resulting document node. */
43 | public get document(): T['document'] {
44 | return this.parser.document;
45 | }
46 | public getFragment(): T['documentFragment'] {
47 | return this.parser.getFragment();
48 | }
49 |
50 | /**
51 | * @param options Parsing options.
52 | */
53 | constructor(
54 | options?: ParserOptions,
55 | public parser: Parser = new Parser(options),
56 | ) {
57 | super({ decodeStrings: false });
58 |
59 | const resume = (): void => {
60 | for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) {
61 | this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]);
62 | }
63 |
64 | this.pendingHtmlInsertions.length = 0;
65 |
66 | //NOTE: keep parsing if we don't wait for the next input chunk
67 | this.parser.tokenizer.resume(this.writeCallback);
68 | };
69 |
70 | const documentWrite = (html: string): void => {
71 | if (!this.parser.stopped) {
72 | this.pendingHtmlInsertions.push(html);
73 | }
74 | };
75 |
76 | const scriptHandler = (scriptElement: T['element']): void => {
77 | if (this.listenerCount('script') > 0) {
78 | this.parser.tokenizer.pause();
79 | this.emit('script', scriptElement, documentWrite, resume);
80 | }
81 | };
82 |
83 | this.parser.scriptHandler = scriptHandler;
84 | }
85 |
86 | //WritableStream implementation
87 | override _write(chunk: string, _encoding: string, callback: () => void): void {
88 | if (typeof chunk !== 'string') {
89 | throw new TypeError('Parser can work only with string streams.');
90 | }
91 |
92 | this.writeCallback = callback;
93 | this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback);
94 | }
95 |
96 | // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`.
97 |
98 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
99 | override end(chunk?: any, encoding?: any, callback?: any): any {
100 | this.lastChunkWritten = true;
101 | super.end(chunk || '', encoding, callback);
102 | }
103 | }
104 |
105 | export interface ParserStream {
106 | /**
107 | * Raised when parser encounters a `');
132 | * ```
133 | *
134 | * @param event Name of the event
135 | * @param handler
136 | */
137 | on(
138 | event: 'script',
139 | handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void,
140 | ): void;
141 | /**
142 | * Base event handler.
143 | *
144 | * @param event Name of the event
145 | * @param handler Event handler
146 | */
147 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
148 | on(event: string, handler: (...args: any[]) => void): this;
149 | }
150 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-parser-stream",
3 | "type": "module",
4 | "description": "Streaming HTML parser with scripting support.",
5 | "version": "7.1.2",
6 | "author": "Ivan Nikulin (https://github.com/inikulin)",
7 | "contributors": "https://github.com/inikulin/parse5/graphs/contributors",
8 | "homepage": "https://parse5.js.org",
9 | "funding": "https://github.com/inikulin/parse5?sponsor=1",
10 | "keywords": [
11 | "parse5",
12 | "parser",
13 | "stream",
14 | "streaming"
15 | ],
16 | "license": "MIT",
17 | "main": "dist/cjs/index.js",
18 | "module": "dist/index.js",
19 | "types": "dist/index.d.ts",
20 | "exports": {
21 | "import": "./dist/index.js",
22 | "require": "./dist/cjs/index.js"
23 | },
24 | "dependencies": {
25 | "parse5": "^7.0.0"
26 | },
27 | "scripts": {
28 | "build:cjs": "tsc --noCheck --moduleResolution node10 --module CommonJS --target ES6 --outDir dist/cjs && echo '{\"type\":\"commonjs\"}' > dist/cjs/package.json"
29 | },
30 | "repository": {
31 | "type": "git",
32 | "url": "git://github.com/inikulin/parse5.git"
33 | },
34 | "files": [
35 | "dist/cjs/package.json",
36 | "dist/**/*.js",
37 | "dist/**/*.d.ts"
38 | ]
39 | }
40 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/test/location-info.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert } from 'vitest';
2 | import { generateLocationInfoParserTests } from 'parse5-test-utils/utils/generate-location-info-parser-tests.js';
3 | import { generateTestsForEachTreeAdapter } from 'parse5-test-utils/utils/common.js';
4 | import { parseChunked } from './utils/parse-chunked.js';
5 |
6 | generateLocationInfoParserTests('location-info', (input, opts) =>
7 | // NOTE: because of performance use bigger chunks here
8 | parseChunked({ input }, opts, 100, 400),
9 | );
10 |
11 | generateTestsForEachTreeAdapter('location-info', (treeAdapter) => {
12 | it('Regression - location info for the implicitly generated , and (GH-44)', () => {
13 | const html = '
';
14 |
15 | const opts = {
16 | treeAdapter,
17 | sourceCodeLocationInfo: true,
18 | };
19 |
20 | const document = parseChunked({ input: html }, opts).node;
21 | const htmlEl = treeAdapter.getChildNodes(document)[0];
22 | const headEl = treeAdapter.getChildNodes(htmlEl)[0];
23 | const bodyEl = treeAdapter.getChildNodes(htmlEl)[1];
24 |
25 | assert.strictEqual(treeAdapter.getNodeSourceCodeLocation(htmlEl), null);
26 | assert.strictEqual(treeAdapter.getNodeSourceCodeLocation(headEl), null);
27 | assert.strictEqual(treeAdapter.getNodeSourceCodeLocation(bodyEl), null);
28 | });
29 | });
30 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/test/parser-stream.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert, describe } from 'vitest';
2 | import { ParserStream } from '../lib/index.js';
3 | import { generateParsingTests } from 'parse5-test-utils/utils/generate-parsing-tests.js';
4 | import { parseChunked } from './utils/parse-chunked.js';
5 | import { finished } from 'parse5-test-utils/utils/common.js';
6 |
7 | generateParsingTests(
8 | 'ParserStream',
9 | 'ParserStream',
10 | {
11 | expectErrors: [
12 | //TODO(GH-448): Foreign content behaviour was updated in the HTML spec.
13 | //The old test suite still tests the old behaviour.
14 | '0.foreign-fragment',
15 | '1.foreign-fragment',
16 | '38.foreign-fragment',
17 | '40.foreign-fragment',
18 | '47.foreign-fragment',
19 | '48.foreign-fragment',
20 | ],
21 | },
22 | (test, opts) => parseChunked(test, opts),
23 | );
24 |
25 | describe('ParserStream', () => {
26 | it('Fix empty stream parsing with ParserStream (GH-196)', async () => {
27 | const parser = new ParserStream();
28 |
29 | parser.end();
30 |
31 | await finished(parser);
32 |
33 | assert.ok(parser.document.childNodes.length > 0);
34 | });
35 |
36 | it('Should not accept binary input (GH-269)', () => {
37 | const stream = new ParserStream();
38 | const buf = Buffer.from('test');
39 |
40 | assert.throws(() => stream.write(buf), TypeError);
41 | });
42 | });
43 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/test/scripting.test.ts:
--------------------------------------------------------------------------------
1 | import { it } from 'vitest';
2 | import { ParserStream } from '../lib/index.js';
3 | import { generateParsingTests } from 'parse5-test-utils/utils/generate-parsing-tests.js';
4 | import { makeChunks, generateTestsForEachTreeAdapter, finished } from 'parse5-test-utils/utils/common.js';
5 | import { runInNewContext } from 'node:vm';
6 |
7 | function pause(): Promise {
8 | return new Promise((resolve) => setTimeout(resolve, 5));
9 | }
10 |
11 | const suitePath = new URL('../../../test/data/tree-construction-scripting', import.meta.url);
12 |
13 | generateParsingTests(
14 | 'ParserStream - Scripting',
15 | 'ParserStream - Scripting',
16 | {
17 | withoutErrors: true,
18 | suitePath,
19 | },
20 | async (test, opts) => {
21 | const chunks = makeChunks(test.input);
22 | const parser = test.fragmentContext
23 | ? ParserStream.getFragmentStream(test.fragmentContext, opts)
24 | : new ParserStream(opts);
25 |
26 | parser.on('script', async (scriptElement, documentWrite, resume) => {
27 | const scriptTextNode = opts.treeAdapter.getChildNodes(scriptElement)[0];
28 | const script = scriptTextNode ? opts.treeAdapter.getTextNodeContent(scriptTextNode) : '';
29 |
30 | //NOTE: emulate postponed script execution
31 | await pause();
32 |
33 | try {
34 | runInNewContext(script, { document: { write: documentWrite } });
35 | resume();
36 | } catch (error) {
37 | parser.emit('error', error);
38 | }
39 | });
40 |
41 | //NOTE: emulate async input stream behavior
42 | for (const chunk of chunks) {
43 | parser.write(chunk);
44 | await pause();
45 | }
46 |
47 | parser.end();
48 |
49 | await finished(parser);
50 |
51 | return {
52 | node: test.fragmentContext ? parser.getFragment() : parser.document,
53 | chunks,
54 | };
55 | },
56 | );
57 |
58 | generateTestsForEachTreeAdapter('ParserStream', (treeAdapter) => {
59 | it('Regression - Synchronously calling resume() leads to crash (GH-98)', async () => {
60 | const parser = new ParserStream({ treeAdapter });
61 |
62 | parser.on('script', (_el, _docWrite, resume) => resume());
63 |
64 | parser.end('');
65 |
66 | await new Promise((resolve) => {
67 | process.nextTick(resolve);
68 | });
69 | });
70 |
71 | it('Regression - Parsing loop lock causes accidental hang ups (GH-101)', () => {
72 | const parser = new ParserStream({ treeAdapter });
73 |
74 | parser.on('script', (_scriptElement, _documentWrite, resume) => process.nextTick(resume));
75 |
76 | parser.write('');
77 | parser.end('dawg');
78 |
79 | return finished(parser);
80 | });
81 | });
82 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/test/utils/parse-chunked.ts:
--------------------------------------------------------------------------------
1 | import type { ParserOptions, TreeAdapterTypeMap } from 'parse5';
2 | import { ParserStream } from '../../lib/index.js';
3 | import { makeChunks } from 'parse5-test-utils/utils/common.js';
4 |
5 | export function parseChunked(
6 | test: { input: string; fragmentContext?: T['parentNode'] },
7 | opts: ParserOptions,
8 | minChunkSize?: number,
9 | maxChunkSize?: number,
10 | ): { node: TreeAdapterTypeMap['document']; chunks: string[] } {
11 | const parserStream = test.fragmentContext
12 | ? ParserStream.getFragmentStream(test.fragmentContext, opts)
13 | : new ParserStream(opts);
14 | const chunks = makeChunks(test.input, minChunkSize, maxChunkSize);
15 |
16 | // NOTE: set small waterline for testing purposes
17 | parserStream.parser.tokenizer.preprocessor.bufferWaterline = 8;
18 |
19 | for (let i = 0; i < chunks.length - 1; i++) {
20 | parserStream.write(chunks[i]);
21 | }
22 |
23 | parserStream.end(chunks[chunks.length - 1]);
24 |
25 | return {
26 | node: test.fragmentContext ? parserStream.getFragment() : parserStream.document,
27 | chunks,
28 | };
29 | }
30 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../../tsconfig.json",
3 | "compilerOptions": {
4 | "rootDir": "lib",
5 | "outDir": "dist"
6 | },
7 | "include": ["**/*.ts"],
8 | "exclude": ["**/*.test.ts", "dist", "test"]
9 | }
10 |
--------------------------------------------------------------------------------
/packages/parse5-parser-stream/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["../../typedoc.base.json"],
3 | "entryPoints": ["lib/index.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
parse5-plain-text-conversion-stream
9 |
Stream that converts plain text into HTML document as required by HTML specification .
10 |
11 |
12 |
13 |
14 | npm install --save parse5-plain-text-conversion-stream
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Changelog
34 |
35 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/lib/index.ts:
--------------------------------------------------------------------------------
1 | import { type ParserOptions, type TreeAdapterTypeMap, html } from 'parse5';
2 | import { ParserStream } from 'parse5-parser-stream';
3 |
4 | const { TAG_ID: $, TAG_NAMES: TN } = html;
5 |
6 | /**
7 | * Converts plain text files into HTML document as required by [HTML specification](https://html.spec.whatwg.org/#read-text).
8 | * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
9 | *
10 | * @example
11 | *
12 | * ```js
13 | * const PlainTextConversionStream = require('parse5-plain-text-conversion-stream');
14 | * const fs = require('fs');
15 | * const { finished } = require('node:stream');
16 | *
17 | * const file = fs.createReadStream('war_and_peace.txt');
18 | * const converter = new PlainTextConversionStream();
19 | *
20 | * finished(converter, () => {
21 | * console.log(converter.document.childNodes[1].childNodes[0].tagName); //> 'head'
22 | * });
23 | *
24 | * file.pipe(converter);
25 | * ```
26 | */
27 | export class PlainTextConversionStream extends ParserStream {
28 | constructor(options?: ParserOptions) {
29 | super(options);
30 |
31 | // NOTE: see https://html.spec.whatwg.org/#read-text
32 | this.parser._insertFakeElement(TN.HTML, $.HTML);
33 | this.parser._insertFakeElement(TN.HEAD, $.HEAD);
34 | this.parser.openElements.pop();
35 | this.parser._insertFakeElement(TN.BODY, $.BODY);
36 | this.parser._insertFakeElement(TN.PRE, $.PRE);
37 | this.parser.treeAdapter.insertText(this.parser.openElements.current, '\n');
38 | this.parser.switchToPlaintextParsing();
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-plain-text-conversion-stream",
3 | "type": "module",
4 | "description": "Stream that converts plain text files into HTML document.",
5 | "version": "7.0.0",
6 | "author": "Ivan Nikulin (https://github.com/inikulin)",
7 | "contributors": "https://github.com/inikulin/parse5/graphs/contributors",
8 | "homepage": "https://parse5.js.org",
9 | "funding": "https://github.com/inikulin/parse5?sponsor=1",
10 | "keywords": [
11 | "parse5",
12 | "parser",
13 | "stream",
14 | "streaming",
15 | "plain",
16 | "text",
17 | "plain text"
18 | ],
19 | "license": "MIT",
20 | "main": "dist/index.js",
21 | "module": "dist/index.js",
22 | "types": "dist/index.d.ts",
23 | "exports": "./dist/index.js",
24 | "dependencies": {
25 | "parse5": "^7.0.0",
26 | "parse5-parser-stream": "^7.0.0"
27 | },
28 | "repository": {
29 | "type": "git",
30 | "url": "git://github.com/inikulin/parse5.git"
31 | },
32 | "files": [
33 | "dist/**/*.js",
34 | "dist/**/*.d.ts"
35 | ]
36 | }
37 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/test/plain-text-conversion-stream.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert, describe } from 'vitest';
2 | import { serialize } from 'parse5';
3 | import { PlainTextConversionStream } from '../lib/index.js';
4 | import { generateTestsForEachTreeAdapter } from 'parse5-test-utils/utils/common.js';
5 |
6 | generateTestsForEachTreeAdapter('plain-test-conversion-stream', (treeAdapter) => {
7 | it('Plain text conversion stream', () => {
8 | const converter = new PlainTextConversionStream({ treeAdapter });
9 |
10 | converter.write('Hey');
11 | converter.write('\r\nyo');
12 | converter.write('\u0000');
13 | converter.end('');
14 |
15 | const result = serialize(converter.document, { treeAdapter });
16 |
17 | assert.strictEqual(
18 | result,
19 | '\nHey\nyo\uFFFD<html><head><body> ',
20 | );
21 | });
22 | });
23 |
24 | describe('plain-text-conversion-stream', () => {
25 | it('Should not accept binary input (GH-269)', () => {
26 | const stream = new PlainTextConversionStream();
27 | const buf = Buffer.from('test');
28 |
29 | assert.throws(() => stream.write(buf), TypeError);
30 | });
31 | });
32 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../../tsconfig.json",
3 | "compilerOptions": {
4 | "rootDir": "lib",
5 | "outDir": "dist"
6 | },
7 | "include": ["**/*.ts"],
8 | "exclude": ["**/*.test.ts", "dist", "test"]
9 | }
10 |
--------------------------------------------------------------------------------
/packages/parse5-plain-text-conversion-stream/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["../../typedoc.base.json"],
3 | "entryPoints": ["lib/index.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
parse5-sax-parser
9 |
Streaming SAX -style HTML parser.
10 |
11 |
12 |
13 |
14 | npm install --save parse5-sax-parser
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Changelog
34 |
35 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/lib/dev-null-stream.ts:
--------------------------------------------------------------------------------
1 | import { Writable } from 'node:stream';
2 |
3 | export class DevNullStream extends Writable {
4 | override _write(_chunk: string, _encoding: string, cb: () => void): void {
5 | cb();
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts:
--------------------------------------------------------------------------------
1 | import {
2 | Tokenizer,
3 | type TokenizerOptions,
4 | TokenizerMode,
5 | type TokenHandler,
6 | Token,
7 | foreignContent,
8 | html,
9 | } from 'parse5';
10 |
11 | const $ = html.TAG_ID;
12 |
13 | const REPLACEMENT_CHARACTER = '\uFFFD';
14 | const LINE_FEED_CODE_POINT = 0x0a;
15 |
16 | /**
17 | * Simulates adjustments of the Tokenizer which are performed by the standard parser during tree construction.
18 | */
19 | export class ParserFeedbackSimulator implements TokenHandler {
20 | private namespaceStack: html.NS[] = [];
21 | public inForeignContent = false;
22 | public skipNextNewLine = false;
23 | public tokenizer: Tokenizer;
24 |
25 | constructor(
26 | options: TokenizerOptions,
27 | private handler: TokenHandler,
28 | ) {
29 | this.tokenizer = new Tokenizer(options, this);
30 | this._enterNamespace(html.NS.HTML);
31 | }
32 |
33 | /** @internal */
34 | onNullCharacter(token: Token.CharacterToken): void {
35 | this.skipNextNewLine = false;
36 |
37 | if (this.inForeignContent) {
38 | this.handler.onCharacter({
39 | type: Token.TokenType.CHARACTER,
40 | chars: REPLACEMENT_CHARACTER,
41 | location: token.location,
42 | });
43 | } else {
44 | this.handler.onNullCharacter(token);
45 | }
46 | }
47 |
48 | /** @internal */
49 | onWhitespaceCharacter(token: Token.CharacterToken): void {
50 | if (this.skipNextNewLine && token.chars.charCodeAt(0) === LINE_FEED_CODE_POINT) {
51 | this.skipNextNewLine = false;
52 |
53 | if (token.chars.length === 1) {
54 | return;
55 | }
56 |
57 | token.chars = token.chars.substr(1);
58 | }
59 |
60 | this.handler.onWhitespaceCharacter(token);
61 | }
62 |
63 | /** @internal */
64 | onCharacter(token: Token.CharacterToken): void {
65 | this.skipNextNewLine = false;
66 | this.handler.onCharacter(token);
67 | }
68 |
69 | /** @internal */
70 | onComment(token: Token.CommentToken): void {
71 | this.skipNextNewLine = false;
72 | this.handler.onComment(token);
73 | }
74 |
75 | /** @internal */
76 | onDoctype(token: Token.DoctypeToken): void {
77 | this.skipNextNewLine = false;
78 | this.handler.onDoctype(token);
79 | }
80 |
81 | /** @internal */
82 | onEof(token: Token.EOFToken): void {
83 | this.skipNextNewLine = false;
84 | this.handler.onEof(token);
85 | }
86 |
87 | //Namespace stack mutations
88 | private _enterNamespace(namespace: html.NS): void {
89 | this.namespaceStack.unshift(namespace);
90 | this.inForeignContent = namespace !== html.NS.HTML;
91 | this.tokenizer.inForeignNode = this.inForeignContent;
92 | }
93 |
94 | private _leaveCurrentNamespace(): void {
95 | this.namespaceStack.shift();
96 | this.inForeignContent = this.namespaceStack[0] !== html.NS.HTML;
97 | this.tokenizer.inForeignNode = this.inForeignContent;
98 | }
99 |
100 | //Token handlers
101 | private _ensureTokenizerMode(tn: html.TAG_ID): void {
102 | switch (tn) {
103 | case $.TEXTAREA:
104 | case $.TITLE: {
105 | this.tokenizer.state = TokenizerMode.RCDATA;
106 | break;
107 | }
108 | case $.PLAINTEXT: {
109 | this.tokenizer.state = TokenizerMode.PLAINTEXT;
110 | break;
111 | }
112 | case $.SCRIPT: {
113 | this.tokenizer.state = TokenizerMode.SCRIPT_DATA;
114 | break;
115 | }
116 | case $.STYLE:
117 | case $.IFRAME:
118 | case $.XMP:
119 | case $.NOEMBED:
120 | case $.NOFRAMES:
121 | case $.NOSCRIPT: {
122 | this.tokenizer.state = TokenizerMode.RAWTEXT;
123 | break;
124 | }
125 | default:
126 | // Do nothing
127 | }
128 | }
129 |
130 | /** @internal */
131 | onStartTag(token: Token.TagToken): void {
132 | let tn = token.tagID;
133 |
134 | switch (tn) {
135 | case $.SVG: {
136 | this._enterNamespace(html.NS.SVG);
137 | break;
138 | }
139 | case $.MATH: {
140 | this._enterNamespace(html.NS.MATHML);
141 | break;
142 | }
143 | default:
144 | // Do nothing
145 | }
146 |
147 | if (this.inForeignContent) {
148 | if (foreignContent.causesExit(token)) {
149 | this._leaveCurrentNamespace();
150 | } else {
151 | const currentNs = this.namespaceStack[0];
152 |
153 | if (currentNs === html.NS.MATHML) {
154 | foreignContent.adjustTokenMathMLAttrs(token);
155 | } else if (currentNs === html.NS.SVG) {
156 | foreignContent.adjustTokenSVGTagName(token);
157 | foreignContent.adjustTokenSVGAttrs(token);
158 | }
159 |
160 | foreignContent.adjustTokenXMLAttrs(token);
161 |
162 | tn = token.tagID;
163 |
164 | if (!token.selfClosing && foreignContent.isIntegrationPoint(tn, currentNs, token.attrs)) {
165 | this._enterNamespace(html.NS.HTML);
166 | }
167 | }
168 | } else {
169 | switch (tn) {
170 | case $.PRE:
171 | case $.TEXTAREA:
172 | case $.LISTING: {
173 | this.skipNextNewLine = true;
174 | break;
175 | }
176 | case $.IMAGE: {
177 | token.tagName = html.TAG_NAMES.IMG;
178 | token.tagID = $.IMG;
179 | break;
180 | }
181 | default:
182 | // Do nothing
183 | }
184 |
185 | this._ensureTokenizerMode(tn);
186 | }
187 |
188 | this.handler.onStartTag(token);
189 | }
190 |
191 | /** @internal */
192 | onEndTag(token: Token.TagToken): void {
193 | let tn = token.tagID;
194 |
195 | if (!this.inForeignContent) {
196 | const previousNs = this.namespaceStack[1];
197 |
198 | if (previousNs === html.NS.SVG) {
199 | const adjustedTagName = foreignContent.SVG_TAG_NAMES_ADJUSTMENT_MAP.get(token.tagName);
200 |
201 | if (adjustedTagName) {
202 | tn = html.getTagID(adjustedTagName);
203 | }
204 | }
205 |
206 | //NOTE: check for exit from integration point
207 | if (foreignContent.isIntegrationPoint(tn, previousNs, token.attrs)) {
208 | this._leaveCurrentNamespace();
209 | }
210 | } else if (
211 | (tn === $.SVG && this.namespaceStack[0] === html.NS.SVG) ||
212 | (tn === $.MATH && this.namespaceStack[0] === html.NS.MATHML)
213 | ) {
214 | this._leaveCurrentNamespace();
215 | }
216 |
217 | // NOTE: adjust end tag name as well for consistency
218 | if (this.namespaceStack[0] === html.NS.SVG) {
219 | foreignContent.adjustTokenSVGTagName(token);
220 | }
221 |
222 | this.handler.onEndTag(token);
223 | }
224 | }
225 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse5-sax-parser",
3 | "type": "module",
4 | "description": "Streaming SAX-style HTML parser.",
5 | "version": "7.0.0",
6 | "author": "Ivan Nikulin (https://github.com/inikulin)",
7 | "contributors": "https://github.com/inikulin/parse5/graphs/contributors",
8 | "homepage": "https://parse5.js.org",
9 | "funding": "https://github.com/inikulin/parse5?sponsor=1",
10 | "keywords": [
11 | "parse5",
12 | "parser",
13 | "stream",
14 | "streaming",
15 | "SAX"
16 | ],
17 | "license": "MIT",
18 | "main": "dist/index.js",
19 | "module": "dist/index.js",
20 | "types": "dist/index.d.ts",
21 | "exports": "./dist/index.js",
22 | "dependencies": {
23 | "parse5": "^7.0.0"
24 | },
25 | "repository": {
26 | "type": "git",
27 | "url": "git://github.com/inikulin/parse5.git"
28 | },
29 | "files": [
30 | "dist/**/*.js",
31 | "dist/**/*.d.ts"
32 | ]
33 | }
34 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/test/location-info.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert, describe } from 'vitest';
2 | import { SAXParser } from '../lib/index.js';
3 | import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js';
4 | import { writeChunkedToStream } from 'parse5-test-utils/utils/common.js';
5 | import type { Token } from 'parse5';
6 |
7 | function assertLocation({ sourceCodeLocation }: { sourceCodeLocation: Token.Location }): void {
8 | assert.strictEqual(typeof sourceCodeLocation.startLine, 'number');
9 | assert.strictEqual(typeof sourceCodeLocation.startCol, 'number');
10 | assert.strictEqual(typeof sourceCodeLocation.startOffset, 'number');
11 | assert.strictEqual(typeof sourceCodeLocation.endOffset, 'number');
12 | assert.ok(sourceCodeLocation.startOffset < sourceCodeLocation.endOffset);
13 | }
14 |
15 | describe('location-info', () => {
16 | it('Location info (SAX)', () => {
17 | for (const test of loadSAXParserTestData()) {
18 | //NOTE: we've already tested the correctness of the location info with the Tokenizer tests.
19 | //So here we just check that SAXParser provides this info in the handlers.
20 | const parser = new SAXParser({ sourceCodeLocationInfo: true });
21 |
22 | parser.on('startTag', assertLocation);
23 | parser.on('endTag', assertLocation);
24 | parser.on('doctype', assertLocation);
25 | parser.on('comment', assertLocation);
26 | parser.on('text', assertLocation);
27 |
28 | writeChunkedToStream(test.src, parser);
29 | }
30 | });
31 |
32 | it('Regression - location info for text (GH-153, GH-266)', () => {
33 | const html = 'Here is a title ';
34 | const parser = new SAXParser({ sourceCodeLocationInfo: true });
35 |
36 | parser.on('text', ({ sourceCodeLocation }) => {
37 | assert.deepStrictEqual(sourceCodeLocation, {
38 | startLine: 1,
39 | startCol: 35,
40 | startOffset: 34,
41 | endLine: 1,
42 | endCol: 50,
43 | endOffset: 49,
44 | });
45 | });
46 |
47 | parser.end(html);
48 | });
49 | });
50 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/test/parser-feedback-simulator.test.ts:
--------------------------------------------------------------------------------
1 | import { generateTokenizationTests } from 'parse5-test-utils/utils/generate-tokenization-tests.js';
2 | import { ParserFeedbackSimulator } from '../lib/parser-feedback-simulator.js';
3 |
4 | const feedbackPath = new URL('../../../test/data/parser-feedback', import.meta.url);
5 |
6 | generateTokenizationTests(
7 | 'ParserFeedbackSimulator',
8 | feedbackPath.pathname,
9 | (handler) => new ParserFeedbackSimulator({}, handler).tokenizer,
10 | );
11 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/test/sax-parser.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert, describe } from 'vitest';
2 | import * as fs from 'node:fs';
3 | import type { SAXParserOptions } from '../lib/index.js';
4 | import { SAXParser } from '../lib/index.js';
5 | import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js';
6 | import {
7 | finished,
8 | getStringDiffMsg,
9 | writeChunkedToStream,
10 | removeNewLines,
11 | WritableStreamStub,
12 | } from 'parse5-test-utils/utils/common.js';
13 |
14 | function sanitizeForComparison(str: string): string {
15 | return removeNewLines(str).replace(/\s/g, '').replace(/'/g, '"').toLowerCase();
16 | }
17 |
18 | function createBasicTest(html: string, expected: string, options?: SAXParserOptions) {
19 | return async function (): Promise {
20 | //NOTE: the idea of the test is to serialize back given HTML using SAXParser handlers
21 | let actual = '';
22 | const parser = new SAXParser(options);
23 |
24 | parser.on('doctype', ({ name, publicId, systemId }) => {
25 | actual += `';
38 | });
39 |
40 | parser.on('startTag', ({ tagName, attrs, selfClosing }) => {
41 | actual += `<${tagName}`;
42 | for (const attr of attrs) {
43 | actual += ` ${attr.name}="${attr.value}"`;
44 | }
45 | actual += selfClosing ? '/>' : '>';
46 | });
47 |
48 | parser.on('endTag', ({ tagName }) => {
49 | actual += `${tagName}>`;
50 | });
51 |
52 | parser.on('text', ({ text }) => {
53 | actual += text;
54 | });
55 |
56 | parser.on('comment', ({ text }) => {
57 | actual += ``;
58 | });
59 |
60 | writeChunkedToStream(html, parser);
61 |
62 | await finished(parser);
63 |
64 | expected = sanitizeForComparison(expected);
65 | actual = sanitizeForComparison(actual);
66 |
67 | //NOTE: use ok assertion, so output will not be polluted by the whole content of the strings
68 | assert.ok(actual === expected, getStringDiffMsg(actual, expected));
69 | };
70 | }
71 |
72 | const hugePage = new URL('../../../test/data/huge-page/huge-page.html', import.meta.url);
73 |
74 | describe('SAX parser', () => {
75 | //Basic tests
76 | for (const [idx, data] of loadSAXParserTestData().entries())
77 | it(`${idx + 1}.${data.name}`, createBasicTest(data.src, data.expected));
78 |
79 | it('Piping and .stop()', async () => {
80 | const parser = new SAXParser();
81 | const writable = new WritableStreamStub();
82 | let handlerCallCount = 0;
83 |
84 | function handler(): void {
85 | handlerCallCount++;
86 |
87 | if (handlerCallCount === 10) {
88 | parser.stop();
89 | }
90 | }
91 |
92 | fs.createReadStream(hugePage, 'utf8').pipe(parser).pipe(writable);
93 |
94 | parser.on('startTag', handler);
95 | parser.on('endTag', handler);
96 | parser.on('doctype', handler);
97 | parser.on('comment', handler);
98 | parser.on('text', handler);
99 |
100 | await finished(writable);
101 |
102 | const expected = fs.readFileSync(hugePage).toString();
103 |
104 | assert.strictEqual(handlerCallCount, 10);
105 | assert.strictEqual(writable.writtenData, expected);
106 | });
107 |
108 | it('Parser silently exits on big files (GH-97)', () => {
109 | const parser = new SAXParser();
110 |
111 | fs.createReadStream(hugePage, 'utf8').pipe(parser);
112 |
113 | //NOTE: This is a smoke test - in case of regression it will fail with timeout.
114 | return finished(parser);
115 | });
116 |
117 | it('Last text chunk must be flushed (GH-271)', async () => {
118 | const parser = new SAXParser();
119 | let foundText = false;
120 |
121 | parser.on('text', ({ text }) => {
122 | foundText = true;
123 | assert.strictEqual(text, 'text');
124 | });
125 |
126 | parser.write('text');
127 | parser.end();
128 |
129 | await finished(parser);
130 |
131 | assert.ok(foundText);
132 | });
133 |
134 | it('Should not accept binary input (GH-269)', () => {
135 | const stream = new SAXParser();
136 | const buf = Buffer.from('test');
137 |
138 | assert.throws(() => stream.write(buf), TypeError);
139 | });
140 |
141 | it('Should treat NULL characters as normal text', async () => {
142 | const parser = new SAXParser();
143 | let foundText = false;
144 |
145 | parser.on('text', ({ text }) => {
146 | foundText = true;
147 | assert.strictEqual(text, '\0');
148 | });
149 |
150 | parser.write('\0');
151 | parser.end();
152 |
153 | await finished(parser);
154 |
155 | assert.strictEqual(foundText, true);
156 | });
157 | });
158 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../../tsconfig.json",
3 | "compilerOptions": {
4 | "rootDir": "lib",
5 | "outDir": "dist"
6 | },
7 | "include": ["**/*.ts"],
8 | "exclude": ["**/*.test.ts", "dist", "test"]
9 | }
10 |
--------------------------------------------------------------------------------
/packages/parse5-sax-parser/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["../../typedoc.base.json"],
3 | "entryPoints": ["lib/index.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/packages/parse5/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/packages/parse5/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
parse5
9 | HTML parser and serializer.
10 |
11 |
12 |
13 |
14 | npm install --save parse5
15 |
16 |
17 |
18 |
19 | 📖 Documentation 📖
20 |
21 |
22 | ---
23 |
24 |
25 | List of parse5 toolset packages
26 |
27 |
28 |
29 | GitHub
30 |
31 |
32 |
33 | Online playground
34 |
35 |
36 |
37 | Changelog
38 |
39 |
--------------------------------------------------------------------------------
/packages/parse5/lib/common/doctype.ts:
--------------------------------------------------------------------------------
1 | import { DOCUMENT_MODE } from './html.js';
2 | import type { DoctypeToken } from './token.js';
3 |
4 | //Const
5 | const VALID_DOCTYPE_NAME = 'html';
6 | const VALID_SYSTEM_ID = 'about:legacy-compat';
7 | const QUIRKS_MODE_SYSTEM_ID = 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd';
8 |
9 | const QUIRKS_MODE_PUBLIC_ID_PREFIXES = [
10 | '+//silmaril//dtd html pro v0r11 19970101//',
11 | '-//as//dtd html 3.0 aswedit + extensions//',
12 | '-//advasoft ltd//dtd html 3.0 aswedit + extensions//',
13 | '-//ietf//dtd html 2.0 level 1//',
14 | '-//ietf//dtd html 2.0 level 2//',
15 | '-//ietf//dtd html 2.0 strict level 1//',
16 | '-//ietf//dtd html 2.0 strict level 2//',
17 | '-//ietf//dtd html 2.0 strict//',
18 | '-//ietf//dtd html 2.0//',
19 | '-//ietf//dtd html 2.1e//',
20 | '-//ietf//dtd html 3.0//',
21 | '-//ietf//dtd html 3.2 final//',
22 | '-//ietf//dtd html 3.2//',
23 | '-//ietf//dtd html 3//',
24 | '-//ietf//dtd html level 0//',
25 | '-//ietf//dtd html level 1//',
26 | '-//ietf//dtd html level 2//',
27 | '-//ietf//dtd html level 3//',
28 | '-//ietf//dtd html strict level 0//',
29 | '-//ietf//dtd html strict level 1//',
30 | '-//ietf//dtd html strict level 2//',
31 | '-//ietf//dtd html strict level 3//',
32 | '-//ietf//dtd html strict//',
33 | '-//ietf//dtd html//',
34 | '-//metrius//dtd metrius presentational//',
35 | '-//microsoft//dtd internet explorer 2.0 html strict//',
36 | '-//microsoft//dtd internet explorer 2.0 html//',
37 | '-//microsoft//dtd internet explorer 2.0 tables//',
38 | '-//microsoft//dtd internet explorer 3.0 html strict//',
39 | '-//microsoft//dtd internet explorer 3.0 html//',
40 | '-//microsoft//dtd internet explorer 3.0 tables//',
41 | '-//netscape comm. corp.//dtd html//',
42 | '-//netscape comm. corp.//dtd strict html//',
43 | "-//o'reilly and associates//dtd html 2.0//",
44 | "-//o'reilly and associates//dtd html extended 1.0//",
45 | "-//o'reilly and associates//dtd html extended relaxed 1.0//",
46 | '-//sq//dtd html 2.0 hotmetal + extensions//',
47 | '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//',
48 | '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//',
49 | '-//spyglass//dtd html 2.0 extended//',
50 | '-//sun microsystems corp.//dtd hotjava html//',
51 | '-//sun microsystems corp.//dtd hotjava strict html//',
52 | '-//w3c//dtd html 3 1995-03-24//',
53 | '-//w3c//dtd html 3.2 draft//',
54 | '-//w3c//dtd html 3.2 final//',
55 | '-//w3c//dtd html 3.2//',
56 | '-//w3c//dtd html 3.2s draft//',
57 | '-//w3c//dtd html 4.0 frameset//',
58 | '-//w3c//dtd html 4.0 transitional//',
59 | '-//w3c//dtd html experimental 19960712//',
60 | '-//w3c//dtd html experimental 970421//',
61 | '-//w3c//dtd w3 html//',
62 | '-//w3o//dtd w3 html 3.0//',
63 | '-//webtechs//dtd mozilla html 2.0//',
64 | '-//webtechs//dtd mozilla html//',
65 | ];
66 |
67 | const QUIRKS_MODE_NO_SYSTEM_ID_PUBLIC_ID_PREFIXES = [
68 | ...QUIRKS_MODE_PUBLIC_ID_PREFIXES,
69 | '-//w3c//dtd html 4.01 frameset//',
70 | '-//w3c//dtd html 4.01 transitional//',
71 | ];
72 |
73 | const QUIRKS_MODE_PUBLIC_IDS = new Set([
74 | '-//w3o//dtd w3 html strict 3.0//en//',
75 | '-/w3c/dtd html 4.0 transitional/en',
76 | 'html',
77 | ]);
78 | const LIMITED_QUIRKS_PUBLIC_ID_PREFIXES = ['-//w3c//dtd xhtml 1.0 frameset//', '-//w3c//dtd xhtml 1.0 transitional//'];
79 |
80 | const LIMITED_QUIRKS_WITH_SYSTEM_ID_PUBLIC_ID_PREFIXES = [
81 | ...LIMITED_QUIRKS_PUBLIC_ID_PREFIXES,
82 | '-//w3c//dtd html 4.01 frameset//',
83 | '-//w3c//dtd html 4.01 transitional//',
84 | ];
85 |
86 | //Utils
87 | function hasPrefix(publicId: string, prefixes: string[]): boolean {
88 | return prefixes.some((prefix) => publicId.startsWith(prefix));
89 | }
90 |
91 | //API
92 | export function isConforming(token: DoctypeToken): boolean {
93 | return (
94 | token.name === VALID_DOCTYPE_NAME &&
95 | token.publicId === null &&
96 | (token.systemId === null || token.systemId === VALID_SYSTEM_ID)
97 | );
98 | }
99 |
100 | export function getDocumentMode(token: DoctypeToken): DOCUMENT_MODE {
101 | if (token.name !== VALID_DOCTYPE_NAME) {
102 | return DOCUMENT_MODE.QUIRKS;
103 | }
104 |
105 | const { systemId } = token;
106 |
107 | if (systemId && systemId.toLowerCase() === QUIRKS_MODE_SYSTEM_ID) {
108 | return DOCUMENT_MODE.QUIRKS;
109 | }
110 |
111 | let { publicId } = token;
112 |
113 | if (publicId !== null) {
114 | publicId = publicId.toLowerCase();
115 |
116 | if (QUIRKS_MODE_PUBLIC_IDS.has(publicId)) {
117 | return DOCUMENT_MODE.QUIRKS;
118 | }
119 |
120 | let prefixes = systemId === null ? QUIRKS_MODE_NO_SYSTEM_ID_PUBLIC_ID_PREFIXES : QUIRKS_MODE_PUBLIC_ID_PREFIXES;
121 |
122 | if (hasPrefix(publicId, prefixes)) {
123 | return DOCUMENT_MODE.QUIRKS;
124 | }
125 |
126 | prefixes =
127 | systemId === null ? LIMITED_QUIRKS_PUBLIC_ID_PREFIXES : LIMITED_QUIRKS_WITH_SYSTEM_ID_PUBLIC_ID_PREFIXES;
128 |
129 | if (hasPrefix(publicId, prefixes)) {
130 | return DOCUMENT_MODE.LIMITED_QUIRKS;
131 | }
132 | }
133 |
134 | return DOCUMENT_MODE.NO_QUIRKS;
135 | }
136 |
--------------------------------------------------------------------------------
/packages/parse5/lib/common/error-codes.ts:
--------------------------------------------------------------------------------
1 | import type { Location } from './token.js';
2 |
3 | export interface ParserError extends Location {
4 | code: ERR;
5 | }
6 |
7 | export type ParserErrorHandler = (error: ParserError) => void;
8 |
9 | export enum ERR {
10 | controlCharacterInInputStream = 'control-character-in-input-stream',
11 | noncharacterInInputStream = 'noncharacter-in-input-stream',
12 | surrogateInInputStream = 'surrogate-in-input-stream',
13 | nonVoidHtmlElementStartTagWithTrailingSolidus = 'non-void-html-element-start-tag-with-trailing-solidus',
14 | endTagWithAttributes = 'end-tag-with-attributes',
15 | endTagWithTrailingSolidus = 'end-tag-with-trailing-solidus',
16 | unexpectedSolidusInTag = 'unexpected-solidus-in-tag',
17 | unexpectedNullCharacter = 'unexpected-null-character',
18 | unexpectedQuestionMarkInsteadOfTagName = 'unexpected-question-mark-instead-of-tag-name',
19 | invalidFirstCharacterOfTagName = 'invalid-first-character-of-tag-name',
20 | unexpectedEqualsSignBeforeAttributeName = 'unexpected-equals-sign-before-attribute-name',
21 | missingEndTagName = 'missing-end-tag-name',
22 | unexpectedCharacterInAttributeName = 'unexpected-character-in-attribute-name',
23 | unknownNamedCharacterReference = 'unknown-named-character-reference',
24 | missingSemicolonAfterCharacterReference = 'missing-semicolon-after-character-reference',
25 | unexpectedCharacterAfterDoctypeSystemIdentifier = 'unexpected-character-after-doctype-system-identifier',
26 | unexpectedCharacterInUnquotedAttributeValue = 'unexpected-character-in-unquoted-attribute-value',
27 | eofBeforeTagName = 'eof-before-tag-name',
28 | eofInTag = 'eof-in-tag',
29 | missingAttributeValue = 'missing-attribute-value',
30 | missingWhitespaceBetweenAttributes = 'missing-whitespace-between-attributes',
31 | missingWhitespaceAfterDoctypePublicKeyword = 'missing-whitespace-after-doctype-public-keyword',
32 | missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers = 'missing-whitespace-between-doctype-public-and-system-identifiers',
33 | missingWhitespaceAfterDoctypeSystemKeyword = 'missing-whitespace-after-doctype-system-keyword',
34 | missingQuoteBeforeDoctypePublicIdentifier = 'missing-quote-before-doctype-public-identifier',
35 | missingQuoteBeforeDoctypeSystemIdentifier = 'missing-quote-before-doctype-system-identifier',
36 | missingDoctypePublicIdentifier = 'missing-doctype-public-identifier',
37 | missingDoctypeSystemIdentifier = 'missing-doctype-system-identifier',
38 | abruptDoctypePublicIdentifier = 'abrupt-doctype-public-identifier',
39 | abruptDoctypeSystemIdentifier = 'abrupt-doctype-system-identifier',
40 | cdataInHtmlContent = 'cdata-in-html-content',
41 | incorrectlyOpenedComment = 'incorrectly-opened-comment',
42 | eofInScriptHtmlCommentLikeText = 'eof-in-script-html-comment-like-text',
43 | eofInDoctype = 'eof-in-doctype',
44 | nestedComment = 'nested-comment',
45 | abruptClosingOfEmptyComment = 'abrupt-closing-of-empty-comment',
46 | eofInComment = 'eof-in-comment',
47 | incorrectlyClosedComment = 'incorrectly-closed-comment',
48 | eofInCdata = 'eof-in-cdata',
49 | absenceOfDigitsInNumericCharacterReference = 'absence-of-digits-in-numeric-character-reference',
50 | nullCharacterReference = 'null-character-reference',
51 | surrogateCharacterReference = 'surrogate-character-reference',
52 | characterReferenceOutsideUnicodeRange = 'character-reference-outside-unicode-range',
53 | controlCharacterReference = 'control-character-reference',
54 | noncharacterCharacterReference = 'noncharacter-character-reference',
55 | missingWhitespaceBeforeDoctypeName = 'missing-whitespace-before-doctype-name',
56 | missingDoctypeName = 'missing-doctype-name',
57 | invalidCharacterSequenceAfterDoctypeName = 'invalid-character-sequence-after-doctype-name',
58 | duplicateAttribute = 'duplicate-attribute',
59 | nonConformingDoctype = 'non-conforming-doctype',
60 | missingDoctype = 'missing-doctype',
61 | misplacedDoctype = 'misplaced-doctype',
62 | endTagWithoutMatchingOpenElement = 'end-tag-without-matching-open-element',
63 | closingOfElementWithOpenChildElements = 'closing-of-element-with-open-child-elements',
64 | disallowedContentInNoscriptInHead = 'disallowed-content-in-noscript-in-head',
65 | openElementsLeftAfterEof = 'open-elements-left-after-eof',
66 | abandonedHeadElementChild = 'abandoned-head-element-child',
67 | misplacedStartTagForHeadElement = 'misplaced-start-tag-for-head-element',
68 | nestedNoscriptInHead = 'nested-noscript-in-head',
69 | eofInElementThatCanContainOnlyText = 'eof-in-element-that-can-contain-only-text',
70 | }
71 |
--------------------------------------------------------------------------------
/packages/parse5/lib/common/token.ts:
--------------------------------------------------------------------------------
1 | import type { TAG_ID } from './html.js';
2 |
3 | export enum TokenType {
4 | CHARACTER,
5 | NULL_CHARACTER,
6 | WHITESPACE_CHARACTER,
7 | START_TAG,
8 | END_TAG,
9 | COMMENT,
10 | DOCTYPE,
11 | EOF,
12 | HIBERNATION,
13 | }
14 |
15 | export interface Location {
16 | /** One-based line index of the first character. */
17 | startLine: number;
18 | /** One-based column index of the first character. */
19 | startCol: number;
20 | /** Zero-based first character index. */
21 | startOffset: number;
22 | /** One-based line index of the last character. */
23 | endLine: number;
24 | /** One-based column index of the last character. Points directly *after* the last character. */
25 | endCol: number;
26 | /** Zero-based last character index. Points directly *after* the last character. */
27 | endOffset: number;
28 | }
29 |
30 | export interface LocationWithAttributes extends Location {
31 | /** Start tag attributes' location info. */
32 | attrs?: Record;
33 | }
34 |
35 | export interface ElementLocation extends LocationWithAttributes {
36 | /** Element's start tag location info. */
37 | startTag?: Location;
38 | /**
39 | * Element's end tag location info.
40 | * This property is undefined, if the element has no closing tag.
41 | */
42 | endTag?: Location;
43 | }
44 |
45 | interface TokenBase {
46 | readonly type: TokenType;
47 | location: Location | null;
48 | }
49 |
50 | export interface DoctypeToken extends TokenBase {
51 | readonly type: TokenType.DOCTYPE;
52 | name: string | null;
53 | forceQuirks: boolean;
54 | publicId: string | null;
55 | systemId: string | null;
56 | }
57 |
58 | export interface Attribute {
59 | /** The name of the attribute. */
60 | name: string;
61 | /** The namespace of the attribute. */
62 | namespace?: string;
63 | /** The namespace-related prefix of the attribute. */
64 | prefix?: string;
65 | /** The value of the attribute. */
66 | value: string;
67 | }
68 |
69 | export interface TagToken extends TokenBase {
70 | readonly type: TokenType.START_TAG | TokenType.END_TAG;
71 | tagName: string;
72 | /** Used to cache the ID of the tag name. */
73 | tagID: TAG_ID;
74 | selfClosing: boolean;
75 | ackSelfClosing: boolean;
76 | attrs: Attribute[];
77 | location: LocationWithAttributes | null;
78 | }
79 |
80 | export function getTokenAttr(token: TagToken, attrName: string): string | null {
81 | for (let i = token.attrs.length - 1; i >= 0; i--) {
82 | if (token.attrs[i].name === attrName) {
83 | return token.attrs[i].value;
84 | }
85 | }
86 |
87 | return null;
88 | }
89 |
90 | export interface CommentToken extends TokenBase {
91 | readonly type: TokenType.COMMENT;
92 | data: string;
93 | }
94 |
95 | export interface EOFToken extends TokenBase {
96 | readonly type: TokenType.EOF;
97 | }
98 |
99 | export interface CharacterToken extends TokenBase {
100 | type: TokenType.CHARACTER | TokenType.NULL_CHARACTER | TokenType.WHITESPACE_CHARACTER;
101 | chars: string;
102 | }
103 |
104 | export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken;
105 |
--------------------------------------------------------------------------------
/packages/parse5/lib/common/unicode.ts:
--------------------------------------------------------------------------------
1 | const UNDEFINED_CODE_POINTS = new Set([
2 | 0xff_fe, 0xff_ff, 0x1_ff_fe, 0x1_ff_ff, 0x2_ff_fe, 0x2_ff_ff, 0x3_ff_fe, 0x3_ff_ff, 0x4_ff_fe, 0x4_ff_ff, 0x5_ff_fe,
3 | 0x5_ff_ff, 0x6_ff_fe, 0x6_ff_ff, 0x7_ff_fe, 0x7_ff_ff, 0x8_ff_fe, 0x8_ff_ff, 0x9_ff_fe, 0x9_ff_ff, 0xa_ff_fe,
4 | 0xa_ff_ff, 0xb_ff_fe, 0xb_ff_ff, 0xc_ff_fe, 0xc_ff_ff, 0xd_ff_fe, 0xd_ff_ff, 0xe_ff_fe, 0xe_ff_ff, 0xf_ff_fe,
5 | 0xf_ff_ff, 0x10_ff_fe, 0x10_ff_ff,
6 | ]);
7 |
8 | export const REPLACEMENT_CHARACTER = '\uFFFD';
9 |
10 | export enum CODE_POINTS {
11 | EOF = -1,
12 | NULL = 0x00,
13 | TABULATION = 0x09,
14 | CARRIAGE_RETURN = 0x0d,
15 | LINE_FEED = 0x0a,
16 | FORM_FEED = 0x0c,
17 | SPACE = 0x20,
18 | EXCLAMATION_MARK = 0x21,
19 | QUOTATION_MARK = 0x22,
20 | AMPERSAND = 0x26,
21 | APOSTROPHE = 0x27,
22 | HYPHEN_MINUS = 0x2d,
23 | SOLIDUS = 0x2f,
24 | DIGIT_0 = 0x30,
25 | DIGIT_9 = 0x39,
26 | SEMICOLON = 0x3b,
27 | LESS_THAN_SIGN = 0x3c,
28 | EQUALS_SIGN = 0x3d,
29 | GREATER_THAN_SIGN = 0x3e,
30 | QUESTION_MARK = 0x3f,
31 | LATIN_CAPITAL_A = 0x41,
32 | LATIN_CAPITAL_Z = 0x5a,
33 | RIGHT_SQUARE_BRACKET = 0x5d,
34 | GRAVE_ACCENT = 0x60,
35 | LATIN_SMALL_A = 0x61,
36 | LATIN_SMALL_Z = 0x7a,
37 | }
38 |
39 | export const SEQUENCES = {
40 | DASH_DASH: '--',
41 | CDATA_START: '[CDATA[',
42 | DOCTYPE: 'doctype',
43 | SCRIPT: 'script',
44 | PUBLIC: 'public',
45 | SYSTEM: 'system',
46 | };
47 |
48 | //Surrogates
49 | export function isSurrogate(cp: number): boolean {
50 | return cp >= 0xd8_00 && cp <= 0xdf_ff;
51 | }
52 |
53 | export function isSurrogatePair(cp: number): boolean {
54 | return cp >= 0xdc_00 && cp <= 0xdf_ff;
55 | }
56 |
57 | export function getSurrogatePairCodePoint(cp1: number, cp2: number): number {
58 | return (cp1 - 0xd8_00) * 0x4_00 + 0x24_00 + cp2;
59 | }
60 |
61 | //NOTE: excluding NULL and ASCII whitespace
62 | export function isControlCodePoint(cp: number): boolean {
63 | return (
64 | (cp !== 0x20 && cp !== 0x0a && cp !== 0x0d && cp !== 0x09 && cp !== 0x0c && cp >= 0x01 && cp <= 0x1f) ||
65 | (cp >= 0x7f && cp <= 0x9f)
66 | );
67 | }
68 |
69 | export function isUndefinedCodePoint(cp: number): boolean {
70 | return (cp >= 0xfd_d0 && cp <= 0xfd_ef) || UNDEFINED_CODE_POINTS.has(cp);
71 | }
72 |
--------------------------------------------------------------------------------
/packages/parse5/lib/index.ts:
--------------------------------------------------------------------------------
1 | import { Parser, type ParserOptions } from './parser/index.js';
2 |
3 | import type { DefaultTreeAdapterMap } from './tree-adapters/default.js';
4 | import type { TreeAdapterTypeMap } from './tree-adapters/interface.js';
5 |
6 | export { type DefaultTreeAdapterMap, defaultTreeAdapter } from './tree-adapters/default.js';
7 | import type * as DefaultTreeAdapter from './tree-adapters/default.js';
8 | // eslint-disable-next-line @typescript-eslint/no-namespace
9 | export namespace DefaultTreeAdapterTypes {
10 | export type Document = DefaultTreeAdapter.Document;
11 | export type DocumentFragment = DefaultTreeAdapter.DocumentFragment;
12 | export type Element = DefaultTreeAdapter.Element;
13 | export type CommentNode = DefaultTreeAdapter.CommentNode;
14 | export type TextNode = DefaultTreeAdapter.TextNode;
15 | export type Template = DefaultTreeAdapter.Template;
16 | export type DocumentType = DefaultTreeAdapter.DocumentType;
17 | export type ParentNode = DefaultTreeAdapter.ParentNode;
18 | export type ChildNode = DefaultTreeAdapter.ChildNode;
19 | export type Node = DefaultTreeAdapter.Node;
20 | export type DefaultTreeAdapterMap = DefaultTreeAdapter.DefaultTreeAdapterMap;
21 | }
22 | export type { TreeAdapter, TreeAdapterTypeMap } from './tree-adapters/interface.js';
23 | export { type ParserOptions, /** @internal */ Parser } from './parser/index.js';
24 | export { serialize, serializeOuter, type SerializerOptions } from './serializer/index.js';
25 | export { ERR as ErrorCodes, type ParserError, type ParserErrorHandler } from './common/error-codes.js';
26 |
27 | /** @internal */
28 | export * as foreignContent from './common/foreign-content.js';
29 | export * as html from './common/html.js';
30 | export * as Token from './common/token.js';
31 | /** @internal */
32 | export { Tokenizer, type TokenizerOptions, TokenizerMode, type TokenHandler } from './tokenizer/index.js';
33 |
34 | // Shorthands
35 |
36 | /**
37 | * Parses an HTML string.
38 | *
39 | * @param html Input HTML string.
40 | * @param options Parsing options.
41 | * @returns Document
42 | *
43 | * @example
44 | *
45 | * ```js
46 | * const parse5 = require('parse5');
47 | *
48 | * const document = parse5.parse('Hi there!');
49 | *
50 | * console.log(document.childNodes[1].tagName); //> 'html'
51 | *```
52 | */
53 | export function parse(
54 | html: string,
55 | options?: ParserOptions,
56 | ): T['document'] {
57 | return Parser.parse(html, options);
58 | }
59 |
60 | /**
61 | * Parses an HTML fragment.
62 | *
63 | * @example
64 | *
65 | * ```js
66 | * const parse5 = require('parse5');
67 | *
68 | * const documentFragment = parse5.parseFragment('');
69 | *
70 | * console.log(documentFragment.childNodes[0].tagName); //> 'table'
71 | *
72 | * // Parses the html fragment in the context of the parsed element.
73 | * const trFragment = parse5.parseFragment(documentFragment.childNodes[0], 'Shake it, baby ');
74 | *
75 | * console.log(trFragment.childNodes[0].childNodes[0].tagName); //> 'td'
76 | * ```
77 | *
78 | * @param fragmentContext Parsing context element. If specified, given fragment will be parsed as if it was set to the context element's `innerHTML` property.
79 | * @param html Input HTML fragment string.
80 | * @param options Parsing options.
81 | * @returns DocumentFragment
82 | */
83 | export function parseFragment(
84 | fragmentContext: T['parentNode'] | null,
85 | html: string,
86 | options: ParserOptions,
87 | ): T['documentFragment'];
88 | export function parseFragment(
89 | html: string,
90 | options?: ParserOptions,
91 | ): T['documentFragment'];
92 | export function parseFragment(
93 | fragmentContext: T['parentNode'] | null | string,
94 | html?: string | ParserOptions,
95 | options?: ParserOptions,
96 | ): T['documentFragment'] {
97 | if (typeof fragmentContext === 'string') {
98 | options = html as ParserOptions;
99 | html = fragmentContext;
100 | fragmentContext = null;
101 | }
102 |
103 | const parser = Parser.getFragmentParser(fragmentContext, options);
104 |
105 | parser.tokenizer.write(html as string, true);
106 |
107 | return parser.getFragment();
108 | }
109 |
--------------------------------------------------------------------------------
/packages/parse5/lib/parser/formatting-element-list.ts:
--------------------------------------------------------------------------------
1 | import type { Attribute, TagToken } from '../common/token.js';
2 | import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface.js';
3 |
4 | //Const
5 | const NOAH_ARK_CAPACITY = 3;
6 |
7 | export enum EntryType {
8 | Marker,
9 | Element,
10 | }
11 |
12 | interface MarkerEntry {
13 | type: EntryType.Marker;
14 | }
15 |
16 | export interface ElementEntry {
17 | type: EntryType.Element;
18 | element: T['element'];
19 | token: TagToken;
20 | }
21 |
22 | export type Entry = MarkerEntry | ElementEntry;
23 |
24 | const MARKER: MarkerEntry = { type: EntryType.Marker };
25 |
26 | //List of formatting elements
27 | export class FormattingElementList {
28 | entries: Entry[] = [];
29 | bookmark: Entry | null = null;
30 |
31 | constructor(private treeAdapter: TreeAdapter) {}
32 |
33 | //Noah Ark's condition
34 | //OPTIMIZATION: at first we try to find possible candidates for exclusion using
35 | //lightweight heuristics without thorough attributes check.
36 | private _getNoahArkConditionCandidates(
37 | newElement: T['element'],
38 | neAttrs: Attribute[],
39 | ): { idx: number; attrs: Attribute[] }[] {
40 | const candidates = [];
41 |
42 | const neAttrsLength = neAttrs.length;
43 | const neTagName = this.treeAdapter.getTagName(newElement);
44 | const neNamespaceURI = this.treeAdapter.getNamespaceURI(newElement);
45 |
46 | for (let i = 0; i < this.entries.length; i++) {
47 | const entry = this.entries[i];
48 |
49 | if (entry.type === EntryType.Marker) {
50 | break;
51 | }
52 |
53 | const { element } = entry;
54 |
55 | if (
56 | this.treeAdapter.getTagName(element) === neTagName &&
57 | this.treeAdapter.getNamespaceURI(element) === neNamespaceURI
58 | ) {
59 | const elementAttrs = this.treeAdapter.getAttrList(element);
60 |
61 | if (elementAttrs.length === neAttrsLength) {
62 | candidates.push({ idx: i, attrs: elementAttrs });
63 | }
64 | }
65 | }
66 |
67 | return candidates;
68 | }
69 |
70 | private _ensureNoahArkCondition(newElement: T['element']): void {
71 | if (this.entries.length < NOAH_ARK_CAPACITY) return;
72 |
73 | const neAttrs = this.treeAdapter.getAttrList(newElement);
74 | const candidates = this._getNoahArkConditionCandidates(newElement, neAttrs);
75 |
76 | if (candidates.length < NOAH_ARK_CAPACITY) return;
77 |
78 | //NOTE: build attrs map for the new element, so we can perform fast lookups
79 | const neAttrsMap = new Map(neAttrs.map((neAttr: Attribute) => [neAttr.name, neAttr.value]));
80 | let validCandidates = 0;
81 |
82 | //NOTE: remove bottommost candidates, until Noah's Ark condition will not be met
83 | for (let i = 0; i < candidates.length; i++) {
84 | const candidate = candidates[i];
85 |
86 | // We know that `candidate.attrs.length === neAttrs.length`
87 | if (candidate.attrs.every((cAttr) => neAttrsMap.get(cAttr.name) === cAttr.value)) {
88 | validCandidates += 1;
89 |
90 | if (validCandidates >= NOAH_ARK_CAPACITY) {
91 | this.entries.splice(candidate.idx, 1);
92 | }
93 | }
94 | }
95 | }
96 |
97 | //Mutations
98 | insertMarker(): void {
99 | this.entries.unshift(MARKER);
100 | }
101 |
102 | pushElement(element: T['element'], token: TagToken): void {
103 | this._ensureNoahArkCondition(element);
104 |
105 | this.entries.unshift({
106 | type: EntryType.Element,
107 | element,
108 | token,
109 | });
110 | }
111 |
112 | insertElementAfterBookmark(element: T['element'], token: TagToken): void {
113 | const bookmarkIdx = this.entries.indexOf(this.bookmark!);
114 |
115 | this.entries.splice(bookmarkIdx, 0, {
116 | type: EntryType.Element,
117 | element,
118 | token,
119 | });
120 | }
121 |
122 | removeEntry(entry: Entry): void {
123 | const entryIndex = this.entries.indexOf(entry);
124 |
125 | if (entryIndex !== -1) {
126 | this.entries.splice(entryIndex, 1);
127 | }
128 | }
129 |
130 | /**
131 | * Clears the list of formatting elements up to the last marker.
132 | *
133 | * @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-list-of-active-formatting-elements-up-to-the-last-marker
134 | */
135 | clearToLastMarker(): void {
136 | const markerIdx = this.entries.indexOf(MARKER);
137 |
138 | if (markerIdx === -1) {
139 | this.entries.length = 0;
140 | } else {
141 | this.entries.splice(0, markerIdx + 1);
142 | }
143 | }
144 |
145 | //Search
146 | getElementEntryInScopeWithTagName(tagName: string): ElementEntry | null {
147 | const entry = this.entries.find(
148 | (entry) => entry.type === EntryType.Marker || this.treeAdapter.getTagName(entry.element) === tagName,
149 | );
150 |
151 | return entry && entry.type === EntryType.Element ? entry : null;
152 | }
153 |
154 | getElementEntry(element: T['element']): ElementEntry | undefined {
155 | return this.entries.find(
156 | (entry): entry is ElementEntry => entry.type === EntryType.Element && entry.element === element,
157 | );
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/packages/parse5/lib/parser/index.test.ts:
--------------------------------------------------------------------------------
1 | import { it, assert, describe, beforeEach, afterEach, vi, expect } from 'vitest';
2 | import { parseFragment, parse } from 'parse5';
3 | import type { Element, TextNode } from '../tree-adapters/default.js';
4 | import { generateParsingTests } from 'parse5-test-utils/utils/generate-parsing-tests.js';
5 | import { treeAdapters } from 'parse5-test-utils/utils/common.js';
6 | import type { Htmlparser2TreeAdapterMap } from 'parse5-htmlparser2-tree-adapter';
7 |
8 | generateParsingTests(
9 | 'parser',
10 | 'Parser',
11 | {
12 | expectErrors: [
13 | //TODO(GH-448): Foreign content behaviour was updated in the HTML spec.
14 | //The old test suite still tests the old behaviour.
15 | '0.foreign-fragment',
16 | '1.foreign-fragment',
17 | '38.foreign-fragment',
18 | '40.foreign-fragment',
19 | '47.foreign-fragment',
20 | '48.foreign-fragment',
21 | ],
22 | },
23 | (test, opts) => ({
24 | node: test.fragmentContext ? parseFragment(test.fragmentContext, test.input, opts) : parse(test.input, opts),
25 | }),
26 | );
27 |
28 | generateParsingTests(
29 | 'parser upstream',
30 | 'Parser',
31 | {
32 | withoutErrors: true,
33 | suitePath: new URL('../../../../test/data/html5lib-tests/tree-construction', import.meta.url),
34 | },
35 | (test, opts) => ({
36 | node: test.fragmentContext ? parseFragment(test.fragmentContext, test.input, opts) : parse(test.input, opts),
37 | }),
38 | );
39 |
40 | describe('parser', () => {
41 | it('Regression - HTML5 Legacy Doctype Misparsed with htmlparser2 tree adapter (GH-45)', () => {
42 | const html = 'Hi there!';
43 | const document = parse(html, { treeAdapter: treeAdapters.htmlparser2 });
44 |
45 | assert.ok(treeAdapters.htmlparser2.isDocumentTypeNode(document.childNodes[0]));
46 | assert.strictEqual(
47 | (document.childNodes[0] as Htmlparser2TreeAdapterMap['documentType']).data,
48 | '!DOCTYPE html SYSTEM "about:legacy-compat"',
49 | );
50 | });
51 |
52 | describe("Regression - Don't inherit from Object when creating collections (GH-119)", () => {
53 | beforeEach(() => {
54 | // @ts-expect-error Adding unknown prototype method
55 | Object.prototype.heyYo = 123;
56 | });
57 |
58 | afterEach(() => {
59 | // @ts-expect-error Deleting unknown prototype property
60 | delete Object.prototype.heyYo;
61 | });
62 |
63 | it('parses correctly', () => {
64 | const fragment = parseFragment('
",
151 | "input": "",
152 | "output": [
153 | [
154 | "StartTag",
155 | "math",
156 | {}
157 | ],
158 | [
159 | "StartTag",
160 | "tbody",
161 | {}
162 | ],
163 | [
164 | "StartTag",
165 | "mo",
166 | {}
167 | ],
168 | [
169 | "EndTag",
170 | "table"
171 | ]
172 | ]
173 | },
174 | {
175 | "fragmentContext": "tbody",
176 | "description": "",
177 | "input": "",
178 | "output": [
179 | [
180 | "StartTag",
181 | "math",
182 | {}
183 | ],
184 | [
185 | "StartTag",
186 | "thead",
187 | {}
188 | ],
189 | [
190 | "StartTag",
191 | "mo",
192 | {}
193 | ],
194 | [
195 | "EndTag",
196 | "table"
197 | ]
198 | ]
199 | },
200 | {
201 | "fragmentContext": "tbody",
202 | "description": "",
203 | "input": "",
204 | "output": [
205 | [
206 | "StartTag",
207 | "math",
208 | {}
209 | ],
210 | [
211 | "StartTag",
212 | "tfoot",
213 | {}
214 | ],
215 | [
216 | "StartTag",
217 | "mo",
218 | {}
219 | ],
220 | [
221 | "EndTag",
222 | "table"
223 | ]
224 | ]
225 | }
226 | ]
227 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/namespace-sensitivity.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": null,
5 | "description": " Foo",
6 | "input": " Foo",
7 | "output": [
8 | [
9 | "StartTag",
10 | "body",
11 | {}
12 | ],
13 | [
14 | "StartTag",
15 | "table",
16 | {}
17 | ],
18 | [
19 | "StartTag",
20 | "tr",
21 | {}
22 | ],
23 | [
24 | "StartTag",
25 | "td",
26 | {}
27 | ],
28 | [
29 | "StartTag",
30 | "svg",
31 | {}
32 | ],
33 | [
34 | "StartTag",
35 | "td",
36 | {}
37 | ],
38 | [
39 | "StartTag",
40 | "foreignObject",
41 | {}
42 | ],
43 | [
44 | "StartTag",
45 | "span",
46 | {}
47 | ],
48 | [
49 | "EndTag",
50 | "td"
51 | ],
52 | [
53 | "Character",
54 | "Foo"
55 | ]
56 | ]
57 | }
58 | ]
59 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/pending-spec-changes-plain-text-unsafe.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": null,
5 | "description": "\\u0000filler\\u0000text\\u0000",
6 | "input": "\u0000filler\u0000text\u0000",
7 | "output": [
8 | [
9 | "StartTag",
10 | "body",
11 | {}
12 | ],
13 | [
14 | "StartTag",
15 | "table",
16 | {}
17 | ],
18 | [
19 | "Character",
20 | "\u0000filler\u0000text\u0000"
21 | ]
22 | ]
23 | }
24 | ]
25 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/pending-spec-changes.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": null,
5 | "description": "",
6 | "input": "",
7 | "output": [
8 | [
9 | "StartTag",
10 | "input",
11 | {
12 | "type": "hidden"
13 | }
14 | ],
15 | [
16 | "StartTag",
17 | "frameset",
18 | {}
19 | ]
20 | ]
21 | },
22 | {
23 | "fragmentContext": null,
24 | "description": "bar",
25 | "input": "bar",
26 | "output": [
27 | [
28 | "DOCTYPE",
29 | "html",
30 | null,
31 | null,
32 | true
33 | ],
34 | [
35 | "StartTag",
36 | "table",
37 | {}
38 | ],
39 | [
40 | "StartTag",
41 | "caption",
42 | {}
43 | ],
44 | [
45 | "StartTag",
46 | "svg",
47 | {}
48 | ],
49 | [
50 | "Character",
51 | "foo"
52 | ],
53 | [
54 | "EndTag",
55 | "table"
56 | ],
57 | [
58 | "Character",
59 | "bar"
60 | ]
61 | ]
62 | },
63 | {
64 | "fragmentContext": null,
65 | "description": "",
152 | "output": [
153 | [
154 | "StartTag",
155 | "svg",
156 | {}
157 | ],
158 | [
159 | "StartTag",
160 | "tbody",
161 | {}
162 | ],
163 | [
164 | "StartTag",
165 | "title",
166 | {}
167 | ],
168 | [
169 | "EndTag",
170 | "table"
171 | ]
172 | ]
173 | },
174 | {
175 | "fragmentContext": "tbody",
176 | "description": "
",
177 | "input": "
",
178 | "output": [
179 | [
180 | "StartTag",
181 | "svg",
182 | {}
183 | ],
184 | [
185 | "StartTag",
186 | "thead",
187 | {}
188 | ],
189 | [
190 | "StartTag",
191 | "title",
192 | {}
193 | ],
194 | [
195 | "EndTag",
196 | "table"
197 | ]
198 | ]
199 | },
200 | {
201 | "fragmentContext": "tbody",
202 | "description": "
",
203 | "input": "
",
204 | "output": [
205 | [
206 | "StartTag",
207 | "svg",
208 | {}
209 | ],
210 | [
211 | "StartTag",
212 | "tfoot",
213 | {}
214 | ],
215 | [
216 | "StartTag",
217 | "title",
218 | {}
219 | ],
220 | [
221 | "EndTag",
222 | "table"
223 | ]
224 | ]
225 | }
226 | ]
227 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/tests14.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": null,
5 | "description": " ",
6 | "input": " ",
7 | "output": [
8 | [
9 | "DOCTYPE",
10 | "html",
11 | null,
12 | null,
13 | true
14 | ],
15 | [
16 | "StartTag",
17 | "html",
18 | {}
19 | ],
20 | [
21 | "StartTag",
22 | "body",
23 | {}
24 | ],
25 | [
26 | "StartTag",
27 | "xyz:abc",
28 | {}
29 | ],
30 | [
31 | "EndTag",
32 | "xyz:abc"
33 | ]
34 | ]
35 | },
36 | {
37 | "fragmentContext": null,
38 | "description": " ",
39 | "input": " ",
40 | "output": [
41 | [
42 | "DOCTYPE",
43 | "html",
44 | null,
45 | null,
46 | true
47 | ],
48 | [
49 | "StartTag",
50 | "html",
51 | {}
52 | ],
53 | [
54 | "StartTag",
55 | "body",
56 | {}
57 | ],
58 | [
59 | "StartTag",
60 | "xyz:abc",
61 | {}
62 | ],
63 | [
64 | "EndTag",
65 | "xyz:abc"
66 | ],
67 | [
68 | "StartTag",
69 | "span",
70 | {}
71 | ],
72 | [
73 | "EndTag",
74 | "span"
75 | ]
76 | ]
77 | },
78 | {
79 | "fragmentContext": null,
80 | "description": " ",
81 | "input": " ",
82 | "output": [
83 | [
84 | "DOCTYPE",
85 | "html",
86 | null,
87 | null,
88 | true
89 | ],
90 | [
91 | "StartTag",
92 | "html",
93 | {}
94 | ],
95 | [
96 | "StartTag",
97 | "html",
98 | {
99 | "abc:def": "gh"
100 | }
101 | ],
102 | [
103 | "StartTag",
104 | "xyz:abc",
105 | {}
106 | ],
107 | [
108 | "EndTag",
109 | "xyz:abc"
110 | ]
111 | ]
112 | },
113 | {
114 | "fragmentContext": null,
115 | "description": "",
116 | "input": "",
117 | "output": [
118 | [
119 | "DOCTYPE",
120 | "html",
121 | null,
122 | null,
123 | true
124 | ],
125 | [
126 | "StartTag",
127 | "html",
128 | {
129 | "xml:lang": "bar"
130 | }
131 | ],
132 | [
133 | "StartTag",
134 | "html",
135 | {
136 | "xml:lang": "foo"
137 | }
138 | ]
139 | ]
140 | },
141 | {
142 | "fragmentContext": null,
143 | "description": "",
144 | "input": "",
145 | "output": [
146 | [
147 | "DOCTYPE",
148 | "html",
149 | null,
150 | null,
151 | true
152 | ],
153 | [
154 | "StartTag",
155 | "html",
156 | {
157 | "123": "456"
158 | }
159 | ]
160 | ]
161 | },
162 | {
163 | "fragmentContext": null,
164 | "description": "",
165 | "input": "",
166 | "output": [
167 | [
168 | "DOCTYPE",
169 | "html",
170 | null,
171 | null,
172 | true
173 | ],
174 | [
175 | "StartTag",
176 | "html",
177 | {
178 | "123": "456"
179 | }
180 | ],
181 | [
182 | "StartTag",
183 | "html",
184 | {
185 | "789": "012"
186 | }
187 | ]
188 | ]
189 | },
190 | {
191 | "fragmentContext": null,
192 | "description": "",
193 | "input": "",
194 | "output": [
195 | [
196 | "DOCTYPE",
197 | "html",
198 | null,
199 | null,
200 | true
201 | ],
202 | [
203 | "StartTag",
204 | "html",
205 | {}
206 | ],
207 | [
208 | "StartTag",
209 | "body",
210 | {
211 | "789": "012"
212 | }
213 | ]
214 | ]
215 | }
216 | ]
217 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/tests24.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": null,
5 | "description": "≂̸",
6 | "input": "≂̸",
7 | "output": [
8 | [
9 | "DOCTYPE",
10 | "html",
11 | null,
12 | null,
13 | true
14 | ],
15 | [
16 | "Character",
17 | "≂̸"
18 | ]
19 | ]
20 | },
21 | {
22 | "fragmentContext": null,
23 | "description": "≂̸A",
24 | "input": "≂̸A",
25 | "output": [
26 | [
27 | "DOCTYPE",
28 | "html",
29 | null,
30 | null,
31 | true
32 | ],
33 | [
34 | "Character",
35 | "≂̸A"
36 | ]
37 | ]
38 | },
39 | {
40 | "fragmentContext": null,
41 | "description": "  ",
42 | "input": "  ",
43 | "output": [
44 | [
45 | "DOCTYPE",
46 | "html",
47 | null,
48 | null,
49 | true
50 | ],
51 | [
52 | "Character",
53 | " "
54 | ]
55 | ]
56 | },
57 | {
58 | "fragmentContext": null,
59 | "description": "  A",
60 | "input": "  A",
61 | "output": [
62 | [
63 | "DOCTYPE",
64 | "html",
65 | null,
66 | null,
67 | true
68 | ],
69 | [
70 | "Character",
71 | " A"
72 | ]
73 | ]
74 | },
75 | {
76 | "fragmentContext": null,
77 | "description": "⊂⃒",
78 | "input": "⊂⃒",
79 | "output": [
80 | [
81 | "DOCTYPE",
82 | "html",
83 | null,
84 | null,
85 | true
86 | ],
87 | [
88 | "Character",
89 | "⊂⃒"
90 | ]
91 | ]
92 | },
93 | {
94 | "fragmentContext": null,
95 | "description": "⊂⃒A",
96 | "input": "⊂⃒A",
97 | "output": [
98 | [
99 | "DOCTYPE",
100 | "html",
101 | null,
102 | null,
103 | true
104 | ],
105 | [
106 | "Character",
107 | "⊂⃒A"
108 | ]
109 | ]
110 | },
111 | {
112 | "fragmentContext": null,
113 | "description": "𝔾",
114 | "input": "𝔾",
115 | "output": [
116 | [
117 | "DOCTYPE",
118 | "html",
119 | null,
120 | null,
121 | true
122 | ],
123 | [
124 | "Character",
125 | "𝔾"
126 | ]
127 | ]
128 | },
129 | {
130 | "fragmentContext": null,
131 | "description": "𝔾A",
132 | "input": "𝔾A",
133 | "output": [
134 | [
135 | "DOCTYPE",
136 | "html",
137 | null,
138 | null,
139 | true
140 | ],
141 | [
142 | "Character",
143 | "𝔾A"
144 | ]
145 | ]
146 | }
147 | ]
148 | }
--------------------------------------------------------------------------------
/test/data/parser-feedback/tests4.test:
--------------------------------------------------------------------------------
1 | {
2 | "tests": [
3 | {
4 | "fragmentContext": "div",
5 | "description": "direct div content",
6 | "input": "direct div content",
7 | "output": [
8 | [
9 | "Character",
10 | "direct div content"
11 | ]
12 | ]
13 | },
14 | {
15 | "fragmentContext": "textarea",
16 | "description": "direct textarea content",
17 | "input": "direct textarea content",
18 | "output": [
19 | [
20 | "Character",
21 | "direct textarea content"
22 | ]
23 | ]
24 | },
25 | {
26 | "fragmentContext": "textarea",
27 | "description": "textarea content with pseudo markup",
28 | "input": "textarea content with pseudo markup",
29 | "output": [
30 | [
31 | "Character",
32 | "textarea content with "
33 | ],
34 | [
35 | "StartTag",
36 | "em",
37 | {}
38 | ],
39 | [
40 | "Character",
41 | "pseudo"
42 | ],
43 | [
44 | "EndTag",
45 | "em"
46 | ],
47 | [
48 | "Character",
49 | " "
50 | ],
51 | [
52 | "StartTag",
53 | "foo",
54 | {}
55 | ],
56 | [
57 | "Character",
58 | "markup"
59 | ]
60 | ]
61 | },
62 | {
63 | "fragmentContext": "style",
64 | "description": "this is CDATA inside a