├── .editorconfig ├── .gitattributes ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── dependabot-automerge.yml │ ├── nodejs-test.yml │ └── pages.yml ├── .gitignore ├── .gitmodules ├── .husky └── pre-commit ├── .prettierignore ├── .prettierrc ├── LICENSE ├── README.md ├── SECURITY.md ├── bench ├── .eslintrc.json ├── memory │ └── sax-parser.js ├── package.json └── perf │ └── index.js ├── docs ├── list-of-packages.md └── version-history.md ├── eslint.config.js ├── media └── logo.png ├── package-lock.json ├── package.json ├── packages ├── parse5-html-rewriting-stream │ ├── LICENSE │ ├── README.md │ ├── lib │ │ └── index.ts │ ├── package.json │ ├── test │ │ └── rewriting-stream.test.ts │ ├── tsconfig.json │ └── typedoc.json ├── parse5-htmlparser2-tree-adapter │ ├── LICENSE │ ├── README.md │ ├── lib │ │ └── index.ts │ ├── package.json │ ├── tsconfig.json │ └── typedoc.json ├── parse5-parser-stream │ ├── LICENSE │ ├── README.md │ ├── lib │ │ └── index.ts │ ├── package.json │ ├── test │ │ ├── location-info.test.ts │ │ ├── parser-stream.test.ts │ │ ├── scripting.test.ts │ │ └── utils │ │ │ └── parse-chunked.ts │ ├── tsconfig.json │ └── typedoc.json ├── parse5-plain-text-conversion-stream │ ├── LICENSE │ ├── README.md │ ├── lib │ │ └── index.ts │ ├── package.json │ ├── test │ │ └── plain-text-conversion-stream.test.ts │ ├── tsconfig.json │ └── typedoc.json ├── parse5-sax-parser │ ├── LICENSE │ ├── README.md │ ├── lib │ │ ├── dev-null-stream.ts │ │ ├── index.ts │ │ └── parser-feedback-simulator.ts │ ├── package.json │ ├── test │ │ ├── location-info.test.ts │ │ ├── parser-feedback-simulator.test.ts │ │ └── sax-parser.test.ts │ ├── tsconfig.json │ └── typedoc.json └── parse5 │ ├── LICENSE │ ├── README.md │ ├── lib │ ├── common │ │ ├── doctype.ts │ │ ├── error-codes.ts │ │ ├── foreign-content.ts │ │ ├── html.ts │ │ ├── token.ts │ │ └── unicode.ts │ ├── index.ts │ ├── parser │ │ ├── formatting-element-list.test.ts │ │ ├── formatting-element-list.ts │ │ ├── index.test.ts │ │ ├── index.ts │ │ ├── open-element-stack.test.ts │ │ ├── open-element-stack.ts │ │ └── parser-location-info.test.ts │ ├── serializer │ │ ├── index.test.ts │ │ └── index.ts │ ├── tokenizer │ │ ├── index.test.ts │ │ ├── index.ts │ │ ├── preprocessor.ts │ │ └── tokenizer-location-info.test.ts │ └── tree-adapters │ │ ├── default.ts │ │ └── interface.ts │ ├── package.json │ ├── tsconfig.json │ └── typedoc.json ├── scripts └── generate-parser-feedback-test │ └── index.ts ├── test ├── data │ ├── huge-page │ │ └── huge-page.html │ ├── location-info │ │ ├── cern │ │ │ └── data.html │ │ ├── dx │ │ │ └── data.html │ │ ├── github-parse5 │ │ │ └── data.html │ │ ├── whatwg-html │ │ │ └── data.html │ │ └── wiki-42 │ │ │ └── data.html │ ├── parser-feedback │ │ ├── adoption01.test │ │ ├── adoption02.test │ │ ├── blocks.test │ │ ├── comments01.test │ │ ├── doctype01.test │ │ ├── domjs-unsafe.test │ │ ├── entities01.test │ │ ├── entities02.test │ │ ├── foreign-fragment.test │ │ ├── gh40_form_in_template.test │ │ ├── html5test-com.test │ │ ├── inbody01.test │ │ ├── isindex.test │ │ ├── main-element.test │ │ ├── math.test │ │ ├── menuitem-element.test │ │ ├── namespace-sensitivity.test │ │ ├── noscript01.test │ │ ├── pending-spec-changes-plain-text-unsafe.test │ │ ├── pending-spec-changes.test │ │ ├── plain-text-unsafe.test │ │ ├── ruby.test │ │ ├── scriptdata01.test │ │ ├── search-element.test │ │ ├── svg.test │ │ ├── tables01.test │ │ ├── template.test │ │ ├── tests1.test │ │ ├── tests10.test │ │ ├── tests11.test │ │ ├── tests12.test │ │ ├── tests14.test │ │ ├── tests15.test │ │ ├── tests16.test │ │ ├── tests17.test │ │ ├── tests18.test │ │ ├── tests19.test │ │ ├── tests2.test │ │ ├── tests20.test │ │ ├── tests21.test │ │ ├── tests22.test │ │ ├── tests23.test │ │ ├── tests24.test │ │ ├── tests25.test │ │ ├── tests26.test │ │ ├── tests3.test │ │ ├── tests4.test │ │ ├── tests5.test │ │ ├── tests6.test │ │ ├── tests7.test │ │ ├── tests8.test │ │ ├── tests9.test │ │ ├── tests_innerHTML_1.test │ │ ├── tricky01.test │ │ ├── webkit01.test │ │ └── webkit02.test │ ├── sax │ │ ├── lhc │ │ │ ├── expected.html │ │ │ └── src.html │ │ ├── nodejsorg │ │ │ ├── expected.html │ │ │ └── src.html │ │ └── npmorg │ │ │ ├── expected.html │ │ │ └── src.html │ ├── serialization │ │ └── tests.json │ └── tree-construction-scripting │ │ └── document_write.dat ├── package.json ├── tsconfig.json └── utils │ ├── common.ts │ ├── generate-location-info-parser-tests.ts │ ├── generate-parsing-tests.ts │ ├── generate-serializer-tests.ts │ ├── generate-tokenization-tests.ts │ ├── load-sax-parser-test-data.ts │ ├── parse-dat-file.ts │ └── serialize-to-dat-file-format.ts ├── tsconfig.json ├── typedoc.base.json ├── typedoc.json └── vitest.config.js /.editorconfig: -------------------------------------------------------------------------------- 1 | # This file is for unifying the coding style for different editors and IDEs 2 | # editorconfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | end_of_line = lf 8 | charset = utf-8 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | indent_style = space 12 | indent_size = 4 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Exclude the HTML files from GitHub's language statistics 2 | # https://github.com/github/linguist#using-gitattributes 3 | test/data/** linguist-vendored 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | open_collective: parse5 2 | github: [fb55] 3 | tidelift: 'npm/parse5' 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: npm 4 | directory: '/' 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | versioning-strategy: increase 9 | - package-ecosystem: 'github-actions' 10 | directory: '/' 11 | schedule: 12 | interval: daily 13 | - package-ecosystem: gitsubmodule 14 | directory: '/' 15 | schedule: 16 | interval: daily 17 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: 'CodeQL' 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: '0 0 * * 0' 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: read 18 | contents: read 19 | security-events: write 20 | 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v4.2.2 24 | with: 25 | submodules: recursive 26 | 27 | - name: Initialize CodeQL 28 | uses: github/codeql-action/init@v3.28.18 29 | with: 30 | languages: 'javascript' 31 | 32 | - name: Perform CodeQL Analysis 33 | uses: github/codeql-action/analyze@v3.28.18 34 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-automerge.yml: -------------------------------------------------------------------------------- 1 | # Based on https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request 2 | name: Dependabot auto-merge 3 | on: pull_request_target 4 | 5 | permissions: 6 | pull-requests: write 7 | contents: write 8 | 9 | jobs: 10 | dependabot: 11 | runs-on: ubuntu-latest 12 | if: ${{ github.actor == 'dependabot[bot]' }} 13 | steps: 14 | - name: Dependabot metadata 15 | id: metadata 16 | uses: dependabot/fetch-metadata@v2.4.0 17 | with: 18 | github-token: '${{ secrets.GITHUB_TOKEN }}' 19 | - name: Enable auto-merge for Dependabot PRs 20 | # Automatically merge semver-patch and semver-minor PRs 21 | if: "${{ steps.metadata.outputs.update-type == 22 | 'version-update:semver-minor' || 23 | steps.metadata.outputs.update-type == 24 | 'version-update:semver-patch' }}" 25 | run: gh pr merge --auto --squash "$PR_URL" 26 | env: 27 | PR_URL: ${{github.event.pull_request.html_url}} 28 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 29 | -------------------------------------------------------------------------------- /.github/workflows/nodejs-test.yml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | - 'dependabot/**' 7 | pull_request: 8 | 9 | env: 10 | CI: true 11 | FORCE_COLOR: 2 12 | NODE_COV: lts/* # The Node.js version to run coveralls on 13 | 14 | permissions: 15 | contents: read # to fetch code (actions/checkout) 16 | 17 | jobs: 18 | lint: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4.2.2 22 | with: 23 | submodules: recursive 24 | - name: Use Node.js ${{ matrix.node }} 25 | uses: actions/setup-node@v4.4.0 26 | with: 27 | node-version: lts/* 28 | cache: npm 29 | - run: npm ci 30 | - run: npm run lint 31 | 32 | test: 33 | permissions: 34 | contents: read # to fetch code (actions/checkout) 35 | checks: write # to create new checks (coverallsapp/github-action) 36 | 37 | name: Node ${{ matrix.node }} 38 | runs-on: ubuntu-latest 39 | 40 | strategy: 41 | fail-fast: false 42 | matrix: 43 | node: 44 | - 18 45 | - 20 46 | - lts/* 47 | 48 | steps: 49 | - uses: actions/checkout@v4.2.2 50 | with: 51 | submodules: recursive 52 | - name: Use Node.js ${{ matrix.node }} 53 | uses: actions/setup-node@v4.4.0 54 | with: 55 | node-version: ${{ matrix.node }} 56 | cache: npm 57 | - run: npm ci 58 | - run: npm run build --if-present 59 | 60 | - name: Run unit tests 61 | run: npm run unit-tests 62 | if: matrix.node != env.NODE_COV 63 | 64 | - name: Run unit tests with coverage 65 | run: npm run unit-tests-coverage 66 | if: matrix.node == env.NODE_COV 67 | 68 | - name: Run Coveralls 69 | uses: coverallsapp/github-action@v2.3.6 70 | if: matrix.node == env.NODE_COV 71 | continue-on-error: true 72 | with: 73 | github-token: '${{ secrets.GITHUB_TOKEN }}' 74 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | on: 3 | push: 4 | branches: 5 | - master 6 | 7 | jobs: 8 | deploy: 9 | name: Deploy to GitHub Pages 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4.2.2 13 | - uses: actions/setup-node@v4.4.0 14 | with: 15 | node-version: lts/* 16 | cache: npm 17 | - run: npm ci 18 | - name: Build docs 19 | run: npm run build:docs 20 | - name: Deploy 21 | uses: peaceiris/actions-gh-pages@v4.0.0 22 | with: 23 | github_token: ${{ secrets.GITHUB_TOKEN }} 24 | publish_dir: docs/build 25 | cname: parse5.js.org 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | .vscode 4 | node_modules 5 | docs/build 6 | packages/*/dist/ 7 | test/dist/ 8 | tsconfig.tsbuildinfo 9 | coverage/ 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "html5lib-tests-fork"] 2 | path = test/data/html5lib-tests-fork 3 | url = https://github.com/HTMLParseErrorWG/html5lib-tests 4 | [submodule "html5lib-tests"] 5 | path = test/data/html5lib-tests 6 | url = https://github.com/html5lib/html5lib-tests.git 7 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | 4 | npm run pre-commit 5 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | packages/*/dist/ 2 | test/dist/ 3 | docs 4 | test/data/html5lib-tests 5 | test/data/html5lib-tests-fork -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | printWidth: 120 2 | tabWidth: 4 3 | singleQuote: true 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2019 Ivan Nikulin (ifaaan@gmail.com, https://github.com/inikulin) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | parse5 4 | 5 |

6 | 7 |

8 | HTML parsing/serialization toolset for Node.js. WHATWG HTML Living Standard (aka HTML5)-compliant. 9 |

10 | 11 |

12 | Build Status 13 | NPM Version 14 | Downloads 15 | Downloads total 16 | Coverage 17 |

18 | 19 |

20 | parse5 provides nearly everything you may need when dealing with HTML. It's the fastest spec-compliant HTML parser 21 | for Node to date. It parses HTML the way the latest version of your browser does. It has proven itself reliable in such projects 22 | as jsdom, Angular, 23 | Lit, Cheerio, 24 | rehype and many more. 25 |

26 | 27 | --- 28 | 29 |

30 | List of parse5 toolset packages 31 |

32 | 33 |

34 | Online playground 35 |

36 | 37 |

38 | Changelog 39 |

40 |

41 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Only the current release is supported. Please make sure to update to the latest release. 6 | 7 | ## Reporting a Vulnerability 8 | 9 | To report a security vulnerability, please use the [Tidelift security contact](https://tidelift.com/security). 10 | Tidelift will coordinate the fix and disclosure. 11 | -------------------------------------------------------------------------------- /bench/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["../.eslintrc.json"], 3 | "rules": { 4 | "no-console": "off" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /bench/memory/sax-parser.js: -------------------------------------------------------------------------------- 1 | import { readFile } from 'node:fs/promises'; 2 | import format from 'human-format'; 3 | import memwatch from '@airbnb/node-memwatch'; 4 | import { SAXParser } from '../../packages/parse5-sax-parser/dist/index.js'; 5 | import { finished } from 'parse5-test-utils/dist/common.js'; 6 | 7 | /* eslint-disable no-console */ 8 | 9 | const heapDiffMeasurement = new memwatch.HeapDiff(); 10 | 11 | let maxMemUsage = 0; 12 | 13 | memwatch.on('stats', (stats) => { 14 | maxMemUsage = Math.max(maxMemUsage, stats.used_heap_size); 15 | }); 16 | 17 | const statsPromise = new Promise((resolve) => memwatch.once('stats', resolve)); 18 | 19 | const startDate = new Date(); 20 | 21 | const parsedDataSize = await parse(); 22 | const endDate = new Date(); 23 | const heapDiff = heapDiffMeasurement.end(); 24 | 25 | // NOTE: we need at least one `stats` result to get maxMemUsage 26 | await statsPromise; 27 | 28 | async function parse() { 29 | const data = await readFile(new URL('../../test/data/huge-page/huge-page.html', import.meta.url), 'utf8'); 30 | let parsedDataSize = 0; 31 | const stream = new SAXParser(); 32 | 33 | for (let i = 0; i < 200; i++) { 34 | parsedDataSize += data.length; 35 | stream.write(data); 36 | } 37 | 38 | stream.end(); 39 | 40 | await finished(stream); 41 | 42 | return parsedDataSize; 43 | } 44 | 45 | console.log('Input data size:', format(parsedDataSize, { unit: 'B' })); 46 | 47 | const scale = new format.Scale({ 48 | seconds: 1, 49 | minutes: 60, 50 | hours: 3600, 51 | }); 52 | 53 | console.log('Duration:', format((endDate - startDate) / 1000, { scale })); 54 | console.log('Memory before:', heapDiff.before.size); 55 | console.log('Memory after:', heapDiff.after.size); 56 | console.log('Memory max:', format(maxMemUsage, { unit: 'B' })); 57 | -------------------------------------------------------------------------------- /bench/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parse5-benchmarks", 3 | "private": "true", 4 | "type": "module", 5 | "version": "1.0.0", 6 | "description": "parse5 regression benchmarks", 7 | "author": "Ivan Nikulin ", 8 | "license": "MIT", 9 | "dependencies": { 10 | "benchmark": "^2.1.4", 11 | "human-format": "^1.2.1", 12 | "@airbnb/node-memwatch": "^3.0.0", 13 | "parse5": "npm:parse5" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /bench/perf/index.js: -------------------------------------------------------------------------------- 1 | import { readFileSync, createReadStream, readdirSync } from 'node:fs'; 2 | import Benchmark from 'benchmark'; 3 | import { loadTreeConstructionTestData } from 'parse5-test-utils/dist/generate-parsing-tests.js'; 4 | import { loadSAXParserTestData } from 'parse5-test-utils/dist/load-sax-parser-test-data.js'; 5 | import { treeAdapters, WritableStreamStub, finished } from 'parse5-test-utils/dist/common.js'; 6 | import * as parse5 from '../../packages/parse5/dist/index.js'; 7 | import { ParserStream as parse5Stream } from '../../packages/parse5-parser-stream/dist/index.js'; 8 | import * as parse5Upstream from 'parse5'; 9 | 10 | /* eslint-disable no-console */ 11 | 12 | const hugePagePath = new URL('../../test/data/huge-page/huge-page.html', import.meta.url); 13 | const treeConstructionPath = new URL('../../test/data/html5lib-tests/tree-construction', import.meta.url); 14 | const saxPath = new URL('../../test/data/sax/', import.meta.url); 15 | 16 | //HACK: https://github.com/bestiejs/benchmark.js/issues/51 17 | /* global workingCopy, WorkingCopyParserStream, upstreamParser, hugePage, microTests, runMicro, runPages, files */ 18 | globalThis.workingCopy = parse5; 19 | globalThis.WorkingCopyParserStream = parse5Stream; 20 | globalThis.upstreamParser = parse5Upstream; 21 | 22 | // Huge page data 23 | globalThis.hugePage = readFileSync(hugePagePath).toString(); 24 | 25 | // Micro data 26 | globalThis.microTests = loadTreeConstructionTestData(treeConstructionPath, treeAdapters.default) 27 | .filter( 28 | (test) => 29 | //NOTE: this test caused a stack overflow in parse5 v1.x 30 | test.input !== '