├── .eslintrc.json ├── .gitattributes ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── dependabot-automerge.yml │ └── nodejs-test.yml ├── .gitignore ├── LICENSE ├── README.md ├── SECURITY.md ├── WritableStream.js ├── _config.yml ├── package-lock.json ├── package.json ├── src ├── FeedHandler.spec.ts ├── Parser.events.spec.ts ├── Parser.spec.ts ├── Parser.ts ├── Tokenizer.spec.ts ├── Tokenizer.ts ├── WritableStream.spec.ts ├── WritableStream.ts ├── __fixtures__ │ ├── Documents │ │ ├── Atom_Example.xml │ │ ├── Attributes.html │ │ ├── Basic.html │ │ ├── RDF_Example.xml │ │ ├── RSS_Example.xml │ │ └── Svg.html │ └── testHelper.ts ├── __snapshots__ │ ├── FeedHandler.spec.ts.snap │ ├── Parser.events.spec.ts.snap │ ├── Tokenizer.spec.ts.snap │ ├── WritableStream.spec.ts.snap │ └── index.spec.ts.snap ├── index.spec.ts └── index.ts └── tsconfig.json /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "eslint:recommended", 4 | "prettier", 5 | "plugin:n/recommended", 6 | "plugin:unicorn/recommended" 7 | ], 8 | "env": { 9 | "node": true, 10 | "es6": true 11 | }, 12 | "rules": { 13 | "eqeqeq": [2, "smart"], 14 | "no-caller": 2, 15 | "dot-notation": 2, 16 | "no-var": 2, 17 | "prefer-const": 2, 18 | "prefer-arrow-callback": [2, { "allowNamedFunctions": true }], 19 | "arrow-body-style": [2, "as-needed"], 20 | "object-shorthand": 2, 21 | "prefer-template": 2, 22 | "one-var": [2, "never"], 23 | "prefer-destructuring": [2, { "object": true }], 24 | "capitalized-comments": 2, 25 | "multiline-comment-style": [2, "starred-block"], 26 | "spaced-comment": 2, 27 | "yoda": [2, "never"], 28 | "curly": [2, "multi-line"], 29 | "no-else-return": 2, 30 | 31 | "n/no-unpublished-import": 0, 32 | 33 | "unicorn/filename-case": [ 34 | 2, 35 | { 36 | "cases": { 37 | "camelCase": true, 38 | "pascalCase": true 39 | } 40 | } 41 | ], 42 | "unicorn/no-null": 0, 43 | "unicorn/prefer-code-point": 0, 44 | "unicorn/prefer-string-slice": 0, 45 | "unicorn/prefer-add-event-listener": 0, 46 | "unicorn/prefer-at": 0, 47 | "unicorn/prefer-string-replace-all": 0 48 | }, 49 | "overrides": [ 50 | { 51 | "files": "*.ts", 52 | "extends": [ 53 | "plugin:@typescript-eslint/eslint-recommended", 54 | "plugin:@typescript-eslint/recommended", 55 | "prettier" 56 | ], 57 | "parserOptions": { 58 | "sourceType": "module", 59 | "project": "./tsconfig.json" 60 | }, 61 | "rules": { 62 | "curly": [2, "multi-line"], 63 | 64 | "@typescript-eslint/prefer-for-of": 0, 65 | "@typescript-eslint/member-ordering": 0, 66 | "@typescript-eslint/explicit-function-return-type": 0, 67 | "@typescript-eslint/no-unused-vars": 0, 68 | "@typescript-eslint/no-use-before-define": [ 69 | 2, 70 | { "functions": false } 71 | ], 72 | "@typescript-eslint/consistent-type-definitions": [ 73 | 2, 74 | "interface" 75 | ], 76 | "@typescript-eslint/prefer-function-type": 2, 77 | "@typescript-eslint/no-unnecessary-type-arguments": 2, 78 | "@typescript-eslint/prefer-string-starts-ends-with": 2, 79 | "@typescript-eslint/prefer-readonly": 2, 80 | "@typescript-eslint/prefer-includes": 2, 81 | "@typescript-eslint/no-unnecessary-condition": 2, 82 | "@typescript-eslint/switch-exhaustiveness-check": 2, 83 | "@typescript-eslint/prefer-nullish-coalescing": 2, 84 | "@typescript-eslint/consistent-type-imports": [ 85 | 2, 86 | { "fixStyle": "inline-type-imports" } 87 | ], 88 | "@typescript-eslint/consistent-type-exports": 2, 89 | 90 | "n/no-missing-import": 0, 91 | "n/no-unsupported-features/es-syntax": 0 92 | } 93 | }, 94 | { 95 | "files": "*.spec.ts", 96 | "rules": { 97 | "n/no-unsupported-features/node-builtins": 0 98 | } 99 | } 100 | ] 101 | } 102 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text eol=lf -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [fb55] 2 | tidelift: npm/htmlparser2 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: npm 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | versioning-strategy: increase 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: daily 13 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: "0 0 * * 0" 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: read 18 | contents: read 19 | security-events: write 20 | 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v4 24 | 25 | - name: Initialize CodeQL 26 | uses: github/codeql-action/init@v3 27 | with: 28 | languages: "javascript" 29 | 30 | - name: Perform CodeQL Analysis 31 | uses: github/codeql-action/analyze@v3 32 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-automerge.yml: -------------------------------------------------------------------------------- 1 | # Based on https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request 2 | name: Dependabot auto-merge 3 | on: pull_request_target 4 | 5 | permissions: 6 | pull-requests: write 7 | contents: write 8 | 9 | jobs: 10 | dependabot: 11 | runs-on: ubuntu-latest 12 | if: ${{ github.actor == 'dependabot[bot]' }} 13 | steps: 14 | - name: Dependabot metadata 15 | id: metadata 16 | uses: dependabot/fetch-metadata@v2.4.0 17 | with: 18 | github-token: "${{ secrets.GITHUB_TOKEN }}" 19 | - name: Enable auto-merge for Dependabot PRs 20 | # Automatically merge semver-patch and semver-minor PRs 21 | if: "${{ steps.metadata.outputs.update-type == 22 | 'version-update:semver-minor' || 23 | steps.metadata.outputs.update-type == 24 | 'version-update:semver-patch' }}" 25 | run: gh pr merge --auto --squash "$PR_URL" 26 | env: 27 | PR_URL: ${{github.event.pull_request.html_url}} 28 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 29 | -------------------------------------------------------------------------------- /.github/workflows/nodejs-test.yml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | - "dependabot/**" 7 | pull_request: 8 | 9 | env: 10 | CI: true 11 | FORCE_COLOR: 2 12 | NODE_COV: lts/* # The Node.js version to run coveralls on 13 | 14 | permissions: 15 | contents: read # to fetch code (actions/checkout) 16 | 17 | jobs: 18 | lint: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | - uses: actions/setup-node@v4 23 | with: 24 | node-version: lts/* 25 | cache: npm 26 | - run: npm ci 27 | - run: npm run lint 28 | 29 | test: 30 | permissions: 31 | contents: read # to fetch code (actions/checkout) 32 | checks: write # to create new checks (coverallsapp/github-action) 33 | 34 | name: Node ${{ matrix.node }} 35 | runs-on: ubuntu-latest 36 | 37 | strategy: 38 | fail-fast: false 39 | matrix: 40 | node: 41 | - 18 42 | - 20 43 | - 22 44 | - lts/* 45 | 46 | steps: 47 | - uses: actions/checkout@v4 48 | - name: Use Node.js ${{ matrix.node }} 49 | uses: actions/setup-node@v4 50 | with: 51 | node-version: ${{ matrix.node }} 52 | cache: npm 53 | - run: npm ci 54 | - run: npm run build --if-present 55 | 56 | - name: Run tests 57 | run: npm run test:vi 58 | if: matrix.node != env.NODE_COV 59 | 60 | - name: Run tests with coverage 61 | run: npm run test:vi -- --coverage 62 | if: matrix.node == env.NODE_COV 63 | 64 | - name: Run Coveralls 65 | uses: coverallsapp/github-action@v2.3.6 66 | if: matrix.node == env.NODE_COV 67 | continue-on-error: true 68 | with: 69 | github-token: "${{ secrets.GITHUB_TOKEN }}" 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | coverage/ 3 | dist/ 4 | .tshy/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2010, 2011, Chris Winberry . All rights reserved. 2 | Permission is hereby granted, free of charge, to any person obtaining a copy 3 | of this software and associated documentation files (the "Software"), to 4 | deal in the Software without restriction, including without limitation the 5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 6 | sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 17 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 18 | IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # htmlparser2 2 | 3 | [![NPM version](https://img.shields.io/npm/v/htmlparser2.svg)](https://npmjs.org/package/htmlparser2) 4 | [![Downloads](https://img.shields.io/npm/dm/htmlparser2.svg)](https://npmjs.org/package/htmlparser2) 5 | [![Node.js CI](https://github.com/fb55/htmlparser2/actions/workflows/nodejs-test.yml/badge.svg)](https://github.com/fb55/htmlparser2/actions/workflows/nodejs-test.yml) 6 | [![Coverage](https://img.shields.io/coveralls/fb55/htmlparser2.svg)](https://coveralls.io/r/fb55/htmlparser2) 7 | 8 | The fast & forgiving HTML/XML parser. 9 | 10 | _htmlparser2 is [the fastest HTML parser](#performance), and takes some shortcuts to get there. If you need strict HTML spec compliance, have a look at [parse5](https://github.com/inikulin/parse5)._ 11 | 12 | ## Installation 13 | 14 | npm install htmlparser2 15 | 16 | A live demo of `htmlparser2` is available [on AST Explorer](https://astexplorer.net/#/2AmVrGuGVJ). 17 | 18 | ## Ecosystem 19 | 20 | | Name | Description | 21 | | ------------------------------------------------------------- | ------------------------------------------------------- | 22 | | [htmlparser2](https://github.com/fb55/htmlparser2) | Fast & forgiving HTML/XML parser | 23 | | [domhandler](https://github.com/fb55/domhandler) | Handler for htmlparser2 that turns documents into a DOM | 24 | | [domutils](https://github.com/fb55/domutils) | Utilities for working with domhandler's DOM | 25 | | [css-select](https://github.com/fb55/css-select) | CSS selector engine, compatible with domhandler's DOM | 26 | | [cheerio](https://github.com/cheeriojs/cheerio) | The jQuery API for domhandler's DOM | 27 | | [dom-serializer](https://github.com/cheeriojs/dom-serializer) | Serializer for domhandler's DOM | 28 | 29 | ## Usage 30 | 31 | `htmlparser2` itself provides a callback interface that allows consumption of documents with minimal allocations. 32 | For a more ergonomic experience, read [Getting a DOM](#getting-a-dom) below. 33 | 34 | ```js 35 | import * as htmlparser2 from "htmlparser2"; 36 | 37 | const parser = new htmlparser2.Parser({ 38 | onopentag(name, attributes) { 39 | /* 40 | * This fires when a new tag is opened. 41 | * 42 | * If you don't need an aggregated `attributes` object, 43 | * have a look at the `onopentagname` and `onattribute` events. 44 | */ 45 | if (name === "script" && attributes.type === "text/javascript") { 46 | console.log("JS! Hooray!"); 47 | } 48 | }, 49 | ontext(text) { 50 | /* 51 | * Fires whenever a section of text was processed. 52 | * 53 | * Note that this can fire at any point within text and you might 54 | * have to stitch together multiple pieces. 55 | */ 56 | console.log("-->", text); 57 | }, 58 | onclosetag(tagname) { 59 | /* 60 | * Fires when a tag is closed. 61 | * 62 | * You can rely on this event only firing when you have received an 63 | * equivalent opening tag before. Closing tags without corresponding 64 | * opening tags will be ignored. 65 | */ 66 | if (tagname === "script") { 67 | console.log("That's it?!"); 68 | } 69 | }, 70 | }); 71 | parser.write( 72 | "Xyz ", 73 | ); 74 | parser.end(); 75 | ``` 76 | 77 | Output (with multiple text events combined): 78 | 79 | ``` 80 | --> Xyz 81 | JS! Hooray! 82 | --> const foo = '<>'; 83 | That's it?! 84 | ``` 85 | 86 | This example only shows three of the possible events. 87 | Read more about the parser, its events and options in the [wiki](https://github.com/fb55/htmlparser2/wiki/Parser-options). 88 | 89 | ### Usage with streams 90 | 91 | While the `Parser` interface closely resembles Node.js streams, it's not a 100% match. 92 | Use the `WritableStream` interface to process a streaming input: 93 | 94 | ```js 95 | import { WritableStream } from "htmlparser2/WritableStream"; 96 | 97 | const parserStream = new WritableStream({ 98 | ontext(text) { 99 | console.log("Streaming:", text); 100 | }, 101 | }); 102 | 103 | const htmlStream = fs.createReadStream("./my-file.html"); 104 | htmlStream.pipe(parserStream).on("finish", () => console.log("done")); 105 | ``` 106 | 107 | ## Getting a DOM 108 | 109 | The `DomHandler` produces a DOM (document object model) that can be manipulated using the [`DomUtils`](https://github.com/fb55/DomUtils) helper. 110 | 111 | ```js 112 | import * as htmlparser2 from "htmlparser2"; 113 | 114 | const dom = htmlparser2.parseDocument(htmlString); 115 | ``` 116 | 117 | The `DomHandler`, while still bundled with this module, was moved to its [own module](https://github.com/fb55/domhandler). 118 | Have a look at that for further information. 119 | 120 | ## Parsing Feeds 121 | 122 | `htmlparser2` makes it easy to parse RSS, RDF and Atom feeds, by providing a `parseFeed` method: 123 | 124 | ```javascript 125 | const feed = htmlparser2.parseFeed(content, options); 126 | ``` 127 | 128 | ## Performance 129 | 130 | After having some artificial benchmarks for some time, **@AndreasMadsen** published his [`htmlparser-benchmark`](https://github.com/AndreasMadsen/htmlparser-benchmark), which benchmarks HTML parses based on real-world websites. 131 | 132 | At the time of writing, the latest versions of all supported parsers show the following performance characteristics on GitHub Actions (sourced from [here](https://github.com/AndreasMadsen/htmlparser-benchmark/blob/e78cd8fc6c2adac08deedd4f274c33537451186b/stats.txt)): 133 | 134 | ``` 135 | htmlparser2 : 2.17215 ms/file ± 3.81587 136 | node-html-parser : 2.35983 ms/file ± 1.54487 137 | html5parser : 2.43468 ms/file ± 2.81501 138 | neutron-html5parser: 2.61356 ms/file ± 1.70324 139 | htmlparser2-dom : 3.09034 ms/file ± 4.77033 140 | html-dom-parser : 3.56804 ms/file ± 5.15621 141 | libxmljs : 4.07490 ms/file ± 2.99869 142 | htmljs-parser : 6.15812 ms/file ± 7.52497 143 | parse5 : 9.70406 ms/file ± 6.74872 144 | htmlparser : 15.0596 ms/file ± 89.0826 145 | html-parser : 28.6282 ms/file ± 22.6652 146 | saxes : 45.7921 ms/file ± 128.691 147 | html5 : 120.844 ms/file ± 153.944 148 | ``` 149 | 150 | ## How does this module differ from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? 151 | 152 | In 2011, this module started as a fork of the `htmlparser` module. 153 | `htmlparser2` was rewritten multiple times and, while it maintains an API that's mostly compatible with `htmlparser`, the projects don't share any code anymore. 154 | 155 | The parser now provides a callback interface inspired by [sax.js](https://github.com/isaacs/sax-js) (originally targeted at [readabilitySAX](https://github.com/fb55/readabilitysax)). 156 | As a result, old handlers won't work anymore. 157 | 158 | The `DefaultHandler` was renamed to clarify its purpose (to `DomHandler`). The old name is still available when requiring `htmlparser2` and your code should work as expected. 159 | 160 | The `RssHandler` was replaced with a `getFeed` function that takes a `DomHandler` DOM and returns a feed object. There is a `parseFeed` helper function that can be used to parse a feed from a string. 161 | 162 | ## Security contact information 163 | 164 | To report a security vulnerability, please use the [Tidelift security contact](https://tidelift.com/security). 165 | Tidelift will coordinate the fix and disclosure. 166 | 167 | ## `htmlparser2` for enterprise 168 | 169 | Available as part of the Tidelift Subscription. 170 | 171 | The maintainers of `htmlparser2` and thousands of other packages are working with Tidelift to deliver commercial support and maintenance for the open source dependencies you use to build your applications. Save time, reduce risk, and improve code health, while paying the maintainers of the exact dependencies you use. [Learn more.](https://tidelift.com/subscription/pkg/npm-htmlparser2?utm_source=npm-htmlparser2&utm_medium=referral&utm_campaign=enterprise&utm_term=repo) 172 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Only the current version is supported. Please make sure to update to the latest release. 6 | 7 | ## Reporting a Vulnerability 8 | 9 | To report a security vulnerability, please use the [Tidelift security contact](https://tidelift.com/security). 10 | Tidelift will coordinate the fix and disclosure. 11 | -------------------------------------------------------------------------------- /WritableStream.js: -------------------------------------------------------------------------------- 1 | // Make exports work in Node < 12 2 | // eslint-disable-next-line no-undef, unicorn/prefer-module 3 | module.exports = require("./dist/commonjs/WritableStream.js"); 4 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "htmlparser2", 3 | "version": "10.0.0", 4 | "description": "Fast & forgiving HTML/XML parser", 5 | "keywords": [ 6 | "html", 7 | "parser", 8 | "streams", 9 | "xml", 10 | "dom", 11 | "rss", 12 | "feed", 13 | "atom" 14 | ], 15 | "repository": { 16 | "type": "git", 17 | "url": "git://github.com/fb55/htmlparser2.git" 18 | }, 19 | "funding": [ 20 | "https://github.com/fb55/htmlparser2?sponsor=1", 21 | { 22 | "type": "github", 23 | "url": "https://github.com/sponsors/fb55" 24 | } 25 | ], 26 | "license": "MIT", 27 | "author": "Felix Boehm ", 28 | "sideEffects": false, 29 | "type": "module", 30 | "exports": { 31 | ".": { 32 | "import": { 33 | "types": "./dist/esm/index.d.ts", 34 | "default": "./dist/esm/index.js" 35 | }, 36 | "require": { 37 | "types": "./dist/commonjs/index.d.ts", 38 | "default": "./dist/commonjs/index.js" 39 | } 40 | }, 41 | "./WritableStream": { 42 | "import": { 43 | "types": "./dist/esm/WritableStream.d.ts", 44 | "default": "./dist/esm/WritableStream.js" 45 | }, 46 | "require": { 47 | "types": "./dist/commonjs/WritableStream.d.ts", 48 | "default": "./dist/commonjs/WritableStream.js" 49 | } 50 | } 51 | }, 52 | "main": "./dist/commonjs/index.js", 53 | "module": "./dist/esm/index.js", 54 | "types": "./dist/commonjs/index.d.ts", 55 | "files": [ 56 | "WritableStream.js", 57 | "dist", 58 | "src" 59 | ], 60 | "scripts": { 61 | "build": "tshy", 62 | "format": "npm run format:es && npm run format:prettier", 63 | "format:es": "npm run lint:es -- --fix", 64 | "format:prettier": "npm run format:prettier:raw -- --write", 65 | "format:prettier:raw": "prettier '**/*.{ts,md,json,yml}'", 66 | "lint": "npm run lint:es && npm run lint:ts && npm run lint:prettier", 67 | "lint:es": "eslint src", 68 | "lint:prettier": "npm run format:prettier:raw -- --check", 69 | "lint:ts": "tsc --noEmit", 70 | "prepare": "npm run build", 71 | "test": "npm run test:vi && npm run lint", 72 | "test:vi": "vitest run" 73 | }, 74 | "prettier": { 75 | "tabWidth": 4 76 | }, 77 | "dependencies": { 78 | "domelementtype": "^2.3.0", 79 | "domhandler": "^5.0.3", 80 | "domutils": "^3.2.2", 81 | "entities": "^6.0.0" 82 | }, 83 | "devDependencies": { 84 | "@types/node": "^22.15.27", 85 | "@typescript-eslint/eslint-plugin": "^8.33.0", 86 | "@typescript-eslint/parser": "^8.32.1", 87 | "@vitest/coverage-v8": "^2.1.8", 88 | "eslint": "^8.57.1", 89 | "eslint-config-prettier": "^10.1.5", 90 | "eslint-plugin-n": "^17.18.0", 91 | "eslint-plugin-unicorn": "^56.0.1", 92 | "prettier": "^3.5.3", 93 | "tshy": "^3.0.2", 94 | "typescript": "^5.8.3", 95 | "vitest": "^2.0.2" 96 | }, 97 | "tshy": { 98 | "exclude": [ 99 | "**/*.spec.ts", 100 | "**/__fixtures__/*", 101 | "**/__tests__/*", 102 | "**/__snapshots__/*" 103 | ], 104 | "exports": { 105 | ".": "./src/index.ts", 106 | "./WritableStream": "./src/WritableStream.ts" 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/FeedHandler.spec.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs/promises"; 2 | import { describe, it, expect } from "vitest"; 3 | import { parseFeed } from "./index.js"; 4 | 5 | const documents = new URL("__fixtures__/Documents/", import.meta.url); 6 | 7 | describe("parseFeed", () => { 8 | it("(rssFeed)", async () => 9 | expect( 10 | parseFeed( 11 | await fs.readFile( 12 | new URL("RSS_Example.xml", documents), 13 | "utf8", 14 | ), 15 | ), 16 | ).toMatchSnapshot()); 17 | 18 | it("(atomFeed)", async () => 19 | expect( 20 | parseFeed( 21 | await fs.readFile( 22 | new URL("Atom_Example.xml", documents), 23 | "utf8", 24 | ), 25 | ), 26 | ).toMatchSnapshot()); 27 | 28 | it("(rdfFeed)", async () => 29 | expect( 30 | parseFeed( 31 | await fs.readFile( 32 | new URL("RDF_Example.xml", documents), 33 | "utf8", 34 | ), 35 | ), 36 | ).toMatchSnapshot()); 37 | }); 38 | -------------------------------------------------------------------------------- /src/Parser.events.spec.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect, vi } from "vitest"; 2 | import { Parser, type ParserOptions } from "./Parser.js"; 3 | import * as helper from "./__fixtures__/testHelper.js"; 4 | 5 | /** 6 | * Write to the parser twice, once a bytes, once as a single blob. Then check 7 | * that we received the expected events. 8 | * 9 | * @internal 10 | * @param input Data to write. 11 | * @param options Parser options. 12 | * @returns Promise that resolves if the test passes. 13 | */ 14 | function runTest(input: string, options?: ParserOptions) { 15 | let firstResult: unknown[] | undefined; 16 | 17 | return new Promise((resolve, reject) => { 18 | const handler = helper.getEventCollector((error, actual) => { 19 | if (error) { 20 | return reject(error); 21 | } 22 | 23 | if (firstResult) { 24 | expect(actual).toEqual(firstResult); 25 | resolve(); 26 | } else { 27 | firstResult = actual; 28 | expect(actual).toMatchSnapshot(); 29 | } 30 | }); 31 | 32 | const parser = new Parser(handler, options); 33 | // First, try to run the test via chunks 34 | for (let index = 0; index < input.length; index++) { 35 | parser.write(input.charAt(index)); 36 | } 37 | parser.end(); 38 | // Then, parse everything 39 | parser.parseComplete(input); 40 | }); 41 | } 42 | 43 | describe("Events", () => { 44 | it("simple", () => runTest("

adsf

")); 45 | 46 | it("Template script tags", () => 47 | runTest( 48 | '

', 49 | )); 50 | 51 | it("Lowercase tags", () => 52 | runTest("

adsf

", { lowerCaseTags: true })); 53 | 54 | it("CDATA", () => 55 | runTest("<> fo]]>", { 56 | xmlMode: true, 57 | })); 58 | 59 | it("CDATA (inside special)", () => 60 | runTest( 61 | "", 62 | )); 63 | 64 | it("leading lt", () => runTest(">a>")); 65 | 66 | it("end slash: void element ending with />", () => 67 | runTest("

Hold the line.")); 68 | 69 | it("end slash: void element ending with >", () => 70 | runTest("


Hold the line.")); 71 | 72 | it("end slash: void element ending with >, xmlMode=true", () => 73 | runTest("


Hold the line.", { xmlMode: true })); 74 | 75 | it("end slash: non-void element ending with />", () => 76 | runTest("

Hold the line.")); 77 | 78 | it("end slash: non-void element ending with />, xmlMode=true", () => 79 | runTest("

Hold the line.", { xmlMode: true })); 80 | 81 | it("end slash: non-void element ending with />, recognizeSelfClosing=true", () => 82 | runTest("

Hold the line.", { recognizeSelfClosing: true })); 83 | 84 | it("end slash: as part of attrib value of void element", () => 85 | runTest("

Hold the line.")); 86 | 87 | it("end slash: as part of attrib value of non-void element", () => 88 | runTest("Foo

Hold the line.")); 89 | 90 | it("Implicit close tags", () => 91 | runTest( 92 | "

  1. TH

    Heading

    Div
    Div2
  2. Heading 2

Para

Heading 4

  • Hi
  • bye
", 93 | )); 94 | 95 | it("attributes (no white space, no value, no quotes)", () => 96 | runTest( 97 | '', 98 | )); 99 | 100 | it("crazy attribute", () => runTest("

stuff

103 | runTest("

")); 104 | 105 | it("Long comment ending", () => 106 | runTest("")); 107 | 108 | it("Long CDATA ending", () => 109 | runTest("", { 110 | xmlMode: true, 111 | })); 112 | 113 | it("Implicit open p and br tags", () => 114 | runTest("
Hallo

World


")); 115 | 116 | it("lt followed by whitespace", () => runTest("a < b")); 117 | 118 | it("double attribute", () => runTest("

")); 119 | 120 | it("numeric entities", () => 121 | runTest("abcdfg&#x;h")); 122 | 123 | it("legacy entities", () => runTest("&elíe&eer;s<er&sum")); 124 | 125 | it("named entities", () => 126 | runTest("&el<er∳foo&bar")); 127 | 128 | it("xml entities", () => 129 | runTest("&>&<üabcde", { 130 | xmlMode: true, 131 | })); 132 | 133 | it("entity in attribute", () => 134 | runTest( 135 | "", 136 | )); 137 | 138 | it("double brackets", () => 139 | runTest("<>testing")); 140 | 141 | it("legacy entities fail", () => runTest("M&M")); 142 | 143 | it("Special special tags", () => 144 | runTest( 145 | "<b>foo</b><title>", 146 | )); 147 | 148 | it("Empty tag name", () => runTest("< >")); 149 | 150 | it("Not quite closed", () => runTest("")); 151 | 152 | it("Entities in attributes", () => 153 | runTest("")); 154 | 155 | it("CDATA in HTML", () => runTest("")); 156 | 157 | it("Comment edge-cases", () => runTest("")); 165 | 166 | it("Scripts ending with <", () => runTest("")); 167 | 168 | it("CDATA more edge-cases", () => 169 | runTest("baz]]>", { recognizeCDATA: true })); 170 | 171 | it("tag names are not ASCII alpha", () => runTest("<12>text")); 172 | 173 | it("open-implies-close case of (non-br) void close tag in non-XML mode", () => 174 | runTest("", { lowerCaseAttributeNames: true })); 175 | 176 | it("entity in attribute (#276)", () => 177 | runTest( 178 | '?&image_uri=1&ℑ=2&image=3', 179 | )); 180 | 181 | it("entity in title (#592)", () => runTest("the "title"")); 182 | 183 | it("entity in title - decodeEntities=false (#592)", () => 184 | runTest("<title>the "title"", { decodeEntities: false })); 185 | 186 | it(" in ")); 188 | 189 | it("XML tags", () => runTest("<:foo><_bar>", { xmlMode: true })); 190 | 191 | it("Trailing legacy entity", () => runTest("⨱×bar")); 192 | 193 | it("Trailing numeric entity", () => runTest("55")); 194 | 195 | it("Multi-byte entity", () => runTest("≧̸")); 196 | 197 | it("Start & end indices from domhandler", () => 198 | runTest( 199 | " The Title Hello world

", 200 | )); 201 | 202 | it("Self-closing indices (#941)", () => 203 | runTest("
", { xmlMode: true })); 204 | 205 | it("Entity after <", () => runTest("<&")); 206 | 207 | it("Attribute in XML (see #1350)", () => 208 | runTest( 209 | '', 210 | { xmlMode: true }, 211 | )); 212 | }); 213 | 214 | describe("Helper", () => { 215 | it("should handle errors", () => { 216 | const eventCallback = vi.fn(); 217 | const parser = new Parser(helper.getEventCollector(eventCallback)); 218 | 219 | parser.end(); 220 | parser.write("foo"); 221 | 222 | expect(eventCallback).toHaveBeenCalledTimes(2); 223 | expect(eventCallback).toHaveBeenNthCalledWith(1, null, []); 224 | expect(eventCallback).toHaveBeenLastCalledWith( 225 | new Error(".write() after done!"), 226 | ); 227 | }); 228 | }); 229 | -------------------------------------------------------------------------------- /src/Parser.spec.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect, vi } from "vitest"; 2 | import { Parser, Tokenizer } from "./index.js"; 3 | import type { Handler } from "./Parser.js"; 4 | 5 | describe("API", () => { 6 | it("should work without callbacks", () => { 7 | const cbs: Partial = { onerror: vi.fn() }; 8 | const p = new Parser(cbs, { 9 | xmlMode: true, 10 | lowerCaseAttributeNames: true, 11 | }); 12 | 13 | p.end("boohay"); 14 | p.write("foo"); 15 | 16 | // Check for an error 17 | p.end(); 18 | p.write("foo"); 19 | expect(cbs.onerror).toHaveBeenLastCalledWith( 20 | new Error(".write() after done!"), 21 | ); 22 | p.end(); 23 | expect(cbs.onerror).toHaveBeenLastCalledWith( 24 | new Error(".end() after done!"), 25 | ); 26 | 27 | // Should ignore the error if there is no callback 28 | delete cbs.onerror; 29 | p.write("foo"); 30 | 31 | p.reset(); 32 | 33 | // Remove method 34 | cbs.onopentag = vi.fn(); 35 | p.write(""); 38 | 39 | // Pause/resume 40 | const onText = vi.fn(); 41 | cbs.ontext = onText; 42 | p.pause(); 43 | p.write("foo"); 44 | expect(onText).not.toHaveBeenCalled(); 45 | p.resume(); 46 | expect(onText).toHaveBeenLastCalledWith("foo"); 47 | p.pause(); 48 | expect(onText).toHaveBeenCalledTimes(1); 49 | p.resume(); 50 | expect(onText).toHaveBeenCalledTimes(1); 51 | p.pause(); 52 | p.end("bar"); 53 | expect(onText).toHaveBeenCalledTimes(1); 54 | p.resume(); 55 | expect(onText).toHaveBeenCalledTimes(2); 56 | expect(onText).toHaveBeenLastCalledWith("bar"); 57 | }); 58 | 59 | it("should back out of numeric entities (#125)", () => { 60 | const onend = vi.fn(); 61 | let text = ""; 62 | const p = new Parser({ 63 | ontext(data) { 64 | text += data; 65 | }, 66 | onend, 67 | }); 68 | 69 | p.end("id=770&#anchor"); 70 | 71 | expect(onend).toHaveBeenCalledTimes(1); 72 | expect(text).toBe("id=770&#anchor"); 73 | 74 | p.reset(); 75 | text = ""; 76 | 77 | p.end("0&#xn"); 78 | 79 | expect(onend).toHaveBeenCalledTimes(2); 80 | expect(text).toBe("0&#xn"); 81 | }); 82 | 83 | it("should not have the start index be greater than the end index", () => { 84 | const onopentag = vi.fn(); 85 | const onclosetag = vi.fn(); 86 | 87 | const p = new Parser({ 88 | onopentag(tag) { 89 | expect(p.startIndex).toBeLessThanOrEqual(p.endIndex); 90 | onopentag(tag, p.startIndex, p.endIndex); 91 | }, 92 | onclosetag(tag) { 93 | expect(p.startIndex).toBeLessThanOrEqual(p.endIndex); 94 | onclosetag(tag, p.endIndex); 95 | }, 96 | }); 97 | 98 | p.write("

"); 99 | 100 | expect(onopentag).toHaveBeenLastCalledWith("p", 0, 2); 101 | expect(onclosetag).not.toHaveBeenCalled(); 102 | 103 | p.write("Foo"); 104 | 105 | p.write("


"); 106 | 107 | expect(onopentag).toHaveBeenLastCalledWith("hr", 6, 9); 108 | expect(onclosetag).toHaveBeenCalledTimes(2); 109 | expect(onclosetag).toHaveBeenNthCalledWith(1, "p", 9); 110 | expect(onclosetag).toHaveBeenNthCalledWith(2, "hr", 9); 111 | }); 112 | 113 | it("should update the position when a single tag is spread across multiple chunks", () => { 114 | let called = false; 115 | const p = new Parser({ 116 | onopentag() { 117 | called = true; 118 | expect(p.startIndex).toBe(0); 119 | expect(p.endIndex).toBe(12); 120 | }, 121 | }); 122 | 123 | p.write("
"); 125 | 126 | expect(called).toBe(true); 127 | }); 128 | 129 | it("should have the correct position for implied opening tags", () => { 130 | let called = false; 131 | const p = new Parser({ 132 | onopentag() { 133 | called = true; 134 | expect(p.startIndex).toBe(0); 135 | expect(p.endIndex).toBe(3); 136 | }, 137 | }); 138 | 139 | p.write("

"); 140 | expect(called).toBe(true); 141 | }); 142 | 143 | it("should parse <__proto__> (#387)", () => { 144 | const p = new Parser(null); 145 | 146 | // Should not throw 147 | p.parseChunk("<__proto__>"); 148 | }); 149 | 150 | it("should support custom tokenizer", () => { 151 | class CustomTokenizer extends Tokenizer {} 152 | 153 | const p = new Parser( 154 | { 155 | onparserinit(parser: Parser) { 156 | // @ts-expect-error Accessing private tokenizer here 157 | expect(parser.tokenizer).toBeInstanceOf(CustomTokenizer); 158 | }, 159 | }, 160 | { Tokenizer: CustomTokenizer }, 161 | ); 162 | p.done(); 163 | }); 164 | }); 165 | -------------------------------------------------------------------------------- /src/Parser.ts: -------------------------------------------------------------------------------- 1 | import Tokenizer, { type Callbacks, QuoteType } from "./Tokenizer.js"; 2 | import { fromCodePoint } from "entities/decode"; 3 | 4 | const formTags = new Set([ 5 | "input", 6 | "option", 7 | "optgroup", 8 | "select", 9 | "button", 10 | "datalist", 11 | "textarea", 12 | ]); 13 | const pTag = new Set(["p"]); 14 | const tableSectionTags = new Set(["thead", "tbody"]); 15 | const ddtTags = new Set(["dd", "dt"]); 16 | const rtpTags = new Set(["rt", "rp"]); 17 | 18 | const openImpliesClose = new Map>([ 19 | ["tr", new Set(["tr", "th", "td"])], 20 | ["th", new Set(["th"])], 21 | ["td", new Set(["thead", "th", "td"])], 22 | ["body", new Set(["head", "link", "script"])], 23 | ["li", new Set(["li"])], 24 | ["p", pTag], 25 | ["h1", pTag], 26 | ["h2", pTag], 27 | ["h3", pTag], 28 | ["h4", pTag], 29 | ["h5", pTag], 30 | ["h6", pTag], 31 | ["select", formTags], 32 | ["input", formTags], 33 | ["output", formTags], 34 | ["button", formTags], 35 | ["datalist", formTags], 36 | ["textarea", formTags], 37 | ["option", new Set(["option"])], 38 | ["optgroup", new Set(["optgroup", "option"])], 39 | ["dd", ddtTags], 40 | ["dt", ddtTags], 41 | ["address", pTag], 42 | ["article", pTag], 43 | ["aside", pTag], 44 | ["blockquote", pTag], 45 | ["details", pTag], 46 | ["div", pTag], 47 | ["dl", pTag], 48 | ["fieldset", pTag], 49 | ["figcaption", pTag], 50 | ["figure", pTag], 51 | ["footer", pTag], 52 | ["form", pTag], 53 | ["header", pTag], 54 | ["hr", pTag], 55 | ["main", pTag], 56 | ["nav", pTag], 57 | ["ol", pTag], 58 | ["pre", pTag], 59 | ["section", pTag], 60 | ["table", pTag], 61 | ["ul", pTag], 62 | ["rt", rtpTags], 63 | ["rp", rtpTags], 64 | ["tbody", tableSectionTags], 65 | ["tfoot", tableSectionTags], 66 | ]); 67 | 68 | const voidElements = new Set([ 69 | "area", 70 | "base", 71 | "basefont", 72 | "br", 73 | "col", 74 | "command", 75 | "embed", 76 | "frame", 77 | "hr", 78 | "img", 79 | "input", 80 | "isindex", 81 | "keygen", 82 | "link", 83 | "meta", 84 | "param", 85 | "source", 86 | "track", 87 | "wbr", 88 | ]); 89 | 90 | const foreignContextElements = new Set(["math", "svg"]); 91 | 92 | const htmlIntegrationElements = new Set([ 93 | "mi", 94 | "mo", 95 | "mn", 96 | "ms", 97 | "mtext", 98 | "annotation-xml", 99 | "foreignobject", 100 | "desc", 101 | "title", 102 | ]); 103 | 104 | export interface ParserOptions { 105 | /** 106 | * Indicates whether special tags (`
")).toMatchSnapshot(); 48 | }); 49 | it("for normal style tag", () => { 50 | expect(tokenize("
")).toMatchSnapshot(); 51 | }); 52 | it("for normal sitle tag", () => { 53 | expect(tokenize("
")).toMatchSnapshot(); 54 | }); 55 | it("for normal textarea tag", () => { 56 | expect( 57 | tokenize("
"), 58 | ).toMatchSnapshot(); 59 | }); 60 | it("for normal xmp tag", () => { 61 | expect(tokenize("
")).toMatchSnapshot(); 62 | }); 63 | }); 64 | 65 | describe("should treat html inside special tags as text", () => { 66 | it("for div inside script tag", () => { 67 | expect(tokenize("")).toMatchSnapshot(); 68 | }); 69 | it("for div inside style tag", () => { 70 | expect(tokenize("")).toMatchSnapshot(); 71 | }); 72 | it("for div inside title tag", () => { 73 | expect(tokenize("<div></div>")).toMatchSnapshot(); 74 | }); 75 | it("for div inside textarea tag", () => { 76 | expect( 77 | tokenize(""), 78 | ).toMatchSnapshot(); 79 | }); 80 | it("for div inside xmp tag", () => { 81 | expect(tokenize("<div></div>")).toMatchSnapshot(); 82 | }); 83 | }); 84 | 85 | describe("should correctly mark attributes", () => { 86 | it("for no value attribute", () => { 87 | expect(tokenize("
")).toMatchSnapshot(); 88 | }); 89 | it("for no quotes attribute", () => { 90 | expect(tokenize("
")).toMatchSnapshot(); 91 | }); 92 | it("for single quotes attribute", () => { 93 | expect(tokenize("
")).toMatchSnapshot(); 94 | }); 95 | it("for double quotes attribute", () => { 96 | expect(tokenize('
')).toMatchSnapshot(); 97 | }); 98 | }); 99 | 100 | describe("should not break after special tag followed by an entity", () => { 101 | it("for normal special tag", () => { 102 | expect(tokenize("'
")).toMatchSnapshot(); 103 | }); 104 | it("for self-closing special tag", () => { 105 | expect(tokenize("