├── .github └── workflows │ └── test.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── bun.lockb ├── index.d.ts ├── package.json ├── src ├── index.test.ts ├── index.ts └── walker │ ├── index.test.ts │ └── index.ts ├── tsconfig.json └── walker.d.ts /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Bunjs 16 | uses: oven-sh/setup-bun@v1 17 | with: 18 | bun-version: 1.0.25 19 | - name: Install dependencies 20 | run: bun install 21 | - run: bun run build 22 | - run: bun run test 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore 2 | 3 | # Build 4 | 5 | build 6 | # Logs 7 | 8 | logs 9 | _.log 10 | npm-debug.log_ 11 | yarn-debug.log* 12 | yarn-error.log* 13 | lerna-debug.log* 14 | .pnpm-debug.log* 15 | 16 | # Caches 17 | 18 | .cache 19 | 20 | # Diagnostic reports (https://nodejs.org/api/report.html) 21 | 22 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 23 | 24 | # Runtime data 25 | 26 | pids 27 | _.pid 28 | _.seed 29 | *.pid.lock 30 | 31 | # Directory for instrumented libs generated by jscoverage/JSCover 32 | 33 | lib-cov 34 | 35 | # Coverage directory used by tools like istanbul 36 | 37 | coverage 38 | *.lcov 39 | 40 | # nyc test coverage 41 | 42 | .nyc_output 43 | 44 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 45 | 46 | .grunt 47 | 48 | # Bower dependency directory (https://bower.io/) 49 | 50 | bower_components 51 | 52 | # node-waf configuration 53 | 54 | .lock-wscript 55 | 56 | # Compiled binary addons (https://nodejs.org/api/addons.html) 57 | 58 | build/Release 59 | 60 | # Dependency directories 61 | 62 | node_modules/ 63 | jspm_packages/ 64 | 65 | # Snowpack dependency directory (https://snowpack.dev/) 66 | 67 | web_modules/ 68 | 69 | # TypeScript cache 70 | 71 | *.tsbuildinfo 72 | 73 | # Optional npm cache directory 74 | 75 | .npm 76 | 77 | # Optional eslint cache 78 | 79 | .eslintcache 80 | 81 | # Optional stylelint cache 82 | 83 | .stylelintcache 84 | 85 | # Microbundle cache 86 | 87 | .rpt2_cache/ 88 | .rts2_cache_cjs/ 89 | .rts2_cache_es/ 90 | .rts2_cache_umd/ 91 | 92 | # Optional REPL history 93 | 94 | .node_repl_history 95 | 96 | # Output of 'npm pack' 97 | 98 | *.tgz 99 | 100 | # Yarn Integrity file 101 | 102 | .yarn-integrity 103 | 104 | # dotenv environment variable files 105 | 106 | .env 107 | .env.development.local 108 | .env.test.local 109 | .env.production.local 110 | .env.local 111 | 112 | # parcel-bundler cache (https://parceljs.org/) 113 | 114 | .parcel-cache 115 | 116 | # Next.js build output 117 | 118 | .next 119 | out 120 | 121 | # Nuxt.js build / generate output 122 | 123 | .nuxt 124 | dist 125 | 126 | # Gatsby files 127 | 128 | # Comment in the public line in if your project uses Gatsby and not Next.js 129 | 130 | # https://nextjs.org/blog/next-9-1#public-directory-support 131 | 132 | # public 133 | 134 | # vuepress build output 135 | 136 | .vuepress/dist 137 | 138 | # vuepress v2.x temp and cache directory 139 | 140 | .temp 141 | 142 | # Docusaurus cache and generated files 143 | 144 | .docusaurus 145 | 146 | # Serverless directories 147 | 148 | .serverless/ 149 | 150 | # FuseBox cache 151 | 152 | .fusebox/ 153 | 154 | # DynamoDB Local files 155 | 156 | .dynamodb/ 157 | 158 | # TernJS port file 159 | 160 | .tern-port 161 | 162 | # Stores VSCode versions used for testing VSCode extensions 163 | 164 | .vscode-test 165 | 166 | # yarn v2 167 | 168 | .yarn/cache 169 | .yarn/unplugged 170 | .yarn/build-state.yml 171 | .yarn/install-state.gz 172 | .pnp.* 173 | 174 | # IntelliJ based IDEs 175 | .idea 176 | 177 | # Finder (MacOS) folder config 178 | .DS_Store 179 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language 12 | - Being respectful of differing viewpoints and experiences 13 | - Gracefully accepting constructive criticism 14 | - Focusing on what is best for the community 15 | - Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | - Trolling, insulting/derogatory comments, and personal or political attacks 21 | - Public or private harassment 22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at contact@aralroca.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | 6 | Please note we have a code of conduct, please follow it in all your interactions with the project. 7 | 8 | ## Pull Request Process 9 | 10 | 1. Ensure you are doing the PR to the canary branch. 11 | 2. Write the failing tests about the issue / feature you are working on. 12 | 3. Update the README.md with details of changes to the interface. 13 | 4. You may merge the Pull Request in once you have the approval of at least one maintainer, or if you 14 | do not have permission to do that, you may request the maintainer to merge it for you. 15 | 16 | ## Code of Conduct 17 | 18 | ### Our Pledge 19 | 20 | In the interest of fostering an open and welcoming environment, we as 21 | contributors and maintainers pledge to making participation in our project and 22 | our community a harassment-free experience for everyone, regardless of age, body 23 | size, disability, ethnicity, gender identity and expression, level of experience, 24 | nationality, personal appearance, race, religion, or sexual identity and 25 | orientation. 26 | 27 | ### Our Standards 28 | 29 | Examples of behavior that contributes to creating a positive environment 30 | include: 31 | 32 | - Using welcoming and inclusive language 33 | - Being respectful of differing viewpoints and experiences 34 | - Gracefully accepting constructive criticism 35 | - Focusing on what is best for the community 36 | - Showing empathy towards other community members 37 | 38 | Examples of unacceptable behavior by participants include: 39 | 40 | - The use of sexualized language or imagery and unwelcome sexual attention or 41 | advances 42 | - Trolling, insulting/derogatory comments, and personal or political attacks 43 | - Public or private harassment 44 | - Publishing others' private information, such as a physical or electronic 45 | address, without explicit permission 46 | - Other conduct which could reasonably be considered inappropriate in a 47 | professional setting 48 | 49 | ### Our Responsibilities 50 | 51 | Project maintainers are responsible for clarifying the standards of acceptable 52 | behavior and are expected to take appropriate and fair corrective action in 53 | response to any instances of unacceptable behavior. 54 | 55 | Project maintainers have the right and responsibility to remove, edit, or 56 | reject comments, commits, code, wiki edits, issues, and other contributions 57 | that are not aligned to this Code of Conduct, or to ban temporarily or 58 | permanently any contributor for other behaviors that they deem inappropriate, 59 | threatening, offensive, or harmful. 60 | 61 | ### Scope 62 | 63 | This Code of Conduct applies both within project spaces and in public spaces 64 | when an individual is representing the project or its community. Examples of 65 | representing a project or community include using an official project e-mail 66 | address, posting via an official social media account, or acting as an appointed 67 | representative at an online or offline event. Representation of a project may be 68 | further defined and clarified by project maintainers. 69 | 70 | ### Enforcement 71 | 72 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 73 | reported by contacting the project team at contact@aralroca.com. All 74 | complaints will be reviewed and investigated and will result in a response that 75 | is deemed necessary and appropriate to the circumstances. The project team is 76 | obligated to maintain confidentiality with regard to the reporter of an incident. 77 | Further details of specific enforcement policies may be posted separately. 78 | 79 | Project maintainers who do not follow or enforce the Code of Conduct in good 80 | faith may face temporary or permanent repercussions as determined by other 81 | members of the project's leadership. 82 | 83 | ### Attribution 84 | 85 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 86 | available at [http://contributor-covenant.org/version/1/4][version] 87 | 88 | [homepage]: http://contributor-covenant.org 89 | [version]: http://contributor-covenant.org/version/1/4/ 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | MIT License 4 | 5 | Copyright (c) 2024 Aral Roca Gomez 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parse-html-stream 2 | 3 | ## Overview 4 | 5 | `parse-html-stream` is a JavaScript library designed for client-side applications, specifically tailored for processing HTML streams. The primary objective is to capture and manipulate DOM Nodes as they are received, enabling seamless integration into hypermedia communication paradigms, such as HTMX. 6 | 7 |
8 | 9 | [![npm version](https://badge.fury.io/js/parse-html-stream.svg)](https://badge.fury.io/js/parse-html-stream) 10 | ![npm](https://img.shields.io/npm/dw/parse-html-stream) 11 | [![size](https://img.shields.io/bundlephobia/minzip/parse-html-stream)](https://bundlephobia.com/package/parse-html-stream) 12 | [![PRs Welcome][badge-prwelcome]][prwelcome] 13 | 14 | 15 | 16 | follow on Twitter 18 | 19 |
20 | 21 | [badge-prwelcome]: https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square 22 | [prwelcome]: http://makeapullrequest.com 23 | [spectrum]: https://spectrum.chat/parse-html-stream 24 | 25 | ## Getting started 26 | 27 | Run: 28 | 29 | ```sh 30 | bun install parse-html-stream 31 | ``` 32 | 33 | ## Usage Example 34 | 35 | Utilize the library by leveraging the asynchronous generator for parsing HTML streams. The following TypeScript example demonstrates its usage: 36 | 37 | ```ts 38 | import parseHTMLStream from "parse-html-stream"; 39 | 40 | // ... 41 | 42 | const reader = res.body.getReader(); 43 | 44 | for await (const node of parseHTMLStream(reader)) { 45 | console.log(node); 46 | } 47 | ``` 48 | 49 | This code snippet showcases how to iterate through the DOM Nodes in a streaming fashion, offering a practical approach for processing HTML streams in real-time. 50 | 51 | ## Walker example 52 | 53 | If you prefer to have control over moving around the HTML tree of the stream, you can use the following function: 54 | 55 | ```ts 56 | import htmlStreamWalker from "parse-html-stream/walker"; 57 | 58 | // ... 59 | 60 | const reader = res.body.getReader(); 61 | const walker = await htmlStreamWalker(reader); 62 | 63 | // Root node 64 | const rootNode = walker.rootNode 65 | 66 | // Gives the firstChild taking account the stream chunks 67 | const child = await walker.firstChild(rootNode); 68 | 69 | // Gives the nextSibling taking account the stream chunks 70 | const brother = await walker.nextSibling(rootNode); 71 | 72 | // You can do it with every HTML node: 73 | const childOfBrother = await walker.firstChild(brother); 74 | ``` 75 | 76 | The stream is processed as you walk through the tree, whenever it does not find a `firstChild` or `nextSibling` and has not yet finished the stream, it asks for another chunk. This way you can walk through the tree during the stream. -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aralroca/parse-html-stream/43ff89eb3f22e3f073ea19212258745cd79b7de9/bun.lockb -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Description: 3 | * 4 | * This module provides a function to parse an HTML stream into a 5 | * generator of nodes. 6 | */ 7 | export default async function* parseHTMLStream( 8 | streamReader: ReadableStreamDefaultReader, 9 | ): AsyncGenerator; 10 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parse-html-stream", 3 | "version": "0.3.0", 4 | "module": "./build/index.js", 5 | "type": "module", 6 | "main": "./build/index.js", 7 | "types": "./index.d.ts", 8 | "license": "MIT", 9 | "author": { 10 | "name": "Aral Roca Gòmez", 11 | "email": "contact@aralroca.com" 12 | }, 13 | "files": [ 14 | "build", 15 | "index.d.ts", 16 | "walker.d.ts" 17 | ], 18 | "exports": { 19 | ".": { 20 | "import": "./build/index.js", 21 | "require": "./build/index.js", 22 | "types": "./index.d.ts" 23 | }, 24 | "./walker": { 25 | "import": "./build/walker/index.js", 26 | "require": "./build/walker/index.js", 27 | "types": "./walker.d.ts" 28 | } 29 | }, 30 | "repository": { 31 | "type": "git", 32 | "url": "https://github.com/aralroca/parse-html-stream.git" 33 | }, 34 | "scripts": { 35 | "build": "bun build --minify --outdir=build src/index.ts src/walker/index.ts", 36 | "test": "bun test" 37 | }, 38 | "devDependencies": { 39 | "@types/bun": "1.0.4", 40 | "jsdom": "24.0.0" 41 | }, 42 | "peerDependencies": { 43 | "typescript": "5.0.0" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/index.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect } from "bun:test"; 2 | 3 | const { JSDOM } = require("jsdom"); 4 | 5 | const dom = new JSDOM(""); 6 | global.document = dom.window.document; 7 | global.window = dom.window; 8 | 9 | describe("parse-html-stream", () => { 10 | it("should handle an empty HTML stream", async () => { 11 | const stream = new ReadableStream({ 12 | start(controller) { 13 | controller.close(); 14 | }, 15 | }); 16 | 17 | const reader = stream.getReader(); 18 | const nodes = []; 19 | 20 | const parseHTMLStream = await import(".").then((m) => m.default); 21 | 22 | for await (const node of parseHTMLStream(reader)) { 23 | nodes.push(node); 24 | } 25 | 26 | expect(nodes).toEqual([]); 27 | }); 28 | 29 | it("should transform a stream of HTML into a stream of nodes", async () => { 30 | const encoder = new TextEncoder(); 31 | const stream = new ReadableStream({ 32 | start(controller) { 33 | controller.enqueue(encoder.encode("")); 34 | controller.enqueue(encoder.encode("")); 35 | controller.enqueue(encoder.encode("")); 36 | controller.enqueue(encoder.encode('
Bar
')); 37 | controller.enqueue(encoder.encode("")); 38 | controller.enqueue(encoder.encode("")); 39 | controller.close(); 40 | }, 41 | }); 42 | 43 | const reader = stream.getReader(); 44 | const nodeNames = []; 45 | 46 | const parseHTMLStream = await import(".").then((m) => m.default); 47 | 48 | for await (const node of parseHTMLStream(reader)) { 49 | nodeNames.push(node?.nodeName); 50 | } 51 | 52 | expect(nodeNames).toEqual(["HTML", "HEAD", "BODY", "DIV", "#text"]); 53 | }); 54 | 55 | it("should work with comments", async () => { 56 | const encoder = new TextEncoder(); 57 | const stream = new ReadableStream({ 58 | start(controller) { 59 | controller.enqueue(encoder.encode("")); 60 | controller.enqueue(encoder.encode("")); 61 | controller.enqueue(encoder.encode("")); 62 | controller.enqueue( 63 | encoder.encode('
Bar
'), 64 | ); 65 | controller.enqueue(encoder.encode("")); 66 | controller.enqueue(encoder.encode("")); 67 | controller.close(); 68 | }, 69 | }); 70 | 71 | const reader = stream.getReader(); 72 | const nodeNames = []; 73 | 74 | const parseHTMLStream = await import(".").then((m) => m.default); 75 | 76 | for await (const node of parseHTMLStream(reader)) { 77 | nodeNames.push(node?.nodeName); 78 | } 79 | 80 | expect(nodeNames).toEqual([ 81 | "HTML", 82 | "HEAD", 83 | "BODY", 84 | "DIV", 85 | "#comment", 86 | "#text", 87 | ]); 88 | }); 89 | 90 | it("should be possible to read the attributes of a node HTMLElement", async () => { 91 | const encoder = new TextEncoder(); 92 | const stream = new ReadableStream({ 93 | start(controller) { 94 | controller.enqueue(encoder.encode('
Bar
')); 95 | controller.close(); 96 | }, 97 | }); 98 | 99 | const reader = stream.getReader(); 100 | const nodes: Node[] = []; 101 | 102 | const parseHTMLStream = await import(".").then((m) => m.default); 103 | 104 | for await (const node of parseHTMLStream(reader)) { 105 | nodes.push(node); 106 | } 107 | 108 | expect(nodes).toHaveLength(5); 109 | expect(nodes[0]?.nodeName).toBe("HTML"); 110 | expect(nodes[1]?.nodeName).toBe("HEAD"); 111 | expect(nodes[2]?.nodeName).toBe("BODY"); 112 | expect(nodes[3]?.nodeName).toBe("DIV"); 113 | expect(nodes[4]?.nodeName).toBe("#text"); 114 | expect((nodes[3] as HTMLElement).getAttribute("class")).toBe("foo"); 115 | }); 116 | 117 | it("should work with very nested HTML", async () => { 118 | const encoder = new TextEncoder(); 119 | const stream = new ReadableStream({ 120 | start(controller) { 121 | controller.enqueue(encoder.encode("")); 122 | controller.enqueue(encoder.encode("")); 123 | controller.enqueue(encoder.encode("")); 124 | controller.enqueue(encoder.encode('
')); 125 | controller.enqueue(encoder.encode('
')); 126 | controller.enqueue(encoder.encode('
')); 127 | controller.enqueue(encoder.encode('
')); 128 | controller.enqueue(encoder.encode("Hello")); 129 | controller.enqueue(encoder.encode("
")); 130 | controller.enqueue(encoder.encode("
")); 131 | controller.enqueue(encoder.encode("
")); 132 | controller.enqueue(encoder.encode("
")); 133 | controller.enqueue(encoder.encode("")); 134 | controller.enqueue(encoder.encode("")); 135 | controller.close(); 136 | }, 137 | }); 138 | 139 | const reader = stream.getReader(); 140 | const nodes = []; 141 | 142 | const parseHTMLStream = await import(".").then((m) => m.default); 143 | 144 | for await (const node of parseHTMLStream(reader)) { 145 | nodes.push(node); 146 | } 147 | 148 | expect(nodes).toHaveLength(8); 149 | expect(nodes[0]?.nodeName).toBe("HTML"); 150 | expect(nodes[1]?.nodeName).toBe("HEAD"); 151 | expect(nodes[2]?.nodeName).toBe("BODY"); 152 | expect(nodes[3]?.nodeName).toBe("DIV"); 153 | expect((nodes[3] as HTMLElement).classList.contains("foo")).toBeTrue(); 154 | expect(nodes[4]?.nodeName).toBe("DIV"); 155 | expect((nodes[4] as HTMLElement).classList.contains("bar")).toBeTrue(); 156 | expect(nodes[5]?.nodeName).toBe("DIV"); 157 | expect((nodes[5] as HTMLElement).classList.contains("baz")).toBeTrue(); 158 | expect(nodes[6]?.nodeName).toBe("DIV"); 159 | expect((nodes[6] as HTMLElement).classList.contains("qux")).toBeTrue(); 160 | expect(nodes[7]?.nodeName).toBe("#text"); 161 | expect(nodes[7]?.textContent).toBe("Hello"); 162 | }); 163 | }); 164 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | const decoder = new TextDecoder(); 2 | 3 | export default async function* parseHTMLStream( 4 | streamReader: ReadableStreamDefaultReader, 5 | doc = document.implementation.createHTMLDocument(), 6 | lastChunkNode: Node | null = null, 7 | ): AsyncGenerator { 8 | const { done, value } = await streamReader.read(); 9 | 10 | if (done) return; 11 | 12 | doc.write(decoder.decode(value)); 13 | 14 | let lastNode = lastChunkNode 15 | ? getNextNode(lastChunkNode) 16 | : doc.documentElement; 17 | 18 | for (let node = lastNode; node; node = getNextNode(node)) { 19 | if (node) lastNode = node; 20 | yield node; 21 | } 22 | 23 | yield* await parseHTMLStream(streamReader, doc, lastNode ?? lastChunkNode); 24 | } 25 | 26 | /** 27 | * Get the next node in the tree. 28 | * It uses depth-first search in order to work with the streamed HTML. 29 | */ 30 | function getNextNode(node: Node | null, deeperDone?: Boolean): Node | null { 31 | if (!node) return null; 32 | if (node.childNodes.length && !deeperDone) return node.firstChild; 33 | return node.nextSibling ?? getNextNode(node.parentNode, true); 34 | } 35 | -------------------------------------------------------------------------------- /src/walker/index.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect } from "bun:test"; 2 | import { JSDOM } from "jsdom"; 3 | import htmlStreamWalker from "."; 4 | 5 | const dom = new JSDOM(""); 6 | global.document = dom.window.document; 7 | global.window = dom.window; 8 | 9 | describe("htmlStreamWalker", () => { 10 | it("should handle an empty HTML stream", async () => { 11 | const stream = new ReadableStream({ 12 | start(controller) { 13 | controller.close(); 14 | }, 15 | }); 16 | 17 | const reader = stream.getReader(); 18 | 19 | const { rootNode } = await htmlStreamWalker(reader); 20 | 21 | expect(rootNode).toBeEmpty(); 22 | }); 23 | 24 | it("should transform a stream of HTML into a stream of nodes", async () => { 25 | const encoder = new TextEncoder(); 26 | const stream = new ReadableStream({ 27 | start(controller) { 28 | controller.enqueue(encoder.encode("")); 29 | controller.enqueue(encoder.encode("")); 30 | controller.enqueue(encoder.encode("")); 31 | controller.enqueue(encoder.encode('
Bar
')); 32 | controller.enqueue(encoder.encode("")); 33 | controller.enqueue(encoder.encode("")); 34 | controller.close(); 35 | }, 36 | }); 37 | 38 | const reader = stream.getReader(); 39 | 40 | const { rootNode, firstChild, nextSibling } = 41 | await htmlStreamWalker(reader); 42 | 43 | expect(rootNode?.nodeName).toBe("HTML"); 44 | 45 | const child = await firstChild(rootNode!); 46 | expect(child?.nodeName).toBe("HEAD"); 47 | 48 | const body = await nextSibling(child!); 49 | expect(body?.nodeName).toBe("BODY"); 50 | 51 | const div = await firstChild(body!); 52 | expect(div?.nodeName).toBe("DIV"); 53 | 54 | const text = await firstChild(div!); 55 | expect(text?.nodeName).toBe("#text"); 56 | expect(text?.textContent).toBe("Bar"); 57 | }); 58 | 59 | it("should work with comments", async () => { 60 | const encoder = new TextEncoder(); 61 | const stream = new ReadableStream({ 62 | start(controller) { 63 | controller.enqueue(encoder.encode("")); 64 | controller.enqueue(encoder.encode("")); 65 | controller.enqueue(encoder.encode("")); 66 | controller.enqueue( 67 | encoder.encode('
Bar
'), 68 | ); 69 | controller.enqueue(encoder.encode("")); 70 | controller.enqueue(encoder.encode("")); 71 | controller.close(); 72 | }, 73 | }); 74 | 75 | const reader = stream.getReader(); 76 | 77 | const { rootNode, firstChild, nextSibling } = 78 | await htmlStreamWalker(reader); 79 | 80 | expect(rootNode?.nodeName).toBe("HTML"); 81 | 82 | const child = await firstChild(rootNode!); 83 | expect(child?.nodeName).toBe("HEAD"); 84 | 85 | const body = await nextSibling(child!); 86 | expect(body?.nodeName).toBe("BODY"); 87 | 88 | const div = await firstChild(body!); 89 | expect(div?.nodeName).toBe("DIV"); 90 | 91 | const comment = await firstChild(div!); 92 | expect(comment?.nodeName).toBe("#comment"); 93 | 94 | const text = await nextSibling(comment!); 95 | expect(text?.nodeName).toBe("#text"); 96 | expect(text?.textContent).toBe("Bar"); 97 | }); 98 | }); 99 | -------------------------------------------------------------------------------- /src/walker/index.ts: -------------------------------------------------------------------------------- 1 | const decoder = new TextDecoder(); 2 | const AUTOCREATED_NODE_NAMES = new Set(["HTML", "HEAD", "BODY"]); 3 | 4 | export default async function htmlStreamWalker( 5 | streamReader: ReadableStreamDefaultReader, 6 | ) { 7 | const doc = document.implementation.createHTMLDocument(); 8 | 9 | async function waitNextChunk() { 10 | const { done, value } = await streamReader.read(); 11 | if (!done) doc.write(decoder.decode(value)); 12 | return done; 13 | } 14 | 15 | const done = await waitNextChunk(); 16 | const rootNode = done ? null : doc.documentElement; 17 | 18 | function next(field: 'firstChild' | 'nextSibling') { 19 | return async (node: Node) => { 20 | if (!node) return null; 21 | if (AUTOCREATED_NODE_NAMES.has(node.nodeName)) await waitNextChunk(); 22 | if (!node[field]) await waitNextChunk(); 23 | if (node[field]) return node[field]; 24 | return null; 25 | } 26 | } 27 | 28 | return { rootNode, firstChild: next('firstChild'), nextSibling: next('nextSibling') }; 29 | } 30 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "lib": ["ESNext", "dom", "dom.iterable"], 4 | "module": "esnext", 5 | "baseUrl": "./src", 6 | "target": "esnext", 7 | "moduleResolution": "bundler", 8 | "moduleDetection": "force", 9 | "allowImportingTsExtensions": true, 10 | "verbatimModuleSyntax": true, 11 | "noFallthroughCasesInSwitch": true, 12 | "noEmit": true, 13 | "composite": true, 14 | "strict": true, 15 | "downlevelIteration": true, 16 | "skipLibCheck": true, 17 | "allowSyntheticDefaultImports": true, 18 | "forceConsistentCasingInFileNames": true, 19 | "allowJs": true, 20 | "paths": { 21 | "@/*": ["*"] 22 | } 23 | }, 24 | "exclude": ["node_modules", "build"] 25 | } 26 | -------------------------------------------------------------------------------- /walker.d.ts: -------------------------------------------------------------------------------- 1 | export default async function htmlStreamWalker( 2 | streamReader: ReadableStreamDefaultReader, 3 | ): Promise<{ 4 | rootNode: Node | null; 5 | firstChild: (node: Node) => Promise; 6 | nextSibling: (node: Node) => Promise; 7 | }> 8 | --------------------------------------------------------------------------------