├── .github
└── workflows
│ └── test.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bun.lockb
├── index.d.ts
├── package.json
├── src
├── index.test.ts
├── index.ts
└── walker
│ ├── index.test.ts
│ └── index.ts
├── tsconfig.json
└── walker.d.ts
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v3
15 | - name: Bunjs
16 | uses: oven-sh/setup-bun@v1
17 | with:
18 | bun-version: 1.0.25
19 | - name: Install dependencies
20 | run: bun install
21 | - run: bun run build
22 | - run: bun run test
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
2 |
3 | # Build
4 |
5 | build
6 | # Logs
7 |
8 | logs
9 | _.log
10 | npm-debug.log_
11 | yarn-debug.log*
12 | yarn-error.log*
13 | lerna-debug.log*
14 | .pnpm-debug.log*
15 |
16 | # Caches
17 |
18 | .cache
19 |
20 | # Diagnostic reports (https://nodejs.org/api/report.html)
21 |
22 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
23 |
24 | # Runtime data
25 |
26 | pids
27 | _.pid
28 | _.seed
29 | *.pid.lock
30 |
31 | # Directory for instrumented libs generated by jscoverage/JSCover
32 |
33 | lib-cov
34 |
35 | # Coverage directory used by tools like istanbul
36 |
37 | coverage
38 | *.lcov
39 |
40 | # nyc test coverage
41 |
42 | .nyc_output
43 |
44 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
45 |
46 | .grunt
47 |
48 | # Bower dependency directory (https://bower.io/)
49 |
50 | bower_components
51 |
52 | # node-waf configuration
53 |
54 | .lock-wscript
55 |
56 | # Compiled binary addons (https://nodejs.org/api/addons.html)
57 |
58 | build/Release
59 |
60 | # Dependency directories
61 |
62 | node_modules/
63 | jspm_packages/
64 |
65 | # Snowpack dependency directory (https://snowpack.dev/)
66 |
67 | web_modules/
68 |
69 | # TypeScript cache
70 |
71 | *.tsbuildinfo
72 |
73 | # Optional npm cache directory
74 |
75 | .npm
76 |
77 | # Optional eslint cache
78 |
79 | .eslintcache
80 |
81 | # Optional stylelint cache
82 |
83 | .stylelintcache
84 |
85 | # Microbundle cache
86 |
87 | .rpt2_cache/
88 | .rts2_cache_cjs/
89 | .rts2_cache_es/
90 | .rts2_cache_umd/
91 |
92 | # Optional REPL history
93 |
94 | .node_repl_history
95 |
96 | # Output of 'npm pack'
97 |
98 | *.tgz
99 |
100 | # Yarn Integrity file
101 |
102 | .yarn-integrity
103 |
104 | # dotenv environment variable files
105 |
106 | .env
107 | .env.development.local
108 | .env.test.local
109 | .env.production.local
110 | .env.local
111 |
112 | # parcel-bundler cache (https://parceljs.org/)
113 |
114 | .parcel-cache
115 |
116 | # Next.js build output
117 |
118 | .next
119 | out
120 |
121 | # Nuxt.js build / generate output
122 |
123 | .nuxt
124 | dist
125 |
126 | # Gatsby files
127 |
128 | # Comment in the public line in if your project uses Gatsby and not Next.js
129 |
130 | # https://nextjs.org/blog/next-9-1#public-directory-support
131 |
132 | # public
133 |
134 | # vuepress build output
135 |
136 | .vuepress/dist
137 |
138 | # vuepress v2.x temp and cache directory
139 |
140 | .temp
141 |
142 | # Docusaurus cache and generated files
143 |
144 | .docusaurus
145 |
146 | # Serverless directories
147 |
148 | .serverless/
149 |
150 | # FuseBox cache
151 |
152 | .fusebox/
153 |
154 | # DynamoDB Local files
155 |
156 | .dynamodb/
157 |
158 | # TernJS port file
159 |
160 | .tern-port
161 |
162 | # Stores VSCode versions used for testing VSCode extensions
163 |
164 | .vscode-test
165 |
166 | # yarn v2
167 |
168 | .yarn/cache
169 | .yarn/unplugged
170 | .yarn/build-state.yml
171 | .yarn/install-state.gz
172 | .pnp.*
173 |
174 | # IntelliJ based IDEs
175 | .idea
176 |
177 | # Finder (MacOS) folder config
178 | .DS_Store
179 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | - Using welcoming and inclusive language
12 | - Being respectful of differing viewpoints and experiences
13 | - Gracefully accepting constructive criticism
14 | - Focusing on what is best for the community
15 | - Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | - Trolling, insulting/derogatory comments, and personal or political attacks
21 | - Public or private harassment
22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | - Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at contact@aralroca.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 |
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via issue,
4 | email, or any other method with the owners of this repository before making a change.
5 |
6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
7 |
8 | ## Pull Request Process
9 |
10 | 1. Ensure you are doing the PR to the canary branch.
11 | 2. Write the failing tests about the issue / feature you are working on.
12 | 3. Update the README.md with details of changes to the interface.
13 | 4. You may merge the Pull Request in once you have the approval of at least one maintainer, or if you
14 | do not have permission to do that, you may request the maintainer to merge it for you.
15 |
16 | ## Code of Conduct
17 |
18 | ### Our Pledge
19 |
20 | In the interest of fostering an open and welcoming environment, we as
21 | contributors and maintainers pledge to making participation in our project and
22 | our community a harassment-free experience for everyone, regardless of age, body
23 | size, disability, ethnicity, gender identity and expression, level of experience,
24 | nationality, personal appearance, race, religion, or sexual identity and
25 | orientation.
26 |
27 | ### Our Standards
28 |
29 | Examples of behavior that contributes to creating a positive environment
30 | include:
31 |
32 | - Using welcoming and inclusive language
33 | - Being respectful of differing viewpoints and experiences
34 | - Gracefully accepting constructive criticism
35 | - Focusing on what is best for the community
36 | - Showing empathy towards other community members
37 |
38 | Examples of unacceptable behavior by participants include:
39 |
40 | - The use of sexualized language or imagery and unwelcome sexual attention or
41 | advances
42 | - Trolling, insulting/derogatory comments, and personal or political attacks
43 | - Public or private harassment
44 | - Publishing others' private information, such as a physical or electronic
45 | address, without explicit permission
46 | - Other conduct which could reasonably be considered inappropriate in a
47 | professional setting
48 |
49 | ### Our Responsibilities
50 |
51 | Project maintainers are responsible for clarifying the standards of acceptable
52 | behavior and are expected to take appropriate and fair corrective action in
53 | response to any instances of unacceptable behavior.
54 |
55 | Project maintainers have the right and responsibility to remove, edit, or
56 | reject comments, commits, code, wiki edits, issues, and other contributions
57 | that are not aligned to this Code of Conduct, or to ban temporarily or
58 | permanently any contributor for other behaviors that they deem inappropriate,
59 | threatening, offensive, or harmful.
60 |
61 | ### Scope
62 |
63 | This Code of Conduct applies both within project spaces and in public spaces
64 | when an individual is representing the project or its community. Examples of
65 | representing a project or community include using an official project e-mail
66 | address, posting via an official social media account, or acting as an appointed
67 | representative at an online or offline event. Representation of a project may be
68 | further defined and clarified by project maintainers.
69 |
70 | ### Enforcement
71 |
72 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
73 | reported by contacting the project team at contact@aralroca.com. All
74 | complaints will be reviewed and investigated and will result in a response that
75 | is deemed necessary and appropriate to the circumstances. The project team is
76 | obligated to maintain confidentiality with regard to the reporter of an incident.
77 | Further details of specific enforcement policies may be posted separately.
78 |
79 | Project maintainers who do not follow or enforce the Code of Conduct in good
80 | faith may face temporary or permanent repercussions as determined by other
81 | members of the project's leadership.
82 |
83 | ### Attribution
84 |
85 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
86 | available at [http://contributor-covenant.org/version/1/4][version]
87 |
88 | [homepage]: http://contributor-covenant.org
89 | [version]: http://contributor-covenant.org/version/1/4/
90 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | MIT License
4 |
5 | Copyright (c) 2024 Aral Roca Gomez
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy
8 | of this software and associated documentation files (the "Software"), to deal
9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # parse-html-stream
2 |
3 | ## Overview
4 |
5 | `parse-html-stream` is a JavaScript library designed for client-side applications, specifically tailored for processing HTML streams. The primary objective is to capture and manipulate DOM Nodes as they are received, enabling seamless integration into hypermedia communication paradigms, such as HTMX.
6 |
7 |
8 |
9 | [](https://badge.fury.io/js/parse-html-stream)
10 | 
11 | [](https://bundlephobia.com/package/parse-html-stream)
12 | [![PRs Welcome][badge-prwelcome]][prwelcome]
13 |
14 | 
15 |
16 | 
18 |
19 |
20 |
21 | [badge-prwelcome]: https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square
22 | [prwelcome]: http://makeapullrequest.com
23 | [spectrum]: https://spectrum.chat/parse-html-stream
24 |
25 | ## Getting started
26 |
27 | Run:
28 |
29 | ```sh
30 | bun install parse-html-stream
31 | ```
32 |
33 | ## Usage Example
34 |
35 | Utilize the library by leveraging the asynchronous generator for parsing HTML streams. The following TypeScript example demonstrates its usage:
36 |
37 | ```ts
38 | import parseHTMLStream from "parse-html-stream";
39 |
40 | // ...
41 |
42 | const reader = res.body.getReader();
43 |
44 | for await (const node of parseHTMLStream(reader)) {
45 | console.log(node);
46 | }
47 | ```
48 |
49 | This code snippet showcases how to iterate through the DOM Nodes in a streaming fashion, offering a practical approach for processing HTML streams in real-time.
50 |
51 | ## Walker example
52 |
53 | If you prefer to have control over moving around the HTML tree of the stream, you can use the following function:
54 |
55 | ```ts
56 | import htmlStreamWalker from "parse-html-stream/walker";
57 |
58 | // ...
59 |
60 | const reader = res.body.getReader();
61 | const walker = await htmlStreamWalker(reader);
62 |
63 | // Root node
64 | const rootNode = walker.rootNode
65 |
66 | // Gives the firstChild taking account the stream chunks
67 | const child = await walker.firstChild(rootNode);
68 |
69 | // Gives the nextSibling taking account the stream chunks
70 | const brother = await walker.nextSibling(rootNode);
71 |
72 | // You can do it with every HTML node:
73 | const childOfBrother = await walker.firstChild(brother);
74 | ```
75 |
76 | The stream is processed as you walk through the tree, whenever it does not find a `firstChild` or `nextSibling` and has not yet finished the stream, it asks for another chunk. This way you can walk through the tree during the stream.
--------------------------------------------------------------------------------
/bun.lockb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aralroca/parse-html-stream/43ff89eb3f22e3f073ea19212258745cd79b7de9/bun.lockb
--------------------------------------------------------------------------------
/index.d.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Description:
3 | *
4 | * This module provides a function to parse an HTML stream into a
5 | * generator of nodes.
6 | */
7 | export default async function* parseHTMLStream(
8 | streamReader: ReadableStreamDefaultReader,
9 | ): AsyncGenerator;
10 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "parse-html-stream",
3 | "version": "0.3.0",
4 | "module": "./build/index.js",
5 | "type": "module",
6 | "main": "./build/index.js",
7 | "types": "./index.d.ts",
8 | "license": "MIT",
9 | "author": {
10 | "name": "Aral Roca Gòmez",
11 | "email": "contact@aralroca.com"
12 | },
13 | "files": [
14 | "build",
15 | "index.d.ts",
16 | "walker.d.ts"
17 | ],
18 | "exports": {
19 | ".": {
20 | "import": "./build/index.js",
21 | "require": "./build/index.js",
22 | "types": "./index.d.ts"
23 | },
24 | "./walker": {
25 | "import": "./build/walker/index.js",
26 | "require": "./build/walker/index.js",
27 | "types": "./walker.d.ts"
28 | }
29 | },
30 | "repository": {
31 | "type": "git",
32 | "url": "https://github.com/aralroca/parse-html-stream.git"
33 | },
34 | "scripts": {
35 | "build": "bun build --minify --outdir=build src/index.ts src/walker/index.ts",
36 | "test": "bun test"
37 | },
38 | "devDependencies": {
39 | "@types/bun": "1.0.4",
40 | "jsdom": "24.0.0"
41 | },
42 | "peerDependencies": {
43 | "typescript": "5.0.0"
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/index.test.ts:
--------------------------------------------------------------------------------
1 | import { describe, it, expect } from "bun:test";
2 |
3 | const { JSDOM } = require("jsdom");
4 |
5 | const dom = new JSDOM("");
6 | global.document = dom.window.document;
7 | global.window = dom.window;
8 |
9 | describe("parse-html-stream", () => {
10 | it("should handle an empty HTML stream", async () => {
11 | const stream = new ReadableStream({
12 | start(controller) {
13 | controller.close();
14 | },
15 | });
16 |
17 | const reader = stream.getReader();
18 | const nodes = [];
19 |
20 | const parseHTMLStream = await import(".").then((m) => m.default);
21 |
22 | for await (const node of parseHTMLStream(reader)) {
23 | nodes.push(node);
24 | }
25 |
26 | expect(nodes).toEqual([]);
27 | });
28 |
29 | it("should transform a stream of HTML into a stream of nodes", async () => {
30 | const encoder = new TextEncoder();
31 | const stream = new ReadableStream({
32 | start(controller) {
33 | controller.enqueue(encoder.encode(""));
34 | controller.enqueue(encoder.encode(""));
35 | controller.enqueue(encoder.encode(""));
36 | controller.enqueue(encoder.encode('Bar
'));
37 | controller.enqueue(encoder.encode(""));
38 | controller.enqueue(encoder.encode(""));
39 | controller.close();
40 | },
41 | });
42 |
43 | const reader = stream.getReader();
44 | const nodeNames = [];
45 |
46 | const parseHTMLStream = await import(".").then((m) => m.default);
47 |
48 | for await (const node of parseHTMLStream(reader)) {
49 | nodeNames.push(node?.nodeName);
50 | }
51 |
52 | expect(nodeNames).toEqual(["HTML", "HEAD", "BODY", "DIV", "#text"]);
53 | });
54 |
55 | it("should work with comments", async () => {
56 | const encoder = new TextEncoder();
57 | const stream = new ReadableStream({
58 | start(controller) {
59 | controller.enqueue(encoder.encode(""));
60 | controller.enqueue(encoder.encode(""));
61 | controller.enqueue(encoder.encode(""));
62 | controller.enqueue(
63 | encoder.encode('Bar
'),
64 | );
65 | controller.enqueue(encoder.encode(""));
66 | controller.enqueue(encoder.encode(""));
67 | controller.close();
68 | },
69 | });
70 |
71 | const reader = stream.getReader();
72 | const nodeNames = [];
73 |
74 | const parseHTMLStream = await import(".").then((m) => m.default);
75 |
76 | for await (const node of parseHTMLStream(reader)) {
77 | nodeNames.push(node?.nodeName);
78 | }
79 |
80 | expect(nodeNames).toEqual([
81 | "HTML",
82 | "HEAD",
83 | "BODY",
84 | "DIV",
85 | "#comment",
86 | "#text",
87 | ]);
88 | });
89 |
90 | it("should be possible to read the attributes of a node HTMLElement", async () => {
91 | const encoder = new TextEncoder();
92 | const stream = new ReadableStream({
93 | start(controller) {
94 | controller.enqueue(encoder.encode('Bar
'));
95 | controller.close();
96 | },
97 | });
98 |
99 | const reader = stream.getReader();
100 | const nodes: Node[] = [];
101 |
102 | const parseHTMLStream = await import(".").then((m) => m.default);
103 |
104 | for await (const node of parseHTMLStream(reader)) {
105 | nodes.push(node);
106 | }
107 |
108 | expect(nodes).toHaveLength(5);
109 | expect(nodes[0]?.nodeName).toBe("HTML");
110 | expect(nodes[1]?.nodeName).toBe("HEAD");
111 | expect(nodes[2]?.nodeName).toBe("BODY");
112 | expect(nodes[3]?.nodeName).toBe("DIV");
113 | expect(nodes[4]?.nodeName).toBe("#text");
114 | expect((nodes[3] as HTMLElement).getAttribute("class")).toBe("foo");
115 | });
116 |
117 | it("should work with very nested HTML", async () => {
118 | const encoder = new TextEncoder();
119 | const stream = new ReadableStream({
120 | start(controller) {
121 | controller.enqueue(encoder.encode(""));
122 | controller.enqueue(encoder.encode(""));
124 | controller.enqueue(encoder.encode(''));
125 | controller.enqueue(encoder.encode('
'));
126 | controller.enqueue(encoder.encode('
'));
127 | controller.enqueue(encoder.encode('
'));
128 | controller.enqueue(encoder.encode("Hello"));
129 | controller.enqueue(encoder.encode("
"));
130 | controller.enqueue(encoder.encode("
"));
131 | controller.enqueue(encoder.encode("
"));
132 | controller.enqueue(encoder.encode("
"));
133 | controller.enqueue(encoder.encode(""));
134 | controller.enqueue(encoder.encode(""));
135 | controller.close();
136 | },
137 | });
138 |
139 | const reader = stream.getReader();
140 | const nodes = [];
141 |
142 | const parseHTMLStream = await import(".").then((m) => m.default);
143 |
144 | for await (const node of parseHTMLStream(reader)) {
145 | nodes.push(node);
146 | }
147 |
148 | expect(nodes).toHaveLength(8);
149 | expect(nodes[0]?.nodeName).toBe("HTML");
150 | expect(nodes[1]?.nodeName).toBe("HEAD");
151 | expect(nodes[2]?.nodeName).toBe("BODY");
152 | expect(nodes[3]?.nodeName).toBe("DIV");
153 | expect((nodes[3] as HTMLElement).classList.contains("foo")).toBeTrue();
154 | expect(nodes[4]?.nodeName).toBe("DIV");
155 | expect((nodes[4] as HTMLElement).classList.contains("bar")).toBeTrue();
156 | expect(nodes[5]?.nodeName).toBe("DIV");
157 | expect((nodes[5] as HTMLElement).classList.contains("baz")).toBeTrue();
158 | expect(nodes[6]?.nodeName).toBe("DIV");
159 | expect((nodes[6] as HTMLElement).classList.contains("qux")).toBeTrue();
160 | expect(nodes[7]?.nodeName).toBe("#text");
161 | expect(nodes[7]?.textContent).toBe("Hello");
162 | });
163 | });
164 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | const decoder = new TextDecoder();
2 |
3 | export default async function* parseHTMLStream(
4 | streamReader: ReadableStreamDefaultReader,
5 | doc = document.implementation.createHTMLDocument(),
6 | lastChunkNode: Node | null = null,
7 | ): AsyncGenerator {
8 | const { done, value } = await streamReader.read();
9 |
10 | if (done) return;
11 |
12 | doc.write(decoder.decode(value));
13 |
14 | let lastNode = lastChunkNode
15 | ? getNextNode(lastChunkNode)
16 | : doc.documentElement;
17 |
18 | for (let node = lastNode; node; node = getNextNode(node)) {
19 | if (node) lastNode = node;
20 | yield node;
21 | }
22 |
23 | yield* await parseHTMLStream(streamReader, doc, lastNode ?? lastChunkNode);
24 | }
25 |
26 | /**
27 | * Get the next node in the tree.
28 | * It uses depth-first search in order to work with the streamed HTML.
29 | */
30 | function getNextNode(node: Node | null, deeperDone?: Boolean): Node | null {
31 | if (!node) return null;
32 | if (node.childNodes.length && !deeperDone) return node.firstChild;
33 | return node.nextSibling ?? getNextNode(node.parentNode, true);
34 | }
35 |
--------------------------------------------------------------------------------
/src/walker/index.test.ts:
--------------------------------------------------------------------------------
1 | import { describe, it, expect } from "bun:test";
2 | import { JSDOM } from "jsdom";
3 | import htmlStreamWalker from ".";
4 |
5 | const dom = new JSDOM("");
6 | global.document = dom.window.document;
7 | global.window = dom.window;
8 |
9 | describe("htmlStreamWalker", () => {
10 | it("should handle an empty HTML stream", async () => {
11 | const stream = new ReadableStream({
12 | start(controller) {
13 | controller.close();
14 | },
15 | });
16 |
17 | const reader = stream.getReader();
18 |
19 | const { rootNode } = await htmlStreamWalker(reader);
20 |
21 | expect(rootNode).toBeEmpty();
22 | });
23 |
24 | it("should transform a stream of HTML into a stream of nodes", async () => {
25 | const encoder = new TextEncoder();
26 | const stream = new ReadableStream({
27 | start(controller) {
28 | controller.enqueue(encoder.encode(""));
29 | controller.enqueue(encoder.encode(""));
30 | controller.enqueue(encoder.encode(""));
31 | controller.enqueue(encoder.encode('Bar
'));
32 | controller.enqueue(encoder.encode(""));
33 | controller.enqueue(encoder.encode(""));
34 | controller.close();
35 | },
36 | });
37 |
38 | const reader = stream.getReader();
39 |
40 | const { rootNode, firstChild, nextSibling } =
41 | await htmlStreamWalker(reader);
42 |
43 | expect(rootNode?.nodeName).toBe("HTML");
44 |
45 | const child = await firstChild(rootNode!);
46 | expect(child?.nodeName).toBe("HEAD");
47 |
48 | const body = await nextSibling(child!);
49 | expect(body?.nodeName).toBe("BODY");
50 |
51 | const div = await firstChild(body!);
52 | expect(div?.nodeName).toBe("DIV");
53 |
54 | const text = await firstChild(div!);
55 | expect(text?.nodeName).toBe("#text");
56 | expect(text?.textContent).toBe("Bar");
57 | });
58 |
59 | it("should work with comments", async () => {
60 | const encoder = new TextEncoder();
61 | const stream = new ReadableStream({
62 | start(controller) {
63 | controller.enqueue(encoder.encode(""));
64 | controller.enqueue(encoder.encode(""));
65 | controller.enqueue(encoder.encode(""));
66 | controller.enqueue(
67 | encoder.encode('Bar
'),
68 | );
69 | controller.enqueue(encoder.encode(""));
70 | controller.enqueue(encoder.encode(""));
71 | controller.close();
72 | },
73 | });
74 |
75 | const reader = stream.getReader();
76 |
77 | const { rootNode, firstChild, nextSibling } =
78 | await htmlStreamWalker(reader);
79 |
80 | expect(rootNode?.nodeName).toBe("HTML");
81 |
82 | const child = await firstChild(rootNode!);
83 | expect(child?.nodeName).toBe("HEAD");
84 |
85 | const body = await nextSibling(child!);
86 | expect(body?.nodeName).toBe("BODY");
87 |
88 | const div = await firstChild(body!);
89 | expect(div?.nodeName).toBe("DIV");
90 |
91 | const comment = await firstChild(div!);
92 | expect(comment?.nodeName).toBe("#comment");
93 |
94 | const text = await nextSibling(comment!);
95 | expect(text?.nodeName).toBe("#text");
96 | expect(text?.textContent).toBe("Bar");
97 | });
98 | });
99 |
--------------------------------------------------------------------------------
/src/walker/index.ts:
--------------------------------------------------------------------------------
1 | const decoder = new TextDecoder();
2 | const AUTOCREATED_NODE_NAMES = new Set(["HTML", "HEAD", "BODY"]);
3 |
4 | export default async function htmlStreamWalker(
5 | streamReader: ReadableStreamDefaultReader,
6 | ) {
7 | const doc = document.implementation.createHTMLDocument();
8 |
9 | async function waitNextChunk() {
10 | const { done, value } = await streamReader.read();
11 | if (!done) doc.write(decoder.decode(value));
12 | return done;
13 | }
14 |
15 | const done = await waitNextChunk();
16 | const rootNode = done ? null : doc.documentElement;
17 |
18 | function next(field: 'firstChild' | 'nextSibling') {
19 | return async (node: Node) => {
20 | if (!node) return null;
21 | if (AUTOCREATED_NODE_NAMES.has(node.nodeName)) await waitNextChunk();
22 | if (!node[field]) await waitNextChunk();
23 | if (node[field]) return node[field];
24 | return null;
25 | }
26 | }
27 |
28 | return { rootNode, firstChild: next('firstChild'), nextSibling: next('nextSibling') };
29 | }
30 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "lib": ["ESNext", "dom", "dom.iterable"],
4 | "module": "esnext",
5 | "baseUrl": "./src",
6 | "target": "esnext",
7 | "moduleResolution": "bundler",
8 | "moduleDetection": "force",
9 | "allowImportingTsExtensions": true,
10 | "verbatimModuleSyntax": true,
11 | "noFallthroughCasesInSwitch": true,
12 | "noEmit": true,
13 | "composite": true,
14 | "strict": true,
15 | "downlevelIteration": true,
16 | "skipLibCheck": true,
17 | "allowSyntheticDefaultImports": true,
18 | "forceConsistentCasingInFileNames": true,
19 | "allowJs": true,
20 | "paths": {
21 | "@/*": ["*"]
22 | }
23 | },
24 | "exclude": ["node_modules", "build"]
25 | }
26 |
--------------------------------------------------------------------------------
/walker.d.ts:
--------------------------------------------------------------------------------
1 | export default async function htmlStreamWalker(
2 | streamReader: ReadableStreamDefaultReader,
3 | ): Promise<{
4 | rootNode: Node | null;
5 | firstChild: (node: Node) => Promise;
6 | nextSibling: (node: Node) => Promise;
7 | }>
8 |
--------------------------------------------------------------------------------