├── .github └── workflows │ ├── jsr.yml │ └── npm.yml ├── .gitignore ├── LICENSE ├── README.md ├── deno.json ├── examples ├── .gitignore ├── advanced │ ├── json.ts │ ├── mod.ts │ ├── server.ts │ └── stream.ts ├── bun.ts ├── deno.ts └── node.mjs ├── mod.ts ├── package.json └── src ├── node.ts ├── parse.ts ├── stream.ts └── types.ts /.github/workflows/jsr.yml: -------------------------------------------------------------------------------- 1 | name: Publish JSR 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | permissions: 12 | contents: read 13 | id-token: write 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Publish package 19 | run: npx jsr publish 20 | -------------------------------------------------------------------------------- /.github/workflows/npm.yml: -------------------------------------------------------------------------------- 1 | name: Publish NPM 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | permissions: 12 | contents: read 13 | id-token: write 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - uses: actions/setup-node@v4 19 | with: 20 | node-version: '22.x' 21 | registry-url: 'https://registry.npmjs.org' 22 | - run: npm publish --provenance --access public 23 | env: 24 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /* 2 | /*.* 3 | !.github 4 | !.gitignore 5 | !deno.json 6 | !package.json 7 | !README.md 8 | !LICENSE 9 | !mod.ts 10 | !src 11 | !src/**/*.ts 12 | !examples 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 David Bushell 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📰 XML Streamify 2 | 3 | [![JSR](https://jsr.io/badges/@dbushell/xml-streamify?labelColor=98e6c8&color=333)](https://jsr.io/@dbushell/xml-streamify) [![NPM](https://img.shields.io/npm/v/@dbushell/xml-streamify?labelColor=98e6c8&color=333)](https://www.npmjs.com/package/@dbushell/xml-streamify) 4 | 5 | Fetch and parse XML documents using the power of JavaScript web streams and async iterators ✨ 6 | 7 | * Small, fast, zero dependencies † 8 | * Work with data before the fetch is complete 9 | * Cross-runtime support (Bun, Deno, Node, and web browsers) 10 | 11 | **This is experimental work in progress.** But it does seem to work. It was designed to parse RSS feeds. 12 | 13 | ## Usage 14 | 15 | The `parse` generator function is the main export. Below is a basic example that logs RSS item titles as they're found: 16 | 17 | ```javascript 18 | import {parse} from "@dbushell/xml-streamify"; 19 | 20 | for await (const node of parse('https://dbushell.com/rss.xml')) { 21 | if (node.is('channel', 'item')) { 22 | console.log(node.first('title').innerText); 23 | } 24 | } 25 | ``` 26 | 27 | See [`src/types.ts`](/src/types.ts) for `parse` options. 28 | 29 | `parse` uses a lower level `XMLStream` that can be used alone: 30 | 31 | ```javascript 32 | const response = await fetch('https://dbushell.com/rss.xml'); 33 | const stream = response.body.pipeThrough(new XMLStream()); 34 | for await (const [type, value] of stream) { 35 | // e.g. declaration: 36 | console.log(`${type}: ${value}`); 37 | } 38 | ``` 39 | 40 | ## Advanced 41 | 42 | See the `examples` directory for more advanced and platform specific examples. 43 | 44 | In the `examples/advanced` directory there is a Deno web server. It will proxy RSS feeds, add CORS headers, and throttle streaming speed for testing. Run `deno run -A examples/advanced/mod.ts` for the full example script. 45 | 46 | ## Notes 47 | 48 | This project may not be fully XML compliant. It can handle XHTML in some cases. It will not parse HTML where elements like `` are not self-closing and `
  • ` do not require a closing `
  • ` for example. 49 | 50 | Browsers may need a [polyfill](https://bugs.chromium.org/p/chromium/issues/detail?id=929585#c10) until they support async iterator on `ReadableStream`. 51 | 52 | † bring your own HTML entities decoder 53 | 54 | * * * 55 | 56 | [MIT License](/LICENSE) | Copyright © 2024 [David Bushell](https://dbushell.com) 57 | -------------------------------------------------------------------------------- /deno.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@dbushell/xml-streamify", 3 | "version": "0.7.0", 4 | "exports": { 5 | ".": "./mod.ts", 6 | "./node": "./src/node.ts", 7 | "./parse": "./src/parse.ts", 8 | "./stream": "./src/stream.ts", 9 | "./types": "./src/types.ts" 10 | }, 11 | "publish": { 12 | "include": ["src", "mod.ts", "deno.json", "LICENSE", "README.md"], 13 | "exclude": [".github", "package.json", "examples"] 14 | }, 15 | "lint": { 16 | "include": ["**/*.ts"] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | /* 2 | !.gitignore 3 | !advanced 4 | advanced/**.* 5 | !**/*.ts 6 | !**/*.mjs 7 | -------------------------------------------------------------------------------- /examples/advanced/json.ts: -------------------------------------------------------------------------------- 1 | import {parse, Node} from '../../mod.ts'; 2 | 3 | const toJSON = (node: Node): Record => { 4 | const json: { 5 | attributes?: {[key: string]: string}; 6 | children?: Record[]; 7 | text?: string; 8 | type: string; 9 | } = { 10 | type: node.type 11 | }; 12 | if (Object.keys(node.attributes).length) { 13 | json.attributes = node.attributes; 14 | } 15 | if (node.type === 'text' || node.type === 'cdata') { 16 | json.text = node.innerText; 17 | } 18 | if (node.children.length > 0) { 19 | json.children = node.children.map(toJSON); 20 | } 21 | return json; 22 | }; 23 | 24 | const parser = parse('https://dbushell.com/rss.xml', { 25 | ignoreDeclaration: false 26 | }); 27 | 28 | for await (const node of parser) { 29 | if (node.type === 'declaration') { 30 | console.log(toJSON(node)); 31 | } 32 | if (node.is('channel', 'item')) { 33 | console.log(toJSON(node)); 34 | break; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /examples/advanced/mod.ts: -------------------------------------------------------------------------------- 1 | import * as html from 'https://deno.land/std@0.216.0/html/mod.ts'; 2 | import {serverURL, serverController} from './server.ts'; 3 | import {parse} from '../../mod.ts'; 4 | 5 | const FEED = new URL('https://shoptalkshow.com/feed/podcast/'); 6 | const feedURL = new URL('/rss', serverURL); 7 | feedURL.searchParams.set('feed', FEED.href); 8 | 9 | console.log(`Parsing: ${FEED}`); 10 | 11 | console.log(`\nTest 1`); 12 | 13 | const parseController = new AbortController(); 14 | let parser = parse(feedURL, { 15 | signal: parseController.signal 16 | }); 17 | 18 | setTimeout(() => { 19 | console.log(`\n❌ Aborting parser`); 20 | parseController.abort(); 21 | }, 10_000); 22 | 23 | for await (const node of parser) { 24 | if (node.is('channel', 'item')) { 25 | let title = node.first('title')?.innerText; 26 | title = html.unescape(title ?? ''); 27 | console.log(`Item found: ${title}`); 28 | } 29 | } 30 | 31 | console.log(`\nTest 2`); 32 | 33 | parser = parse(feedURL, { 34 | silent: false 35 | }); 36 | 37 | // Kill the server after 10 seconds to test error handling 38 | setTimeout(() => { 39 | console.log(`\n❌ Aborting server`); 40 | serverController.abort(); 41 | }, 10_000); 42 | 43 | try { 44 | for await (const node of parser) { 45 | if (node.is('channel', 'item')) { 46 | let title = node.first('title')?.innerText; 47 | title = html.unescape(title ?? ''); 48 | console.log(`Item found: ${title}`); 49 | } 50 | } 51 | } catch (err) { 52 | console.log(`Error: ${err.message}`); 53 | } 54 | 55 | console.log('\nFeed complete. Closing server.'); 56 | // serverController.abort(); 57 | -------------------------------------------------------------------------------- /examples/advanced/server.ts: -------------------------------------------------------------------------------- 1 | // Deno web proxy server that streams a feed slowly in chunks 2 | // Based on: https://github.com/dbushell/deno_turtle/ 3 | 4 | // Generate feed in chunks 5 | const generate = function* (bytes: string, chunkSize: number) { 6 | const encoder = new TextEncoder(); 7 | while (bytes.length) { 8 | const chunk = bytes.slice(0, Math.min(bytes.length, chunkSize)); 9 | bytes = bytes.slice(chunk.length); 10 | yield encoder.encode(chunk); 11 | } 12 | }; 13 | 14 | // ReadableStream response from generator 15 | const stream = async (url: URL | string) => { 16 | const bytes = await (await fetch(url)).text(); 17 | // 512 byte chunks at 5120 bytes per second 18 | const delay = (512 / 5120) * 1000; 19 | const generator = generate(bytes, 512); 20 | const stream = new ReadableStream({ 21 | async start(controller) { 22 | while (true) { 23 | const {value, done} = generator.next(); 24 | if (done) { 25 | controller.close(); 26 | break; 27 | } 28 | controller.enqueue(value); 29 | await new Promise((resolve) => setTimeout(resolve, delay)); 30 | } 31 | } 32 | }); 33 | const headers = new Headers(); 34 | headers.set('access-control-allow-origin', '*'); 35 | headers.set('content-type', 'application/xml'); 36 | headers.set('cache-control', 'no-store'); 37 | return new Response(stream, { 38 | headers 39 | }); 40 | }; 41 | 42 | // default test feed 43 | const FEED = 'https://dbushell.com/rss.xml'; 44 | 45 | export const serverURL = new URL('http://localhost:3001/'); 46 | 47 | export const serverController = new AbortController(); 48 | 49 | Deno.serve( 50 | { 51 | port: Number(serverURL.port), 52 | hostname: serverURL.hostname, 53 | signal: serverController.signal, 54 | onListen: () => { 55 | console.log(`🚀 Proxy server on ${serverURL.href}`); 56 | console.log('\nExample feeds:'); 57 | console.log(`${serverURL.href}rss`); 58 | console.log( 59 | `${serverURL.href}rss?feed=https://shoptalkshow.com/feed/podcast/` 60 | ); 61 | console.log(`\n⚠️ This server is intentionally slow!\n`); 62 | }, 63 | onError: (error) => { 64 | console.error(error); 65 | return new Response(null, { 66 | status: 500 67 | }); 68 | } 69 | }, 70 | (request: Request) => { 71 | const url = new URL(request.url); 72 | if (url.pathname === '/rss') { 73 | return stream(url.searchParams.get('feed') ?? FEED); 74 | } 75 | return new Response(null, { 76 | status: 302, 77 | headers: { 78 | location: '/rss' 79 | } 80 | }); 81 | } 82 | ); 83 | -------------------------------------------------------------------------------- /examples/advanced/stream.ts: -------------------------------------------------------------------------------- 1 | import {XMLStream} from '../../mod.ts'; 2 | 3 | const response = await fetch('https://dbushell.com/rss.xml'); 4 | 5 | if (!response.ok || !response.body) { 6 | throw new Error('Bad response'); 7 | } 8 | 9 | const stream = response.body.pipeThrough(new XMLStream()); 10 | 11 | for await (const [type, value] of stream) { 12 | // e.g. declaration: 13 | console.log(`${type}: ${value}`); 14 | } 15 | -------------------------------------------------------------------------------- /examples/bun.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S bun run 2 | 3 | import {parse, Node} from '../mod.ts'; 4 | 5 | const blog = async () => { 6 | const parser = parse('https://dbushell.com/rss.xml'); 7 | for await (const node of parser) { 8 | if (node.is('channel', 'item')) { 9 | console.log(node.first('title')?.innerText); 10 | } 11 | if (node.type === 'channel') { 12 | const items = node.all('item'); 13 | console.log(`Total items: ${items.length}`); 14 | } 15 | } 16 | }; 17 | 18 | // Bun currently has issues with aborting a stream 19 | // https://github.com/oven-sh/bun/issues/2489 20 | 21 | const podcast = async () => { 22 | const contoller = new AbortController(); 23 | const parser = parse('https://feed.syntax.fm/rss', { 24 | signal: contoller.signal 25 | }); 26 | const items: Node[] = []; 27 | for await (const node of parser) { 28 | if (node.is('channel', 'item')) { 29 | items.push(node); 30 | if (items.length === 10) { 31 | contoller.abort(); 32 | } 33 | } 34 | } 35 | console.log(items.map((item) => item.first('title')?.innerText)); 36 | }; 37 | 38 | await blog(); 39 | await podcast(); 40 | -------------------------------------------------------------------------------- /examples/deno.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S deno run --allow-net 2 | 3 | import {parse, Node} from '../mod.ts'; 4 | 5 | const blog = async () => { 6 | const parser = parse('https://dbushell.com/rss.xml'); 7 | for await (const node of parser) { 8 | if (node.is('channel', 'item')) { 9 | console.log(node.first('title')?.innerText); 10 | } 11 | if (node.type === 'channel') { 12 | const items = node.all('item'); 13 | console.log(`Total items: ${items.length}`); 14 | } 15 | } 16 | }; 17 | 18 | const podcast = async () => { 19 | const contoller = new AbortController(); 20 | const response = await fetch('https://feed.syntax.fm/rss'); 21 | const parser = parse(response.body!, { 22 | signal: contoller.signal 23 | }); 24 | const items: Node[] = []; 25 | for await (const node of parser) { 26 | if (node.is('channel', 'item')) { 27 | items.push(node); 28 | if (items.length === 10) { 29 | contoller.abort(); 30 | } 31 | } 32 | } 33 | console.log(items.map((item) => item.first('title')?.innerText)); 34 | }; 35 | 36 | await blog(); 37 | await podcast(); 38 | -------------------------------------------------------------------------------- /examples/node.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S node 2 | 3 | // https://jsr.io/@dbushell/xml-streamify 4 | import {parse} from '@dbushell/xml-streamify'; 5 | 6 | const blog = async () => { 7 | const parser = parse('https://dbushell.com/rss.xml'); 8 | for await (const node of parser) { 9 | if (node.is('channel', 'item')) { 10 | console.log(node.first('title')?.innerText); 11 | } 12 | if (node.type === 'channel') { 13 | const items = node.all('item'); 14 | console.log(`Total items: ${items.length}`); 15 | } 16 | } 17 | }; 18 | 19 | const podcast = async () => { 20 | const contoller = new AbortController(); 21 | const parser = parse('https://feed.syntax.fm/rss', { 22 | signal: contoller.signal 23 | }); 24 | const items = []; 25 | for await (const node of parser) { 26 | if (node.is('channel', 'item')) { 27 | items.push(node); 28 | if (items.length === 10) { 29 | contoller.abort(); 30 | } 31 | } 32 | } 33 | console.log(items.map((item) => item.first('title')?.innerText)); 34 | }; 35 | 36 | await blog(); 37 | await podcast(); 38 | -------------------------------------------------------------------------------- /mod.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @module 3 | * 4 | * Fetch and parse XML documents using web streams and async iterators. 5 | */ 6 | export * from './src/node.ts'; 7 | export * from './src/parse.ts'; 8 | export * from './src/stream.ts'; 9 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@dbushell/xml-streamify", 3 | "version": "0.7.0", 4 | "repository": { 5 | "type": "git", 6 | "url": "git+https://github.com/dbushell/xml-streamify.git" 7 | }, 8 | "description": "Fetch and parse XML documents using the power of JavaScript web streams and async iterators ✨", 9 | "keywords": [ 10 | "xml", 11 | "typescript" 12 | ], 13 | "license": "MIT", 14 | "type": "module", 15 | "exports": { 16 | ".": "./mod.ts", 17 | "./node": "./src/node.ts", 18 | "./parse": "./src/parse.ts", 19 | "./stream": "./src/stream.ts", 20 | "./types": "./src/types.ts" 21 | }, 22 | "types": "src/types.ts", 23 | "files": [ 24 | "src", 25 | "mod.ts", 26 | "package.json", 27 | "LICENSE", 28 | "README.md" 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /src/node.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Module exports an XML Node class. 3 | * 4 | * @module 5 | */ 6 | /** XML node with helper methods to read data and traverse the tree */ 7 | export class Node { 8 | #type: string; 9 | #children: Array; 10 | #parent?: Node; 11 | #attr?: Record; 12 | #raw?: string; 13 | 14 | constructor(type: string, parent?: Node, raw?: string) { 15 | this.#type = type; 16 | this.#parent = parent; 17 | this.#raw = raw; 18 | this.#children = []; 19 | } 20 | 21 | get type(): string { 22 | return this.#type; 23 | } 24 | 25 | get raw(): string { 26 | return this.#raw ?? ''; 27 | } 28 | 29 | get parent(): Node | undefined { 30 | return this.#parent; 31 | } 32 | 33 | get children(): Array { 34 | return this.#children; 35 | } 36 | 37 | get attributes(): Record { 38 | if (this.#attr) { 39 | return this.#attr; 40 | } 41 | // Setup and parse attributes on first access 42 | this.#attr = {}; 43 | if (this.raw) { 44 | const regex = /([\w:.-]+)\s*=\s*(["'])(.*?)\2/g; 45 | let match: RegExpExecArray | null; 46 | while ((match = regex.exec(this.raw)) !== null) { 47 | this.#attr[match[1]] = match[3]; 48 | } 49 | } 50 | return this.#attr; 51 | } 52 | 53 | get innerText(): string { 54 | if (this.children.length) { 55 | let text = ''; 56 | for (const child of this.children) { 57 | text += child.innerText; 58 | } 59 | return text; 60 | } 61 | return (this.raw.match(//s) ?? [, this.raw])[1]; 62 | } 63 | 64 | addChild(child: Node): void { 65 | this.#children.push(child); 66 | } 67 | 68 | /** 69 | * Returns true if node and parents match the key hierarchy 70 | * @param keys - XML tag names 71 | */ 72 | is(...keys: Array): boolean { 73 | if (!keys.length) return false; 74 | let parent: Node | undefined; 75 | for (const key of keys.toReversed()) { 76 | parent = parent ? parent.parent : this; 77 | if (parent?.type !== key) { 78 | return false; 79 | } 80 | } 81 | return true; 82 | } 83 | 84 | /** 85 | * Return the first child matching the key 86 | * @param key - XML tag name 87 | */ 88 | first(key: string): Node | undefined { 89 | return this.children.find((n) => n.type === key); 90 | } 91 | 92 | /** 93 | * Return all children matching the key hierarchy 94 | * @param keys - XML tag names 95 | */ 96 | all(...keys: Array): Array { 97 | let nodes: Array | undefined = this.children; 98 | let found: Array = []; 99 | for (const [i, k] of Object.entries(keys)) { 100 | if (Number.parseInt(i) === keys.length - 1) { 101 | found = nodes.filter((n) => n.type === k); 102 | break; 103 | } 104 | nodes = nodes?.find((n) => n.type === k)?.children; 105 | if (!nodes) return []; 106 | } 107 | return found; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/parse.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Module export an async generator function for parsing a streamed XML document. 3 | * 4 | * @module 5 | */ 6 | import type {ParseOptions} from './types.ts'; 7 | import {NodeType} from './types.ts'; 8 | import {Node} from './node.ts'; 9 | import {XMLStream} from './stream.ts'; 10 | 11 | const ignoreTypes: Partial> = { 12 | [NodeType.COMMENT]: 'ignoreComments', 13 | [NodeType.DECLARATION]: 'ignoreDeclaration', 14 | [NodeType.DOCTYPE]: 'ignoreDoctype' 15 | } as const; 16 | 17 | /** 18 | * Async generator function for parsing a streamed XML document 19 | * @param input URL to fetch and parse (or a ReadableStream) 20 | * @param options Parsing options {@link ParseOptions} 21 | * @returns Yields parsed XML nodes {@link Node} 22 | */ 23 | export async function* parse( 24 | input: string | URL | ReadableStream, 25 | options?: ParseOptions 26 | ): AsyncGenerator { 27 | const document = new Node('@document'); 28 | try { 29 | const init = {...options?.fetchOptions}; 30 | if (options?.signal) { 31 | init.signal = options.signal; 32 | } 33 | 34 | let source: ReadableStream; 35 | 36 | // Fetch stream if URL is provided as input 37 | if (typeof input === 'string' || input instanceof URL) { 38 | input = new URL(input); 39 | const response = await fetch(input, init); 40 | if (!response.ok || !response.body) { 41 | throw new Error(`Bad response`); 42 | } 43 | source = response.body; 44 | } else { 45 | source = input; 46 | } 47 | 48 | const stream = source 49 | .pipeThrough(new TextDecoderStream()) 50 | .pipeThrough(new XMLStream(), { 51 | signal: options?.signal 52 | }); 53 | 54 | // Set root document as current node 55 | let node = document; 56 | 57 | for await (const [type, value] of stream) { 58 | if (options?.signal?.aborted) { 59 | break; 60 | } 61 | // Skip whitespace 62 | if (type === NodeType.TEXT) { 63 | if (options?.ignoreWhitespace !== false && value.trim().length === 0) { 64 | continue; 65 | } 66 | } 67 | // Handle other ignored types 68 | if (type in ignoreTypes && options?.[ignoreTypes[type]!] === false) { 69 | const newNode = new Node(type, node, value); 70 | node.addChild(newNode); 71 | yield newNode; 72 | continue; 73 | } 74 | // Handle elements 75 | if (type === NodeType.ELEMENT) { 76 | const name = value.match(/<\/?([\w:.-]+)/)![1]; 77 | // Handle self-closing element 78 | if (value.endsWith('/>')) { 79 | const newNode = new Node(name, node, value); 80 | node.addChild(newNode); 81 | yield newNode; 82 | continue; 83 | } 84 | // Handle closing element 85 | if (value.startsWith('', 13 | start: /^', 17 | start: /^