├── .gitignore ├── LICENSE ├── README.md ├── package-lock.json ├── package.json ├── tsconfig.json └── unfurl.js /.gitignore: -------------------------------------------------------------------------------- 1 | unfurl.d.ts 2 | node_modules 3 | .* 4 | !.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024, tldraw GB Ltd. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cloudflare-workers-unfurl 2 | 3 | A tiny utility to get basic information about a URL on Cloudflare Workers using the built-in HTMLRewriter API. 4 | 5 | ## Usage 6 | 7 | ``` 8 | npm install cloudflare-workers-unfurl 9 | ``` 10 | 11 | There are two exports: `unfurl` and `handleUnfurlRequest`. 12 | 13 | `unfurl` is the main function that does the work. It takes a URL and returns a promise that resolves to an object with the following properties: 14 | 15 | - `title`: The title of the page 16 | - `description`: The description of the page 17 | - `image`: The URL of the social image 18 | - `favicon`: The URL of the favicon 19 | 20 | ```js 21 | import { unfurl } from "cloudflare-workers-unfurl"; 22 | 23 | const result = await unfurl("https://example.com"); 24 | 25 | result.title; // 'Example Domain' 26 | ``` 27 | 28 | `handleUnfurlRequest` is a function that takes a `Request` object and returns a promise that resolves to a `Response` object. It uses `unfurl` to get the information about the URL in the `url` query parameter of the request and returns it as JSON. 29 | 30 | ```js 31 | import { AutoRouter } from "itty-router"; 32 | import { handleUnfurlRequest } from "cloudflare-workers-unfurl"; 33 | 34 | const router = AutoRouter(); 35 | 36 | router.get("/unfurl", handleUnfurlRequest); 37 | 38 | export default router; 39 | ``` 40 | 41 | ## License 42 | 43 | MIT 44 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cloudflare-workers-unfurl", 3 | "version": "0.0.1", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "cloudflare-workers-unfurl", 9 | "version": "0.0.1", 10 | "license": "ISC", 11 | "dependencies": { 12 | "@cloudflare/workers-types": "^4.20240718.0" 13 | }, 14 | "devDependencies": { 15 | "typescript": "^5.5.3" 16 | } 17 | }, 18 | "node_modules/@cloudflare/workers-types": { 19 | "version": "4.20240718.0", 20 | "resolved": "https://registry.npmjs.org/@cloudflare/workers-types/-/workers-types-4.20240718.0.tgz", 21 | "integrity": "sha512-7RqxXIM9HyhjfZ9ztXjITuc7mL0w4s+zXgypqKmMuvuObC3DgXutJ3bOYbQ+Ss5QbywrzWSNMlmGdL/ldg/yZg==" 22 | }, 23 | "node_modules/typescript": { 24 | "version": "5.5.3", 25 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.3.tgz", 26 | "integrity": "sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==", 27 | "dev": true, 28 | "bin": { 29 | "tsc": "bin/tsc", 30 | "tsserver": "bin/tsserver" 31 | }, 32 | "engines": { 33 | "node": ">=14.17" 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cloudflare-workers-unfurl", 3 | "version": "0.0.7", 4 | "description": "Unfurl urls in cloudflare workers using HTMLRewriter", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/tldraw/cloudflare-workers-unfurl" 8 | }, 9 | "bugs": { 10 | "url": "https://github.com/tldraw/cloudflare-workers-unfurl/issues" 11 | }, 12 | "keywords": [ 13 | "cloudflare", 14 | "workers", 15 | "unfurl" 16 | ], 17 | "main": "unfurl.js", 18 | "type": "module", 19 | "scripts": { 20 | "test": "echo \"Error: no test specified\" && exit 1", 21 | "prepack": "tsc" 22 | }, 23 | "author": "David Sheldrick", 24 | "license": "MIT", 25 | "types": "unfurl.d.ts", 26 | "files": [ 27 | "unfurl.js", 28 | "unfurl.d.ts" 29 | ], 30 | "devDependencies": { 31 | "@cloudflare/workers-types": "^4.20240718.0", 32 | "typescript": "^5.5.3" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["unfurl.js"], 3 | "compilerOptions": { 4 | "emitDeclarationOnly": true, 5 | "declaration": true, 6 | "lib": ["esnext"], 7 | "types": ["@cloudflare/workers-types"], 8 | "allowJs": true, 9 | "checkJs": true, 10 | 11 | "esModuleInterop": false, 12 | "forceConsistentCasingInFileNames": true, 13 | 14 | "strict": true, 15 | "noImplicitAny": true, 16 | "noImplicitThis": true, 17 | "noUnusedLocals": true, 18 | "noUnusedParameters": true, 19 | "noImplicitReturns": true, 20 | "noFallthroughCasesInSwitch": true, 21 | 22 | "skipLibCheck": true 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /unfurl.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @template Value 3 | * @typedef {Object} GoodResult 4 | * @property {true} ok - The success status. 5 | * @property {Value} value - The data extracted from the URL. 6 | */ 7 | 8 | /** 9 | * @template Error 10 | * @typedef {Object} BadResult 11 | * @property {false} ok - The success status. 12 | * @property {Error} error - The error 13 | */ 14 | 15 | /** 16 | * @template Value, Error 17 | * @typedef {GoodResult | BadResult} Result 18 | */ 19 | 20 | /** 21 | * @typedef {Object} UnfurledData 22 | * @property {string} [title] - The title extracted from the URL. 23 | * @property {string} [description] - The description extracted from the URL. 24 | * @property {string} [image] - The image URL extracted from the URL. 25 | * @property {string} [favicon] - The favicon URL extracted from the URL. 26 | */ 27 | 28 | /** 29 | * @typedef {'bad-param' | 'failed-fetch'} UnfurlError 30 | */ 31 | 32 | const validContentTypes = [ 33 | "text/html", 34 | "application/xhtml+xml", 35 | "application/xml", 36 | "image/*", 37 | ]; 38 | 39 | /** 40 | * 41 | * @param {string} contentType 42 | * @returns {boolean} 43 | */ 44 | function isValidContentType(contentType) { 45 | return ( 46 | // allow unspecified, try to parse it anyway 47 | !contentType || 48 | contentType.startsWith("image/") || 49 | validContentTypes.some((valid) => contentType.startsWith(valid)) 50 | ); 51 | } 52 | 53 | /** 54 | * Handles the unfurling of a URL by extracting metadata such as title, description, image, and favicon. 55 | * @param {string} url - The URL to unfurl. 56 | * @returns {Promise>} - A promise that resolves to an object containing the extracted metadata, or null if an error occurs. 57 | */ 58 | export async function unfurl(url) { 59 | if (typeof url !== "string" || !url.match(/^https?:\/\//)) { 60 | return { ok: false, error: "bad-param" }; 61 | } 62 | 63 | // cloudflare has a built-in HTML parser/rewriter called HTMLRewriter. in order to use it, we 64 | // need to define classes that act as event handlers for certain elements, attributes, etc. 65 | // see https://developers.cloudflare.com/workers/runtime-apis/html-rewriter/ 66 | const meta$ = new MetaExtractor(); 67 | const title$ = new TextExtractor(); 68 | const icon$ = new IconExtractor(); 69 | 70 | try { 71 | const headers = new Headers(); 72 | for (const contentType of validContentTypes) { 73 | headers.append("accept", contentType); 74 | } 75 | const res = await fetch(url, { headers }); 76 | if (!res.ok || !isValidContentType(res.headers.get("content-type") ?? "")) { 77 | return { ok: false, error: "failed-fetch" }; 78 | } 79 | if (res.headers.get("content-type")?.startsWith("image/")) { 80 | return { 81 | ok: true, 82 | value: { 83 | image: url, 84 | title: new URL(url).pathname.split("/").pop() || undefined, 85 | }, 86 | }; 87 | } 88 | await new HTMLRewriter() 89 | .on("meta", meta$) 90 | .on("title", title$) 91 | .on("link", icon$) 92 | .transform(res) 93 | .blob(); 94 | } catch { 95 | return { ok: false, error: "failed-fetch" }; 96 | } 97 | 98 | // we don't know exactly what we'll end up with, so this is a best-effort extraction 99 | const { og, twitter } = meta$; 100 | const title = 101 | og["og:title"] ?? twitter["twitter:title"] ?? title$.string ?? undefined; 102 | const description = 103 | og["og:description"] ?? 104 | twitter["twitter:description"] ?? 105 | meta$.description ?? 106 | undefined; 107 | let image = 108 | og["og:image:secure_url"] ?? 109 | og["og:image"] ?? 110 | twitter["twitter:image"] ?? 111 | undefined; 112 | let favicon = icon$.appleIcon ?? icon$.icon ?? undefined; 113 | 114 | if (image && !image?.startsWith("http")) { 115 | image = new URL(image, url).href; 116 | } 117 | if (favicon && !favicon?.startsWith("http")) { 118 | favicon = new URL(favicon, url).href; 119 | } 120 | 121 | return { 122 | ok: true, 123 | value: { 124 | title, 125 | description, 126 | image, 127 | favicon, 128 | }, 129 | }; 130 | } 131 | 132 | /** 133 | * Implements a handler for a GET request where the uri is passed in as a search param called `url`. 134 | * 135 | * e.g. GET /foo/bar?url=https://example.com 136 | * 137 | * @param {Request} request 138 | * @returns {Promise} 139 | */ 140 | export async function handleUnfurlRequest(request) { 141 | const url = new URL(request.url).searchParams.get("url"); 142 | 143 | if (!url) { 144 | return new Response("Missing URL query parameter.", { status: 400 }); 145 | } 146 | 147 | const result = await unfurl(url); 148 | 149 | if (result.ok) { 150 | return new Response(JSON.stringify(result.value), { 151 | headers: { "Content-Type": "application/json" }, 152 | }); 153 | } else if (result.error === "bad-param") { 154 | return new Response("Bad URL query parameter.", { status: 400 }); 155 | } else { 156 | return new Response("Failed to fetch URL.", { status: 422 }); 157 | } 158 | } 159 | 160 | /** 161 | * Extracts text from HTML elements. 162 | */ 163 | class TextExtractor { 164 | /** 165 | * The accumulated text extracted from elements. 166 | * @type {string} 167 | */ 168 | string = ""; 169 | 170 | /** 171 | * Handles an incoming piece of text. 172 | * @param {Object} param - The text object. 173 | * @param {string} param.text - The incoming text. 174 | */ 175 | text({ text }) { 176 | this.string += text; 177 | } 178 | } 179 | 180 | /** 181 | * Extracts metadata from HTML elements. 182 | */ 183 | class MetaExtractor { 184 | /** 185 | * The Open Graph (og) metadata extracted from elements. 186 | * @type {Object.} 187 | */ 188 | og = {}; 189 | 190 | /** 191 | * The Twitter metadata extracted from elements. 192 | * @type {Object.} 193 | */ 194 | twitter = {}; 195 | 196 | /** 197 | * The description extracted from elements. 198 | * @type {string|null} 199 | */ 200 | description = null; 201 | 202 | /** 203 | * Handles an incoming element. 204 | * @param {Element} element - The incoming element. 205 | */ 206 | element(element) { 207 | const property = element.getAttribute("property"); 208 | const name = element.getAttribute("name"); 209 | 210 | if (property && property.startsWith("og:")) { 211 | this.og[property] = element.getAttribute("content"); 212 | } else if (name && name.startsWith("twitter:")) { 213 | this.twitter[name] = element.getAttribute("content"); 214 | } else if (name === "description") { 215 | this.description = element.getAttribute("content"); 216 | } 217 | } 218 | } 219 | 220 | /** 221 | * Extracts favicon URLs from HTML elements. 222 | */ 223 | class IconExtractor { 224 | /** 225 | * The Apple touch icon URL extracted from elements. 226 | * @type {string|null} 227 | */ 228 | appleIcon = null; 229 | 230 | /** 231 | * The favicon URL extracted from elements. 232 | * @type {string|null} 233 | */ 234 | icon = null; 235 | 236 | /** 237 | * Handles an incoming element. 238 | * @param {Element} element - The incoming element. 239 | */ 240 | element(element) { 241 | if (element.getAttribute("rel") === "icon") { 242 | this.icon = element.getAttribute("href"); 243 | } else if (element.getAttribute("rel") === "apple-touch-icon") { 244 | this.appleIcon = element.getAttribute("href"); 245 | } 246 | } 247 | } 248 | --------------------------------------------------------------------------------