├── README.md ├── do9000.drawio.png ├── do9000.ts ├── wrangler.toml └── xymake.json /README.md: -------------------------------------------------------------------------------- 1 | Problem: carrying out many requests that retrieve large files (over 100MB) isn't easy. 2 | 3 | When using SQLite as storage, we are limited to 10GB total per DO, and we'd pay quite a lot. It isn't always ideal for the destination to be a DO. 4 | 5 | This worker shows 2 proofs of concept: 6 | 7 | 1. you can do as many subrequests in a durable object as you want 8 | 2. you can stream the results out to a single file in r2 as long as you pass it a predetermined length. to have this fixed length, you can simply ensure to end up with that even if you don't know the actual total length; you can either stop early, or you can append the end of the file with padding. 9 | 10 | This worker uses no dependencies and leverages the `FixedLengthStream` web-standard. 11 | 12 | ![](do9000.drawio.png) 13 | 14 | Thread: https://x.com/janwilmake/status/1921554676081733782 15 | 16 | # Notes 17 | 18 | ## Idea: r2 zipper DO: 19 | 20 | Problem: i can't possibly get all my r2 items into a single zipfile and store that in an r2... 21 | 22 | - I can just query 1000 of them. 23 | - In a queue i can query 1000 each message, but even then, how do I put it into a single zip? 24 | 25 | Would it be possible to: 26 | 27 | - List all items and queue a message for each 1000 items 28 | - Consume the queue in which I fetch the 1000 items and stream them into a multipart/form-data stream to a DO 29 | - The DO receives a multipart/form-data stream from each queuemessage, and turns it into a single zip by streaming this to R2 and collecting the metadata. 30 | - A final request to the DO to close the zip would send the central directory and close it. 31 | 32 | https://claude.ai/chat/dafa225f-7098-4dd0-9cbc-ac5e9e034e6e 33 | 34 | Generally: 35 | 36 | - input can be kv, r2, sqlite, or any public URLs 37 | - zipping it isn't even required. A giant formdata stream is also fine. 38 | 39 | https://developers.cloudflare.com/r2/api/workers/workers-multipart-usage/index.md 40 | https://developers.cloudflare.com/r2/objects/multipart-objects/index.md 41 | 42 | Related: 43 | 44 | https://aws.amazon.com/about-aws/whats-new/2024/11/amazon-s3-express-one-zone-append-data-object/ 45 | https://simonwillison.net/2024/Nov/22/amazon-s3-append-data/ 46 | 47 | Both multipart uploader and `r2.put` don't allow not knowing the size in advance and passing a readablestream: `✘ [ERROR] Uncaught (in response) TypeError: Provided readable stream must have a known length (request/response body or readable half of FixedLengthStream)` 48 | 49 | https://developers.cloudflare.com/workers/runtime-apis/streams/transformstream/#fixedlengthstream 50 | 51 | WOW! Fixed length stream works. That's a neat little trick if you know the length. you can round up by adding dashes. This is the only way to stream files into R2!!! Predetermined content-length. 52 | -------------------------------------------------------------------------------- /do9000.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janwilmake/do-ingest-api-aggregate-r2/e64ac8bc5efb4b238b84376ff02385829e50d9ce/do9000.drawio.png -------------------------------------------------------------------------------- /do9000.ts: -------------------------------------------------------------------------------- 1 | //@ts-check 2 | /// 3 | // src/index.ts 4 | // This version uses FixedLengthStream to ensure known content length for R2 uploads 5 | 6 | interface Env { 7 | COUNTER: DurableObjectNamespace; 8 | R2_BUCKET: R2Bucket; 9 | } 10 | 11 | interface HackerNewsItem { 12 | id: number; 13 | title?: string; 14 | type?: string; 15 | by?: string; 16 | time?: number; 17 | text?: string; 18 | url?: string; 19 | score?: number; 20 | [key: string]: any; // For other possible properties 21 | } 22 | 23 | interface CollectionStats { 24 | bytesCollected: number; 25 | targetLength: number; 26 | fileName: string; 27 | startTime: string | null; 28 | lastItemTime: string | null; 29 | runningFor: string | null; 30 | itemCount: number; 31 | status: string; 32 | } 33 | 34 | interface ItemResponse { 35 | id: number; 36 | data: HackerNewsItem; 37 | collected: string; 38 | byteSize: number; 39 | } 40 | 41 | export class SubrequestCounter { 42 | private state: DurableObjectState; 43 | private env: Env; 44 | private storage: DurableObjectStorage; 45 | private collectionActive: boolean = false; 46 | private bytesCollected: number = 0; 47 | private targetLength: number = 0; 48 | private itemCount: number = 0; 49 | private startTime: number | null = null; 50 | private lastItemTime: number | null = null; 51 | private fileName: string = ""; 52 | private latestItems: ItemResponse[] = []; 53 | private maxLatestItems: number = 100; 54 | private status: string = "idle"; 55 | 56 | constructor(state: DurableObjectState, env: Env) { 57 | this.state = state; 58 | this.env = env; 59 | this.storage = state.storage; 60 | 61 | // Initialize state from storage 62 | this.state.blockConcurrencyWhile(async () => { 63 | this.bytesCollected = (await this.storage.get("bytesCollected")) || 0; 64 | this.targetLength = (await this.storage.get("targetLength")) || 0; 65 | this.itemCount = (await this.storage.get("itemCount")) || 0; 66 | this.startTime = (await this.storage.get("startTime")) || null; 67 | this.lastItemTime = (await this.storage.get("lastItemTime")) || null; 68 | this.fileName = (await this.storage.get("fileName")) || ""; 69 | this.latestItems = (await this.storage.get("latestItems")) || []; 70 | this.status = (await this.storage.get("status")) || "idle"; 71 | }); 72 | } 73 | 74 | async fetch(request: Request): Promise { 75 | const url = new URL(request.url); 76 | const path = url.pathname.slice(1); 77 | 78 | // Start collection process 79 | if (path === "start") { 80 | if (this.collectionActive) { 81 | return new Response( 82 | JSON.stringify({ 83 | message: "Collection is already in progress", 84 | }), 85 | { 86 | headers: { "Content-Type": "application/json" }, 87 | }, 88 | ); 89 | } 90 | 91 | const length = parseInt(url.searchParams.get("length") || "10000", 10); 92 | if (isNaN(length) || length <= 0 || length > 100000000) { 93 | return new Response( 94 | JSON.stringify({ 95 | error: 96 | "Invalid length parameter. Must be a positive number up to 100MB.", 97 | }), 98 | { 99 | status: 400, 100 | headers: { "Content-Type": "application/json" }, 101 | }, 102 | ); 103 | } 104 | 105 | // Generate a unique filename for this collection 106 | this.fileName = `hn-items-${Date.now()}.jsonl`; 107 | this.targetLength = length; 108 | this.startTime = Date.now(); 109 | this.bytesCollected = 0; 110 | this.itemCount = 0; 111 | this.status = "collecting"; 112 | 113 | // Save initial state 114 | await this.storage.put("fileName", this.fileName); 115 | await this.storage.put("targetLength", this.targetLength); 116 | await this.storage.put("startTime", this.startTime); 117 | await this.storage.put("bytesCollected", this.bytesCollected); 118 | await this.storage.put("itemCount", this.itemCount); 119 | await this.storage.put("status", this.status); 120 | 121 | this.collectionActive = true; 122 | const encoder = new TextEncoder(); 123 | 124 | // Create a FixedLengthStream with the exact content length 125 | const { readable, writable } = new FixedLengthStream(length); 126 | 127 | // Create a writer for the stream 128 | const writer = writable.getWriter(); 129 | 130 | // Start the upload to R2 using the readable end of the stream 131 | const uploadPromise = this.env.R2_BUCKET.put(this.fileName, readable, { 132 | httpMetadata: { contentType: "application/jsonl" }, 133 | }); 134 | 135 | try { 136 | // Prepare buffer for collected data 137 | let currentSize = 0; 138 | 139 | // Continue fetching until we reach target length 140 | while (currentSize < length && this.collectionActive) { 141 | // Generate a random HN story ID 142 | const id = 1 + Math.floor(Math.random() * 35000000); 143 | 144 | // Fetch the item 145 | const item = await this.fetchItem(id); 146 | 147 | if (item) { 148 | const timestamp = Date.now(); 149 | const enrichedItem = { 150 | ...item, 151 | _collected: timestamp, 152 | }; 153 | 154 | // Convert to JSONL format 155 | const jsonLine = JSON.stringify(enrichedItem) + "\n"; 156 | const itemSize = encoder.encode(jsonLine).length; 157 | 158 | // Check if adding this item would exceed the target length 159 | if (currentSize + itemSize > length) { 160 | // Fill remaining space with dashes 161 | const remainingBytes = length - currentSize; 162 | if (remainingBytes > 0) { 163 | const padding = "-".repeat(remainingBytes - 1) + "\n"; 164 | await writer.write(encoder.encode(padding)); 165 | 166 | currentSize += encoder.encode(padding).length; 167 | } 168 | break; 169 | } 170 | 171 | // Add item to writer 172 | await writer.write(encoder.encode(jsonLine)); 173 | 174 | currentSize += itemSize; 175 | 176 | // Update stats 177 | this.itemCount++; 178 | this.bytesCollected = currentSize; 179 | this.lastItemTime = timestamp; 180 | 181 | // Add to latest items cache 182 | const itemResponse: ItemResponse = { 183 | id: item.id, 184 | data: item, 185 | collected: new Date(timestamp).toISOString(), 186 | byteSize: itemSize, 187 | }; 188 | 189 | this.latestItems.unshift(itemResponse); 190 | 191 | // Trim latest items if needed 192 | if (this.latestItems.length > this.maxLatestItems) { 193 | this.latestItems = this.latestItems.slice(0, this.maxLatestItems); 194 | } 195 | 196 | // Update storage with latest stats 197 | await this.storage.put("bytesCollected", this.bytesCollected); 198 | await this.storage.put("itemCount", this.itemCount); 199 | await this.storage.put("lastItemTime", this.lastItemTime); 200 | await this.storage.put("latestItems", this.latestItems); 201 | } 202 | } 203 | 204 | try { 205 | // Wait for the upload to complete 206 | await writer.close(); 207 | await uploadPromise; 208 | 209 | console.log( 210 | `Successfully uploaded ${length} bytes to ${this.fileName}`, 211 | ); 212 | } catch (error) { 213 | console.error("Error uploading to R2:", error); 214 | throw error; 215 | } 216 | 217 | this.status = "completed"; 218 | await this.storage.put("status", this.status); 219 | } catch (error) { 220 | console.error("Error during collection:", error); 221 | this.status = "error"; 222 | await this.storage.put("status", this.status); 223 | } finally { 224 | this.collectionActive = false; 225 | } 226 | 227 | return new Response( 228 | JSON.stringify({ 229 | message: `Started collecting data up to ${length} bytes, storing to R2 as ${this.fileName}`, 230 | }), 231 | { 232 | headers: { "Content-Type": "application/json" }, 233 | }, 234 | ); 235 | } 236 | 237 | // Get status and stats 238 | if (path === "status") { 239 | const stats = await this.getStats(); 240 | return new Response(JSON.stringify(stats), { 241 | headers: { "Content-Type": "application/json" }, 242 | }); 243 | } 244 | 245 | // Get the latest items 246 | if (path === "items") { 247 | const limit = parseInt(url.searchParams.get("limit") || "10", 10); 248 | const items = await this.getLatestItems(limit); 249 | return new Response(JSON.stringify(items), { 250 | headers: { "Content-Type": "application/json" }, 251 | }); 252 | } 253 | 254 | // Reset the collection 255 | if (path === "reset") { 256 | await this.resetCollection(); 257 | return new Response( 258 | JSON.stringify({ 259 | message: "Collection reset successfully", 260 | }), 261 | { 262 | headers: { "Content-Type": "application/json" }, 263 | }, 264 | ); 265 | } 266 | 267 | return new Response( 268 | JSON.stringify({ 269 | error: "Unknown action", 270 | availableActions: [ 271 | "/start?length=N - Start collecting data up to N bytes from Hacker News API", 272 | "/status - Get current collection status and stats", 273 | "/items?limit=N - Get the latest N items collected", 274 | "/reset - Reset the collection", 275 | ], 276 | }), 277 | { 278 | status: 400, 279 | headers: { "Content-Type": "application/json" }, 280 | }, 281 | ); 282 | } 283 | 284 | async fetchItem(id: number): Promise { 285 | try { 286 | const response = await fetch( 287 | `https://hacker-news.firebaseio.com/v0/item/${id}.json`, 288 | ); 289 | const data: HackerNewsItem = await response.json(); 290 | 291 | // Only return items that exist 292 | if (data && data.id) { 293 | return data; 294 | } 295 | return null; 296 | } catch (error) { 297 | console.error(`Error fetching item ${id}:`, error); 298 | return null; 299 | } 300 | } 301 | 302 | async getStats(): Promise { 303 | return { 304 | bytesCollected: this.bytesCollected, 305 | targetLength: this.targetLength, 306 | fileName: this.fileName, 307 | startTime: this.startTime ? new Date(this.startTime).toISOString() : null, 308 | lastItemTime: this.lastItemTime 309 | ? new Date(this.lastItemTime).toISOString() 310 | : null, 311 | runningFor: this.startTime 312 | ? Math.floor((Date.now() - (this.startTime || 0)) / 1000) + " seconds" 313 | : null, 314 | itemCount: this.itemCount, 315 | status: this.status, 316 | }; 317 | } 318 | 319 | async getLatestItems(limit: number): Promise { 320 | return this.latestItems.slice(0, limit); 321 | } 322 | 323 | async resetCollection(): Promise { 324 | // Stop any active collection 325 | this.collectionActive = false; 326 | 327 | // Reset stats 328 | this.bytesCollected = 0; 329 | this.targetLength = 0; 330 | this.itemCount = 0; 331 | this.startTime = null; 332 | this.lastItemTime = null; 333 | this.fileName = ""; 334 | this.latestItems = []; 335 | this.status = "idle"; 336 | 337 | // Update storage 338 | await this.storage.put("bytesCollected", this.bytesCollected); 339 | await this.storage.put("targetLength", this.targetLength); 340 | await this.storage.put("itemCount", this.itemCount); 341 | await this.storage.put("startTime", this.startTime); 342 | await this.storage.put("lastItemTime", this.lastItemTime); 343 | await this.storage.put("fileName", this.fileName); 344 | await this.storage.put("latestItems", this.latestItems); 345 | await this.storage.put("status", this.status); 346 | } 347 | } 348 | 349 | // Worker to route requests to the Durable Object 350 | export default { 351 | async fetch( 352 | request: Request, 353 | env: Env, 354 | ctx: ExecutionContext, 355 | ): Promise { 356 | const url = new URL(request.url); 357 | 358 | // Create a unique ID for our Durable Object 359 | const doId = env.COUNTER.idFromName("hn-data-collector"); 360 | const durableObject = env.COUNTER.get(doId); 361 | 362 | // Forward the request to the Durable Object 363 | return durableObject.fetch(request); 364 | }, 365 | }; 366 | -------------------------------------------------------------------------------- /wrangler.toml: -------------------------------------------------------------------------------- 1 | name = "do-alarm-test" 2 | main = "do9000.ts" 3 | compatibility_date = "2023-09-01" 4 | 5 | [[r2_buckets]] 6 | bucket_name = "do9000" 7 | binding = "R2_BUCKET" 8 | 9 | [durable_objects] 10 | bindings = [ 11 | { name = "COUNTER", class_name = "SubrequestCounter" } 12 | ] 13 | 14 | [[migrations]] 15 | tag = "v1" 16 | new_sqlite_classes = ["SubrequestCounter"] 17 | -------------------------------------------------------------------------------- /xymake.json: -------------------------------------------------------------------------------- 1 | { 2 | "threads": [ 3 | { 4 | "branch": "main", 5 | "createdAt": "2025-05-11T13:08:42.974Z", 6 | "slug": "thread1921553050650411207", 7 | "updatedAt": "2025-05-11T13:08:42.975Z", 8 | "url": "https://x.com/janwilmake/status/1921553050650411207" 9 | } 10 | ], 11 | "lastPostUrl": "https://x.com/janwilmake/status/1921553050650411207", 12 | "$schema": "https://cli.xymake.com/xymake.schema.json" 13 | } --------------------------------------------------------------------------------