├── README.md
├── do9000.drawio.png
├── do9000.ts
├── wrangler.toml
└── xymake.json
/README.md:
--------------------------------------------------------------------------------
1 | Problem: carrying out many requests that retrieve large files (over 100MB) isn't easy.
2 |
3 | When using SQLite as storage, we are limited to 10GB total per DO, and we'd pay quite a lot. It isn't always ideal for the destination to be a DO.
4 |
5 | This worker shows 2 proofs of concept:
6 |
7 | 1. you can do as many subrequests in a durable object as you want
8 | 2. you can stream the results out to a single file in r2 as long as you pass it a predetermined length. to have this fixed length, you can simply ensure to end up with that even if you don't know the actual total length; you can either stop early, or you can append the end of the file with padding.
9 |
10 | This worker uses no dependencies and leverages the `FixedLengthStream` web-standard.
11 |
12 | 
13 |
14 | Thread: https://x.com/janwilmake/status/1921554676081733782
15 |
16 | # Notes
17 |
18 | ## Idea: r2 zipper DO:
19 |
20 | Problem: i can't possibly get all my r2 items into a single zipfile and store that in an r2...
21 |
22 | - I can just query 1000 of them.
23 | - In a queue i can query 1000 each message, but even then, how do I put it into a single zip?
24 |
25 | Would it be possible to:
26 |
27 | - List all items and queue a message for each 1000 items
28 | - Consume the queue in which I fetch the 1000 items and stream them into a multipart/form-data stream to a DO
29 | - The DO receives a multipart/form-data stream from each queuemessage, and turns it into a single zip by streaming this to R2 and collecting the metadata.
30 | - A final request to the DO to close the zip would send the central directory and close it.
31 |
32 | https://claude.ai/chat/dafa225f-7098-4dd0-9cbc-ac5e9e034e6e
33 |
34 | Generally:
35 |
36 | - input can be kv, r2, sqlite, or any public URLs
37 | - zipping it isn't even required. A giant formdata stream is also fine.
38 |
39 | https://developers.cloudflare.com/r2/api/workers/workers-multipart-usage/index.md
40 | https://developers.cloudflare.com/r2/objects/multipart-objects/index.md
41 |
42 | Related:
43 |
44 | https://aws.amazon.com/about-aws/whats-new/2024/11/amazon-s3-express-one-zone-append-data-object/
45 | https://simonwillison.net/2024/Nov/22/amazon-s3-append-data/
46 |
47 | Both multipart uploader and `r2.put` don't allow not knowing the size in advance and passing a readablestream: `✘ [ERROR] Uncaught (in response) TypeError: Provided readable stream must have a known length (request/response body or readable half of FixedLengthStream)`
48 |
49 | https://developers.cloudflare.com/workers/runtime-apis/streams/transformstream/#fixedlengthstream
50 |
51 | WOW! Fixed length stream works. That's a neat little trick if you know the length. you can round up by adding dashes. This is the only way to stream files into R2!!! Predetermined content-length.
52 |
--------------------------------------------------------------------------------
/do9000.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/janwilmake/do-ingest-api-aggregate-r2/e64ac8bc5efb4b238b84376ff02385829e50d9ce/do9000.drawio.png
--------------------------------------------------------------------------------
/do9000.ts:
--------------------------------------------------------------------------------
1 | //@ts-check
2 | ///
3 | // src/index.ts
4 | // This version uses FixedLengthStream to ensure known content length for R2 uploads
5 |
6 | interface Env {
7 | COUNTER: DurableObjectNamespace;
8 | R2_BUCKET: R2Bucket;
9 | }
10 |
11 | interface HackerNewsItem {
12 | id: number;
13 | title?: string;
14 | type?: string;
15 | by?: string;
16 | time?: number;
17 | text?: string;
18 | url?: string;
19 | score?: number;
20 | [key: string]: any; // For other possible properties
21 | }
22 |
23 | interface CollectionStats {
24 | bytesCollected: number;
25 | targetLength: number;
26 | fileName: string;
27 | startTime: string | null;
28 | lastItemTime: string | null;
29 | runningFor: string | null;
30 | itemCount: number;
31 | status: string;
32 | }
33 |
34 | interface ItemResponse {
35 | id: number;
36 | data: HackerNewsItem;
37 | collected: string;
38 | byteSize: number;
39 | }
40 |
41 | export class SubrequestCounter {
42 | private state: DurableObjectState;
43 | private env: Env;
44 | private storage: DurableObjectStorage;
45 | private collectionActive: boolean = false;
46 | private bytesCollected: number = 0;
47 | private targetLength: number = 0;
48 | private itemCount: number = 0;
49 | private startTime: number | null = null;
50 | private lastItemTime: number | null = null;
51 | private fileName: string = "";
52 | private latestItems: ItemResponse[] = [];
53 | private maxLatestItems: number = 100;
54 | private status: string = "idle";
55 |
56 | constructor(state: DurableObjectState, env: Env) {
57 | this.state = state;
58 | this.env = env;
59 | this.storage = state.storage;
60 |
61 | // Initialize state from storage
62 | this.state.blockConcurrencyWhile(async () => {
63 | this.bytesCollected = (await this.storage.get("bytesCollected")) || 0;
64 | this.targetLength = (await this.storage.get("targetLength")) || 0;
65 | this.itemCount = (await this.storage.get("itemCount")) || 0;
66 | this.startTime = (await this.storage.get("startTime")) || null;
67 | this.lastItemTime = (await this.storage.get("lastItemTime")) || null;
68 | this.fileName = (await this.storage.get("fileName")) || "";
69 | this.latestItems = (await this.storage.get("latestItems")) || [];
70 | this.status = (await this.storage.get("status")) || "idle";
71 | });
72 | }
73 |
74 | async fetch(request: Request): Promise {
75 | const url = new URL(request.url);
76 | const path = url.pathname.slice(1);
77 |
78 | // Start collection process
79 | if (path === "start") {
80 | if (this.collectionActive) {
81 | return new Response(
82 | JSON.stringify({
83 | message: "Collection is already in progress",
84 | }),
85 | {
86 | headers: { "Content-Type": "application/json" },
87 | },
88 | );
89 | }
90 |
91 | const length = parseInt(url.searchParams.get("length") || "10000", 10);
92 | if (isNaN(length) || length <= 0 || length > 100000000) {
93 | return new Response(
94 | JSON.stringify({
95 | error:
96 | "Invalid length parameter. Must be a positive number up to 100MB.",
97 | }),
98 | {
99 | status: 400,
100 | headers: { "Content-Type": "application/json" },
101 | },
102 | );
103 | }
104 |
105 | // Generate a unique filename for this collection
106 | this.fileName = `hn-items-${Date.now()}.jsonl`;
107 | this.targetLength = length;
108 | this.startTime = Date.now();
109 | this.bytesCollected = 0;
110 | this.itemCount = 0;
111 | this.status = "collecting";
112 |
113 | // Save initial state
114 | await this.storage.put("fileName", this.fileName);
115 | await this.storage.put("targetLength", this.targetLength);
116 | await this.storage.put("startTime", this.startTime);
117 | await this.storage.put("bytesCollected", this.bytesCollected);
118 | await this.storage.put("itemCount", this.itemCount);
119 | await this.storage.put("status", this.status);
120 |
121 | this.collectionActive = true;
122 | const encoder = new TextEncoder();
123 |
124 | // Create a FixedLengthStream with the exact content length
125 | const { readable, writable } = new FixedLengthStream(length);
126 |
127 | // Create a writer for the stream
128 | const writer = writable.getWriter();
129 |
130 | // Start the upload to R2 using the readable end of the stream
131 | const uploadPromise = this.env.R2_BUCKET.put(this.fileName, readable, {
132 | httpMetadata: { contentType: "application/jsonl" },
133 | });
134 |
135 | try {
136 | // Prepare buffer for collected data
137 | let currentSize = 0;
138 |
139 | // Continue fetching until we reach target length
140 | while (currentSize < length && this.collectionActive) {
141 | // Generate a random HN story ID
142 | const id = 1 + Math.floor(Math.random() * 35000000);
143 |
144 | // Fetch the item
145 | const item = await this.fetchItem(id);
146 |
147 | if (item) {
148 | const timestamp = Date.now();
149 | const enrichedItem = {
150 | ...item,
151 | _collected: timestamp,
152 | };
153 |
154 | // Convert to JSONL format
155 | const jsonLine = JSON.stringify(enrichedItem) + "\n";
156 | const itemSize = encoder.encode(jsonLine).length;
157 |
158 | // Check if adding this item would exceed the target length
159 | if (currentSize + itemSize > length) {
160 | // Fill remaining space with dashes
161 | const remainingBytes = length - currentSize;
162 | if (remainingBytes > 0) {
163 | const padding = "-".repeat(remainingBytes - 1) + "\n";
164 | await writer.write(encoder.encode(padding));
165 |
166 | currentSize += encoder.encode(padding).length;
167 | }
168 | break;
169 | }
170 |
171 | // Add item to writer
172 | await writer.write(encoder.encode(jsonLine));
173 |
174 | currentSize += itemSize;
175 |
176 | // Update stats
177 | this.itemCount++;
178 | this.bytesCollected = currentSize;
179 | this.lastItemTime = timestamp;
180 |
181 | // Add to latest items cache
182 | const itemResponse: ItemResponse = {
183 | id: item.id,
184 | data: item,
185 | collected: new Date(timestamp).toISOString(),
186 | byteSize: itemSize,
187 | };
188 |
189 | this.latestItems.unshift(itemResponse);
190 |
191 | // Trim latest items if needed
192 | if (this.latestItems.length > this.maxLatestItems) {
193 | this.latestItems = this.latestItems.slice(0, this.maxLatestItems);
194 | }
195 |
196 | // Update storage with latest stats
197 | await this.storage.put("bytesCollected", this.bytesCollected);
198 | await this.storage.put("itemCount", this.itemCount);
199 | await this.storage.put("lastItemTime", this.lastItemTime);
200 | await this.storage.put("latestItems", this.latestItems);
201 | }
202 | }
203 |
204 | try {
205 | // Wait for the upload to complete
206 | await writer.close();
207 | await uploadPromise;
208 |
209 | console.log(
210 | `Successfully uploaded ${length} bytes to ${this.fileName}`,
211 | );
212 | } catch (error) {
213 | console.error("Error uploading to R2:", error);
214 | throw error;
215 | }
216 |
217 | this.status = "completed";
218 | await this.storage.put("status", this.status);
219 | } catch (error) {
220 | console.error("Error during collection:", error);
221 | this.status = "error";
222 | await this.storage.put("status", this.status);
223 | } finally {
224 | this.collectionActive = false;
225 | }
226 |
227 | return new Response(
228 | JSON.stringify({
229 | message: `Started collecting data up to ${length} bytes, storing to R2 as ${this.fileName}`,
230 | }),
231 | {
232 | headers: { "Content-Type": "application/json" },
233 | },
234 | );
235 | }
236 |
237 | // Get status and stats
238 | if (path === "status") {
239 | const stats = await this.getStats();
240 | return new Response(JSON.stringify(stats), {
241 | headers: { "Content-Type": "application/json" },
242 | });
243 | }
244 |
245 | // Get the latest items
246 | if (path === "items") {
247 | const limit = parseInt(url.searchParams.get("limit") || "10", 10);
248 | const items = await this.getLatestItems(limit);
249 | return new Response(JSON.stringify(items), {
250 | headers: { "Content-Type": "application/json" },
251 | });
252 | }
253 |
254 | // Reset the collection
255 | if (path === "reset") {
256 | await this.resetCollection();
257 | return new Response(
258 | JSON.stringify({
259 | message: "Collection reset successfully",
260 | }),
261 | {
262 | headers: { "Content-Type": "application/json" },
263 | },
264 | );
265 | }
266 |
267 | return new Response(
268 | JSON.stringify({
269 | error: "Unknown action",
270 | availableActions: [
271 | "/start?length=N - Start collecting data up to N bytes from Hacker News API",
272 | "/status - Get current collection status and stats",
273 | "/items?limit=N - Get the latest N items collected",
274 | "/reset - Reset the collection",
275 | ],
276 | }),
277 | {
278 | status: 400,
279 | headers: { "Content-Type": "application/json" },
280 | },
281 | );
282 | }
283 |
284 | async fetchItem(id: number): Promise {
285 | try {
286 | const response = await fetch(
287 | `https://hacker-news.firebaseio.com/v0/item/${id}.json`,
288 | );
289 | const data: HackerNewsItem = await response.json();
290 |
291 | // Only return items that exist
292 | if (data && data.id) {
293 | return data;
294 | }
295 | return null;
296 | } catch (error) {
297 | console.error(`Error fetching item ${id}:`, error);
298 | return null;
299 | }
300 | }
301 |
302 | async getStats(): Promise {
303 | return {
304 | bytesCollected: this.bytesCollected,
305 | targetLength: this.targetLength,
306 | fileName: this.fileName,
307 | startTime: this.startTime ? new Date(this.startTime).toISOString() : null,
308 | lastItemTime: this.lastItemTime
309 | ? new Date(this.lastItemTime).toISOString()
310 | : null,
311 | runningFor: this.startTime
312 | ? Math.floor((Date.now() - (this.startTime || 0)) / 1000) + " seconds"
313 | : null,
314 | itemCount: this.itemCount,
315 | status: this.status,
316 | };
317 | }
318 |
319 | async getLatestItems(limit: number): Promise {
320 | return this.latestItems.slice(0, limit);
321 | }
322 |
323 | async resetCollection(): Promise {
324 | // Stop any active collection
325 | this.collectionActive = false;
326 |
327 | // Reset stats
328 | this.bytesCollected = 0;
329 | this.targetLength = 0;
330 | this.itemCount = 0;
331 | this.startTime = null;
332 | this.lastItemTime = null;
333 | this.fileName = "";
334 | this.latestItems = [];
335 | this.status = "idle";
336 |
337 | // Update storage
338 | await this.storage.put("bytesCollected", this.bytesCollected);
339 | await this.storage.put("targetLength", this.targetLength);
340 | await this.storage.put("itemCount", this.itemCount);
341 | await this.storage.put("startTime", this.startTime);
342 | await this.storage.put("lastItemTime", this.lastItemTime);
343 | await this.storage.put("fileName", this.fileName);
344 | await this.storage.put("latestItems", this.latestItems);
345 | await this.storage.put("status", this.status);
346 | }
347 | }
348 |
349 | // Worker to route requests to the Durable Object
350 | export default {
351 | async fetch(
352 | request: Request,
353 | env: Env,
354 | ctx: ExecutionContext,
355 | ): Promise {
356 | const url = new URL(request.url);
357 |
358 | // Create a unique ID for our Durable Object
359 | const doId = env.COUNTER.idFromName("hn-data-collector");
360 | const durableObject = env.COUNTER.get(doId);
361 |
362 | // Forward the request to the Durable Object
363 | return durableObject.fetch(request);
364 | },
365 | };
366 |
--------------------------------------------------------------------------------
/wrangler.toml:
--------------------------------------------------------------------------------
1 | name = "do-alarm-test"
2 | main = "do9000.ts"
3 | compatibility_date = "2023-09-01"
4 |
5 | [[r2_buckets]]
6 | bucket_name = "do9000"
7 | binding = "R2_BUCKET"
8 |
9 | [durable_objects]
10 | bindings = [
11 | { name = "COUNTER", class_name = "SubrequestCounter" }
12 | ]
13 |
14 | [[migrations]]
15 | tag = "v1"
16 | new_sqlite_classes = ["SubrequestCounter"]
17 |
--------------------------------------------------------------------------------
/xymake.json:
--------------------------------------------------------------------------------
1 | {
2 | "threads": [
3 | {
4 | "branch": "main",
5 | "createdAt": "2025-05-11T13:08:42.974Z",
6 | "slug": "thread1921553050650411207",
7 | "updatedAt": "2025-05-11T13:08:42.975Z",
8 | "url": "https://x.com/janwilmake/status/1921553050650411207"
9 | }
10 | ],
11 | "lastPostUrl": "https://x.com/janwilmake/status/1921553050650411207",
12 | "$schema": "https://cli.xymake.com/xymake.schema.json"
13 | }
--------------------------------------------------------------------------------