{title}

23 |
29 |       <div className="mt-2">
30 |         {/* auto scroll checkbox */}
31 |         <div className="form-check form-switch">
32 |           <input
33 |             className="form-check-input"
34 |             type="checkbox"
35 |             id="autoScroll"
36 |             checked={autoScroll}
37 |             onChange={(event) => setAutoScroll(event.target.checked)}
38 |           />
39 |           <label className="form-check-label" htmlFor="autoScroll">
40 |             Auto scroll
41 |           </label>
42 |         </div>
43 |       </div>
44 |     </div>
45 |   );
46 | }
47 | 


--------------------------------------------------------------------------------
/src/components/TranscriptViewer.css:
--------------------------------------------------------------------------------
 1 | .TranscriptViewer {
 2 |   font-family: "Sarabun", sans-serif;
 3 |   letter-spacing: 0.1ch;
 4 |   font-size: 20px;
 5 |   padding-bottom: 75vh;
 6 | }
 7 | 
 8 | .TranscriptItem {
 9 |   margin-bottom: 0.5rem;
10 |   display: flex;
11 | }
12 | 
13 | .TranscriptItem__content {
14 |   padding: 0.75rem 1rem;
15 |   border-radius: 1rem;
16 |   border: 1px solid transparent;
17 |   position: relative;
18 | }
19 | 
20 | .TranscriptItem__content[data-transcribed="true"] {
21 |   border-color: var(--bs-border-color);
22 | }
23 | 
24 | .TranscriptItem__content[data-needs-correction="true"] {
25 |   color: var(--bs-yellow);
26 | }
27 | 
28 | .TranscriptItem__content[data-editing="true"] {
29 |   border-color: var(--bs-yellow);
30 | }
31 | 
32 | .TranscriptItem__time {
33 |   display: block;
34 |   color: var(--bs-gray);
35 |   font-size: 0.5em;
36 |   position: absolute;
37 |   bottom: 0.2rem;
38 |   right: 0.7rem;
39 |   opacity: 0;
40 |   user-select: none;
41 | }
42 | 
43 | .TranscriptItem__content:hover .TranscriptItem__time {
44 |   opacity: 1;
45 | }
46 | 
47 | .TranscriptViewerOptions {
48 |   position: fixed;
49 |   bottom: 0;
50 |   left: 0;
51 |   right: 0;
52 |   background: #000;
53 |   padding: 0.5rem;
54 |   transform: translateY(100%) translateY(-0.5rem);
55 |   transition: all 0.3s;
56 |   opacity: 0;
57 | }
58 | 
59 | .TranscriptViewerOptions:hover,
60 | .TranscriptViewerOptions:focus-within,
61 | .TranscriptViewerOptions[data-editable="true"] {
62 |   transform: translateY(0);
63 |   opacity: 1;
64 | }
65 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "fresh-app",
 3 |   "type": "module",
 4 |   "version": "0.0.1",
 5 |   "scripts": {
 6 |     "dev": "astro dev",
 7 |     "start": "astro dev",
 8 |     "build": "astro check && astro build",
 9 |     "preview": "astro preview",
10 |     "astro": "astro",
11 |     "server": "tsx --env-file=.env backend/scripts/server.ts",
12 |     "dev:server": "tsx --env-file=.env --watch backend/scripts/server.ts",
13 |     "createRoom": "tsx --env-file=.env backend/scripts/createRoom.ts",
14 |     "batchTranscriber": "tsx --env-file=.env backend/scripts/batchTranscriber.ts",
15 |     "whisperTranslator": "tsx --env-file=.env backend/scripts/whisperTranslator.ts",
16 |     "partialTranscriber": "tsx --env-file=.env backend/scripts/partialTranscriber.ts"
17 |   },
18 |   "dependencies": {
19 |     "@astrojs/check": "^0.9.2",
20 |     "@astrojs/react": "^3.6.2",
21 |     "@fastify/cors": "^10.0.1",
22 |     "@fastify/websocket": "^11.0.1",
23 |     "@google-cloud/speech": "^6.7.0",
24 |     "@google/generative-ai": "^0.21.0",
25 |     "@keyv/sqlite": "^4.0.1",
26 |     "@nanostores/react": "^0.7.3",
27 |     "@ricky0123/vad-web": "^0.0.19",
28 |     "@stablelib/base64": "^2.0.0",
29 |     "@thai/html": "npm:@jsr/thai__html@0.1.0-alpha.0",
30 |     "@types/react": "^18.3.8",
31 |     "@types/react-dom": "^18.3.0",
32 |     "astro": "^4.14.2",
33 |     "buffer-es6": "^4.9.3",
34 |     "chalk": "^5.3.0",
35 |     "fastify": "^5.0.0",
36 |     "groq-sdk": "^0.7.0",
37 |     "keyv": "^5.1.0",
38 |     "nanostores": "^0.11.3",
39 |     "ofetch": "^1.4.1",
40 |     "react": "^18.3.1",
41 |     "react-dom": "^18.3.1",
42 |     "react-textarea-autosize": "^8.5.4",
43 |     "reconnecting-websocket": "^4.4.0",
44 |     "sqlite3": "^5.1.7",
45 |     "tsx": "^4.19.1",
46 |     "typescript": "^5.5.4",
47 |     "uuidv7": "^1.0.2"
48 |   },
49 |   "packageManager": "pnpm@9.7.1+sha512.faf344af2d6ca65c4c5c8c2224ea77a81a5e8859cbc4e06b1511ddce2f0151512431dd19e6aff31f2c6a8f5f2aced9bd2273e1fed7dd4de1868984059d2c4247",
50 |   "devDependencies": {
51 |     "@types/node": "^22.7.6"
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/backend/scripts/whisperTranslator.ts:
--------------------------------------------------------------------------------
 1 | import Groq from "groq-sdk";
 2 | import { uuidv7 } from "uuidv7";
 3 | import { createRoomApi, getRoomConfig, publicApi } from "../src/client";
 4 | 
 5 | const groq = new Groq();
 6 | const api = createRoomApi(getRoomConfig());
 7 | 
 8 | let waiting = false;
 9 | async function main() {
10 |   const list = await api<
11 |     {
12 |       id: string;
13 |       start: string;
14 |       finish: string;
15 |       length: number;
16 |       transcript?: string;
17 |     }[]
18 |   >(`/items`);
19 | 
20 |   const validItems = list.filter((item) => item.length > 0);
21 |   validItems.sort((a, b) => a.start.localeCompare(b.start));
22 | 
23 |   const untranscribed = validItems.filter((item) => item.transcript == null);
24 |   if (!untranscribed.length) {
25 |     if (!waiting) {
26 |       waiting = true;
27 |       process.stderr.write("Waiting for transcription...");
28 |     } else {
29 |       process.stderr.write(".");
30 |     }
31 |     return false;
32 |   }
33 |   if (waiting) {
34 |     process.stderr.write("\n");
35 |     waiting = false;
36 |   }
37 | 
38 |   const modelName = "whisper-large-v3";
39 |   const transcription = await groq.audio.translations.create({
40 |     file: new File([await loadAudio(untranscribed[0].id)], "audio.wav"),
41 |     model: modelName,
42 |     response_format: "json",
43 |   });
44 |   console.log(transcription.text);
45 |   const usageId = uuidv7();
46 |   await api(`/items/${untranscribed[0].id}`, {
47 |     method: "PATCH",
48 |     body: {
49 |       transcript: transcription.text,
50 |       transcriptBy: modelName,
51 |       usageId,
52 |     },
53 |   });
54 |   return true;
55 | }
56 | 
57 | async function loadAudio(id: string) {
58 |   return publicApi(`/pcm/${id}`, { responseType: "blob" }).then((r) =>
59 |     r.arrayBuffer()
60 |   );
61 | }
62 | 
63 | const initialHp = 5;
64 | let hp = initialHp;
65 | for (;;) {
66 |   try {
67 |     if (!(await main())) {
68 |       await new Promise((r) => setTimeout(r, 1000));
69 |     }
70 |     if (hp < initialHp) {
71 |       hp = initialHp;
72 |       console.error("HP has been restored to", hp);
73 |     }
74 |   } catch (error) {
75 |     console.error(error);
76 |     hp--;
77 |     if (hp <= 0) {
78 |       console.error("Giving up");
79 |       process.exit(1);
80 |       break;
81 |     } else {
82 |       console.error("HP has been reduced to", hp);
83 |     }
84 |   } finally {
85 |     await new Promise((r) => setTimeout(r, 100));
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/backend/src/persistence.ts:
--------------------------------------------------------------------------------
 1 | import sqlite3 from "sqlite3";
 2 | 
 3 | export class Partition {
 4 |   private db: sqlite3.Database;
 5 | 
 6 |   constructor(db: sqlite3.Database, private partitionKey: string) {
 7 |     this.db = db;
 8 |   }
 9 | 
10 |   async get(sortKey: string): Promise<any> {
11 |     return new Promise((resolve, reject) => {
12 |       this.db.get<{ value: string }>(
13 |         "SELECT value FROM keyvalue WHERE partition = ? AND key = ?",
14 |         [this.partitionKey, sortKey],
15 |         (err, row) => {
16 |           if (err) reject(err);
17 |           else resolve(row ? JSON.parse(row.value) : undefined);
18 |         }
19 |       );
20 |     });
21 |   }
22 | 
23 |   async set(sortKey: string, value: any): Promise<void> {
24 |     const serializedValue = JSON.stringify(value);
25 |     return new Promise((resolve, reject) => {
26 |       this.db.run(
27 |         "INSERT OR REPLACE INTO keyvalue (partition, key, value) VALUES (?, ?, ?)",
28 |         [this.partitionKey, sortKey, serializedValue],
29 |         function (err: Error | null) {
30 |           if (err) reject(err);
31 |           else resolve();
32 |         }
33 |       );
34 |     });
35 |   }
36 | 
37 |   async *[Symbol.asyncIterator]() {
38 |     const rows = await new Promise<any[]>((resolve, reject) => {
39 |       this.db.all(
40 |         "SELECT key, value FROM keyvalue WHERE partition = ?",
41 |         [this.partitionKey],
42 |         (err, rows) => {
43 |           if (err) reject(err);
44 |           else resolve(rows);
45 |         }
46 |       );
47 |     });
48 | 
49 |     for (const row of rows) {
50 |       yield [row.key, JSON.parse(row.value)];
51 |     }
52 |   }
53 | }
54 | 
55 | export class Persistence {
56 |   private db: sqlite3.Database;
57 |   private partitions: Map<string, Partition> = new Map();
58 | 
59 |   constructor(connectionString: string) {
60 |     this.db = new sqlite3.Database(connectionString, (err) => {
61 |       if (err) {
62 |         console.error("Error opening database:", err.message);
63 |       } else {
64 |         this.initializeDatabase();
65 |       }
66 |     });
67 |   }
68 | 
69 |   private initializeDatabase() {
70 |     this.db.run(`
71 |       CREATE TABLE IF NOT EXISTS keyvalue (
72 |         partition TEXT,
73 |         key TEXT,
74 |         value TEXT,
75 |         PRIMARY KEY (partition, key)
76 |       )
77 |     `);
78 |   }
79 | 
80 |   getPartition(partitionKey: string): Partition {
81 |     if (!this.partitions.has(partitionKey)) {
82 |       this.partitions.set(partitionKey, new Partition(this.db, partitionKey));
83 |     }
84 |     return this.partitions.get(partitionKey)!;
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # live-speech
  2 | 
  3 | This project aims to provide live speech transcription for tech events, specifically designed to support Thai tech talks where there's a mixture of Thai words and technical terms. The system offers real-time transcription with post-processing capabilities for improved accuracy.
  4 | 
  5 | ## Components
  6 | 
  7 | ![A flowchart on a black background illustrates the workflow of an audio transcription system: Audio Sender, Server, Realtime Transcriber, Batch Transcriber, and Viewer.](https://im.dt.in.th/ipfs/bafybeibhbdvrey26ieetdcjzf443quorx5xe4vjrf5rt6dit4ay3paitqq/image.webp)
  8 | 
  9 | ### Audio Sender
 10 | 
 11 | - Web-based, using `getUserMedia` API
 12 | - Responsible for capturing audio from the speaker's device, converting it to 16-bit linear PCM audio data, and sends it to the server using WebSockets
 13 | 
 14 | ### Server
 15 | 
 16 | - Acts as the central backend for the application
 17 | - Handles database operations and pub/sub functionality
 18 | - Manages communication between different components
 19 | 
 20 | ### Realtime Transcriber
 21 | 
 22 | - Performs streaming transcription in real-time
 23 | - Provides quick, albeit less accurate, transcriptions
 24 | - Useful for immediate feedback and live subtitles
 25 | 
 26 | ### Batch Transcriber
 27 | 
 28 | - Uses a more advanced ASR model (Gemini 1.5 Flash) for improved accuracy
 29 | - Processes audio in batches for higher quality transcriptions
 30 | 
 31 | ### Transcript Viewer
 32 | 
 33 | - Displays the transcribed text to the audience
 34 | - Shows both real-time and refined transcriptions
 35 | 
 36 | ## Key Features
 37 | 
 38 | - Real-time audio capture and streaming
 39 | - Live transcription with quick feedback
 40 | - High-accuracy batch processing for refined transcripts
 41 | - Support for mixed Thai and English technical content
 42 | 
 43 | This system is designed to enhance the accessibility and documentation of Thai tech talks by providing accurate transcriptions that can handle the unique challenges of mixed-language technical presentations.
 44 | 
 45 | ## Setup
 46 | 
 47 | ```sh
 48 | # Install Node.js
 49 | mise install
 50 | 
 51 | # Enable corepack
 52 | corepack enable
 53 | 
 54 | # Install dependencies
 55 | pnpm install
 56 | ```
 57 | 
 58 | `.env`:
 59 | 
 60 | ```sh
 61 | # For local development
 62 | SERVER_URL_BASE=http://localhost:10300
 63 | FRONTEND_URL_BASE=http://localhost:4321
 64 | 
 65 | # Generate a random string for the secret key, e.g. using `openssl rand -hex 32`
 66 | SERVICE_TOKEN=
 67 | 
 68 | # For batch transcription
 69 | GEMINI_API_KEY=
 70 | 
 71 | # Change to "pro" for better transcription quality at higher cost
 72 | GEMINI_MODEL=flash
 73 | 
 74 | # For partial transcription with Speechmatics
 75 | PARTIAL_TRANSCRIBER_PROVIDER=speechmatics
 76 | SPEECHMATICS_API_KEY=
 77 | 
 78 | # For partial transcription with Google
 79 | # PARTIAL_TRANSCRIBER_PROVIDER=google
 80 | # GOOGLE_APPLICATION_CREDENTIALS=
 81 | 
 82 | # For partial transcription with local model (macOS only),
 83 | # compile this CLI <https://github.com/dtinth/transcribe> and set
 84 | # PARTIAL_TRANSCRIBER_PROVIDER=local
 85 | ```
 86 | 
 87 | ## How much does it cost?
 88 | 
 89 | The numbers are **approximate** and depends on which models you use.
 90 | 
 91 | Google Speech-To-Text model has lower latency (from Thailand) and cheaper, but performs worse than Speechmatics for Thai contents.
 92 | 
 93 | | Partial transcription model | Price per hour |
 94 | | --------------------------- | -------------- |
 95 | | `local`                     | $0.00          |
 96 | | `google`                    | $0.81          |
 97 | | `speechmatics`              | $1.18          |
 98 | 
 99 | Gemini Flash works great for Thai contents, but for English content [Gemini Pro is recommended for better punctuation insertion](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/audio-understanding#:~:text=Transcription%20punctuation%3A%20(if%20using%20Gemini%C2%A01.5%C2%A0Flash)%20The%20models%20might%20return%20transcriptions%20that%20don%27t%20include%20punctuation).
100 | 
101 | | Batch transcription model | Price per hour |
102 | | ------------------------- | -------------- |
103 | | Gemini Flash              | $0.18          |
104 | | Gemini Pro                | $2.97          |
105 | 
106 | ## Workflow
107 | 
108 | 1. Run the server:
109 | 
110 |    ```sh
111 |    pnpm run server # or `pnpm run dev:server` to restart on file changes
112 |    ```
113 | 
114 | 2. Run the frontend:
115 | 
116 |    ```sh
117 |    pnpm run dev
118 |    ```
119 | 
120 | 3. Create a room:
121 | 
122 |    ```sh
123 |    pnpm run createRoom
124 |    ```
125 | 
126 | 4. Run partial transcriber:
127 | 
128 |    ```sh
129 |    pnpm run partialTranscriber
130 |    ```
131 | 
132 | 5. Run batch transcriber:
133 | 
134 |    ```sh
135 |    pnpm run batchTranscriber
136 |    ```
137 | 
138 | 6. Navigate to audio sender.
139 | 


--------------------------------------------------------------------------------
/backend/scripts/server.ts:
--------------------------------------------------------------------------------
  1 | import Cors from "@fastify/cors";
  2 | import Websocket from "@fastify/websocket";
  3 | import Fastify, { type FastifyRequest } from "fastify";
  4 | import { randomBytes } from "node:crypto";
  5 | import { uuidv7 } from "uuidv7";
  6 | import { db } from "../src/db";
  7 | import { getItem, getItems, updateItem } from "../src/itemOperations";
  8 | import { publicBroadcast } from "../src/publicBroadcast";
  9 | import { pubsub } from "../src/pubsub";
 10 | import { Room } from "../src/room";
 11 | import { Utterance } from "../src/utterance";
 12 | 
 13 | const fastify = Fastify({
 14 |   logger: true,
 15 | });
 16 | await fastify.register(Websocket);
 17 | 
 18 | // Add `Access-Control-Allow-Private-Network: true` to all responses
 19 | fastify.addHook("onSend", (request, reply, payload, done) => {
 20 |   reply.header("Access-Control-Allow-Private-Network", "true");
 21 |   done();
 22 | });
 23 | 
 24 | await fastify.register(Cors);
 25 | 
 26 | fastify.post("/admin/rooms", async (req, reply) => {
 27 |   const token = req.headers["authorization"]?.split(" ")[1];
 28 |   if (token !== process.env["SERVICE_TOKEN"]) {
 29 |     reply.code(401).send({ error: "Unauthorized" });
 30 |     return;
 31 |   }
 32 | 
 33 |   const roomId = uuidv7();
 34 |   const roomKey = randomBytes(32).toString("hex");
 35 | 
 36 |   await db.rooms.set(roomId, { roomKey });
 37 | 
 38 |   return { roomId, roomKey };
 39 | });
 40 | 
 41 | fastify.get("/admin/rooms", async (req, reply) => {
 42 |   const token = req.headers["authorization"]?.split(" ")[1];
 43 |   if (token !== process.env["SERVICE_TOKEN"]) {
 44 |     reply.code(401).send({ error: "Unauthorized" });
 45 |     return;
 46 |   }
 47 | 
 48 |   const rooms = [];
 49 |   for await (const [roomId, roomData] of db.rooms) {
 50 |     rooms.push({ roomId, ...roomData });
 51 |   }
 52 | 
 53 |   return rooms;
 54 | });
 55 | 
 56 | async function checkRoomKey(room: Room, key: string) {
 57 |   const roomInfo = await db.rooms.get(room.name);
 58 |   if (!roomInfo) {
 59 |     return false;
 60 |   }
 61 |   return roomInfo.roomKey === key;
 62 | }
 63 | 
 64 | async function validateRoomKey(
 65 |   req: FastifyRequest,
 66 |   room: Room
 67 | ): Promise<boolean> {
 68 |   const authHeader = req.headers["authorization"];
 69 |   if (!authHeader) return false;
 70 | 
 71 |   const [bearer, key] = authHeader.split(" ");
 72 |   if (bearer !== "Bearer" || !key) return false;
 73 | 
 74 |   return checkRoomKey(room, key);
 75 | }
 76 | 
 77 | fastify.get(
 78 |   "/rooms/:room/audioIngest",
 79 |   { websocket: true },
 80 |   async (connection, req) => {
 81 |     const key = (req.query as { key: string }).key;
 82 |     const room = new Room((req.params as { room: string }).room);
 83 |     if (!(await checkRoomKey(room, key))) {
 84 |       connection.send(JSON.stringify({ error: "Invalid token" }));
 85 |       connection.close();
 86 |       return;
 87 |     }
 88 | 
 89 |     let currentUtterance: Utterance | undefined;
 90 |     connection.on("message", async (message: any) => {
 91 |       try {
 92 |         const data = JSON.parse(message.toString());
 93 |         // JSON-RPC messages:
 94 |         // - "start" - start audio stream. params.localTime is the local time when the audio stream started.
 95 |         // - "audio" - audio data. params.data is base64-encoded s16le audio data.
 96 |         // - "stop" - stop audio stream.
 97 |         // Send acknowledgement for each message.
 98 |         try {
 99 |           switch (data.method) {
100 |             case "start": {
101 |               currentUtterance = new Utterance(room, data.params.localTime);
102 |               break;
103 |             }
104 |             case "audio": {
105 |               currentUtterance?.addAudio(data.params.data);
106 |               break;
107 |             }
108 |             case "stop": {
109 |               currentUtterance?.finish();
110 |               break;
111 |             }
112 |           }
113 |           connection.send(JSON.stringify({ id: data.id, result: null }));
114 |         } catch (error) {
115 |           connection.send(
116 |             JSON.stringify({ id: data.id, error: String(error) })
117 |           );
118 |           req.log.error(error);
119 |         }
120 |       } catch (error) {
121 |         req.log.error(error);
122 |       }
123 |     });
124 |     connection.send(JSON.stringify({ method: "welcome" }));
125 |   }
126 | );
127 | 
128 | fastify.get(
129 |   "/rooms/:room/audioEvents",
130 |   { websocket: true },
131 |   async (connection, req) => {
132 |     const key = (req.query as Record<string, string>).key;
133 |     const room = new Room((req.params as { room: string }).room);
134 |     if (!(await checkRoomKey(room, key))) {
135 |       connection.send(JSON.stringify({ error: "Invalid room key" }));
136 |       connection.close();
137 |       return;
138 |     }
139 |     connection.on("message", async (message: any) => {
140 |       try {
141 |         const data = JSON.parse(message.toString());
142 |         if (data.method === "submit_partial_transcript") {
143 |           const { id, transcript } = data.params;
144 |           publicBroadcast(room, "partial_transcript", {
145 |             id,
146 |             transcript,
147 |           });
148 |           connection.send(
149 |             JSON.stringify({ id: data.id, result: { ok: true } })
150 |           );
151 |         }
152 |       } catch (error) {
153 |         req.log.error(error);
154 |       }
155 |     });
156 |     const unsubscribe = pubsub.subscribe(room.audioTopic, (message) => {
157 |       connection.send(message);
158 |     });
159 |     connection.on("close", unsubscribe);
160 |   }
161 | );
162 | 
163 | fastify.get(
164 |   "/rooms/:room/publicEvents",
165 |   { websocket: true },
166 |   (connection, req) => {
167 |     const room = new Room((req.params as { room: string }).room);
168 |     const unsubscribe = pubsub.subscribe(room.publicTopic, (message) => {
169 |       connection.send(message);
170 |     });
171 |     connection.on("close", unsubscribe);
172 |   }
173 | );
174 | 
175 | fastify.get("/rooms/:room/items", async (req) => {
176 |   const room = new Room((req.params as { room: string }).room);
177 |   const items = await getItems(room);
178 |   return items;
179 | });
180 | 
181 | fastify.get("/rooms/:room/items/:id", async (req, reply) => {
182 |   const room = new Room((req.params as { room: string }).room);
183 |   const id = (req.params as { id: string }).id;
184 |   const item = await getItem(room, id);
185 |   if (!item) {
186 |     reply.status(404).send({ error: "Not found" });
187 |     return;
188 |   }
189 |   return item;
190 | });
191 | 
192 | fastify.patch("/rooms/:room/items/:id", async (req, reply) => {
193 |   const room = new Room((req.params as { room: string }).room);
194 |   if (!(await validateRoomKey(req, room))) {
195 |     reply.code(401).send({ error: "Invalid room key" });
196 |     return;
197 |   }
198 |   const id = (req.params as { id: string }).id;
199 |   const body = req.body as any;
200 |   const newValue = await updateItem(room, id, body);
201 |   return newValue;
202 | });
203 | 
204 | fastify.get("/pcm/:id", async (req, reply) => {
205 |   const id = (req.params as { id: string }).id;
206 |   const buffer = Buffer.from((await db.audio.get(id)) as string, "base64");
207 |   // Generate wav file. Buffer is raw PCM, s16le, 1 channel.
208 |   const sampleRate = 16000; // Assuming 16kHz sample rate
209 |   const numChannels = 1;
210 |   const bitsPerSample = 16;
211 | 
212 |   const dataSize = buffer.length;
213 |   const wavBuffer = Buffer.alloc(44 + dataSize);
214 | 
215 |   // WAV header
216 |   wavBuffer.write("RIFF", 0);
217 |   wavBuffer.writeUInt32LE(36 + dataSize, 4);
218 |   wavBuffer.write("WAVE", 8);
219 |   wavBuffer.write("fmt ", 12);
220 |   wavBuffer.writeUInt32LE(16, 16);
221 |   wavBuffer.writeUInt16LE(1, 20);
222 |   wavBuffer.writeUInt16LE(numChannels, 22);
223 |   wavBuffer.writeUInt32LE(sampleRate, 24);
224 |   wavBuffer.writeUInt32LE((sampleRate * numChannels * bitsPerSample) / 8, 28);
225 |   wavBuffer.writeUInt16LE((numChannels * bitsPerSample) / 8, 32);
226 |   wavBuffer.writeUInt16LE(bitsPerSample, 34);
227 |   wavBuffer.write("data", 36);
228 |   wavBuffer.writeUInt32LE(dataSize, 40);
229 | 
230 |   // Copy PCM data
231 |   buffer.copy(wavBuffer, 44);
232 | 
233 |   reply
234 |     .header("Content-Type", "audio/wav")
235 |     .header("Content-Disposition", `inline; filename="${id}.wav"`)
236 |     .send(wavBuffer);
237 | });
238 | 
239 | fastify.listen({ port: 10300 });
240 | 


--------------------------------------------------------------------------------
/backend/scripts/partialTranscriber.ts:
--------------------------------------------------------------------------------
  1 | import { protos, v2 } from "@google-cloud/speech";
  2 | import { spawn } from "child_process";
  3 | import { createInterface } from "node:readline";
  4 | import { PassThrough, Readable } from "node:stream";
  5 | import { pipeline } from "node:stream/promises";
  6 | import { ofetch } from "ofetch";
  7 | import ReconnectingWebSocket from "reconnecting-websocket";
  8 | import { getRoomConfig } from "../src/client";
  9 | 
 10 | const roomConfig = getRoomConfig();
 11 | 
 12 | const websocket = new ReconnectingWebSocket(
 13 |   `${process.env["SERVER_URL_BASE"]!.replace(/^http/, "ws")}/rooms/${
 14 |     roomConfig.roomId
 15 |   }/audioEvents?key=${roomConfig.roomKey}`
 16 | );
 17 | 
 18 | function isAbortError(e: any) {
 19 |   return e.name === "AbortError";
 20 | }
 21 | 
 22 | function createTranscriber(
 23 |   language: string,
 24 |   requireOnDevice: boolean,
 25 |   signal: AbortSignal
 26 | ) {
 27 |   const child = spawn("transcriber", [language], {
 28 |     stdio: ["pipe", "pipe", "inherit"],
 29 |     env: {
 30 |       ...process.env,
 31 |       ...(requireOnDevice ? { TRANSCRIBE_ON_DEVICE_ONLY: "1" } : {}),
 32 |     },
 33 |     signal,
 34 |   });
 35 |   child.on("error", (error) => {
 36 |     if (isAbortError(error)) return;
 37 |     console.error("Transcriber process encountered error", error);
 38 |   });
 39 |   return async function* (source: AsyncIterable<Uint8Array>) {
 40 |     Readable.from(source).pipe(child.stdin);
 41 |     for await (const line of parseNdjson(child.stdout)) {
 42 |       yield line;
 43 |     }
 44 |     child.kill();
 45 |   };
 46 | }
 47 | 
 48 | function createGoogleTranscriber(language: string, signal: AbortSignal) {
 49 |   const client = new v2.SpeechClient();
 50 |   const stream = client._streamingRecognize();
 51 |   const createRequest = (
 52 |     x: protos.google.cloud.speech.v2.IStreamingRecognizeRequest
 53 |   ) => x;
 54 |   return async function* (source: AsyncIterable<Uint8Array>) {
 55 |     const inputStream = Readable.from(
 56 |       (async function* () {
 57 |         yield createRequest({
 58 |           recognizer:
 59 |             "projects/dtinth-audio-transcription/locations/global/recognizers/_",
 60 |           streamingConfig: {
 61 |             config: {
 62 |               explicitDecodingConfig: {
 63 |                 encoding: "LINEAR16",
 64 |                 sampleRateHertz: 16000,
 65 |                 audioChannelCount: 1,
 66 |               },
 67 |               languageCodes: [language],
 68 |               model: "short",
 69 |             },
 70 |             streamingFeatures: {
 71 |               interimResults: true,
 72 |             },
 73 |           },
 74 |         });
 75 |         for await (const chunk of source) {
 76 |           yield createRequest({ audio: chunk });
 77 |         }
 78 |       })()
 79 |     );
 80 |     inputStream.pipe(stream);
 81 |     for await (const event of stream) {
 82 |       const text = event?.results?.[0]?.alternatives?.[0]?.transcript;
 83 |       if (text) {
 84 |         yield { text };
 85 |       } else {
 86 |         console.warn("No text in event", JSON.stringify(event));
 87 |       }
 88 |     }
 89 |   };
 90 | }
 91 | 
 92 | let cachedSpeechmaticsApiKey: string | undefined;
 93 | async function obtainSpeechamticsApiKey() {
 94 |   if (cachedSpeechmaticsApiKey) return cachedSpeechmaticsApiKey;
 95 | 
 96 |   const apiKey = process.env.SPEECHMATICS_API_KEY;
 97 |   if (!apiKey) {
 98 |     throw new Error("SPEECHMATICS_API_KEY environment variable is not set");
 99 |   }
100 | 
101 |   const refresh = async () => {
102 |     const response = await ofetch<{ key_value: string }>(
103 |       "https://mp.speechmatics.com/v1/api_keys?type=rt",
104 |       {
105 |         method: "POST",
106 |         headers: {
107 |           "Content-Type": "application/json",
108 |           Authorization: `Bearer ${apiKey}`,
109 |         },
110 |         body: JSON.stringify({ ttl: 3600 }),
111 |       }
112 |     );
113 |     cachedSpeechmaticsApiKey = response.key_value;
114 |     return response.key_value;
115 |   };
116 |   setInterval(refresh, 1800 * 1000);
117 |   return await refresh();
118 | }
119 | 
120 | function createSpeechmaticsTranscriber(language: string, signal: AbortSignal) {
121 |   const output = new PassThrough({ objectMode: true });
122 |   async function worker(source: AsyncIterable<Uint8Array>) {
123 |     const tempKey = await obtainSpeechamticsApiKey();
124 |     const socket = new WebSocket(
125 |       `wss://eu2.rt.speechmatics.com/v2?jwt=${tempKey}`
126 |     );
127 |     const openPromise = new Promise<void>((resolve, reject) => {
128 |       socket.onopen = () => {
129 |         console.log("Connected to Speechmatics WebSocket");
130 |         const startMessage = {
131 |           message: "StartRecognition",
132 |           audio_format: {
133 |             type: "raw",
134 |             encoding: "pcm_s16le",
135 |             sample_rate: 16000,
136 |           },
137 |           transcription_config: {
138 |             language,
139 |             enable_partials: true,
140 |           },
141 |         };
142 |         socket.send(JSON.stringify(startMessage));
143 |         resolve();
144 |       };
145 |       socket.onmessage = (event) => {
146 |         const data = JSON.parse(event.data);
147 |         output.write(data);
148 |         if (data.message === "EndOfTranscript") {
149 |           socket.close();
150 |           output.end();
151 |         }
152 |       };
153 |       socket.onerror = (error) => {
154 |         console.error("WebSocket error:", error);
155 |         reject();
156 |         output.end();
157 |       };
158 |       socket.onclose = (event) => {
159 |         console.log("WebSocket closed:", event.code, event.reason);
160 |         output.end();
161 |       };
162 |     });
163 |     await openPromise;
164 |     let nChunks = 0;
165 |     for await (const chunk of source) {
166 |       socket.send(chunk);
167 |       nChunks += 1;
168 |     }
169 |     socket.send(
170 |       JSON.stringify({ message: "EndOfStream", last_seq_no: nChunks })
171 |     );
172 |   }
173 | 
174 |   return async function* (source: AsyncIterable<Uint8Array>) {
175 |     const promise = worker(source);
176 |     for await (const item of output) {
177 |       if (
178 |         item.message === "AddTranscript" ||
179 |         item.message === "AddPartialTranscript"
180 |       ) {
181 |         const text = String(item.metadata?.transcript || "")
182 |           .replace(/<\w+>/g, "")
183 |           .trim();
184 |         if (text) {
185 |           yield { text };
186 |         }
187 |       }
188 |     }
189 |     await promise;
190 |   };
191 | }
192 | 
193 | async function* parseNdjson(source: any) {
194 |   for await (const line of createInterface({ input: Readable.from(source) })) {
195 |     if (line.trim()) {
196 |       yield JSON.parse(line);
197 |     }
198 |   }
199 | }
200 | 
201 | let currentTranscription: Transcription | undefined;
202 | 
203 | class Transcription {
204 |   abortController: AbortController;
205 |   input = new PassThrough();
206 |   constructor(public id: string) {
207 |     this.abortController = new AbortController();
208 |     this.worker();
209 |     console.log("*", id);
210 |   }
211 |   addAudio(buffer: Buffer) {
212 |     this.input.write(buffer);
213 |   }
214 |   async worker() {
215 |     try {
216 |       await pipeline(
217 |         this.input,
218 |         new PassThrough(),
219 |         process.env["PARTIAL_TRANSCRIBER_PROVIDER"] === "speechmatics"
220 |           ? createSpeechmaticsTranscriber(process.env['TRANSCRIBER_LANG'] || "th", this.abortController.signal)
221 |           : process.env["PARTIAL_TRANSCRIBER_PROVIDER"] === "local"
222 |           ? createTranscriber("th", false, this.abortController.signal)
223 |           : createGoogleTranscriber("th-TH", this.abortController.signal),
224 |         async (source) => {
225 |           for await (const { text } of source) {
226 |             console.log("   -", text);
227 |             websocket.send(
228 |               JSON.stringify({
229 |                 method: "submit_partial_transcript",
230 |                 params: {
231 |                   id: this.id,
232 |                   transcript: text,
233 |                 },
234 |               })
235 |             );
236 |           }
237 |         }
238 |       );
239 |     } catch (error) {
240 |       console.error(`Worker ${this.id} error`, error);
241 |     }
242 |   }
243 |   finish() {
244 |     this.input.end();
245 |     setTimeout(() => {
246 |       this.abortController.abort();
247 |     }, 3000);
248 |   }
249 | }
250 | 
251 | websocket.onopen = () => {
252 |   console.log("Connected to backend");
253 | };
254 | websocket.onclose = () => {
255 |   console.error("Disconnected from backend");
256 | };
257 | websocket.onmessage = (e) => {
258 |   const data = JSON.parse(e.data);
259 |   switch (data.method) {
260 |     case "audio_start": {
261 |       if (currentTranscription) {
262 |         currentTranscription.finish();
263 |         currentTranscription = undefined;
264 |       }
265 |       currentTranscription = new Transcription(data.params.id);
266 |       break;
267 |     }
268 |     case "audio_data": {
269 |       if (currentTranscription && currentTranscription.id !== data.params.id) {
270 |         currentTranscription.finish();
271 |         currentTranscription = undefined;
272 |       }
273 |       if (!currentTranscription) {
274 |         currentTranscription = new Transcription(data.params.id);
275 |       }
276 |       currentTranscription.addAudio(Buffer.from(data.params.base64, "base64"));
277 |       break;
278 |     }
279 |     case "audio_finish": {
280 |       if (currentTranscription) {
281 |         currentTranscription.finish();
282 |         currentTranscription = undefined;
283 |       }
284 |       break;
285 |     }
286 |   }
287 | };
288 | 


--------------------------------------------------------------------------------
/backend/scripts/batchTranscriber.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   GoogleGenerativeAI,
  3 |   HarmBlockThreshold,
  4 |   HarmCategory,
  5 |   SchemaType,
  6 |   type GenerationConfig,
  7 |   type Part,
  8 |   type UsageMetadata,
  9 | } from "@google/generative-ai";
 10 | import { createHash } from "crypto";
 11 | import { uuidv7 } from "uuidv7";
 12 | import { createRoomApi, getRoomConfig, publicApi } from "../src/client";
 13 | 
 14 | const api = createRoomApi(getRoomConfig());
 15 | 
 16 | const apiKey = process.env["GEMINI_API_KEY"]!;
 17 | const genAI = new GoogleGenerativeAI(apiKey);
 18 | const modelName = process.env['GEMINI_MODEL'] === 'pro' ? "gemini-1.5-pro-002" : "gemini-1.5-flash-002";
 19 | export const model = genAI.getGenerativeModel({
 20 |   model: modelName,
 21 | });
 22 | console.log('Using model:', modelName);
 23 | 
 24 | interface HistoryItem {
 25 |   audio: ArrayBuffer;
 26 |   transcript: string;
 27 | }
 28 | 
 29 | let waiting = false;
 30 | 
 31 | export interface TranscriptionItem {
 32 |   id: string;
 33 |   transcript: string;
 34 | }
 35 | 
 36 | export async function processAudio(
 37 |   audio: ArrayBuffer[],
 38 |   history: HistoryItem[] = [],
 39 |   prior: string[] = []
 40 | ) {
 41 |   const generationConfig: GenerationConfig = {
 42 |     maxOutputTokens: 300,
 43 |     responseMimeType: "application/json",
 44 |     responseSchema: {
 45 |       type: SchemaType.OBJECT,
 46 |       properties: {
 47 |         transcription: {
 48 |           type: SchemaType.ARRAY,
 49 |           items: {
 50 |             type: SchemaType.OBJECT,
 51 |             properties: {
 52 |               id: { type: SchemaType.STRING },
 53 |               transcript: { type: SchemaType.STRING },
 54 |             },
 55 |           },
 56 |         },
 57 |       },
 58 |     },
 59 |   };
 60 |   const historyParts: Part[] = [
 61 |     {
 62 |       text:
 63 |         `You are a professional transcriber.` +
 64 |         (prior.length > 0
 65 |           ? `
 66 | For your context, here are the prior transcribed texts: ${JSON.stringify(
 67 |               prior
 68 |             )}\n\n`
 69 |           : "") +
 70 |         `
 71 | You will be given a series of audio files and their IDs in this format:
 72 | 
 73 | id: <id>
 74 | <audio file>
 75 | 
 76 | Transcribe the speech in each audio file. Follow the style guide when transcribing:
 77 | - For English words, if it is a common word, then spell it using lowercase (e.g. oscillator). If it is a proper noun, capitalize it properly (e.g. Google Chrome). If it's an API name or part of computer code, use verbatim capitalization (e.g. getElementById).
 78 | - For Thai text, do not add a space between words. Only add spaces between sentences or when there is obvious pausing.
 79 | - Add spaces between Thai words and foreign words.
 80 | - For English sentences, add punctuation marks as appropriate. For example, add periods at the end of sentences (or a question mark if the speaker is asking a question), and add commas and hyphens where it should be used. Sometimes our speakers are not fluent in English, so please fix the disfluency (such as "um"'s and "uh"'s, stuttering and stammering). Also fix minor grammatical mistakes, for example, "everyone like" should be "everyone likes." (Only fix minor mistakes though!)
 81 | - For English sentences, capitalize the first word of the sentence so it is easier to read.
 82 | - For technical terms, in general, spell it in English (e.g. canvas, vertex, scene). Only transliterate it to Thai if it is a very common word and commonly spelled in Thai (e.g. ลิงก์, เคส, อัพเกรด, โปรแกรมเมอร์).
 83 | - Remove filler words like "umm" and "ah". Also fix the transcript when the speaker corrects themselves or repeats themselves due to stuttering.
 84 | - At the end of the audio file there may be beeping sound, do not include it in the transcript.
 85 | - If there is no speech, return an empty string for the transcript.
 86 | 
 87 | Transcribe the following audio files.`,
 88 |     },
 89 |   ];
 90 |   const expected: TranscriptionItem[] = [];
 91 |   for (const item of history) {
 92 |     const buffer = Buffer.from(item.audio);
 93 |     const id = createHash("md5").update(buffer).digest("hex").slice(0, 6);
 94 |     historyParts.push({ text: "id: " + id });
 95 |     historyParts.push({
 96 |       inlineData: {
 97 |         mimeType: "audio/x-m4a",
 98 |         data: buffer.toString("base64"),
 99 |       },
100 |     });
101 |     expected.push({ id, transcript: item.transcript });
102 |   }
103 |   const chatSession = model.startChat({
104 |     generationConfig: generationConfig,
105 |     history: [
106 |       {
107 |         role: "user",
108 |         parts: historyParts,
109 |       },
110 |       {
111 |         role: "model",
112 |         parts: [{ text: JSON.stringify({ transcription: expected }) }],
113 |       },
114 |     ],
115 |     safetySettings: [
116 |       {
117 |         category: HarmCategory.HARM_CATEGORY_HARASSMENT,
118 |         threshold: HarmBlockThreshold.BLOCK_NONE,
119 |       },
120 |       {
121 |         category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
122 |         threshold: HarmBlockThreshold.BLOCK_NONE,
123 |       },
124 |       {
125 |         category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
126 |         threshold: HarmBlockThreshold.BLOCK_NONE,
127 |       },
128 |       {
129 |         category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
130 |         threshold: HarmBlockThreshold.BLOCK_NONE,
131 |       },
132 |     ],
133 |   });
134 | 
135 |   const promptParts: Part[] = [];
136 |   const ids: string[] = [];
137 |   for (const item of audio) {
138 |     const buffer = Buffer.from(item);
139 |     const id = createHash("md5").update(buffer).digest("hex").slice(0, 6);
140 |     ids.push(id);
141 |     promptParts.push({ text: "id: " + id });
142 |     promptParts.push({
143 |       inlineData: {
144 |         mimeType: "audio/x-m4a",
145 |         data: buffer.toString("base64"),
146 |       },
147 |     });
148 |   }
149 | 
150 |   const result = await chatSession.sendMessageStream(promptParts, {
151 |     timeout: 15000,
152 |   });
153 |   let usageMetadata: UsageMetadata | undefined;
154 |   let text = "";
155 |   let error = "";
156 |   try {
157 |     for await (const chunk of result.stream) {
158 |       if (chunk.usageMetadata) {
159 |         usageMetadata = chunk.usageMetadata;
160 |       }
161 |       text += chunk.text();
162 |     }
163 |   } catch (e: any) {
164 |     // Add emoji to signify error
165 |     text += "❌";
166 |     console.error("[processAudio]", e);
167 |     error = String(e?.stack || e);
168 |     // ctx.log("error", { error });
169 |   }
170 |   return { usageMetadata, text, error, ids };
171 | }
172 | 
173 | function postProcess(text: string) {
174 |   return (
175 |     text
176 |       .replace(/ปื๊ด\s*$/, "")
177 |       .replace(/ปื้ด\s*$/, "")
178 |       .replace(/ปี๊บๆ+\s*$/, "")
179 |       .replace(/ๆ(?:ๆ+)\s*$/, "ๆ")
180 | 
181 |       // Add spaces between Thai words and foreign words.
182 |       .replace(/([ก-๙])([a-zA-Z0-9])/g, "$1 $2")
183 |       .replace(/([a-zA-Z0-9])([ก-๙])/g, "$1 $2")
184 | 
185 |       .trim()
186 |   );
187 | }
188 | 
189 | async function main({ maxMessages }: { maxMessages: number }) {
190 |   const list = await api<
191 |     {
192 |       id: string;
193 |       start: string;
194 |       finish: string;
195 |       length: number;
196 |       transcript?: string;
197 |     }[]
198 |   >(`/items`);
199 | 
200 |   const validItems = list.filter((item) => item.length > 0);
201 |   validItems.sort((a, b) => a.start.localeCompare(b.start));
202 | 
203 |   const untranscribed = validItems.filter((item) => item.transcript == null);
204 |   if (!untranscribed.length) {
205 |     if (!waiting) {
206 |       waiting = true;
207 |       process.stderr.write("Waiting for transcription...");
208 |     } else {
209 |       process.stderr.write(".");
210 |     }
211 |     return false;
212 |   }
213 |   if (waiting) {
214 |     process.stderr.write("\n");
215 |     waiting = false;
216 |   }
217 |   const allBefore = validItems.filter(
218 |     (item) => item.start < untranscribed[0].start
219 |   );
220 |   const before = allBefore.slice(-3);
221 |   const prior = allBefore
222 |     .slice(0, -3)
223 |     .flatMap((r) => (r.transcript ? [r.transcript] : []))
224 |     .slice(-37);
225 |   const history = await Promise.all(
226 |     before.map(async (item) => {
227 |       const audio = await loadAudio(item.id);
228 |       return { audio, transcript: item.transcript! };
229 |     })
230 |   );
231 |   const audio = await Promise.all(
232 |     untranscribed.slice(0, maxMessages).map((item) => loadAudio(item.id))
233 |   );
234 |   const result = await processAudio(audio, history, prior);
235 |   console.debug('Gemini result', result);
236 |   let { transcription } = JSON.parse(result.text) as {
237 |     transcription: TranscriptionItem[];
238 |   };
239 |   const usageId = uuidv7();
240 |   for (const [i, item] of transcription.entries()) {
241 |     if (result.ids[i] !== item.id) {
242 |       console.warn(
243 |         "Prompt ID mismatch, expected",
244 |         item.id,
245 |         "but received",
246 |         result.ids[i]
247 |       );
248 |       continue;
249 |     }
250 |     const { id } = untranscribed[i];
251 |     const transcript = postProcess(item.transcript);
252 |     console.log(`${id} => ${JSON.stringify(transcript)}`);
253 |     await api(`/items/${id}`, {
254 |       method: "PATCH",
255 |       body: {
256 |         transcript,
257 |         transcriptBy: modelName,
258 |         usageMetadata: result.usageMetadata,
259 |         usageId: usageId,
260 |       },
261 |     });
262 |   }
263 |   return true;
264 | }
265 | 
266 | async function loadAudio(id: string) {
267 |   return publicApi(`/pcm/${id}`, { responseType: "blob" }).then((r) =>
268 |     r.arrayBuffer()
269 |   );
270 | }
271 | 
272 | const initialHp = 5;
273 | let hp = initialHp;
274 | for (;;) {
275 |   try {
276 |     if (!(await main({ maxMessages: hp }))) {
277 |       await new Promise((r) => setTimeout(r, 1000));
278 |     }
279 |     if (hp < initialHp) {
280 |       hp = initialHp;
281 |       console.error('HP has been restored to', hp);
282 |     }
283 |   } catch (error) {
284 |     console.error(error);
285 |     hp--;
286 |     if (hp <= 0) {
287 |       console.error('Giving up');
288 |       process.exit(1);
289 |       break;
290 |     } else {
291 |       console.error('HP has been reduced to', hp);
292 |     }
293 |   } finally {
294 |     await new Promise((r) => setTimeout(r, 100));
295 |   }
296 | }
297 | 


--------------------------------------------------------------------------------
/src/components/TranscriptViewer.tsx:
--------------------------------------------------------------------------------
  1 | import { useStore } from "@nanostores/react";
  2 | import { atom, computed, type WritableAtom } from "nanostores";
  3 | import { ofetch } from "ofetch";
  4 | import { useEffect, useMemo, useRef, useState } from "react";
  5 | import TextareaAutosize from "react-textarea-autosize";
  6 | import ReconnectingWebSocket from "reconnecting-websocket";
  7 | import type { BackendContext } from "../BackendContext";
  8 | import "./TranscriptViewer.css";
  9 | import { $autoCorrects, $autoScroll } from "./TranscriptViewerKnobs";
 10 | 
 11 | const $autocorrectables = atom<React.RefObject<HTMLDivElement>[]>([]);
 12 | 
 13 | const $autoCorrector = computed([$autoCorrects], (autoCorrects) => {
 14 |   const items = autoCorrects
 15 |     .split(",")
 16 |     .map((x) => x.trim())
 17 |     .filter((x) => x)
 18 |     .flatMap((x) => {
 19 |       const [from, to] = x.split("=>").map((x) => x.trim());
 20 |       if (!from || !to) return [];
 21 |       return [{ from, to }];
 22 |     });
 23 |   return {
 24 |     correct: (text: string) => {
 25 |       let correctedText = text;
 26 |       for (const { from, to } of items) {
 27 |         const regex = new RegExp(from, "gi");
 28 |         correctedText = correctedText.replace(regex, to);
 29 |       }
 30 |       // Add spaces between Thai and non-Thai words.
 31 |       correctedText = correctedText
 32 |         .replace(/([ก-๙])([a-zA-Z0-9])/g, "$1 $2")
 33 |         .replace(/([a-zA-Z0-9])([ก-๙])/g, "$1 $2")
 34 |         .replace(/ๆ(?!ๆ|\s|$)/g, "ๆ ")
 35 |         .trim();
 36 |       return correctedText;
 37 |     },
 38 |   };
 39 | });
 40 | 
 41 | export function TranscriptViewer() {
 42 |   const params = new URLSearchParams(window.location.search);
 43 | 
 44 |   const backend = params.get("backend");
 45 |   const room = params.get("room");
 46 |   const key = params.get("key") || undefined;
 47 |   if (!backend || !room) {
 48 |     return <div>Missing parameters</div>;
 49 |   }
 50 | 
 51 |   const backendContext: BackendContext = { backend, room, key };
 52 |   return <TranscriptViewerView backendContext={backendContext} />;
 53 | }
 54 | 
 55 | function createViewer(backendContext: BackendContext) {
 56 |   const ws = new ReconnectingWebSocket(
 57 |     `${backendContext.backend.replace(/^http/, "ws")}/rooms/${
 58 |       backendContext.room
 59 |     }/publicEvents`
 60 |   );
 61 |   const bufferedPartial = new Map<string, string>();
 62 |   ws.onmessage = async (e) => {
 63 |     const json = JSON.parse(e.data);
 64 |     console.log(json);
 65 |     if (json.method === "updated") {
 66 |       const state: ItemState = json.params;
 67 |       const id = state.id;
 68 |       const item = $items.get().find((item) => item.id === id);
 69 |       if (item) {
 70 |         item.$state.set(state);
 71 |       } else {
 72 |         $items.set([
 73 |           ...$items.get(),
 74 |           {
 75 |             id,
 76 |             $state: atom(state),
 77 |             $partial: atom(bufferedPartial.get(id) || undefined),
 78 |           },
 79 |         ]);
 80 |       }
 81 |     } else if (json.method === "partial_transcript") {
 82 |       const id = json.params.id;
 83 |       const item = $items.get().find((item) => item.id === id);
 84 |       bufferedPartial.set(id, json.params.transcript);
 85 |       if (item) {
 86 |         item.$partial.set(json.params.transcript);
 87 |       }
 88 |     }
 89 |   };
 90 |   const $items = atom<ViewerTranscriptItem[]>([]);
 91 |   async function init() {
 92 |     const items = await ofetch<ItemState[]>(
 93 |       `${backendContext.backend}/rooms/${backendContext.room}/items`
 94 |     );
 95 |     $items.set(
 96 |       items.map((item): ViewerTranscriptItem => {
 97 |         return {
 98 |           id: item.id,
 99 |           $state: atom(item),
100 |           $partial: atom(),
101 |         };
102 |       })
103 |     );
104 |   }
105 |   init();
106 |   return {
107 |     $items,
108 |     editable: !!backendContext.key,
109 |     async updateTranscript(id: string, transcript: string) {
110 |       await ofetch(
111 |         `${backendContext.backend}/rooms/${backendContext.room}/items/${id}`,
112 |         {
113 |           method: "PATCH",
114 |           body: JSON.stringify({ transcript, transcriptBy: "manual" }),
115 |           headers: {
116 |             "Content-Type": "application/json",
117 |             Authorization: `Bearer ${backendContext.key}`,
118 |           },
119 |         }
120 |       );
121 |     },
122 |     getAudioUrl(id: string) {
123 |       return `${backendContext.backend}/pcm/${id}`;
124 |     },
125 |   };
126 | }
127 | 
128 | interface ItemState {
129 |   id: string;
130 |   start: string;
131 |   finish: string;
132 |   length: number;
133 |   transcript?: string;
134 | }
135 | 
136 | interface ViewerTranscriptItem {
137 |   id: string;
138 |   $state: WritableAtom<ItemState>;
139 |   $partial: WritableAtom<string | undefined>;
140 | }
141 | 
142 | type Viewer = ReturnType<typeof createViewer>;
143 | let _viewer: Viewer | undefined;
144 | 
145 | function TranscriptViewerView(props: { backendContext: BackendContext }) {
146 |   const viewer = (_viewer ??= createViewer(props.backendContext));
147 |   const items = useStore(viewer.$items);
148 |   return (
149 |     <div>
150 |       <h1>Transcript for room {props.backendContext.room}</h1>
151 |       <div className="TranscriptViewer">
152 |         {items.map((item) => {
153 |           return (
154 |             <TranscriptItem
155 |               start={formatTime(new Date(item.$state.get().start))}
156 |               key={item.id}
157 |               item={item}
158 |               viewer={viewer}
159 |             />
160 |           );
161 |         })}
162 |       </div>
163 |       <TranscriptViewerOptions viewer={viewer} />
164 |     </div>
165 |   );
166 | }
167 | 
168 | const scroller = (() => {
169 |   let toScroll = 0;
170 |   let timeout: number | undefined;
171 |   return {
172 |     scrollBy(v: number) {
173 |       console.log(v);
174 |       if (!timeout) {
175 |         timeout = setTimeout(() => {
176 |           const amount = toScroll;
177 |           toScroll = 0;
178 |           timeout = undefined;
179 |           if (amount < 0) return;
180 |           smoothScroll(amount);
181 |         }, 120) as unknown as number;
182 |       }
183 |       toScroll = Math.max(v, toScroll);
184 |     },
185 |   };
186 | })();
187 | 
188 | function smoothScroll(amount: number) {
189 |   console.log(amount);
190 |   let last = 0;
191 |   let current = 0;
192 |   amount = Math.round(amount);
193 |   const frame = () => {
194 |     current += (amount - current) / 5;
195 |     const nextValue = Math.round(current);
196 |     if (nextValue > last) {
197 |       window.scrollBy({
198 |         top: nextValue - last,
199 |         behavior: "instant",
200 |       });
201 |       last = nextValue;
202 |     }
203 |     if (nextValue < amount) {
204 |       requestAnimationFrame(frame);
205 |     }
206 |   };
207 |   requestAnimationFrame(frame);
208 | }
209 | 
210 | function TranscriptItem(props: {
211 |   start: string;
212 |   item: ViewerTranscriptItem;
213 |   viewer: Viewer;
214 | }) {
215 |   const div = useRef<HTMLDivElement>(null);
216 |   const text = useRef<HTMLDivElement>(null);
217 |   const { item, viewer } = props;
218 |   const state = useStore(item.$state);
219 |   const partial = useStore(item.$partial);
220 |   const [isEditing, setIsEditing] = useState<false | { width: number }>(false);
221 |   const transcribed = state.transcript != null;
222 |   const [wasUntranscribed] = useState(!transcribed);
223 |   const corrector = useStore($autoCorrector);
224 |   const corrected = useMemo(() => {
225 |     if (!state.transcript || !viewer.editable) return state.transcript;
226 |     return corrector.correct(state.transcript);
227 |   }, [corrector, state.transcript, viewer.editable]);
228 | 
229 |   const needsCorrection = corrected !== state.transcript;
230 |   const autoCorrectableAdded = useRef(false);
231 |   useEffect(() => {
232 |     if (needsCorrection && !autoCorrectableAdded.current) {
233 |       $autocorrectables.set([...$autocorrectables.get(), div]);
234 |       autoCorrectableAdded.current = true;
235 |     } else if (!needsCorrection && autoCorrectableAdded.current) {
236 |       $autocorrectables.set($autocorrectables.get().filter((x) => x !== div));
237 |       autoCorrectableAdded.current = false;
238 |     }
239 |   }, [needsCorrection]);
240 | 
241 |   useEffect(() => {
242 |     if (transcribed && wasUntranscribed && div.current && $autoScroll.get()) {
243 |       const clientRect = div.current.getBoundingClientRect();
244 |       // Do not scroll if focusing on a text area.
245 |       if (document.activeElement instanceof HTMLTextAreaElement) return;
246 |       scroller.scrollBy(
247 |         clientRect.top + clientRect.height - (window.innerHeight - 140)
248 |       );
249 |     }
250 |   }, [transcribed, wasUntranscribed]);
251 | 
252 |   const handleClick = (e: React.MouseEvent) => {
253 |     if (e.altKey && needsCorrection && corrected) {
254 |       viewer.updateTranscript(item.id, corrected);
255 |       return;
256 |     }
257 |     if (viewer.editable && state.finish && !isEditing) {
258 |       const width = text.current?.offsetWidth;
259 |       setIsEditing({ width: width == null ? 0 : width + 2 });
260 |     }
261 |   };
262 | 
263 |   const handleSave = (newTranscript: string) => {
264 |     viewer.updateTranscript(item.id, newTranscript);
265 |     setIsEditing(false);
266 |   };
267 | 
268 |   const handleCancel = () => {
269 |     setIsEditing(false);
270 |   };
271 | 
272 |   const listen = () => {
273 |     const myWindow = window as { currentAudio?: HTMLAudioElement };
274 |     const audio = (myWindow.currentAudio ??= new Audio());
275 |     const src = viewer.getAudioUrl(item.id);
276 |     if (!audio.paused && audio.src === src) {
277 |       audio.pause();
278 |     } else {
279 |       document.body.append(audio);
280 |       audio.src = src;
281 |       audio.load();
282 |       audio.currentTime = 0;
283 |       audio.play();
284 |     }
285 |     const textarea = div.current?.querySelector("textarea");
286 |     if (textarea) textarea.focus();
287 |   };
288 | 
289 |   return (
290 |     <div className="TranscriptItem">
291 |       <div
292 |         className="TranscriptItem__content"
293 |         data-transcribed={transcribed ? "true" : "false"}
294 |         data-editable={viewer.editable && state.finish ? "true" : "false"}
295 |         data-editing={isEditing ? "true" : "false"}
296 |         data-needs-correction={needsCorrection ? "true" : "false"}
297 |         ref={div}
298 |         onClick={handleClick}
299 |       >
300 |         {isEditing ? (
301 |           <div className="d-flex">
302 |             <EditableTranscript
303 |               initialValue={state.transcript || ""}
304 |               onSave={handleSave}
305 |               onCancel={handleCancel}
306 |               width={isEditing.width}
307 |             />
308 |             <div
309 |               style={{
310 |                 position: "absolute",
311 |                 top: -2,
312 |                 right: 0,
313 |                 transform: "translateY(-100%)",
314 |               }}
315 |             >
316 |               <button onClick={listen} className="btn">
317 |                 👂
318 |               </button>
319 |               <button onClick={handleCancel} className="btn">
320 |                 ❌
321 |               </button>
322 |             </div>
323 |           </div>
324 |         ) : (
325 |           <div ref={text}>
326 |             {state.transcript ?? (
327 |               <i style={{ opacity: 0.5 }}>{partial ?? "…"}</i>
328 |             )}{" "}
329 |             <span className="TranscriptItem__time">{props.start}</span>
330 |           </div>
331 |         )}
332 |       </div>
333 |     </div>
334 |   );
335 | }
336 | interface EditableTranscriptProps {
337 |   initialValue: string;
338 |   width: number;
339 |   onSave: (newTranscript: string) => void;
340 |   onCancel: () => void;
341 | }
342 | 
343 | function formatTime(date: Date) {
344 |   return `${date.getHours()}:${String(date.getMinutes()).padStart(2, "0")}`;
345 | }
346 | 
347 | function EditableTranscript({
348 |   initialValue,
349 |   onSave,
350 |   onCancel,
351 |   width,
352 | }: EditableTranscriptProps) {
353 |   const [value, setValue] = useState(initialValue);
354 | 
355 |   const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
356 |     if (e.key === "Enter" && !e.shiftKey) {
357 |       e.preventDefault();
358 |       onSave(value);
359 |     } else if (e.key === "Escape") {
360 |       e.preventDefault();
361 |       onCancel();
362 |     }
363 |   };
364 | 
365 |   return (
366 |     <TextareaAutosize
367 |       value={value}
368 |       onChange={(e) => setValue(e.target.value)}
369 |       onKeyDown={handleKeyDown}
370 |       autoFocus
371 |       style={{
372 |         width: width || "100%",
373 |         border: "none",
374 |         outline: "none",
375 |         resize: "none",
376 |         padding: "0",
377 |         fontFamily: "inherit",
378 |         fontSize: "inherit",
379 |         letterSpacing: "inherit",
380 |         backgroundColor: "transparent",
381 |       }}
382 |     />
383 |   );
384 | }
385 | 
386 | function TranscriptViewerOptions({ viewer }: { viewer: Viewer }) {
387 |   const autoScroll = useStore($autoScroll);
388 |   const toCorrect = useStore($autocorrectables).length;
389 | 
390 |   useEffect(() => {
391 |     const onKeyDown = (e: KeyboardEvent) => {
392 |       // Do not process keydown events when editing a text area.
393 |       if (document.activeElement instanceof HTMLTextAreaElement) return;
394 |       console.log(e.key);
395 |       if (e.key === "s") {
396 |         $autoScroll.set(!autoScroll);
397 |       }
398 |       if (e.key === "x") {
399 |         document.querySelector<HTMLButtonElement>("#autoCorrectables")?.click();
400 |       }
401 |     };
402 |     window.addEventListener("keydown", onKeyDown);
403 |     return () => {
404 |       window.removeEventListener("keydown", onKeyDown);
405 |     };
406 |   }, []);
407 | 
408 |   return (
409 |     <div
410 |       className="TranscriptViewerOptions"
411 |       data-editable={viewer.editable ? "true" : "false"}
412 |     >
413 |       <div className="d-flex gap-3 align-items-center">
414 |         <label>
415 |           <input
416 |             type="checkbox"
417 |             checked={autoScroll}
418 |             onChange={() => $autoScroll.set(!autoScroll)}
419 |           />{" "}
420 |           Auto-scroll
421 |         </label>
422 |         {viewer.editable && (
423 |           <button
424 |             className="btn btn-sm btn-outline-secondary"
425 |             id="autoCorrectables"
426 |             onClick={(e) => {
427 |               if (e.altKey) {
428 |                 const before = $autoCorrects.get();
429 |                 const after = prompt("Autocorrects", before);
430 |                 if (after != null) {
431 |                   $autoCorrects.set(after);
432 |                 }
433 |               } else {
434 |                 $autocorrectables
435 |                   .get()[0]
436 |                   .current?.scrollIntoView({ behavior: "instant" });
437 |               }
438 |             }}
439 |           >
440 |             Autocorrect ({toCorrect})
441 |           </button>
442 |         )}
443 |         <button
444 |           className="btn btn-sm btn-outline-secondary"
445 |           onClick={(e) => {
446 |             const tsvContent = exportTsv(viewer);
447 |             navigator.clipboard.writeText(tsvContent);
448 |           }}
449 |         >
450 |           Copy TSV
451 |         </button>
452 |       </div>
453 |     </div>
454 |   );
455 | }
456 | 
457 | function exportTsv(viewer: Viewer) {
458 |   const items = viewer.$items.get();
459 |   const tsvContent = items
460 |     .map((item) => {
461 |       const state = item.$state.get();
462 |       return `${state.start}\t${state.finish}\t${state.transcript || ""}`;
463 |     })
464 |     .join("\n");
465 |   return tsvContent;
466 | }
467 | 


--------------------------------------------------------------------------------
/src/components/AudioSender.tsx:
--------------------------------------------------------------------------------
  1 | import { useStore } from "@nanostores/react";
  2 | import { encode } from "@stablelib/base64";
  3 | import { atom, computed } from "nanostores";
  4 | import ReconnectingWebSocket from "reconnecting-websocket";
  5 | import type { BackendContext } from "../BackendContext";
  6 | import {
  7 |   $activationThreshold,
  8 |   $deactivationThreshold,
  9 |   $decayEasing,
 10 |   $maxLength as $maxAudioLength,
 11 |   $minimumPeak as $minimumLevel,
 12 | } from "../knobs";
 13 | import { log } from "../logbus";
 14 | import { LogViewer } from "./LogViewer";
 15 | 
 16 | import { FrameProcessor, NonRealTimeVAD } from "@ricky0123/vad-web";
 17 | 
 18 | async function initVad() {
 19 |   const vad = await NonRealTimeVAD.new({
 20 |     modelURL:
 21 |       "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.19/dist/silero_vad.onnx",
 22 |     ortConfig(ort) {
 23 |       ort.env.wasm.wasmPaths =
 24 |         "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.19.2/dist/";
 25 |     },
 26 |   });
 27 |   return vad;
 28 | }
 29 | 
 30 | let audioContext: AudioContext | null = null;
 31 | function getAudioContext() {
 32 |   return (audioContext ??= new AudioContext({ sampleRate: 16000 }));
 33 | }
 34 | 
 35 | export function AudioSender() {
 36 |   const params = new URLSearchParams(window.location.search);
 37 | 
 38 |   const backend = params.get("backend");
 39 |   const room = params.get("room");
 40 |   const key = params.get("key");
 41 |   if (!backend || !room || !key) {
 42 |     return <div>Missing parameters</div>;
 43 |   }
 44 | 
 45 |   const deviceId = params.get("deviceId");
 46 |   if (!deviceId) {
 47 |     return <AudioDeviceSelector />;
 48 |   }
 49 | 
 50 |   const backendContext: BackendContext = { backend, room, key };
 51 |   return (
 52 |     <AudioSenderView deviceId={deviceId} backendContext={backendContext} />
 53 |   );
 54 | }
 55 | 
 56 | const $devices = atom<MediaDeviceInfo[]>([]);
 57 | 
 58 | const getDeviceList = async () => {
 59 |   try {
 60 |     const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
 61 |     stream.getTracks().forEach((track) => track.stop());
 62 | 
 63 |     const deviceList = await navigator.mediaDevices.enumerateDevices();
 64 |     const audioInputDevices = deviceList.filter(
 65 |       (device) => device.kind === "audioinput"
 66 |     );
 67 | 
 68 |     $devices.set(audioInputDevices);
 69 |   } catch (error) {
 70 |     console.error("Error getting device list:", error);
 71 |   }
 72 | };
 73 | 
 74 | function AudioDeviceSelector() {
 75 |   const devices = useStore($devices);
 76 |   return (
 77 |     <>
 78 |       <h1>Select device</h1>
 79 |       <div>
 80 |         <button
 81 |           className="btn btn-primary flex-shrink-0"
 82 |           onClick={getDeviceList}
 83 |         >
 84 |           {devices.length > 0 ? "Refresh" : "Get"} device list
 85 |         </button>
 86 |       </div>
 87 |       <ul>
 88 |         {devices.map((device) => (
 89 |           <li key={device.deviceId}>
 90 |             <a href={`${location.search}&deviceId=${device.deviceId}`}>
 91 |               {device.label}
 92 |             </a>
 93 |           </li>
 94 |         ))}
 95 |       </ul>
 96 |     </>
 97 |   );
 98 | }
 99 | 
100 | function createAudioSenderController(options: {
101 |   backendContext: BackendContext;
102 |   deviceId: string;
103 |   log: (message: string) => void;
104 | }) {
105 |   const { log, backendContext } = options;
106 |   const $level = atom(0);
107 |   const $realMax = atom(0);
108 |   const $effectiveMax = computed(
109 |     [$realMax, $minimumLevel],
110 |     (realMax, minimumLevel) => Math.max(realMax, minimumLevel / 100)
111 |   );
112 |   const $current = atom(0);
113 |   const $active = atom<string | null>(null);
114 |   const $socketStatus = atom<"disconnected" | "authenticating" | "connected">(
115 |     "disconnected"
116 |   );
117 |   const unackedMessages = new Map<string, any>();
118 |   const $pendingEventCount = atom(0);
119 |   const $started = atom(false);
120 | 
121 |   const vadPromise = initVad();
122 |   const $vad = atom<NonRealTimeVAD | null>(null);
123 |   vadPromise.then((vad) => {
124 |     $vad.set(vad);
125 |     $minimumLevel.set(100);
126 |     $activationThreshold.set(0.5);
127 |     $deactivationThreshold.set(0.35);
128 |   });
129 | 
130 |   let currentBlockCount = 0;
131 |   type SocketEvent =
132 |     | { method: "start"; params: { localTime: string } }
133 |     | { method: "audio"; params: { data: string } }
134 |     | { method: "stop" };
135 |   let onEvent: (event: SocketEvent) => void = () => {};
136 | 
137 |   async function start() {
138 |     if ($started.get()) return;
139 |     $started.set(true);
140 |     await Promise.all([startAudio(), startWebsocket()]);
141 |   }
142 | 
143 |   async function startAudio() {
144 |     const audioContext = getAudioContext();
145 | 
146 |     const workletCode = `
147 |       class AudioSenderProcessor extends AudioWorkletProcessor {
148 |         constructor() {
149 |           super();
150 |           this.buffer = new Float32Array(1024);
151 |           this.bufferIndex = 0;
152 |         }
153 | 
154 |         process(inputs) {
155 |           const input = inputs[0];
156 |           if (input.length > 0) {
157 |             const inputData = input[0];
158 |             for (let i = 0; i < inputData.length; i++) {
159 |               this.buffer[this.bufferIndex++] = inputData[i];
160 | 
161 |               if (this.bufferIndex === this.buffer.length) {
162 |                 const outputData = new Int16Array(this.buffer.length);
163 |                 for (let j = 0; j < this.buffer.length; j++) {
164 |                   const s = Math.max(-1, Math.min(1, this.buffer[j]));
165 |                   outputData[j] = s < 0 ? s * 0x8000 : s * 0x7FFF;
166 |                 }
167 |                 this.port.postMessage(outputData.buffer, [outputData.buffer]);
168 |                 this.bufferIndex = 0;
169 |               }
170 |             }
171 |           }
172 |           return true;
173 |         }
174 |       }
175 |       registerProcessor('audio-sender-processor', AudioSenderProcessor);
176 |     `;
177 | 
178 |     try {
179 |       const stream = await navigator.mediaDevices.getUserMedia({
180 |         audio: {
181 |           deviceId: { exact: options.deviceId },
182 |           echoCancellation: false,
183 |           noiseSuppression: false,
184 |           autoGainControl: false,
185 |           channelCount: 1,
186 |           sampleRate: 16000,
187 |         },
188 |       });
189 | 
190 |       // Add the AudioWorklet module
191 |       const blob = new Blob([workletCode], { type: "application/javascript" });
192 |       const workletUrl = URL.createObjectURL(blob);
193 |       await audioContext.audioWorklet.addModule(workletUrl);
194 | 
195 |       const source = audioContext.createMediaStreamSource(stream);
196 |       const workletNode = new AudioWorkletNode(
197 |         audioContext,
198 |         "audio-sender-processor"
199 |       );
200 |       source.connect(workletNode);
201 | 
202 |       workletNode.port.onmessage = async (event) => {
203 |         const data = new Int16Array(event.data);
204 |         let level = 0;
205 |         if ($vad.get()) {
206 |           const vad = $vad.get()!;
207 |           // Convert Int16Array to Float32Array
208 |           const floatData = new Float32Array(data.length);
209 |           for (let i = 0; i < data.length; i++) {
210 |             floatData[i] = data[i] / 32768;
211 |           }
212 |           const result = await (
213 |             vad.frameProcessor as FrameProcessor
214 |           ).modelProcessFunc(floatData);
215 |           level = result.isSpeech;
216 |         } else {
217 |           // Calculate RMS
218 |           let sum = 0;
219 |           for (let i = 0; i < data.length; i++) {
220 |             sum += (data[i] / 32768) ** 2;
221 |           }
222 |           level = Math.sqrt(sum / data.length) * Math.sqrt(2);
223 |         }
224 |         $level.set(level);
225 |         if (level > $realMax.get()) {
226 |           $realMax.set(level);
227 |         } else {
228 |           $realMax.set($realMax.get() * 0.995);
229 |         }
230 |         if (level > $current.get()) {
231 |           $current.set(level);
232 |         } else if ($active.get()) {
233 |           const maxSamples = $maxAudioLength.get() * 16000;
234 |           const maxBlocks = maxSamples / 1024;
235 |           const progress =
236 |             Math.min(1, currentBlockCount / maxBlocks) ** $decayEasing.get();
237 |           const decayRate = 0.99 - progress * 0.5;
238 |           $current.set($current.get() * decayRate);
239 |         } else {
240 |           $current.set(level);
241 |         }
242 |         if (!$active.get()) {
243 |           const threshold = $effectiveMax.get() * $activationThreshold.get();
244 |           if ($current.get() > threshold) {
245 |             const id = `au${Date.now()}`;
246 |             $active.set(id);
247 |             currentBlockCount = 0;
248 |             onEvent({
249 |               method: "start",
250 |               params: { localTime: new Date().toISOString() },
251 |             });
252 |             log(`Utterance started`);
253 |           }
254 |         } else if ($active.get()) {
255 |           const threshold = $effectiveMax.get() * $deactivationThreshold.get();
256 |           if ($current.get() < threshold) {
257 |             $active.set(null);
258 |             onEvent({ method: "stop" });
259 |             const samples = currentBlockCount * 1024;
260 |             const duration = samples / 16000;
261 |             log(`Utterance finished, duration: ${duration.toFixed(2)}s`);
262 |           } else {
263 |             currentBlockCount++;
264 |           }
265 |         }
266 |         if ($active.get()) {
267 |           // Convert data into base64-encoded string.
268 |           const base64 = encode(new Uint8Array(event.data));
269 |           onEvent({ method: "audio", params: { data: base64 } });
270 |         }
271 |       };
272 |     } catch (error) {
273 |       options.log(`Error in audio sender: ${error}`);
274 |     }
275 |   }
276 | 
277 |   async function startWebsocket() {
278 |     const { backend, room, key } = backendContext;
279 |     const socket = new ReconnectingWebSocket(
280 |       `${backend}/rooms/${room}/audioIngest?key=${key}`
281 |     );
282 |     socket.onopen = () => {
283 |       log("WebSocket connected");
284 |       $socketStatus.set("authenticating");
285 |     };
286 |     socket.onmessage = (event) => {
287 |       const data = JSON.parse(event.data);
288 |       if (data.method === "welcome") {
289 |         log("Received welcome message");
290 |         $socketStatus.set("connected");
291 |         for (const message of unackedMessages.values()) {
292 |           socket.send(message);
293 |         }
294 |       }
295 |       if (data.id && unackedMessages.has(data.id)) {
296 |         unackedMessages.delete(data.id);
297 |         $pendingEventCount.set(unackedMessages.size);
298 |       }
299 |     };
300 |     onEvent = (event) => {
301 |       const id = crypto.randomUUID();
302 |       const payload = JSON.stringify({ id, ...event });
303 |       socket.send(payload);
304 |       unackedMessages.set(id, payload);
305 |       $pendingEventCount.set(unackedMessages.size);
306 |     };
307 |     socket.onclose = (event) => {
308 |       log(`WebSocket disconnected: ${event.reason}`);
309 |       $socketStatus.set("disconnected");
310 |     };
311 |   }
312 | 
313 |   return {
314 |     $level,
315 |     $max: $effectiveMax,
316 |     $current,
317 |     $active,
318 |     start,
319 |     $pendingEventCount,
320 |     $started,
321 |     $socketStatus,
322 |   };
323 | }
324 | 
325 | const levelToDb = (level: number) => 20 * Math.log10(level);
326 | const levelToX = (level: number) => {
327 |   const db = levelToDb(level);
328 |   const x = Math.max(0, Math.min(1, (db + 100) / 100));
329 |   return x;
330 | };
331 | 
332 | type AudioSenderController = ReturnType<typeof createAudioSenderController>;
333 | let _sender: AudioSenderController | undefined;
334 | 
335 | function AudioSenderView(props: {
336 |   deviceId: string;
337 |   backendContext: BackendContext;
338 | }) {
339 |   const sender = (_sender ??= createAudioSenderController({
340 |     backendContext: props.backendContext,
341 |     deviceId: props.deviceId,
342 |     log: log,
343 |   }));
344 |   return (
345 |     <>
346 |       <StartButton sender={sender} />
347 |       <LevelMeter sender={sender} />
348 |       <StatusInspector sender={sender} />
349 |       <Knobs />
350 |       <LogViewer />
351 |     </>
352 |   );
353 | }
354 | 
355 | function StartButton(props: { sender: AudioSenderController }) {
356 |   const sender = props.sender;
357 |   const started = useStore(sender.$started);
358 |   return (
359 |     <p>
360 |       <button
361 |         className="btn btn-primary"
362 |         onClick={sender.start}
363 |         disabled={started}
364 |       >
365 |         Start
366 |       </button>
367 |     </p>
368 |   );
369 | }
370 | 
371 | function StatusInspector(props: { sender: AudioSenderController }) {
372 |   const status = useStore(props.sender.$socketStatus);
373 |   const count = useStore(props.sender.$pendingEventCount);
374 |   return (
375 |     <p>
376 |       Socket status: {status}
377 |       <br />
378 |       Pending events: {count}
379 |     </p>
380 |   );
381 | }
382 | 
383 | function LevelMeter(props: { sender: AudioSenderController }) {
384 |   const level = useStore(props.sender.$level);
385 |   const max = useStore(props.sender.$max);
386 |   const current = useStore(props.sender.$current);
387 |   const active = useStore(props.sender.$active);
388 |   const threshold =
389 |     max * (active ? $deactivationThreshold.get() : $activationThreshold.get());
390 |   return (
391 |     <div
392 |       className="border mb-3 position-relative"
393 |       style={{ height: "16px", maxWidth: "720px" }}
394 |     >
395 |       <div
396 |         className="position-absolute top-0 left-0 bottom-0 bg-primary"
397 |         style={{
398 |           width: `${levelToX(current) * 100}%`,
399 |           opacity: "0.5",
400 |         }}
401 |       ></div>
402 |       <div
403 |         className="position-absolute top-0 left-0 bottom-0 bg-info"
404 |         style={{
405 |           width: `${levelToX(level) * 100}%`,
406 |         }}
407 |       ></div>
408 |       <div
409 |         className="position-absolute top-0 bottom-0 bg-danger"
410 |         style={{
411 |           left: `${levelToX(threshold) * 100}%`,
412 |           width: "2px",
413 |         }}
414 |       ></div>
415 |       <div
416 |         className="position-absolute top-0 bottom-0 bg-success"
417 |         style={{
418 |           left: `${levelToX(max) * 100}%`,
419 |           width: "2px",
420 |         }}
421 |       ></div>
422 |     </div>
423 |   );
424 | }
425 | 
426 | function Knobs() {
427 |   return (
428 |     <div className="d-flex gap-3 mb-3 flex-wrap">
429 |       <NumberKnob
430 |         step="0.25"
431 |         label="Max Audio Length"
432 |         $value={$maxAudioLength}
433 |       />
434 |       <NumberKnob step="0.05" label="Decay Easing" $value={$decayEasing} />
435 |       <NumberKnob
436 |         step="0.1"
437 |         label="Minimum Activation Level"
438 |         $value={$minimumLevel}
439 |       />
440 |       <NumberKnob
441 |         step="0.01"
442 |         label="Activation Threshold"
443 |         $value={$activationThreshold}
444 |       />
445 |       <NumberKnob
446 |         step="0.01"
447 |         label="Deactivation Threshold"
448 |         $value={$deactivationThreshold}
449 |       />
450 |     </div>
451 |   );
452 | }
453 | 
454 | function NumberKnob({
455 |   label,
456 |   step,
457 |   $value,
458 | }: {
459 |   label: string;
460 |   step: string;
461 |   $value: any;
462 | }) {
463 |   const value = useStore($value);
464 | 
465 |   const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
466 |     $value.set(parseFloat(e.target.value));
467 |   };
468 | 
469 |   return (
470 |     <div>
471 |       <label htmlFor={label} className="form-label text-muted">
472 |         <small>{label}</small>
473 |       </label>
474 |       <input
475 |         type="number"
476 |         className="form-control"
477 |         id={label}
478 |         step={step}
479 |         value={value}
480 |         onChange={handleChange}
481 |       />
482 |     </div>
483 |   );
484 | }
485 | 


--------------------------------------------------------------------------------