├── .mise.toml ├── .npmrc ├── src ├── env.d.ts ├── BackendContext.tsx ├── knobs.ts ├── pages │ ├── sender.astro │ ├── view.astro │ └── index.astro ├── components │ ├── TranscriptViewerKnobs.ts │ ├── LogViewer.tsx │ ├── TranscriptViewer.css │ ├── TranscriptViewer.tsx │ └── AudioSender.tsx ├── logbus.ts └── layouts │ └── Layout.astro ├── .vscode ├── extensions.json └── launch.json ├── tsconfig.json ├── astro.config.mjs ├── backend ├── scripts │ ├── TEST_getItems.ts │ ├── createRoom.ts │ ├── whisperTranslator.ts │ ├── server.ts │ ├── partialTranscriber.ts │ └── batchTranscriber.ts └── src │ ├── room.ts │ ├── publicBroadcast.ts │ ├── db.ts │ ├── client.ts │ ├── pubsub.ts │ ├── utterance.ts │ ├── itemOperations.ts │ └── persistence.ts ├── .gitignore ├── public └── favicon.svg ├── package.json └── README.md /.mise.toml: -------------------------------------------------------------------------------- 1 | [tools] 2 | node = "22.10.0" 3 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | @jsr:registry=https://npm.jsr.io 2 | -------------------------------------------------------------------------------- /src/env.d.ts: -------------------------------------------------------------------------------- 1 | /// -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["astro-build.astro-vscode"], 3 | "unwantedRecommendations": [] 4 | } 5 | -------------------------------------------------------------------------------- /src/BackendContext.tsx: -------------------------------------------------------------------------------- 1 | export interface BackendContext { 2 | backend: string; 3 | room: string; 4 | key?: string; 5 | } 6 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "astro/tsconfigs/strict", 3 | "compilerOptions": { 4 | "jsx": "react-jsx", 5 | "jsxImportSource": "react" 6 | } 7 | } -------------------------------------------------------------------------------- /astro.config.mjs: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'astro/config'; 2 | 3 | import react from "@astrojs/react"; 4 | 5 | // https://astro.build/config 6 | export default defineConfig({ 7 | integrations: [react()] 8 | }); -------------------------------------------------------------------------------- /backend/scripts/TEST_getItems.ts: -------------------------------------------------------------------------------- 1 | import { getItems } from "../src/itemOperations"; 2 | import { Room } from "../src/room"; 3 | 4 | console.log(await getItems(new Room("019296a2-3c00-7b5c-8913-6cfad0b97093"))); 5 | -------------------------------------------------------------------------------- /backend/src/room.ts: -------------------------------------------------------------------------------- 1 | export class Room { 2 | constructor(public name: string) {} 3 | get audioTopic() { 4 | return `${this.name}/audio`; 5 | } 6 | get publicTopic() { 7 | return `${this.name}/public`; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "command": "./node_modules/.bin/astro dev", 6 | "name": "Development server", 7 | "request": "launch", 8 | "type": "node-terminal" 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /src/knobs.ts: -------------------------------------------------------------------------------- 1 | import { atom } from "nanostores"; 2 | 3 | export const $minimumPeak = atom(2.5); 4 | export const $activationThreshold = atom(0.25); 5 | export const $deactivationThreshold = atom(0.2); 6 | export const $maxLength = atom(10); 7 | export const $decayEasing = atom(1.25); 8 | -------------------------------------------------------------------------------- /src/pages/sender.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import { AudioSender } from "../components/AudioSender"; 3 | import Layout from "../layouts/Layout.astro"; 4 | --- 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/pages/view.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import { TranscriptViewer } from "../components/TranscriptViewer"; 3 | import Layout from "../layouts/Layout.astro"; 4 | --- 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/pages/index.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import Layout from "../layouts/Layout.astro"; 3 | --- 4 | 5 | 6 | 7 | Welcome to Live Speech frontend... 8 | 9 | For more information, check out the{ 10 | " " 11 | }https://github.com/dtinth/live-speech 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # build output 2 | dist/ 3 | 4 | # generated types 5 | .astro/ 6 | 7 | # dependencies 8 | node_modules/ 9 | 10 | # logs 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | pnpm-debug.log* 15 | 16 | # environment variables 17 | .env 18 | .env.production 19 | 20 | # macOS-specific files 21 | .DS_Store 22 | 23 | # jetbrains setting folder 24 | .idea/ 25 | pb_data/ 26 | .data 27 | outputs -------------------------------------------------------------------------------- /backend/src/publicBroadcast.ts: -------------------------------------------------------------------------------- 1 | import { uuidv7 } from "uuidv7"; 2 | import { db } from "./db"; 3 | import { pubsub } from "./pubsub"; 4 | import type { Room } from "./room"; 5 | 6 | export function publicBroadcast(room: Room, method: string, params: any) { 7 | pubsub.publish(room.publicTopic, method, params); 8 | db.roomLogs(room).set(uuidv7(), { 9 | time: new Date().toISOString(), 10 | method, 11 | params, 12 | }); 13 | } 14 | -------------------------------------------------------------------------------- /src/components/TranscriptViewerKnobs.ts: -------------------------------------------------------------------------------- 1 | import { atom } from "nanostores"; 2 | 3 | export const $autoScroll = atom(true); 4 | export const $autoCorrects = atom("โมนัด=>monad"); 5 | 6 | // Save $autoCorrects to sessionStorage 7 | $autoCorrects.subscribe((value) => { 8 | sessionStorage.setItem("autoCorrects", value); 9 | }); 10 | if (sessionStorage.getItem("autoCorrects")) { 11 | $autoCorrects.set(sessionStorage.getItem("autoCorrects")!); 12 | } 13 | -------------------------------------------------------------------------------- /src/logbus.ts: -------------------------------------------------------------------------------- 1 | const logListeners = new Set(); 2 | 3 | export interface LogListener { 4 | onLog(message: string): void; 5 | } 6 | 7 | export function addLogListener(listener: LogListener) { 8 | logListeners.add(listener); 9 | return () => { 10 | logListeners.delete(listener); 11 | }; 12 | } 13 | 14 | export function log(message: string) { 15 | for (const listener of logListeners) { 16 | listener.onLog(message); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /public/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | -------------------------------------------------------------------------------- /backend/src/db.ts: -------------------------------------------------------------------------------- 1 | import { mkdirSync } from "fs"; 2 | import { Partition, Persistence } from "./persistence"; 3 | import type { Room } from "./room"; 4 | 5 | mkdirSync(".data", { recursive: true }); 6 | const persistence = new Persistence(".data/database.sqlite"); 7 | 8 | export const db = { 9 | get audio(): Partition { 10 | return persistence.getPartition("audio"); 11 | }, 12 | get rooms(): Partition { 13 | return persistence.getPartition("rooms"); 14 | }, 15 | roomItems(room: Room): Partition { 16 | return persistence.getPartition(`room_${room.name}`); 17 | }, 18 | roomPartials(room: Room): Partition { 19 | return persistence.getPartition(`partials_${room.name}`); 20 | }, 21 | roomLogs(room: Room): Partition { 22 | return persistence.getPartition(`logs_${room.name}`); 23 | }, 24 | }; 25 | -------------------------------------------------------------------------------- /src/layouts/Layout.astro: -------------------------------------------------------------------------------- 1 | --- 2 | interface Props { 3 | title: string; 4 | } 5 | 6 | const { title } = Astro.props; 7 | --- 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {title} 18 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /backend/scripts/createRoom.ts: -------------------------------------------------------------------------------- 1 | import chalk from "chalk"; 2 | import { adminApi } from "../src/client"; 3 | 4 | const roomInfo = await adminApi<{ 5 | roomId: string; 6 | roomKey: string; 7 | }>("/admin/rooms", { method: "POST" }); 8 | 9 | const webUrl = process.env["FRONTEND_URL_BASE"]; 10 | const backendUrl = process.env["SERVER_URL_BASE"]; 11 | 12 | console.log(JSON.stringify(roomInfo, null, 2)); 13 | 14 | console.log(` 15 | ${chalk.yellow.bold("Viewer URL:")} 16 | ${webUrl}/view?backend=${backendUrl}&room=${roomInfo.roomId} 17 | 18 | ${chalk.yellow.bold("Editor URL:")} 19 | ${webUrl}/view?backend=${backendUrl}&room=${roomInfo.roomId}&key=${ 20 | roomInfo.roomKey 21 | } 22 | 23 | ${chalk.yellow.bold("Audio Sender URL:")} 24 | ${webUrl}/sender?backend=${backendUrl}&room=${roomInfo.roomId}&key=${ 25 | roomInfo.roomKey 26 | } 27 | 28 | ${chalk.bold("env:")} 29 | SERVER_URL_BASE=${backendUrl} 30 | ROOM_ID=${roomInfo.roomId} 31 | ROOM_KEY=${roomInfo.roomKey} 32 | `); 33 | -------------------------------------------------------------------------------- /backend/src/client.ts: -------------------------------------------------------------------------------- 1 | import { ofetch } from "ofetch"; 2 | 3 | export const publicApi = ofetch.create({ 4 | baseURL: process.env["SERVER_URL_BASE"], 5 | }); 6 | 7 | export const adminApi = publicApi.create({ 8 | headers: { 9 | authorization: `Bearer ${process.env["SERVICE_TOKEN"]}`, 10 | }, 11 | }); 12 | 13 | export function getRoomConfig(): RoomConfig { 14 | const roomId = process.env["ROOM_ID"]; 15 | const roomKey = process.env["ROOM_KEY"]; 16 | 17 | if (!roomId) { 18 | throw new Error("Missing ROOM_ID"); 19 | } 20 | if (!roomKey) { 21 | throw new Error("Missing ROOM_KEY"); 22 | } 23 | 24 | return { roomId, roomKey }; 25 | } 26 | 27 | export type RoomConfig = { roomId: string; roomKey: string }; 28 | 29 | export function createRoomApi({ roomId, roomKey }: RoomConfig) { 30 | return publicApi.create({ 31 | headers: { 32 | authorization: `Bearer ${roomKey}`, 33 | }, 34 | baseURL: `${process.env["SERVER_URL_BASE"]}/rooms/${roomId}`, 35 | }); 36 | } 37 | -------------------------------------------------------------------------------- /backend/src/pubsub.ts: -------------------------------------------------------------------------------- 1 | type Listener = (message: string) => void; 2 | 3 | class PubSub { 4 | private listenerSetMap = new Map>(); 5 | 6 | getListenerSet(channel: string): Set { 7 | if (!this.listenerSetMap.has(channel)) { 8 | this.listenerSetMap.set(channel, new Set()); 9 | } 10 | return this.listenerSetMap.get(channel)!; 11 | } 12 | 13 | subscribe(channel: string, listener: Listener): () => void { 14 | const listeners = this.getListenerSet(channel); 15 | listeners.add(listener); 16 | return () => { 17 | listeners.delete(listener); 18 | }; 19 | } 20 | 21 | publish(channel: string, method: string, params: any): void { 22 | const payload = JSON.stringify({ method, params }); 23 | const listeners = this.getListenerSet(channel); 24 | for (const listener of listeners) { 25 | try { 26 | listener(payload); 27 | } catch (error) { 28 | console.error(error); 29 | } 30 | } 31 | } 32 | } 33 | 34 | export const pubsub = new PubSub(); 35 | -------------------------------------------------------------------------------- /backend/src/utterance.ts: -------------------------------------------------------------------------------- 1 | import { uuidv7 } from "uuidv7"; 2 | import { db } from "./db"; 3 | import { updateItem } from "./itemOperations"; 4 | import { pubsub } from "./pubsub"; 5 | import type { Room } from "./room"; 6 | 7 | export class Utterance { 8 | id = uuidv7(); 9 | start = new Date().toISOString(); 10 | buffers: Buffer[] = []; 11 | 12 | constructor(public room: Room, localTime: string) { 13 | pubsub.publish(room.audioTopic, "audio_start", { id: this.id }); 14 | updateItem(room, this.id, { start: this.start, startLocalTime: localTime }); 15 | } 16 | addAudio(base64: string) { 17 | this.buffers.push(Buffer.from(base64, "base64")); 18 | pubsub.publish(this.room.audioTopic, "audio_data", { id: this.id, base64 }); 19 | } 20 | async finish() { 21 | pubsub.publish(this.room.audioTopic, "audio_finish", { id: this.id }); 22 | const buffer = Buffer.concat(this.buffers); 23 | await db.audio.set(this.id, buffer.toString("base64")); 24 | await updateItem(this.room, this.id, { 25 | finish: new Date().toISOString(), 26 | length: buffer.length, 27 | }); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /backend/src/itemOperations.ts: -------------------------------------------------------------------------------- 1 | import { db } from "./db"; 2 | import { publicBroadcast } from "./publicBroadcast"; 3 | import { Room } from "./room"; 4 | 5 | export async function getItems(room: Room) { 6 | const output = []; 7 | for await (const [id, data] of db.roomItems(room)) { 8 | if (typeof data !== "object") { 9 | console.error("Invalid item", id); 10 | } else { 11 | output.push({ id, ...data }); 12 | } 13 | } 14 | return output; 15 | } 16 | 17 | export async function getItem(room: Room, id: string) { 18 | const item = await db.roomItems(room).get(id); 19 | return item ? { ...item, id } : null; 20 | } 21 | 22 | export async function updateItem(room: Room, id: string, changes: any) { 23 | const existingItem = (await db.roomItems(room).get(id)) || {}; 24 | const newValue = { 25 | ...existingItem, 26 | ...changes, 27 | changes: [ 28 | ...(existingItem?.changes ?? []), 29 | { payload: changes, time: new Date().toISOString() }, 30 | ], 31 | }; 32 | await db.roomItems(room).set(id, newValue); 33 | publicBroadcast(room, "updated", { ...newValue, id }); 34 | return newValue; 35 | } 36 | -------------------------------------------------------------------------------- /src/components/LogViewer.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useRef, useState } from "react"; 2 | import { addLogListener } from "../logbus"; 3 | 4 | export function LogViewer() { 5 | const ref = useRef(null); 6 | const [autoScroll, setAutoScroll] = useState(true); 7 | const autoScrollRef = useRef(autoScroll); 8 | useEffect(() => { 9 | autoScrollRef.current = autoScroll; 10 | }, [autoScroll]); 11 | useEffect(() => { 12 | return addLogListener({ 13 | onLog(message) { 14 | if (ref.current && autoScrollRef.current) { 15 | ref.current.value += message + "\n"; 16 | ref.current.scrollTop = ref.current.scrollHeight; 17 | } 18 | }, 19 | }); 20 | }, []); 21 | return ( 22 | 23 | 29 | 30 | {/* auto scroll checkbox */} 31 | 32 | setAutoScroll(event.target.checked)} 38 | /> 39 | 40 | Auto scroll 41 | 42 | 43 | 44 | 45 | ); 46 | } 47 | -------------------------------------------------------------------------------- /src/components/TranscriptViewer.css: -------------------------------------------------------------------------------- 1 | .TranscriptViewer { 2 | font-family: "Sarabun", sans-serif; 3 | letter-spacing: 0.1ch; 4 | font-size: 20px; 5 | padding-bottom: 75vh; 6 | } 7 | 8 | .TranscriptItem { 9 | margin-bottom: 0.5rem; 10 | display: flex; 11 | } 12 | 13 | .TranscriptItem__content { 14 | padding: 0.75rem 1rem; 15 | border-radius: 1rem; 16 | border: 1px solid transparent; 17 | position: relative; 18 | } 19 | 20 | .TranscriptItem__content[data-transcribed="true"] { 21 | border-color: var(--bs-border-color); 22 | } 23 | 24 | .TranscriptItem__content[data-needs-correction="true"] { 25 | color: var(--bs-yellow); 26 | } 27 | 28 | .TranscriptItem__content[data-editing="true"] { 29 | border-color: var(--bs-yellow); 30 | } 31 | 32 | .TranscriptItem__time { 33 | display: block; 34 | color: var(--bs-gray); 35 | font-size: 0.5em; 36 | position: absolute; 37 | bottom: 0.2rem; 38 | right: 0.7rem; 39 | opacity: 0; 40 | user-select: none; 41 | } 42 | 43 | .TranscriptItem__content:hover .TranscriptItem__time { 44 | opacity: 1; 45 | } 46 | 47 | .TranscriptViewerOptions { 48 | position: fixed; 49 | bottom: 0; 50 | left: 0; 51 | right: 0; 52 | background: #000; 53 | padding: 0.5rem; 54 | transform: translateY(100%) translateY(-0.5rem); 55 | transition: all 0.3s; 56 | opacity: 0; 57 | } 58 | 59 | .TranscriptViewerOptions:hover, 60 | .TranscriptViewerOptions:focus-within, 61 | .TranscriptViewerOptions[data-editable="true"] { 62 | transform: translateY(0); 63 | opacity: 1; 64 | } 65 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fresh-app", 3 | "type": "module", 4 | "version": "0.0.1", 5 | "scripts": { 6 | "dev": "astro dev", 7 | "start": "astro dev", 8 | "build": "astro check && astro build", 9 | "preview": "astro preview", 10 | "astro": "astro", 11 | "server": "tsx --env-file=.env backend/scripts/server.ts", 12 | "dev:server": "tsx --env-file=.env --watch backend/scripts/server.ts", 13 | "createRoom": "tsx --env-file=.env backend/scripts/createRoom.ts", 14 | "batchTranscriber": "tsx --env-file=.env backend/scripts/batchTranscriber.ts", 15 | "whisperTranslator": "tsx --env-file=.env backend/scripts/whisperTranslator.ts", 16 | "partialTranscriber": "tsx --env-file=.env backend/scripts/partialTranscriber.ts" 17 | }, 18 | "dependencies": { 19 | "@astrojs/check": "^0.9.2", 20 | "@astrojs/react": "^3.6.2", 21 | "@fastify/cors": "^10.0.1", 22 | "@fastify/websocket": "^11.0.1", 23 | "@google-cloud/speech": "^6.7.0", 24 | "@google/generative-ai": "^0.21.0", 25 | "@keyv/sqlite": "^4.0.1", 26 | "@nanostores/react": "^0.7.3", 27 | "@ricky0123/vad-web": "^0.0.19", 28 | "@stablelib/base64": "^2.0.0", 29 | "@thai/html": "npm:@jsr/thai__html@0.1.0-alpha.0", 30 | "@types/react": "^18.3.8", 31 | "@types/react-dom": "^18.3.0", 32 | "astro": "^4.14.2", 33 | "buffer-es6": "^4.9.3", 34 | "chalk": "^5.3.0", 35 | "fastify": "^5.0.0", 36 | "groq-sdk": "^0.7.0", 37 | "keyv": "^5.1.0", 38 | "nanostores": "^0.11.3", 39 | "ofetch": "^1.4.1", 40 | "react": "^18.3.1", 41 | "react-dom": "^18.3.1", 42 | "react-textarea-autosize": "^8.5.4", 43 | "reconnecting-websocket": "^4.4.0", 44 | "sqlite3": "^5.1.7", 45 | "tsx": "^4.19.1", 46 | "typescript": "^5.5.4", 47 | "uuidv7": "^1.0.2" 48 | }, 49 | "packageManager": "pnpm@9.7.1+sha512.faf344af2d6ca65c4c5c8c2224ea77a81a5e8859cbc4e06b1511ddce2f0151512431dd19e6aff31f2c6a8f5f2aced9bd2273e1fed7dd4de1868984059d2c4247", 50 | "devDependencies": { 51 | "@types/node": "^22.7.6" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /backend/scripts/whisperTranslator.ts: -------------------------------------------------------------------------------- 1 | import Groq from "groq-sdk"; 2 | import { uuidv7 } from "uuidv7"; 3 | import { createRoomApi, getRoomConfig, publicApi } from "../src/client"; 4 | 5 | const groq = new Groq(); 6 | const api = createRoomApi(getRoomConfig()); 7 | 8 | let waiting = false; 9 | async function main() { 10 | const list = await api< 11 | { 12 | id: string; 13 | start: string; 14 | finish: string; 15 | length: number; 16 | transcript?: string; 17 | }[] 18 | >(`/items`); 19 | 20 | const validItems = list.filter((item) => item.length > 0); 21 | validItems.sort((a, b) => a.start.localeCompare(b.start)); 22 | 23 | const untranscribed = validItems.filter((item) => item.transcript == null); 24 | if (!untranscribed.length) { 25 | if (!waiting) { 26 | waiting = true; 27 | process.stderr.write("Waiting for transcription..."); 28 | } else { 29 | process.stderr.write("."); 30 | } 31 | return false; 32 | } 33 | if (waiting) { 34 | process.stderr.write("\n"); 35 | waiting = false; 36 | } 37 | 38 | const modelName = "whisper-large-v3"; 39 | const transcription = await groq.audio.translations.create({ 40 | file: new File([await loadAudio(untranscribed[0].id)], "audio.wav"), 41 | model: modelName, 42 | response_format: "json", 43 | }); 44 | console.log(transcription.text); 45 | const usageId = uuidv7(); 46 | await api(`/items/${untranscribed[0].id}`, { 47 | method: "PATCH", 48 | body: { 49 | transcript: transcription.text, 50 | transcriptBy: modelName, 51 | usageId, 52 | }, 53 | }); 54 | return true; 55 | } 56 | 57 | async function loadAudio(id: string) { 58 | return publicApi(`/pcm/${id}`, { responseType: "blob" }).then((r) => 59 | r.arrayBuffer() 60 | ); 61 | } 62 | 63 | const initialHp = 5; 64 | let hp = initialHp; 65 | for (;;) { 66 | try { 67 | if (!(await main())) { 68 | await new Promise((r) => setTimeout(r, 1000)); 69 | } 70 | if (hp < initialHp) { 71 | hp = initialHp; 72 | console.error("HP has been restored to", hp); 73 | } 74 | } catch (error) { 75 | console.error(error); 76 | hp--; 77 | if (hp <= 0) { 78 | console.error("Giving up"); 79 | process.exit(1); 80 | break; 81 | } else { 82 | console.error("HP has been reduced to", hp); 83 | } 84 | } finally { 85 | await new Promise((r) => setTimeout(r, 100)); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /backend/src/persistence.ts: -------------------------------------------------------------------------------- 1 | import sqlite3 from "sqlite3"; 2 | 3 | export class Partition { 4 | private db: sqlite3.Database; 5 | 6 | constructor(db: sqlite3.Database, private partitionKey: string) { 7 | this.db = db; 8 | } 9 | 10 | async get(sortKey: string): Promise { 11 | return new Promise((resolve, reject) => { 12 | this.db.get<{ value: string }>( 13 | "SELECT value FROM keyvalue WHERE partition = ? AND key = ?", 14 | [this.partitionKey, sortKey], 15 | (err, row) => { 16 | if (err) reject(err); 17 | else resolve(row ? JSON.parse(row.value) : undefined); 18 | } 19 | ); 20 | }); 21 | } 22 | 23 | async set(sortKey: string, value: any): Promise { 24 | const serializedValue = JSON.stringify(value); 25 | return new Promise((resolve, reject) => { 26 | this.db.run( 27 | "INSERT OR REPLACE INTO keyvalue (partition, key, value) VALUES (?, ?, ?)", 28 | [this.partitionKey, sortKey, serializedValue], 29 | function (err: Error | null) { 30 | if (err) reject(err); 31 | else resolve(); 32 | } 33 | ); 34 | }); 35 | } 36 | 37 | async *[Symbol.asyncIterator]() { 38 | const rows = await new Promise((resolve, reject) => { 39 | this.db.all( 40 | "SELECT key, value FROM keyvalue WHERE partition = ?", 41 | [this.partitionKey], 42 | (err, rows) => { 43 | if (err) reject(err); 44 | else resolve(rows); 45 | } 46 | ); 47 | }); 48 | 49 | for (const row of rows) { 50 | yield [row.key, JSON.parse(row.value)]; 51 | } 52 | } 53 | } 54 | 55 | export class Persistence { 56 | private db: sqlite3.Database; 57 | private partitions: Map = new Map(); 58 | 59 | constructor(connectionString: string) { 60 | this.db = new sqlite3.Database(connectionString, (err) => { 61 | if (err) { 62 | console.error("Error opening database:", err.message); 63 | } else { 64 | this.initializeDatabase(); 65 | } 66 | }); 67 | } 68 | 69 | private initializeDatabase() { 70 | this.db.run(` 71 | CREATE TABLE IF NOT EXISTS keyvalue ( 72 | partition TEXT, 73 | key TEXT, 74 | value TEXT, 75 | PRIMARY KEY (partition, key) 76 | ) 77 | `); 78 | } 79 | 80 | getPartition(partitionKey: string): Partition { 81 | if (!this.partitions.has(partitionKey)) { 82 | this.partitions.set(partitionKey, new Partition(this.db, partitionKey)); 83 | } 84 | return this.partitions.get(partitionKey)!; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # live-speech 2 | 3 | This project aims to provide live speech transcription for tech events, specifically designed to support Thai tech talks where there's a mixture of Thai words and technical terms. The system offers real-time transcription with post-processing capabilities for improved accuracy. 4 | 5 | ## Components 6 | 7 |  8 | 9 | ### Audio Sender 10 | 11 | - Web-based, using `getUserMedia` API 12 | - Responsible for capturing audio from the speaker's device, converting it to 16-bit linear PCM audio data, and sends it to the server using WebSockets 13 | 14 | ### Server 15 | 16 | - Acts as the central backend for the application 17 | - Handles database operations and pub/sub functionality 18 | - Manages communication between different components 19 | 20 | ### Realtime Transcriber 21 | 22 | - Performs streaming transcription in real-time 23 | - Provides quick, albeit less accurate, transcriptions 24 | - Useful for immediate feedback and live subtitles 25 | 26 | ### Batch Transcriber 27 | 28 | - Uses a more advanced ASR model (Gemini 1.5 Flash) for improved accuracy 29 | - Processes audio in batches for higher quality transcriptions 30 | 31 | ### Transcript Viewer 32 | 33 | - Displays the transcribed text to the audience 34 | - Shows both real-time and refined transcriptions 35 | 36 | ## Key Features 37 | 38 | - Real-time audio capture and streaming 39 | - Live transcription with quick feedback 40 | - High-accuracy batch processing for refined transcripts 41 | - Support for mixed Thai and English technical content 42 | 43 | This system is designed to enhance the accessibility and documentation of Thai tech talks by providing accurate transcriptions that can handle the unique challenges of mixed-language technical presentations. 44 | 45 | ## Setup 46 | 47 | ```sh 48 | # Install Node.js 49 | mise install 50 | 51 | # Enable corepack 52 | corepack enable 53 | 54 | # Install dependencies 55 | pnpm install 56 | ``` 57 | 58 | `.env`: 59 | 60 | ```sh 61 | # For local development 62 | SERVER_URL_BASE=http://localhost:10300 63 | FRONTEND_URL_BASE=http://localhost:4321 64 | 65 | # Generate a random string for the secret key, e.g. using `openssl rand -hex 32` 66 | SERVICE_TOKEN= 67 | 68 | # For batch transcription 69 | GEMINI_API_KEY= 70 | 71 | # Change to "pro" for better transcription quality at higher cost 72 | GEMINI_MODEL=flash 73 | 74 | # For partial transcription with Speechmatics 75 | PARTIAL_TRANSCRIBER_PROVIDER=speechmatics 76 | SPEECHMATICS_API_KEY= 77 | 78 | # For partial transcription with Google 79 | # PARTIAL_TRANSCRIBER_PROVIDER=google 80 | # GOOGLE_APPLICATION_CREDENTIALS= 81 | 82 | # For partial transcription with local model (macOS only), 83 | # compile this CLI and set 84 | # PARTIAL_TRANSCRIBER_PROVIDER=local 85 | ``` 86 | 87 | ## How much does it cost? 88 | 89 | The numbers are **approximate** and depends on which models you use. 90 | 91 | Google Speech-To-Text model has lower latency (from Thailand) and cheaper, but performs worse than Speechmatics for Thai contents. 92 | 93 | | Partial transcription model | Price per hour | 94 | | --------------------------- | -------------- | 95 | | `local` | $0.00 | 96 | | `google` | $0.81 | 97 | | `speechmatics` | $1.18 | 98 | 99 | Gemini Flash works great for Thai contents, but for English content [Gemini Pro is recommended for better punctuation insertion](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/audio-understanding#:~:text=Transcription%20punctuation%3A%20(if%20using%20Gemini%C2%A01.5%C2%A0Flash)%20The%20models%20might%20return%20transcriptions%20that%20don%27t%20include%20punctuation). 100 | 101 | | Batch transcription model | Price per hour | 102 | | ------------------------- | -------------- | 103 | | Gemini Flash | $0.18 | 104 | | Gemini Pro | $2.97 | 105 | 106 | ## Workflow 107 | 108 | 1. Run the server: 109 | 110 | ```sh 111 | pnpm run server # or `pnpm run dev:server` to restart on file changes 112 | ``` 113 | 114 | 2. Run the frontend: 115 | 116 | ```sh 117 | pnpm run dev 118 | ``` 119 | 120 | 3. Create a room: 121 | 122 | ```sh 123 | pnpm run createRoom 124 | ``` 125 | 126 | 4. Run partial transcriber: 127 | 128 | ```sh 129 | pnpm run partialTranscriber 130 | ``` 131 | 132 | 5. Run batch transcriber: 133 | 134 | ```sh 135 | pnpm run batchTranscriber 136 | ``` 137 | 138 | 6. Navigate to audio sender. 139 | -------------------------------------------------------------------------------- /backend/scripts/server.ts: -------------------------------------------------------------------------------- 1 | import Cors from "@fastify/cors"; 2 | import Websocket from "@fastify/websocket"; 3 | import Fastify, { type FastifyRequest } from "fastify"; 4 | import { randomBytes } from "node:crypto"; 5 | import { uuidv7 } from "uuidv7"; 6 | import { db } from "../src/db"; 7 | import { getItem, getItems, updateItem } from "../src/itemOperations"; 8 | import { publicBroadcast } from "../src/publicBroadcast"; 9 | import { pubsub } from "../src/pubsub"; 10 | import { Room } from "../src/room"; 11 | import { Utterance } from "../src/utterance"; 12 | 13 | const fastify = Fastify({ 14 | logger: true, 15 | }); 16 | await fastify.register(Websocket); 17 | 18 | // Add `Access-Control-Allow-Private-Network: true` to all responses 19 | fastify.addHook("onSend", (request, reply, payload, done) => { 20 | reply.header("Access-Control-Allow-Private-Network", "true"); 21 | done(); 22 | }); 23 | 24 | await fastify.register(Cors); 25 | 26 | fastify.post("/admin/rooms", async (req, reply) => { 27 | const token = req.headers["authorization"]?.split(" ")[1]; 28 | if (token !== process.env["SERVICE_TOKEN"]) { 29 | reply.code(401).send({ error: "Unauthorized" }); 30 | return; 31 | } 32 | 33 | const roomId = uuidv7(); 34 | const roomKey = randomBytes(32).toString("hex"); 35 | 36 | await db.rooms.set(roomId, { roomKey }); 37 | 38 | return { roomId, roomKey }; 39 | }); 40 | 41 | fastify.get("/admin/rooms", async (req, reply) => { 42 | const token = req.headers["authorization"]?.split(" ")[1]; 43 | if (token !== process.env["SERVICE_TOKEN"]) { 44 | reply.code(401).send({ error: "Unauthorized" }); 45 | return; 46 | } 47 | 48 | const rooms = []; 49 | for await (const [roomId, roomData] of db.rooms) { 50 | rooms.push({ roomId, ...roomData }); 51 | } 52 | 53 | return rooms; 54 | }); 55 | 56 | async function checkRoomKey(room: Room, key: string) { 57 | const roomInfo = await db.rooms.get(room.name); 58 | if (!roomInfo) { 59 | return false; 60 | } 61 | return roomInfo.roomKey === key; 62 | } 63 | 64 | async function validateRoomKey( 65 | req: FastifyRequest, 66 | room: Room 67 | ): Promise { 68 | const authHeader = req.headers["authorization"]; 69 | if (!authHeader) return false; 70 | 71 | const [bearer, key] = authHeader.split(" "); 72 | if (bearer !== "Bearer" || !key) return false; 73 | 74 | return checkRoomKey(room, key); 75 | } 76 | 77 | fastify.get( 78 | "/rooms/:room/audioIngest", 79 | { websocket: true }, 80 | async (connection, req) => { 81 | const key = (req.query as { key: string }).key; 82 | const room = new Room((req.params as { room: string }).room); 83 | if (!(await checkRoomKey(room, key))) { 84 | connection.send(JSON.stringify({ error: "Invalid token" })); 85 | connection.close(); 86 | return; 87 | } 88 | 89 | let currentUtterance: Utterance | undefined; 90 | connection.on("message", async (message: any) => { 91 | try { 92 | const data = JSON.parse(message.toString()); 93 | // JSON-RPC messages: 94 | // - "start" - start audio stream. params.localTime is the local time when the audio stream started. 95 | // - "audio" - audio data. params.data is base64-encoded s16le audio data. 96 | // - "stop" - stop audio stream. 97 | // Send acknowledgement for each message. 98 | try { 99 | switch (data.method) { 100 | case "start": { 101 | currentUtterance = new Utterance(room, data.params.localTime); 102 | break; 103 | } 104 | case "audio": { 105 | currentUtterance?.addAudio(data.params.data); 106 | break; 107 | } 108 | case "stop": { 109 | currentUtterance?.finish(); 110 | break; 111 | } 112 | } 113 | connection.send(JSON.stringify({ id: data.id, result: null })); 114 | } catch (error) { 115 | connection.send( 116 | JSON.stringify({ id: data.id, error: String(error) }) 117 | ); 118 | req.log.error(error); 119 | } 120 | } catch (error) { 121 | req.log.error(error); 122 | } 123 | }); 124 | connection.send(JSON.stringify({ method: "welcome" })); 125 | } 126 | ); 127 | 128 | fastify.get( 129 | "/rooms/:room/audioEvents", 130 | { websocket: true }, 131 | async (connection, req) => { 132 | const key = (req.query as Record).key; 133 | const room = new Room((req.params as { room: string }).room); 134 | if (!(await checkRoomKey(room, key))) { 135 | connection.send(JSON.stringify({ error: "Invalid room key" })); 136 | connection.close(); 137 | return; 138 | } 139 | connection.on("message", async (message: any) => { 140 | try { 141 | const data = JSON.parse(message.toString()); 142 | if (data.method === "submit_partial_transcript") { 143 | const { id, transcript } = data.params; 144 | publicBroadcast(room, "partial_transcript", { 145 | id, 146 | transcript, 147 | }); 148 | connection.send( 149 | JSON.stringify({ id: data.id, result: { ok: true } }) 150 | ); 151 | } 152 | } catch (error) { 153 | req.log.error(error); 154 | } 155 | }); 156 | const unsubscribe = pubsub.subscribe(room.audioTopic, (message) => { 157 | connection.send(message); 158 | }); 159 | connection.on("close", unsubscribe); 160 | } 161 | ); 162 | 163 | fastify.get( 164 | "/rooms/:room/publicEvents", 165 | { websocket: true }, 166 | (connection, req) => { 167 | const room = new Room((req.params as { room: string }).room); 168 | const unsubscribe = pubsub.subscribe(room.publicTopic, (message) => { 169 | connection.send(message); 170 | }); 171 | connection.on("close", unsubscribe); 172 | } 173 | ); 174 | 175 | fastify.get("/rooms/:room/items", async (req) => { 176 | const room = new Room((req.params as { room: string }).room); 177 | const items = await getItems(room); 178 | return items; 179 | }); 180 | 181 | fastify.get("/rooms/:room/items/:id", async (req, reply) => { 182 | const room = new Room((req.params as { room: string }).room); 183 | const id = (req.params as { id: string }).id; 184 | const item = await getItem(room, id); 185 | if (!item) { 186 | reply.status(404).send({ error: "Not found" }); 187 | return; 188 | } 189 | return item; 190 | }); 191 | 192 | fastify.patch("/rooms/:room/items/:id", async (req, reply) => { 193 | const room = new Room((req.params as { room: string }).room); 194 | if (!(await validateRoomKey(req, room))) { 195 | reply.code(401).send({ error: "Invalid room key" }); 196 | return; 197 | } 198 | const id = (req.params as { id: string }).id; 199 | const body = req.body as any; 200 | const newValue = await updateItem(room, id, body); 201 | return newValue; 202 | }); 203 | 204 | fastify.get("/pcm/:id", async (req, reply) => { 205 | const id = (req.params as { id: string }).id; 206 | const buffer = Buffer.from((await db.audio.get(id)) as string, "base64"); 207 | // Generate wav file. Buffer is raw PCM, s16le, 1 channel. 208 | const sampleRate = 16000; // Assuming 16kHz sample rate 209 | const numChannels = 1; 210 | const bitsPerSample = 16; 211 | 212 | const dataSize = buffer.length; 213 | const wavBuffer = Buffer.alloc(44 + dataSize); 214 | 215 | // WAV header 216 | wavBuffer.write("RIFF", 0); 217 | wavBuffer.writeUInt32LE(36 + dataSize, 4); 218 | wavBuffer.write("WAVE", 8); 219 | wavBuffer.write("fmt ", 12); 220 | wavBuffer.writeUInt32LE(16, 16); 221 | wavBuffer.writeUInt16LE(1, 20); 222 | wavBuffer.writeUInt16LE(numChannels, 22); 223 | wavBuffer.writeUInt32LE(sampleRate, 24); 224 | wavBuffer.writeUInt32LE((sampleRate * numChannels * bitsPerSample) / 8, 28); 225 | wavBuffer.writeUInt16LE((numChannels * bitsPerSample) / 8, 32); 226 | wavBuffer.writeUInt16LE(bitsPerSample, 34); 227 | wavBuffer.write("data", 36); 228 | wavBuffer.writeUInt32LE(dataSize, 40); 229 | 230 | // Copy PCM data 231 | buffer.copy(wavBuffer, 44); 232 | 233 | reply 234 | .header("Content-Type", "audio/wav") 235 | .header("Content-Disposition", `inline; filename="${id}.wav"`) 236 | .send(wavBuffer); 237 | }); 238 | 239 | fastify.listen({ port: 10300 }); 240 | -------------------------------------------------------------------------------- /backend/scripts/partialTranscriber.ts: -------------------------------------------------------------------------------- 1 | import { protos, v2 } from "@google-cloud/speech"; 2 | import { spawn } from "child_process"; 3 | import { createInterface } from "node:readline"; 4 | import { PassThrough, Readable } from "node:stream"; 5 | import { pipeline } from "node:stream/promises"; 6 | import { ofetch } from "ofetch"; 7 | import ReconnectingWebSocket from "reconnecting-websocket"; 8 | import { getRoomConfig } from "../src/client"; 9 | 10 | const roomConfig = getRoomConfig(); 11 | 12 | const websocket = new ReconnectingWebSocket( 13 | `${process.env["SERVER_URL_BASE"]!.replace(/^http/, "ws")}/rooms/${ 14 | roomConfig.roomId 15 | }/audioEvents?key=${roomConfig.roomKey}` 16 | ); 17 | 18 | function isAbortError(e: any) { 19 | return e.name === "AbortError"; 20 | } 21 | 22 | function createTranscriber( 23 | language: string, 24 | requireOnDevice: boolean, 25 | signal: AbortSignal 26 | ) { 27 | const child = spawn("transcriber", [language], { 28 | stdio: ["pipe", "pipe", "inherit"], 29 | env: { 30 | ...process.env, 31 | ...(requireOnDevice ? { TRANSCRIBE_ON_DEVICE_ONLY: "1" } : {}), 32 | }, 33 | signal, 34 | }); 35 | child.on("error", (error) => { 36 | if (isAbortError(error)) return; 37 | console.error("Transcriber process encountered error", error); 38 | }); 39 | return async function* (source: AsyncIterable) { 40 | Readable.from(source).pipe(child.stdin); 41 | for await (const line of parseNdjson(child.stdout)) { 42 | yield line; 43 | } 44 | child.kill(); 45 | }; 46 | } 47 | 48 | function createGoogleTranscriber(language: string, signal: AbortSignal) { 49 | const client = new v2.SpeechClient(); 50 | const stream = client._streamingRecognize(); 51 | const createRequest = ( 52 | x: protos.google.cloud.speech.v2.IStreamingRecognizeRequest 53 | ) => x; 54 | return async function* (source: AsyncIterable) { 55 | const inputStream = Readable.from( 56 | (async function* () { 57 | yield createRequest({ 58 | recognizer: 59 | "projects/dtinth-audio-transcription/locations/global/recognizers/_", 60 | streamingConfig: { 61 | config: { 62 | explicitDecodingConfig: { 63 | encoding: "LINEAR16", 64 | sampleRateHertz: 16000, 65 | audioChannelCount: 1, 66 | }, 67 | languageCodes: [language], 68 | model: "short", 69 | }, 70 | streamingFeatures: { 71 | interimResults: true, 72 | }, 73 | }, 74 | }); 75 | for await (const chunk of source) { 76 | yield createRequest({ audio: chunk }); 77 | } 78 | })() 79 | ); 80 | inputStream.pipe(stream); 81 | for await (const event of stream) { 82 | const text = event?.results?.[0]?.alternatives?.[0]?.transcript; 83 | if (text) { 84 | yield { text }; 85 | } else { 86 | console.warn("No text in event", JSON.stringify(event)); 87 | } 88 | } 89 | }; 90 | } 91 | 92 | let cachedSpeechmaticsApiKey: string | undefined; 93 | async function obtainSpeechamticsApiKey() { 94 | if (cachedSpeechmaticsApiKey) return cachedSpeechmaticsApiKey; 95 | 96 | const apiKey = process.env.SPEECHMATICS_API_KEY; 97 | if (!apiKey) { 98 | throw new Error("SPEECHMATICS_API_KEY environment variable is not set"); 99 | } 100 | 101 | const refresh = async () => { 102 | const response = await ofetch<{ key_value: string }>( 103 | "https://mp.speechmatics.com/v1/api_keys?type=rt", 104 | { 105 | method: "POST", 106 | headers: { 107 | "Content-Type": "application/json", 108 | Authorization: `Bearer ${apiKey}`, 109 | }, 110 | body: JSON.stringify({ ttl: 3600 }), 111 | } 112 | ); 113 | cachedSpeechmaticsApiKey = response.key_value; 114 | return response.key_value; 115 | }; 116 | setInterval(refresh, 1800 * 1000); 117 | return await refresh(); 118 | } 119 | 120 | function createSpeechmaticsTranscriber(language: string, signal: AbortSignal) { 121 | const output = new PassThrough({ objectMode: true }); 122 | async function worker(source: AsyncIterable) { 123 | const tempKey = await obtainSpeechamticsApiKey(); 124 | const socket = new WebSocket( 125 | `wss://eu2.rt.speechmatics.com/v2?jwt=${tempKey}` 126 | ); 127 | const openPromise = new Promise((resolve, reject) => { 128 | socket.onopen = () => { 129 | console.log("Connected to Speechmatics WebSocket"); 130 | const startMessage = { 131 | message: "StartRecognition", 132 | audio_format: { 133 | type: "raw", 134 | encoding: "pcm_s16le", 135 | sample_rate: 16000, 136 | }, 137 | transcription_config: { 138 | language, 139 | enable_partials: true, 140 | }, 141 | }; 142 | socket.send(JSON.stringify(startMessage)); 143 | resolve(); 144 | }; 145 | socket.onmessage = (event) => { 146 | const data = JSON.parse(event.data); 147 | output.write(data); 148 | if (data.message === "EndOfTranscript") { 149 | socket.close(); 150 | output.end(); 151 | } 152 | }; 153 | socket.onerror = (error) => { 154 | console.error("WebSocket error:", error); 155 | reject(); 156 | output.end(); 157 | }; 158 | socket.onclose = (event) => { 159 | console.log("WebSocket closed:", event.code, event.reason); 160 | output.end(); 161 | }; 162 | }); 163 | await openPromise; 164 | let nChunks = 0; 165 | for await (const chunk of source) { 166 | socket.send(chunk); 167 | nChunks += 1; 168 | } 169 | socket.send( 170 | JSON.stringify({ message: "EndOfStream", last_seq_no: nChunks }) 171 | ); 172 | } 173 | 174 | return async function* (source: AsyncIterable) { 175 | const promise = worker(source); 176 | for await (const item of output) { 177 | if ( 178 | item.message === "AddTranscript" || 179 | item.message === "AddPartialTranscript" 180 | ) { 181 | const text = String(item.metadata?.transcript || "") 182 | .replace(/<\w+>/g, "") 183 | .trim(); 184 | if (text) { 185 | yield { text }; 186 | } 187 | } 188 | } 189 | await promise; 190 | }; 191 | } 192 | 193 | async function* parseNdjson(source: any) { 194 | for await (const line of createInterface({ input: Readable.from(source) })) { 195 | if (line.trim()) { 196 | yield JSON.parse(line); 197 | } 198 | } 199 | } 200 | 201 | let currentTranscription: Transcription | undefined; 202 | 203 | class Transcription { 204 | abortController: AbortController; 205 | input = new PassThrough(); 206 | constructor(public id: string) { 207 | this.abortController = new AbortController(); 208 | this.worker(); 209 | console.log("*", id); 210 | } 211 | addAudio(buffer: Buffer) { 212 | this.input.write(buffer); 213 | } 214 | async worker() { 215 | try { 216 | await pipeline( 217 | this.input, 218 | new PassThrough(), 219 | process.env["PARTIAL_TRANSCRIBER_PROVIDER"] === "speechmatics" 220 | ? createSpeechmaticsTranscriber(process.env['TRANSCRIBER_LANG'] || "th", this.abortController.signal) 221 | : process.env["PARTIAL_TRANSCRIBER_PROVIDER"] === "local" 222 | ? createTranscriber("th", false, this.abortController.signal) 223 | : createGoogleTranscriber("th-TH", this.abortController.signal), 224 | async (source) => { 225 | for await (const { text } of source) { 226 | console.log(" -", text); 227 | websocket.send( 228 | JSON.stringify({ 229 | method: "submit_partial_transcript", 230 | params: { 231 | id: this.id, 232 | transcript: text, 233 | }, 234 | }) 235 | ); 236 | } 237 | } 238 | ); 239 | } catch (error) { 240 | console.error(`Worker ${this.id} error`, error); 241 | } 242 | } 243 | finish() { 244 | this.input.end(); 245 | setTimeout(() => { 246 | this.abortController.abort(); 247 | }, 3000); 248 | } 249 | } 250 | 251 | websocket.onopen = () => { 252 | console.log("Connected to backend"); 253 | }; 254 | websocket.onclose = () => { 255 | console.error("Disconnected from backend"); 256 | }; 257 | websocket.onmessage = (e) => { 258 | const data = JSON.parse(e.data); 259 | switch (data.method) { 260 | case "audio_start": { 261 | if (currentTranscription) { 262 | currentTranscription.finish(); 263 | currentTranscription = undefined; 264 | } 265 | currentTranscription = new Transcription(data.params.id); 266 | break; 267 | } 268 | case "audio_data": { 269 | if (currentTranscription && currentTranscription.id !== data.params.id) { 270 | currentTranscription.finish(); 271 | currentTranscription = undefined; 272 | } 273 | if (!currentTranscription) { 274 | currentTranscription = new Transcription(data.params.id); 275 | } 276 | currentTranscription.addAudio(Buffer.from(data.params.base64, "base64")); 277 | break; 278 | } 279 | case "audio_finish": { 280 | if (currentTranscription) { 281 | currentTranscription.finish(); 282 | currentTranscription = undefined; 283 | } 284 | break; 285 | } 286 | } 287 | }; 288 | -------------------------------------------------------------------------------- /backend/scripts/batchTranscriber.ts: -------------------------------------------------------------------------------- 1 | import { 2 | GoogleGenerativeAI, 3 | HarmBlockThreshold, 4 | HarmCategory, 5 | SchemaType, 6 | type GenerationConfig, 7 | type Part, 8 | type UsageMetadata, 9 | } from "@google/generative-ai"; 10 | import { createHash } from "crypto"; 11 | import { uuidv7 } from "uuidv7"; 12 | import { createRoomApi, getRoomConfig, publicApi } from "../src/client"; 13 | 14 | const api = createRoomApi(getRoomConfig()); 15 | 16 | const apiKey = process.env["GEMINI_API_KEY"]!; 17 | const genAI = new GoogleGenerativeAI(apiKey); 18 | const modelName = process.env['GEMINI_MODEL'] === 'pro' ? "gemini-1.5-pro-002" : "gemini-1.5-flash-002"; 19 | export const model = genAI.getGenerativeModel({ 20 | model: modelName, 21 | }); 22 | console.log('Using model:', modelName); 23 | 24 | interface HistoryItem { 25 | audio: ArrayBuffer; 26 | transcript: string; 27 | } 28 | 29 | let waiting = false; 30 | 31 | export interface TranscriptionItem { 32 | id: string; 33 | transcript: string; 34 | } 35 | 36 | export async function processAudio( 37 | audio: ArrayBuffer[], 38 | history: HistoryItem[] = [], 39 | prior: string[] = [] 40 | ) { 41 | const generationConfig: GenerationConfig = { 42 | maxOutputTokens: 300, 43 | responseMimeType: "application/json", 44 | responseSchema: { 45 | type: SchemaType.OBJECT, 46 | properties: { 47 | transcription: { 48 | type: SchemaType.ARRAY, 49 | items: { 50 | type: SchemaType.OBJECT, 51 | properties: { 52 | id: { type: SchemaType.STRING }, 53 | transcript: { type: SchemaType.STRING }, 54 | }, 55 | }, 56 | }, 57 | }, 58 | }, 59 | }; 60 | const historyParts: Part[] = [ 61 | { 62 | text: 63 | `You are a professional transcriber.` + 64 | (prior.length > 0 65 | ? ` 66 | For your context, here are the prior transcribed texts: ${JSON.stringify( 67 | prior 68 | )}\n\n` 69 | : "") + 70 | ` 71 | You will be given a series of audio files and their IDs in this format: 72 | 73 | id: 74 | 75 | 76 | Transcribe the speech in each audio file. Follow the style guide when transcribing: 77 | - For English words, if it is a common word, then spell it using lowercase (e.g. oscillator). If it is a proper noun, capitalize it properly (e.g. Google Chrome). If it's an API name or part of computer code, use verbatim capitalization (e.g. getElementById). 78 | - For Thai text, do not add a space between words. Only add spaces between sentences or when there is obvious pausing. 79 | - Add spaces between Thai words and foreign words. 80 | - For English sentences, add punctuation marks as appropriate. For example, add periods at the end of sentences (or a question mark if the speaker is asking a question), and add commas and hyphens where it should be used. Sometimes our speakers are not fluent in English, so please fix the disfluency (such as "um"'s and "uh"'s, stuttering and stammering). Also fix minor grammatical mistakes, for example, "everyone like" should be "everyone likes." (Only fix minor mistakes though!) 81 | - For English sentences, capitalize the first word of the sentence so it is easier to read. 82 | - For technical terms, in general, spell it in English (e.g. canvas, vertex, scene). Only transliterate it to Thai if it is a very common word and commonly spelled in Thai (e.g. ลิงก์, เคส, อัพเกรด, โปรแกรมเมอร์). 83 | - Remove filler words like "umm" and "ah". Also fix the transcript when the speaker corrects themselves or repeats themselves due to stuttering. 84 | - At the end of the audio file there may be beeping sound, do not include it in the transcript. 85 | - If there is no speech, return an empty string for the transcript. 86 | 87 | Transcribe the following audio files.`, 88 | }, 89 | ]; 90 | const expected: TranscriptionItem[] = []; 91 | for (const item of history) { 92 | const buffer = Buffer.from(item.audio); 93 | const id = createHash("md5").update(buffer).digest("hex").slice(0, 6); 94 | historyParts.push({ text: "id: " + id }); 95 | historyParts.push({ 96 | inlineData: { 97 | mimeType: "audio/x-m4a", 98 | data: buffer.toString("base64"), 99 | }, 100 | }); 101 | expected.push({ id, transcript: item.transcript }); 102 | } 103 | const chatSession = model.startChat({ 104 | generationConfig: generationConfig, 105 | history: [ 106 | { 107 | role: "user", 108 | parts: historyParts, 109 | }, 110 | { 111 | role: "model", 112 | parts: [{ text: JSON.stringify({ transcription: expected }) }], 113 | }, 114 | ], 115 | safetySettings: [ 116 | { 117 | category: HarmCategory.HARM_CATEGORY_HARASSMENT, 118 | threshold: HarmBlockThreshold.BLOCK_NONE, 119 | }, 120 | { 121 | category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, 122 | threshold: HarmBlockThreshold.BLOCK_NONE, 123 | }, 124 | { 125 | category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, 126 | threshold: HarmBlockThreshold.BLOCK_NONE, 127 | }, 128 | { 129 | category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, 130 | threshold: HarmBlockThreshold.BLOCK_NONE, 131 | }, 132 | ], 133 | }); 134 | 135 | const promptParts: Part[] = []; 136 | const ids: string[] = []; 137 | for (const item of audio) { 138 | const buffer = Buffer.from(item); 139 | const id = createHash("md5").update(buffer).digest("hex").slice(0, 6); 140 | ids.push(id); 141 | promptParts.push({ text: "id: " + id }); 142 | promptParts.push({ 143 | inlineData: { 144 | mimeType: "audio/x-m4a", 145 | data: buffer.toString("base64"), 146 | }, 147 | }); 148 | } 149 | 150 | const result = await chatSession.sendMessageStream(promptParts, { 151 | timeout: 15000, 152 | }); 153 | let usageMetadata: UsageMetadata | undefined; 154 | let text = ""; 155 | let error = ""; 156 | try { 157 | for await (const chunk of result.stream) { 158 | if (chunk.usageMetadata) { 159 | usageMetadata = chunk.usageMetadata; 160 | } 161 | text += chunk.text(); 162 | } 163 | } catch (e: any) { 164 | // Add emoji to signify error 165 | text += "❌"; 166 | console.error("[processAudio]", e); 167 | error = String(e?.stack || e); 168 | // ctx.log("error", { error }); 169 | } 170 | return { usageMetadata, text, error, ids }; 171 | } 172 | 173 | function postProcess(text: string) { 174 | return ( 175 | text 176 | .replace(/ปื๊ด\s*$/, "") 177 | .replace(/ปื้ด\s*$/, "") 178 | .replace(/ปี๊บๆ+\s*$/, "") 179 | .replace(/ๆ(?:ๆ+)\s*$/, "ๆ") 180 | 181 | // Add spaces between Thai words and foreign words. 182 | .replace(/([ก-๙])([a-zA-Z0-9])/g, "$1 $2") 183 | .replace(/([a-zA-Z0-9])([ก-๙])/g, "$1 $2") 184 | 185 | .trim() 186 | ); 187 | } 188 | 189 | async function main({ maxMessages }: { maxMessages: number }) { 190 | const list = await api< 191 | { 192 | id: string; 193 | start: string; 194 | finish: string; 195 | length: number; 196 | transcript?: string; 197 | }[] 198 | >(`/items`); 199 | 200 | const validItems = list.filter((item) => item.length > 0); 201 | validItems.sort((a, b) => a.start.localeCompare(b.start)); 202 | 203 | const untranscribed = validItems.filter((item) => item.transcript == null); 204 | if (!untranscribed.length) { 205 | if (!waiting) { 206 | waiting = true; 207 | process.stderr.write("Waiting for transcription..."); 208 | } else { 209 | process.stderr.write("."); 210 | } 211 | return false; 212 | } 213 | if (waiting) { 214 | process.stderr.write("\n"); 215 | waiting = false; 216 | } 217 | const allBefore = validItems.filter( 218 | (item) => item.start < untranscribed[0].start 219 | ); 220 | const before = allBefore.slice(-3); 221 | const prior = allBefore 222 | .slice(0, -3) 223 | .flatMap((r) => (r.transcript ? [r.transcript] : [])) 224 | .slice(-37); 225 | const history = await Promise.all( 226 | before.map(async (item) => { 227 | const audio = await loadAudio(item.id); 228 | return { audio, transcript: item.transcript! }; 229 | }) 230 | ); 231 | const audio = await Promise.all( 232 | untranscribed.slice(0, maxMessages).map((item) => loadAudio(item.id)) 233 | ); 234 | const result = await processAudio(audio, history, prior); 235 | console.debug('Gemini result', result); 236 | let { transcription } = JSON.parse(result.text) as { 237 | transcription: TranscriptionItem[]; 238 | }; 239 | const usageId = uuidv7(); 240 | for (const [i, item] of transcription.entries()) { 241 | if (result.ids[i] !== item.id) { 242 | console.warn( 243 | "Prompt ID mismatch, expected", 244 | item.id, 245 | "but received", 246 | result.ids[i] 247 | ); 248 | continue; 249 | } 250 | const { id } = untranscribed[i]; 251 | const transcript = postProcess(item.transcript); 252 | console.log(`${id} => ${JSON.stringify(transcript)}`); 253 | await api(`/items/${id}`, { 254 | method: "PATCH", 255 | body: { 256 | transcript, 257 | transcriptBy: modelName, 258 | usageMetadata: result.usageMetadata, 259 | usageId: usageId, 260 | }, 261 | }); 262 | } 263 | return true; 264 | } 265 | 266 | async function loadAudio(id: string) { 267 | return publicApi(`/pcm/${id}`, { responseType: "blob" }).then((r) => 268 | r.arrayBuffer() 269 | ); 270 | } 271 | 272 | const initialHp = 5; 273 | let hp = initialHp; 274 | for (;;) { 275 | try { 276 | if (!(await main({ maxMessages: hp }))) { 277 | await new Promise((r) => setTimeout(r, 1000)); 278 | } 279 | if (hp < initialHp) { 280 | hp = initialHp; 281 | console.error('HP has been restored to', hp); 282 | } 283 | } catch (error) { 284 | console.error(error); 285 | hp--; 286 | if (hp <= 0) { 287 | console.error('Giving up'); 288 | process.exit(1); 289 | break; 290 | } else { 291 | console.error('HP has been reduced to', hp); 292 | } 293 | } finally { 294 | await new Promise((r) => setTimeout(r, 100)); 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /src/components/TranscriptViewer.tsx: -------------------------------------------------------------------------------- 1 | import { useStore } from "@nanostores/react"; 2 | import { atom, computed, type WritableAtom } from "nanostores"; 3 | import { ofetch } from "ofetch"; 4 | import { useEffect, useMemo, useRef, useState } from "react"; 5 | import TextareaAutosize from "react-textarea-autosize"; 6 | import ReconnectingWebSocket from "reconnecting-websocket"; 7 | import type { BackendContext } from "../BackendContext"; 8 | import "./TranscriptViewer.css"; 9 | import { $autoCorrects, $autoScroll } from "./TranscriptViewerKnobs"; 10 | 11 | const $autocorrectables = atom[]>([]); 12 | 13 | const $autoCorrector = computed([$autoCorrects], (autoCorrects) => { 14 | const items = autoCorrects 15 | .split(",") 16 | .map((x) => x.trim()) 17 | .filter((x) => x) 18 | .flatMap((x) => { 19 | const [from, to] = x.split("=>").map((x) => x.trim()); 20 | if (!from || !to) return []; 21 | return [{ from, to }]; 22 | }); 23 | return { 24 | correct: (text: string) => { 25 | let correctedText = text; 26 | for (const { from, to } of items) { 27 | const regex = new RegExp(from, "gi"); 28 | correctedText = correctedText.replace(regex, to); 29 | } 30 | // Add spaces between Thai and non-Thai words. 31 | correctedText = correctedText 32 | .replace(/([ก-๙])([a-zA-Z0-9])/g, "$1 $2") 33 | .replace(/([a-zA-Z0-9])([ก-๙])/g, "$1 $2") 34 | .replace(/ๆ(?!ๆ|\s|$)/g, "ๆ ") 35 | .trim(); 36 | return correctedText; 37 | }, 38 | }; 39 | }); 40 | 41 | export function TranscriptViewer() { 42 | const params = new URLSearchParams(window.location.search); 43 | 44 | const backend = params.get("backend"); 45 | const room = params.get("room"); 46 | const key = params.get("key") || undefined; 47 | if (!backend || !room) { 48 | return Missing parameters; 49 | } 50 | 51 | const backendContext: BackendContext = { backend, room, key }; 52 | return ; 53 | } 54 | 55 | function createViewer(backendContext: BackendContext) { 56 | const ws = new ReconnectingWebSocket( 57 | `${backendContext.backend.replace(/^http/, "ws")}/rooms/${ 58 | backendContext.room 59 | }/publicEvents` 60 | ); 61 | const bufferedPartial = new Map(); 62 | ws.onmessage = async (e) => { 63 | const json = JSON.parse(e.data); 64 | console.log(json); 65 | if (json.method === "updated") { 66 | const state: ItemState = json.params; 67 | const id = state.id; 68 | const item = $items.get().find((item) => item.id === id); 69 | if (item) { 70 | item.$state.set(state); 71 | } else { 72 | $items.set([ 73 | ...$items.get(), 74 | { 75 | id, 76 | $state: atom(state), 77 | $partial: atom(bufferedPartial.get(id) || undefined), 78 | }, 79 | ]); 80 | } 81 | } else if (json.method === "partial_transcript") { 82 | const id = json.params.id; 83 | const item = $items.get().find((item) => item.id === id); 84 | bufferedPartial.set(id, json.params.transcript); 85 | if (item) { 86 | item.$partial.set(json.params.transcript); 87 | } 88 | } 89 | }; 90 | const $items = atom([]); 91 | async function init() { 92 | const items = await ofetch( 93 | `${backendContext.backend}/rooms/${backendContext.room}/items` 94 | ); 95 | $items.set( 96 | items.map((item): ViewerTranscriptItem => { 97 | return { 98 | id: item.id, 99 | $state: atom(item), 100 | $partial: atom(), 101 | }; 102 | }) 103 | ); 104 | } 105 | init(); 106 | return { 107 | $items, 108 | editable: !!backendContext.key, 109 | async updateTranscript(id: string, transcript: string) { 110 | await ofetch( 111 | `${backendContext.backend}/rooms/${backendContext.room}/items/${id}`, 112 | { 113 | method: "PATCH", 114 | body: JSON.stringify({ transcript, transcriptBy: "manual" }), 115 | headers: { 116 | "Content-Type": "application/json", 117 | Authorization: `Bearer ${backendContext.key}`, 118 | }, 119 | } 120 | ); 121 | }, 122 | getAudioUrl(id: string) { 123 | return `${backendContext.backend}/pcm/${id}`; 124 | }, 125 | }; 126 | } 127 | 128 | interface ItemState { 129 | id: string; 130 | start: string; 131 | finish: string; 132 | length: number; 133 | transcript?: string; 134 | } 135 | 136 | interface ViewerTranscriptItem { 137 | id: string; 138 | $state: WritableAtom; 139 | $partial: WritableAtom; 140 | } 141 | 142 | type Viewer = ReturnType; 143 | let _viewer: Viewer | undefined; 144 | 145 | function TranscriptViewerView(props: { backendContext: BackendContext }) { 146 | const viewer = (_viewer ??= createViewer(props.backendContext)); 147 | const items = useStore(viewer.$items); 148 | return ( 149 | 150 | Transcript for room {props.backendContext.room} 151 | 152 | {items.map((item) => { 153 | return ( 154 | 160 | ); 161 | })} 162 | 163 | 164 | 165 | ); 166 | } 167 | 168 | const scroller = (() => { 169 | let toScroll = 0; 170 | let timeout: number | undefined; 171 | return { 172 | scrollBy(v: number) { 173 | console.log(v); 174 | if (!timeout) { 175 | timeout = setTimeout(() => { 176 | const amount = toScroll; 177 | toScroll = 0; 178 | timeout = undefined; 179 | if (amount < 0) return; 180 | smoothScroll(amount); 181 | }, 120) as unknown as number; 182 | } 183 | toScroll = Math.max(v, toScroll); 184 | }, 185 | }; 186 | })(); 187 | 188 | function smoothScroll(amount: number) { 189 | console.log(amount); 190 | let last = 0; 191 | let current = 0; 192 | amount = Math.round(amount); 193 | const frame = () => { 194 | current += (amount - current) / 5; 195 | const nextValue = Math.round(current); 196 | if (nextValue > last) { 197 | window.scrollBy({ 198 | top: nextValue - last, 199 | behavior: "instant", 200 | }); 201 | last = nextValue; 202 | } 203 | if (nextValue < amount) { 204 | requestAnimationFrame(frame); 205 | } 206 | }; 207 | requestAnimationFrame(frame); 208 | } 209 | 210 | function TranscriptItem(props: { 211 | start: string; 212 | item: ViewerTranscriptItem; 213 | viewer: Viewer; 214 | }) { 215 | const div = useRef(null); 216 | const text = useRef(null); 217 | const { item, viewer } = props; 218 | const state = useStore(item.$state); 219 | const partial = useStore(item.$partial); 220 | const [isEditing, setIsEditing] = useState(false); 221 | const transcribed = state.transcript != null; 222 | const [wasUntranscribed] = useState(!transcribed); 223 | const corrector = useStore($autoCorrector); 224 | const corrected = useMemo(() => { 225 | if (!state.transcript || !viewer.editable) return state.transcript; 226 | return corrector.correct(state.transcript); 227 | }, [corrector, state.transcript, viewer.editable]); 228 | 229 | const needsCorrection = corrected !== state.transcript; 230 | const autoCorrectableAdded = useRef(false); 231 | useEffect(() => { 232 | if (needsCorrection && !autoCorrectableAdded.current) { 233 | $autocorrectables.set([...$autocorrectables.get(), div]); 234 | autoCorrectableAdded.current = true; 235 | } else if (!needsCorrection && autoCorrectableAdded.current) { 236 | $autocorrectables.set($autocorrectables.get().filter((x) => x !== div)); 237 | autoCorrectableAdded.current = false; 238 | } 239 | }, [needsCorrection]); 240 | 241 | useEffect(() => { 242 | if (transcribed && wasUntranscribed && div.current && $autoScroll.get()) { 243 | const clientRect = div.current.getBoundingClientRect(); 244 | // Do not scroll if focusing on a text area. 245 | if (document.activeElement instanceof HTMLTextAreaElement) return; 246 | scroller.scrollBy( 247 | clientRect.top + clientRect.height - (window.innerHeight - 140) 248 | ); 249 | } 250 | }, [transcribed, wasUntranscribed]); 251 | 252 | const handleClick = (e: React.MouseEvent) => { 253 | if (e.altKey && needsCorrection && corrected) { 254 | viewer.updateTranscript(item.id, corrected); 255 | return; 256 | } 257 | if (viewer.editable && state.finish && !isEditing) { 258 | const width = text.current?.offsetWidth; 259 | setIsEditing({ width: width == null ? 0 : width + 2 }); 260 | } 261 | }; 262 | 263 | const handleSave = (newTranscript: string) => { 264 | viewer.updateTranscript(item.id, newTranscript); 265 | setIsEditing(false); 266 | }; 267 | 268 | const handleCancel = () => { 269 | setIsEditing(false); 270 | }; 271 | 272 | const listen = () => { 273 | const myWindow = window as { currentAudio?: HTMLAudioElement }; 274 | const audio = (myWindow.currentAudio ??= new Audio()); 275 | const src = viewer.getAudioUrl(item.id); 276 | if (!audio.paused && audio.src === src) { 277 | audio.pause(); 278 | } else { 279 | document.body.append(audio); 280 | audio.src = src; 281 | audio.load(); 282 | audio.currentTime = 0; 283 | audio.play(); 284 | } 285 | const textarea = div.current?.querySelector("textarea"); 286 | if (textarea) textarea.focus(); 287 | }; 288 | 289 | return ( 290 | 291 | 300 | {isEditing ? ( 301 | 302 | 308 | 316 | 317 | 👂 318 | 319 | 320 | ❌ 321 | 322 | 323 | 324 | ) : ( 325 | 326 | {state.transcript ?? ( 327 | {partial ?? "…"} 328 | )}{" "} 329 | {props.start} 330 | 331 | )} 332 | 333 | 334 | ); 335 | } 336 | interface EditableTranscriptProps { 337 | initialValue: string; 338 | width: number; 339 | onSave: (newTranscript: string) => void; 340 | onCancel: () => void; 341 | } 342 | 343 | function formatTime(date: Date) { 344 | return `${date.getHours()}:${String(date.getMinutes()).padStart(2, "0")}`; 345 | } 346 | 347 | function EditableTranscript({ 348 | initialValue, 349 | onSave, 350 | onCancel, 351 | width, 352 | }: EditableTranscriptProps) { 353 | const [value, setValue] = useState(initialValue); 354 | 355 | const handleKeyDown = (e: React.KeyboardEvent) => { 356 | if (e.key === "Enter" && !e.shiftKey) { 357 | e.preventDefault(); 358 | onSave(value); 359 | } else if (e.key === "Escape") { 360 | e.preventDefault(); 361 | onCancel(); 362 | } 363 | }; 364 | 365 | return ( 366 | setValue(e.target.value)} 369 | onKeyDown={handleKeyDown} 370 | autoFocus 371 | style={{ 372 | width: width || "100%", 373 | border: "none", 374 | outline: "none", 375 | resize: "none", 376 | padding: "0", 377 | fontFamily: "inherit", 378 | fontSize: "inherit", 379 | letterSpacing: "inherit", 380 | backgroundColor: "transparent", 381 | }} 382 | /> 383 | ); 384 | } 385 | 386 | function TranscriptViewerOptions({ viewer }: { viewer: Viewer }) { 387 | const autoScroll = useStore($autoScroll); 388 | const toCorrect = useStore($autocorrectables).length; 389 | 390 | useEffect(() => { 391 | const onKeyDown = (e: KeyboardEvent) => { 392 | // Do not process keydown events when editing a text area. 393 | if (document.activeElement instanceof HTMLTextAreaElement) return; 394 | console.log(e.key); 395 | if (e.key === "s") { 396 | $autoScroll.set(!autoScroll); 397 | } 398 | if (e.key === "x") { 399 | document.querySelector("#autoCorrectables")?.click(); 400 | } 401 | }; 402 | window.addEventListener("keydown", onKeyDown); 403 | return () => { 404 | window.removeEventListener("keydown", onKeyDown); 405 | }; 406 | }, []); 407 | 408 | return ( 409 | 413 | 414 | 415 | $autoScroll.set(!autoScroll)} 419 | />{" "} 420 | Auto-scroll 421 | 422 | {viewer.editable && ( 423 | { 427 | if (e.altKey) { 428 | const before = $autoCorrects.get(); 429 | const after = prompt("Autocorrects", before); 430 | if (after != null) { 431 | $autoCorrects.set(after); 432 | } 433 | } else { 434 | $autocorrectables 435 | .get()[0] 436 | .current?.scrollIntoView({ behavior: "instant" }); 437 | } 438 | }} 439 | > 440 | Autocorrect ({toCorrect}) 441 | 442 | )} 443 | { 446 | const tsvContent = exportTsv(viewer); 447 | navigator.clipboard.writeText(tsvContent); 448 | }} 449 | > 450 | Copy TSV 451 | 452 | 453 | 454 | ); 455 | } 456 | 457 | function exportTsv(viewer: Viewer) { 458 | const items = viewer.$items.get(); 459 | const tsvContent = items 460 | .map((item) => { 461 | const state = item.$state.get(); 462 | return `${state.start}\t${state.finish}\t${state.transcript || ""}`; 463 | }) 464 | .join("\n"); 465 | return tsvContent; 466 | } 467 | -------------------------------------------------------------------------------- /src/components/AudioSender.tsx: -------------------------------------------------------------------------------- 1 | import { useStore } from "@nanostores/react"; 2 | import { encode } from "@stablelib/base64"; 3 | import { atom, computed } from "nanostores"; 4 | import ReconnectingWebSocket from "reconnecting-websocket"; 5 | import type { BackendContext } from "../BackendContext"; 6 | import { 7 | $activationThreshold, 8 | $deactivationThreshold, 9 | $decayEasing, 10 | $maxLength as $maxAudioLength, 11 | $minimumPeak as $minimumLevel, 12 | } from "../knobs"; 13 | import { log } from "../logbus"; 14 | import { LogViewer } from "./LogViewer"; 15 | 16 | import { FrameProcessor, NonRealTimeVAD } from "@ricky0123/vad-web"; 17 | 18 | async function initVad() { 19 | const vad = await NonRealTimeVAD.new({ 20 | modelURL: 21 | "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.19/dist/silero_vad.onnx", 22 | ortConfig(ort) { 23 | ort.env.wasm.wasmPaths = 24 | "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.19.2/dist/"; 25 | }, 26 | }); 27 | return vad; 28 | } 29 | 30 | let audioContext: AudioContext | null = null; 31 | function getAudioContext() { 32 | return (audioContext ??= new AudioContext({ sampleRate: 16000 })); 33 | } 34 | 35 | export function AudioSender() { 36 | const params = new URLSearchParams(window.location.search); 37 | 38 | const backend = params.get("backend"); 39 | const room = params.get("room"); 40 | const key = params.get("key"); 41 | if (!backend || !room || !key) { 42 | return Missing parameters; 43 | } 44 | 45 | const deviceId = params.get("deviceId"); 46 | if (!deviceId) { 47 | return ; 48 | } 49 | 50 | const backendContext: BackendContext = { backend, room, key }; 51 | return ( 52 | 53 | ); 54 | } 55 | 56 | const $devices = atom([]); 57 | 58 | const getDeviceList = async () => { 59 | try { 60 | const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); 61 | stream.getTracks().forEach((track) => track.stop()); 62 | 63 | const deviceList = await navigator.mediaDevices.enumerateDevices(); 64 | const audioInputDevices = deviceList.filter( 65 | (device) => device.kind === "audioinput" 66 | ); 67 | 68 | $devices.set(audioInputDevices); 69 | } catch (error) { 70 | console.error("Error getting device list:", error); 71 | } 72 | }; 73 | 74 | function AudioDeviceSelector() { 75 | const devices = useStore($devices); 76 | return ( 77 | <> 78 | Select device 79 | 80 | 84 | {devices.length > 0 ? "Refresh" : "Get"} device list 85 | 86 | 87 | 88 | {devices.map((device) => ( 89 | 90 | 91 | {device.label} 92 | 93 | 94 | ))} 95 | 96 | > 97 | ); 98 | } 99 | 100 | function createAudioSenderController(options: { 101 | backendContext: BackendContext; 102 | deviceId: string; 103 | log: (message: string) => void; 104 | }) { 105 | const { log, backendContext } = options; 106 | const $level = atom(0); 107 | const $realMax = atom(0); 108 | const $effectiveMax = computed( 109 | [$realMax, $minimumLevel], 110 | (realMax, minimumLevel) => Math.max(realMax, minimumLevel / 100) 111 | ); 112 | const $current = atom(0); 113 | const $active = atom(null); 114 | const $socketStatus = atom<"disconnected" | "authenticating" | "connected">( 115 | "disconnected" 116 | ); 117 | const unackedMessages = new Map(); 118 | const $pendingEventCount = atom(0); 119 | const $started = atom(false); 120 | 121 | const vadPromise = initVad(); 122 | const $vad = atom(null); 123 | vadPromise.then((vad) => { 124 | $vad.set(vad); 125 | $minimumLevel.set(100); 126 | $activationThreshold.set(0.5); 127 | $deactivationThreshold.set(0.35); 128 | }); 129 | 130 | let currentBlockCount = 0; 131 | type SocketEvent = 132 | | { method: "start"; params: { localTime: string } } 133 | | { method: "audio"; params: { data: string } } 134 | | { method: "stop" }; 135 | let onEvent: (event: SocketEvent) => void = () => {}; 136 | 137 | async function start() { 138 | if ($started.get()) return; 139 | $started.set(true); 140 | await Promise.all([startAudio(), startWebsocket()]); 141 | } 142 | 143 | async function startAudio() { 144 | const audioContext = getAudioContext(); 145 | 146 | const workletCode = ` 147 | class AudioSenderProcessor extends AudioWorkletProcessor { 148 | constructor() { 149 | super(); 150 | this.buffer = new Float32Array(1024); 151 | this.bufferIndex = 0; 152 | } 153 | 154 | process(inputs) { 155 | const input = inputs[0]; 156 | if (input.length > 0) { 157 | const inputData = input[0]; 158 | for (let i = 0; i < inputData.length; i++) { 159 | this.buffer[this.bufferIndex++] = inputData[i]; 160 | 161 | if (this.bufferIndex === this.buffer.length) { 162 | const outputData = new Int16Array(this.buffer.length); 163 | for (let j = 0; j < this.buffer.length; j++) { 164 | const s = Math.max(-1, Math.min(1, this.buffer[j])); 165 | outputData[j] = s < 0 ? s * 0x8000 : s * 0x7FFF; 166 | } 167 | this.port.postMessage(outputData.buffer, [outputData.buffer]); 168 | this.bufferIndex = 0; 169 | } 170 | } 171 | } 172 | return true; 173 | } 174 | } 175 | registerProcessor('audio-sender-processor', AudioSenderProcessor); 176 | `; 177 | 178 | try { 179 | const stream = await navigator.mediaDevices.getUserMedia({ 180 | audio: { 181 | deviceId: { exact: options.deviceId }, 182 | echoCancellation: false, 183 | noiseSuppression: false, 184 | autoGainControl: false, 185 | channelCount: 1, 186 | sampleRate: 16000, 187 | }, 188 | }); 189 | 190 | // Add the AudioWorklet module 191 | const blob = new Blob([workletCode], { type: "application/javascript" }); 192 | const workletUrl = URL.createObjectURL(blob); 193 | await audioContext.audioWorklet.addModule(workletUrl); 194 | 195 | const source = audioContext.createMediaStreamSource(stream); 196 | const workletNode = new AudioWorkletNode( 197 | audioContext, 198 | "audio-sender-processor" 199 | ); 200 | source.connect(workletNode); 201 | 202 | workletNode.port.onmessage = async (event) => { 203 | const data = new Int16Array(event.data); 204 | let level = 0; 205 | if ($vad.get()) { 206 | const vad = $vad.get()!; 207 | // Convert Int16Array to Float32Array 208 | const floatData = new Float32Array(data.length); 209 | for (let i = 0; i < data.length; i++) { 210 | floatData[i] = data[i] / 32768; 211 | } 212 | const result = await ( 213 | vad.frameProcessor as FrameProcessor 214 | ).modelProcessFunc(floatData); 215 | level = result.isSpeech; 216 | } else { 217 | // Calculate RMS 218 | let sum = 0; 219 | for (let i = 0; i < data.length; i++) { 220 | sum += (data[i] / 32768) ** 2; 221 | } 222 | level = Math.sqrt(sum / data.length) * Math.sqrt(2); 223 | } 224 | $level.set(level); 225 | if (level > $realMax.get()) { 226 | $realMax.set(level); 227 | } else { 228 | $realMax.set($realMax.get() * 0.995); 229 | } 230 | if (level > $current.get()) { 231 | $current.set(level); 232 | } else if ($active.get()) { 233 | const maxSamples = $maxAudioLength.get() * 16000; 234 | const maxBlocks = maxSamples / 1024; 235 | const progress = 236 | Math.min(1, currentBlockCount / maxBlocks) ** $decayEasing.get(); 237 | const decayRate = 0.99 - progress * 0.5; 238 | $current.set($current.get() * decayRate); 239 | } else { 240 | $current.set(level); 241 | } 242 | if (!$active.get()) { 243 | const threshold = $effectiveMax.get() * $activationThreshold.get(); 244 | if ($current.get() > threshold) { 245 | const id = `au${Date.now()}`; 246 | $active.set(id); 247 | currentBlockCount = 0; 248 | onEvent({ 249 | method: "start", 250 | params: { localTime: new Date().toISOString() }, 251 | }); 252 | log(`Utterance started`); 253 | } 254 | } else if ($active.get()) { 255 | const threshold = $effectiveMax.get() * $deactivationThreshold.get(); 256 | if ($current.get() < threshold) { 257 | $active.set(null); 258 | onEvent({ method: "stop" }); 259 | const samples = currentBlockCount * 1024; 260 | const duration = samples / 16000; 261 | log(`Utterance finished, duration: ${duration.toFixed(2)}s`); 262 | } else { 263 | currentBlockCount++; 264 | } 265 | } 266 | if ($active.get()) { 267 | // Convert data into base64-encoded string. 268 | const base64 = encode(new Uint8Array(event.data)); 269 | onEvent({ method: "audio", params: { data: base64 } }); 270 | } 271 | }; 272 | } catch (error) { 273 | options.log(`Error in audio sender: ${error}`); 274 | } 275 | } 276 | 277 | async function startWebsocket() { 278 | const { backend, room, key } = backendContext; 279 | const socket = new ReconnectingWebSocket( 280 | `${backend}/rooms/${room}/audioIngest?key=${key}` 281 | ); 282 | socket.onopen = () => { 283 | log("WebSocket connected"); 284 | $socketStatus.set("authenticating"); 285 | }; 286 | socket.onmessage = (event) => { 287 | const data = JSON.parse(event.data); 288 | if (data.method === "welcome") { 289 | log("Received welcome message"); 290 | $socketStatus.set("connected"); 291 | for (const message of unackedMessages.values()) { 292 | socket.send(message); 293 | } 294 | } 295 | if (data.id && unackedMessages.has(data.id)) { 296 | unackedMessages.delete(data.id); 297 | $pendingEventCount.set(unackedMessages.size); 298 | } 299 | }; 300 | onEvent = (event) => { 301 | const id = crypto.randomUUID(); 302 | const payload = JSON.stringify({ id, ...event }); 303 | socket.send(payload); 304 | unackedMessages.set(id, payload); 305 | $pendingEventCount.set(unackedMessages.size); 306 | }; 307 | socket.onclose = (event) => { 308 | log(`WebSocket disconnected: ${event.reason}`); 309 | $socketStatus.set("disconnected"); 310 | }; 311 | } 312 | 313 | return { 314 | $level, 315 | $max: $effectiveMax, 316 | $current, 317 | $active, 318 | start, 319 | $pendingEventCount, 320 | $started, 321 | $socketStatus, 322 | }; 323 | } 324 | 325 | const levelToDb = (level: number) => 20 * Math.log10(level); 326 | const levelToX = (level: number) => { 327 | const db = levelToDb(level); 328 | const x = Math.max(0, Math.min(1, (db + 100) / 100)); 329 | return x; 330 | }; 331 | 332 | type AudioSenderController = ReturnType; 333 | let _sender: AudioSenderController | undefined; 334 | 335 | function AudioSenderView(props: { 336 | deviceId: string; 337 | backendContext: BackendContext; 338 | }) { 339 | const sender = (_sender ??= createAudioSenderController({ 340 | backendContext: props.backendContext, 341 | deviceId: props.deviceId, 342 | log: log, 343 | })); 344 | return ( 345 | <> 346 | 347 | 348 | 349 | 350 | 351 | > 352 | ); 353 | } 354 | 355 | function StartButton(props: { sender: AudioSenderController }) { 356 | const sender = props.sender; 357 | const started = useStore(sender.$started); 358 | return ( 359 | 360 | 365 | Start 366 | 367 | 368 | ); 369 | } 370 | 371 | function StatusInspector(props: { sender: AudioSenderController }) { 372 | const status = useStore(props.sender.$socketStatus); 373 | const count = useStore(props.sender.$pendingEventCount); 374 | return ( 375 | 376 | Socket status: {status} 377 | 378 | Pending events: {count} 379 | 380 | ); 381 | } 382 | 383 | function LevelMeter(props: { sender: AudioSenderController }) { 384 | const level = useStore(props.sender.$level); 385 | const max = useStore(props.sender.$max); 386 | const current = useStore(props.sender.$current); 387 | const active = useStore(props.sender.$active); 388 | const threshold = 389 | max * (active ? $deactivationThreshold.get() : $activationThreshold.get()); 390 | return ( 391 | 395 | 402 | 408 | 415 | 422 | 423 | ); 424 | } 425 | 426 | function Knobs() { 427 | return ( 428 | 429 | 434 | 435 | 440 | 445 | 450 | 451 | ); 452 | } 453 | 454 | function NumberKnob({ 455 | label, 456 | step, 457 | $value, 458 | }: { 459 | label: string; 460 | step: string; 461 | $value: any; 462 | }) { 463 | const value = useStore($value); 464 | 465 | const handleChange = (e: React.ChangeEvent) => { 466 | $value.set(parseFloat(e.target.value)); 467 | }; 468 | 469 | return ( 470 | 471 | 472 | {label} 473 | 474 | 482 | 483 | ); 484 | } 485 | --------------------------------------------------------------------------------
Welcome to Live Speech frontend...
9 | For more information, check out the{ 10 | " " 11 | }https://github.com/dtinth/live-speech 12 |
360 | 365 | Start 366 | 367 |
376 | Socket status: {status} 377 | 378 | Pending events: {count} 379 |