├── tsconfig.json
├── .gitignore
├── lib
    ├── utils.ts
    ├── types.ts
    ├── baseline-transcriber.ts
    ├── results-storage.ts
    ├── metrics-calculator.ts
    ├── audio-fetcher.ts
    ├── report-generator.ts
    └── streaming-transcribers.ts
├── package.json
├── README.md
└── index.ts


/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2022",
 4 |     "module": "NodeNext",
 5 |     "moduleResolution": "NodeNext",
 6 |     "esModuleInterop": true,
 7 |     "types": ["node"],
 8 |     "lib": ["ES2022"],
 9 |     "forceConsistentCasingInFileNames": true,
10 |     "strict": true,
11 |     "skipLibCheck": true
12 |   }
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | node_modules/
 3 | 
 4 | # Output
 5 | output/
 6 | dist/
 7 | build/
 8 | 
 9 | # Environment variables
10 | .env
11 | .env.local
12 | .env.*.local
13 | 
14 | # Logs
15 | logs
16 | *.log
17 | npm-debug.log*
18 | yarn-debug.log*
19 | yarn-error.log*
20 | 
21 | # IDE
22 | .vscode/
23 | .idea/
24 | *.swp
25 | *.swo
26 | *~
27 | 
28 | # OS
29 | .DS_Store
30 | Thumbs.db
31 | 
32 | # TypeScript
33 | *.tsbuildinfo
34 | 


--------------------------------------------------------------------------------
/lib/utils.ts:
--------------------------------------------------------------------------------
 1 | import dotenv from 'dotenv'
 2 | import path from 'path'
 3 | import fs from 'fs'
 4 | import { fileURLToPath } from 'url'
 5 | 
 6 | export function loadEnv() {
 7 |   // Try to load from .env.local in root first, then .env in root
 8 |   const moduleDir = path.dirname(fileURLToPath(import.meta.url))
 9 |   const rootDir = path.resolve(moduleDir, '../../')
10 |   const envLocalPath = path.join(rootDir, '.env.local')
11 |   const envPath = path.join(rootDir, '.env')
12 | 
13 |   if (fs.existsSync(envLocalPath)) {
14 |     dotenv.config({ path: envLocalPath })
15 |   } else if (fs.existsSync(envPath)) {
16 |     dotenv.config({ path: envPath })
17 |   }
18 | }
19 | 
20 | export function getEnvVar(key: string, required = false): string {
21 |   const value = process.env[key]
22 |   if (required && !value) {
23 |     throw new Error(`Missing required environment variable: ${key}`)
24 |   }
25 |   return value || ''
26 | }
27 | 
28 | export function sleep(ms: number) {
29 |   return new Promise((resolve) => setTimeout(resolve, ms))
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "asr-benchmark",
 3 |   "version": "1.0.0",
 4 |   "description": "ASR Benchmarking Tool for VAPI Calls",
 5 |   "type": "module",
 6 |   "main": "index.ts",
 7 |   "scripts": {
 8 |     "benchmark": "tsx index.ts"
 9 |   },
10 |   "dependencies": {
11 |     "@aws-sdk/client-transcribe-streaming": "^3.696.0",
12 |     "@deepgram/sdk": "^3.9.0",
13 |     "@google-cloud/speech": "^6.7.0",
14 |     "assemblyai": "^4.8.0",
15 |     "chalk": "^5.3.0",
16 |     "cli-progress": "^3.12.0",
17 |     "commander": "^12.1.0",
18 |     "csv-stringify": "^6.5.1",
19 |     "dotenv": "^16.4.5",
20 |     "fast-levenshtein": "^3.0.0",
21 |     "fs-extra": "^11.2.0",
22 |     "openai": "^4.73.0",
23 |     "pg": "^8.13.1",
24 |     "postgres": "^3.4.5",
25 |     "tsx": "^4.19.2",
26 |     "zod": "^3.23.8"
27 |   },
28 |   "devDependencies": {
29 |     "@types/cli-progress": "^3.11.6",
30 |     "@types/fast-levenshtein": "^0.0.2",
31 |     "@types/fs-extra": "^11.0.4",
32 |     "@types/node": "^22.10.1",
33 |     "@types/pg": "^8.11.10",
34 |     "typescript": "^5.6.3"
35 |   }
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/lib/types.ts:
--------------------------------------------------------------------------------
 1 | export interface TranscriptionResult {
 2 |   transcript: string
 3 |   timeToFirstWord: number
 4 |   timeToCompletion: number
 5 |   metadata: Record<string, any>
 6 |   words?: Array<{
 7 |     word: string
 8 |     start: number
 9 |     end: number
10 |     confidence: number
11 |   }>
12 | }
13 | 
14 | export interface StreamingTranscriber {
15 |   name: string
16 |   transcribe(audioPath: string): Promise<TranscriptionResult>
17 | }
18 | 
19 | export interface MetricResult {
20 |   wer: number
21 |   cer: number
22 |   similarity: number
23 |   latency: {
24 |     firstWord: number
25 |     completion: number
26 |   }
27 | }
28 | 
29 | export interface CallResult {
30 |   callId: string
31 |   audioPath: string
32 |   baselineTranscript: string
33 |   results: Record<string, {
34 |     transcript: string
35 |     metrics: MetricResult
36 |   }>
37 | }
38 | 
39 | export interface BenchmarkResult {
40 |   timestamp: string
41 |   baselineModel: string
42 |   callsAnalyzed: number
43 |   models: Array<{
44 |     name: string
45 |     avgWER: number
46 |     avgCER: number
47 |     avgSimilarity: number
48 |     avgTimeToFirstWord: number
49 |     avgTimeToCompletion: number
50 |   }>
51 |   perCallResults: CallResult[]
52 | }
53 | 
54 | export interface ASRConfig {
55 |   openaiApiKey?: string
56 |   deepgramApiKey?: string
57 |   assemblyAiApiKey?: string
58 |   googleCredentialsPath?: string
59 |   awsAccessKeyId?: string
60 |   awsSecretAccessKey?: string
61 |   awsRegion?: string
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/lib/baseline-transcriber.ts:
--------------------------------------------------------------------------------
 1 | import OpenAI from 'openai'
 2 | import fs from 'fs-extra'
 3 | import { getEnvVar } from './utils.js'
 4 | import { TranscriptionResult } from './types.js'
 5 | 
 6 | export class BaselineTranscriber {
 7 |   private openai: OpenAI
 8 | 
 9 |   constructor() {
10 |     const apiKey = getEnvVar('OPENAI_API_KEY', true)
11 |     this.openai = new OpenAI({ apiKey })
12 |   }
13 | 
14 |   async transcribe(audioPath: string): Promise<TranscriptionResult> {
15 |     const startTime = Date.now()
16 |     
17 |     const fileStream = fs.createReadStream(audioPath)
18 | 
19 |     const response = await this.openai.audio.transcriptions.create({
20 |       file: fileStream,
21 |       model: 'whisper-1', // Using 'whisper-1' which is typically Large v2/v3 on API
22 |       response_format: 'verbose_json',
23 |       timestamp_granularities: ['word'],
24 |     })
25 | 
26 |     const endTime = Date.now()
27 |     const timeToCompletion = endTime - startTime
28 | 
29 |     // Determine first word time from words if available
30 |     let timeToFirstWord = 0
31 |     if (response.words && response.words.length > 0) {
32 |        // API returns seconds, convert to ms
33 |        timeToFirstWord = response.words[0].start * 1000
34 |     } else {
35 |        // Fallback if no words (shouldn't happen with verbose_json + word granularity)
36 |        timeToFirstWord = timeToCompletion // rough estimate if not streaming
37 |     }
38 | 
39 |     return {
40 |       transcript: response.text,
41 |       timeToFirstWord,
42 |       timeToCompletion,
43 |       metadata: {
44 |         model: 'whisper-1',
45 |         duration: response.duration,
46 |       },
47 |       words: response.words?.map((w: any) => ({
48 |         word: w.word,
49 |         start: w.start,
50 |         end: w.end,
51 |         confidence: 1.0 // OpenAI doesn't always return confidence per word in all modes, assume 1 for baseline or check schema
52 |       }))
53 |     }
54 |   }
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/lib/results-storage.ts:
--------------------------------------------------------------------------------
 1 | import fs from 'fs-extra'
 2 | import path from 'path'
 3 | import { stringify } from 'csv-stringify/sync'
 4 | import { BenchmarkResult, CallResult } from './types.js'
 5 | 
 6 | type CsvRow = {
 7 |   callId: string
 8 |   model: string
 9 |   wer: string
10 |   cer: string
11 |   similarity: string
12 |   timeToFirstWord: number
13 |   timeToCompletion: number
14 |   transcriptLength: number
15 | }
16 | 
17 | export class ResultsStorage {
18 |   private outputDir: string
19 | 
20 |   constructor(baseOutputDir: string) {
21 |     const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
22 |     this.outputDir = path.join(baseOutputDir, timestamp)
23 |   }
24 | 
25 |   async init() {
26 |     await fs.ensureDir(this.outputDir)
27 |     await fs.ensureDir(path.join(this.outputDir, 'transcripts'))
28 |     return this.outputDir
29 |   }
30 | 
31 |   async saveResults(results: BenchmarkResult) {
32 |     // 1. Save full JSON
33 |     await fs.writeJSON(path.join(this.outputDir, 'results.json'), results, { spaces: 2 })
34 | 
35 |     // 2. Save individual transcripts
36 |     for (const callResult of results.perCallResults) {
37 |       await fs.writeJSON(
38 |         path.join(this.outputDir, 'transcripts', `${callResult.callId}.json`),
39 |         callResult,
40 |         { spaces: 2 }
41 |       )
42 |     }
43 | 
44 |     // 3. Save CSV Summary
45 |     const csvRows: CsvRow[] = results.perCallResults.flatMap((call: CallResult) => {
46 |       const modelEntries = Object.entries(call.results) as Array<
47 |         [string, CallResult['results'][string]]
48 |       >
49 | 
50 |       return modelEntries.map(([modelName, data]) => ({
51 |         callId: call.callId,
52 |         model: modelName,
53 |         wer: data.metrics.wer.toFixed(4),
54 |         cer: data.metrics.cer.toFixed(4),
55 |         similarity: data.metrics.similarity.toFixed(4),
56 |         timeToFirstWord: data.metrics.latency.firstWord,
57 |         timeToCompletion: data.metrics.latency.completion,
58 |         transcriptLength: data.transcript.length
59 |       }))
60 |     })
61 | 
62 |     const csvOutput = stringify(csvRows, {
63 |       header: true,
64 |       columns: ['callId', 'model', 'wer', 'cer', 'similarity', 'timeToFirstWord', 'timeToCompletion', 'transcriptLength']
65 |     })
66 | 
67 |     await fs.writeFile(path.join(this.outputDir, 'summary.csv'), csvOutput)
68 |   }
69 | 
70 |   getOutputDir() {
71 |     return this.outputDir
72 |   }
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ASR Benchmarking Tool
 2 | 
 3 | A tool to benchmark streaming ASR providers (Deepgram, AssemblyAI, Google, AWS) against OpenAI Whisper Large v3.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Baseline Comparison**: Compares all models against Whisper Large v3 (via OpenAI API).
 8 | - **Metrics**: WER (Word Error Rate), CER (Character Error Rate), Semantic Similarity, and Latency (Time to First Word).
 9 | - **Visual Report**: Generates an HTML report with charts and transcript diffs.
10 | - **Parallel Processing**: Runs multiple models concurrently for efficiency.
11 | 
12 | ## Setup
13 | 
14 | 1. **Install Dependencies**:
15 |    ```bash
16 |    cd eval
17 |    npm install
18 |    ```
19 | 
20 | 2. **Environment Variables**:
21 |    Ensure your root `.env` or `.env.local` contains the following keys:
22 |    
23 |    ```env
24 |    # Database (to fetch call recordings)
25 |    DATABASE_URL=postgresql://...
26 | 
27 |    # ASR Providers
28 |    OPENAI_API_KEY=sk-...
29 |    DEEPGRAM_API_KEY=...
30 |    ASSEMBLYAI_API_KEY=...
31 |    
32 |    # AWS Transcribe
33 |    AWS_ACCESS_KEY_ID=...
34 |    AWS_SECRET_ACCESS_KEY=...
35 |    AWS_REGION=us-east-1
36 |    
37 |    # Google Cloud Speech
38 |    # Ensure GOOGLE_APPLICATION_CREDENTIALS points to your JSON key file
39 |    # GOOGLE_APPLICATION_CREDENTIALS=path/to/key.json
40 |    ```
41 | 
42 | ## Usage
43 | 
44 | Run the benchmark from the project root:
45 | 
46 | ```bash
47 | npm run benchmark:asr -- -w <WORKSPACE_ID> [options]
48 | ```
49 | 
50 | Or from the `eval` directory:
51 | 
52 | ```bash
53 | npm run benchmark -- -w <WORKSPACE_ID> [options]
54 | ```
55 | 
56 | ### Options
57 | 
58 | - `-w, --workspace-id <id>` (Required) Workspace ID to fetch calls from.
59 | - `-n, --count <number>` (Default: 10) Number of most recent calls to analyze.
60 | - `-a, --agent-id <id>` Filter by specific Agent ID.
61 | - `-o, --output <path>` Custom output directory (default: `eval/output`).
62 | - `--models <names>` Comma-separated list of models to test (default: all).
63 |   - Available: `Deepgram Nova 3`, `Deepgram Nova 2`, `AssemblyAI Streaming`, `Google Streaming`, `AWS Transcribe Streaming`
64 | 
65 | ### VAPI API Source
66 | 
67 | You can fetch calls directly from the VAPI API instead of the database:
68 | 
69 | ```bash
70 | npm run benchmark:asr -- --source vapi --agent-id <VAPI_AGENT_ID> --min-duration 120
71 | ```
72 | 
73 | - `--source vapi`: Switch to VAPI API source.
74 | - `--agent-id <id>`: Required for VAPI source.
75 | - `--min-duration <seconds>`: Filter calls shorter than this duration (e.g., 120 for 2 minutes).
76 | - `-w, --workspace-id` is NOT required when using VAPI source.
77 | 
78 | ### Example
79 | 
80 | ```bash
81 | # Benchmark last 5 calls for Deepgram and AssemblyAI
82 | npm run benchmark:asr -- -w workspace_123 -n 5 --models "Deepgram Nova 3,AssemblyAI Streaming"
83 | ```
84 | 
85 | ## Output
86 | 
87 | Results are saved in `eval/output/<timestamp>/`:
88 | - `report.html`: Interactive visualization.
89 | - `summary.csv`: Raw metrics for analysis.
90 | - `results.json`: Full detailed JSON data.
91 | - `transcripts/`: Individual JSON result per call.
92 | 
93 | 


--------------------------------------------------------------------------------
/lib/metrics-calculator.ts:
--------------------------------------------------------------------------------
  1 | import levenshtein from 'fast-levenshtein'
  2 | import OpenAI from 'openai'
  3 | import { getEnvVar } from './utils.js'
  4 | import { MetricResult } from './types.js'
  5 | 
  6 | export class MetricsCalculator {
  7 |   private openai: OpenAI
  8 | 
  9 |   constructor() {
 10 |     this.openai = new OpenAI({ apiKey: getEnvVar('OPENAI_API_KEY', true) })
 11 |   }
 12 | 
 13 |   normalizeText(text: string): string {
 14 |     return text
 15 |       .toLowerCase()
 16 |       .replace(/[^\w\s]/g, '') // Remove punctuation
 17 |       .replace(/\s+/g, ' ')    // Collapse whitespace
 18 |       .trim()
 19 |   }
 20 | 
 21 |   calculateWER(reference: string, hypothesis: string): number {
 22 |     const refWords = this.normalizeText(reference).split(' ')
 23 |     const hypWords = this.normalizeText(hypothesis).split(' ')
 24 |     
 25 |     if (refWords.length === 0) return hypWords.length > 0 ? 1.0 : 0.0
 26 | 
 27 |     const distance = this.calculateTokenDistance(refWords, hypWords)
 28 |     return distance / refWords.length
 29 |   }
 30 | 
 31 |   calculateCER(reference: string, hypothesis: string): number {
 32 |     const refNorm = this.normalizeText(reference)
 33 |     const hypNorm = this.normalizeText(hypothesis)
 34 |     
 35 |     if (refNorm.length === 0) return hypNorm.length > 0 ? 1.0 : 0.0
 36 | 
 37 |     const distance = levenshtein.get(refNorm, hypNorm)
 38 |     return distance / refNorm.length
 39 |   }
 40 | 
 41 |   private calculateTokenDistance(reference: string[], hypothesis: string[]): number {
 42 |     const rows = reference.length + 1
 43 |     const cols = hypothesis.length + 1
 44 |     const dp: number[][] = Array.from({ length: rows }, () => Array(cols).fill(0))
 45 | 
 46 |     for (let i = 0; i < rows; i++) dp[i][0] = i
 47 |     for (let j = 0; j < cols; j++) dp[0][j] = j
 48 | 
 49 |     for (let i = 1; i < rows; i++) {
 50 |       for (let j = 1; j < cols; j++) {
 51 |         const cost = reference[i - 1] === hypothesis[j - 1] ? 0 : 1
 52 |         dp[i][j] = Math.min(
 53 |           dp[i - 1][j] + 1,
 54 |           dp[i][j - 1] + 1,
 55 |           dp[i - 1][j - 1] + cost
 56 |         )
 57 |       }
 58 |     }
 59 | 
 60 |     return dp[rows - 1][cols - 1]
 61 |   }
 62 | 
 63 |   async calculateSimilarity(reference: string, hypothesis: string): Promise<number> {
 64 |     if (!reference || !hypothesis) return 0
 65 | 
 66 |     try {
 67 |       const response = await this.openai.embeddings.create({
 68 |         model: 'text-embedding-3-small',
 69 |         input: [reference, hypothesis],
 70 |       })
 71 | 
 72 |       const vecA = response.data[0].embedding
 73 |       const vecB = response.data[1].embedding
 74 | 
 75 |       return this.cosineSimilarity(vecA, vecB)
 76 |     } catch (error) {
 77 |       console.error('Error calculating similarity:', error)
 78 |       return 0
 79 |     }
 80 |   }
 81 | 
 82 |   private cosineSimilarity(vecA: number[], vecB: number[]): number {
 83 |     let dotProduct = 0
 84 |     let normA = 0
 85 |     let normB = 0
 86 |     for (let i = 0; i < vecA.length; i++) {
 87 |       dotProduct += vecA[i] * vecB[i]
 88 |       normA += vecA[i] * vecA[i]
 89 |       normB += vecB[i] * vecB[i]
 90 |     }
 91 |     return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
 92 |   }
 93 | 
 94 |   async calculateAll(
 95 |     reference: string, 
 96 |     hypothesis: string, 
 97 |     latency: { firstWord: number; completion: number }
 98 |   ): Promise<MetricResult> {
 99 |     return {
100 |       wer: this.calculateWER(reference, hypothesis),
101 |       cer: this.calculateCER(reference, hypothesis),
102 |       similarity: await this.calculateSimilarity(reference, hypothesis),
103 |       latency
104 |     }
105 |   }
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/lib/audio-fetcher.ts:
--------------------------------------------------------------------------------
  1 | import { Client } from 'pg'
  2 | import fs from 'fs-extra'
  3 | import path from 'path'
  4 | import { loadEnv, getEnvVar } from './utils.js'
  5 | 
  6 | loadEnv()
  7 | 
  8 | export interface CallRecord {
  9 |   id: string
 10 |   recordingUrl: string
 11 |   agentId: string
 12 |   workspaceId: string
 13 |   transcript: string | null
 14 | }
 15 | 
 16 | export class AudioFetcher {
 17 |   private client: Client
 18 |   private outputDir: string
 19 | 
 20 |   constructor(outputDir: string) {
 21 |     this.outputDir = outputDir
 22 |     this.client = new Client({
 23 |       connectionString: getEnvVar('DATABASE_URL', false), // Make optional if only using VAPI API
 24 |     })
 25 |   }
 26 | 
 27 |   async connect() {
 28 |     // Only connect if DATABASE_URL is present
 29 |     if (getEnvVar('DATABASE_URL', false)) {
 30 |         await this.client.connect()
 31 |     }
 32 |   }
 33 | 
 34 |   async disconnect() {
 35 |     if (getEnvVar('DATABASE_URL', false)) {
 36 |         await this.client.end()
 37 |     }
 38 |   }
 39 | 
 40 |   async fetchCallsFromVapi(
 41 |     assistantId: string,
 42 |     limit: number,
 43 |     minDurationSeconds: number
 44 |   ): Promise<CallRecord[]> {
 45 |     const apiKey = getEnvVar('VAPI_PRIVATE_KEY', true)
 46 |     const records: CallRecord[] = []
 47 |     
 48 |     // Fetch calls from VAPI API
 49 |     // We'll fetch a bit more than limit to account for filtering
 50 |     const fetchLimit = Math.max(limit * 3, 50) 
 51 |     
 52 |     const url = new URL('https://api.vapi.ai/call')
 53 |     url.searchParams.append('assistantId', assistantId)
 54 |     url.searchParams.append('limit', fetchLimit.toString())
 55 |     url.searchParams.append('createdAtGt', new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString()) // Last 30 days
 56 | 
 57 |     const response = await fetch(url.toString(), {
 58 |       headers: {
 59 |         'Authorization': `Bearer ${apiKey}`,
 60 |         'Content-Type': 'application/json'
 61 |       }
 62 |     })
 63 | 
 64 |     if (!response.ok) {
 65 |       throw new Error(`Failed to fetch calls from VAPI: ${response.statusText}`)
 66 |     }
 67 | 
 68 |     const data = await response.json() as any[]
 69 |     
 70 |     for (const call of data) {
 71 |       // Duration check
 72 |       // VAPI call object usually has durationSeconds or analysis.durationSeconds
 73 |       // We'll check top level duration or calculate from startedAt/endedAt if needed
 74 |       let duration = call.durationSeconds
 75 |       if (!duration && call.startedAt && call.endedAt) {
 76 |         duration = (new Date(call.endedAt).getTime() - new Date(call.startedAt).getTime()) / 1000
 77 |       }
 78 |       
 79 |       if ((duration || 0) < minDurationSeconds) continue
 80 |       
 81 |       if (!call.recordingUrl && !call.artifact?.recordingUrl) continue
 82 |       if (call.status !== 'ended') continue
 83 | 
 84 |       records.push({
 85 |         id: call.id,
 86 |         recordingUrl: call.recordingUrl || call.artifact?.recordingUrl,
 87 |         agentId: call.assistantId,
 88 |         workspaceId: 'vapi-api', // Placeholder
 89 |         transcript: call.transcript || call.artifact?.transcript || null
 90 |       })
 91 | 
 92 |       if (records.length >= limit) break
 93 |     }
 94 | 
 95 |     return records
 96 |   }
 97 | 
 98 |   async fetchRecentCalls(
 99 |     count: number,
100 |     workspaceId: string,
101 |     agentId?: string
102 |   ): Promise<CallRecord[]> {
103 |     let query = `
104 |       SELECT id, "recordingUrl", "agentId", "workspaceId", transcript
105 |       FROM calls
106 |       WHERE "workspaceId" = $1
107 |         AND "recordingUrl" IS NOT NULL
108 |         AND status = 'ended'
109 |         AND provider = 'vapi'
110 |     `
111 |     const params: any[] = [workspaceId]
112 | 
113 |     if (agentId) {
114 |       query += ` AND "agentId" = $2`
115 |       params.push(agentId)
116 |     }
117 | 
118 |     query += ` ORDER BY "createdAt" DESC LIMIT $${params.length + 1}`
119 |     params.push(count)
120 | 
121 |     try {
122 |       const res = await this.client.query(query, params)
123 |       return res.rows
124 |     } catch (error) {
125 |       console.error('Error fetching calls:', error)
126 |       throw error
127 |     }
128 |   }
129 | 
130 |   async downloadAudio(url: string, callId: string): Promise<string> {
131 |     const extension = path.extname(url.split('?')[0]) || '.wav'
132 |     const filename = `${callId}${extension}`
133 |     const filePath = path.join(this.outputDir, filename)
134 | 
135 |     if (await fs.pathExists(filePath)) {
136 |       return filePath
137 |     }
138 | 
139 |     const response = await fetch(url)
140 |     if (!response.ok) {
141 |       throw new Error(`Failed to download audio from ${url}: ${response.statusText}`)
142 |     }
143 | 
144 |     const buffer = Buffer.from(await response.arrayBuffer())
145 |     await fs.writeFile(filePath, buffer)
146 | 
147 |     return filePath
148 |   }
149 | }
150 | 
151 | 


--------------------------------------------------------------------------------
/lib/report-generator.ts:
--------------------------------------------------------------------------------
  1 | import fs from 'fs-extra'
  2 | import path from 'path'
  3 | import { BenchmarkResult } from './types.js'
  4 | 
  5 | export class ReportGenerator {
  6 |   async generateReport(results: BenchmarkResult, outputDir: string) {
  7 |     const html = this.generateHTML(results)
  8 |     await fs.writeFile(path.join(outputDir, 'report.html'), html)
  9 |   }
 10 | 
 11 |   private generateHTML(results: BenchmarkResult): string {
 12 |     const models = results.models.map((model) => model.name)
 13 |     const werData = results.models.map((model) => (model.avgWER * 100).toFixed(2))
 14 |     const latencyData = results.models.map((model) => model.avgTimeToFirstWord)
 15 |     const modelTableRows = results.models
 16 |       .map((model) => `
 17 |           <tr class="hover:bg-slate-50">
 18 |             <td class="p-3 font-medium">${model.name}</td>
 19 |             <td class="p-3 ${(model.avgWER * 100) < 10 ? 'text-green-600' : ''}">${(model.avgWER * 100).toFixed(2)}%</td>
 20 |             <td class="p-3">${(model.avgCER * 100).toFixed(2)}%</td>
 21 |             <td class="p-3">${model.avgSimilarity.toFixed(3)}</td>
 22 |             <td class="p-3">${Math.round(model.avgTimeToFirstWord)}</td>
 23 |             <td class="p-3">${Math.round(model.avgTimeToCompletion)}</td>
 24 |           </tr>
 25 |         `)
 26 |       .join('')
 27 |     const callOptions = results.perCallResults
 28 |       .map((call, index) => `<option value="${index}">Call ${call.callId}</option>`)
 29 |       .join('')
 30 |     
 31 |     const resultsJson = JSON.stringify(results).replace(/</g, '\\u003c')
 32 | 
 33 |     return `<!DOCTYPE html>
 34 | <html lang="en">
 35 | <head>
 36 |   <meta charset="UTF-8">
 37 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 38 |   <title>ASR Benchmark Results - ${results.timestamp}</title>
 39 |   <script src="https://cdn.tailwindcss.com"></script>
 40 |   <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 41 |   <style>
 42 |     body { font-family: system-ui, -apple-system, sans-serif; background: #f8fafc; }
 43 |     .card { background: white; border-radius: 0.5rem; padding: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1.5rem; }
 44 |     .diff-ins { background-color: #bbf7d0; text-decoration: none; }
 45 |     .diff-del { background-color: #fecaca; text-decoration: line-through; color: #991b1b; }
 46 |   </style>
 47 | </head>
 48 | <body class="p-6 max-w-7xl mx-auto">
 49 |   <header class="mb-8">
 50 |     <h1 class="text-3xl font-bold text-slate-900">ASR Benchmark Results</h1>
 51 |     <p class="text-slate-600">Baseline: ${results.baselineModel} | Calls: ${results.callsAnalyzed} | Date: ${results.timestamp}</p>
 52 |   </header>
 53 | 
 54 |   <div class="grid grid-cols-1 md:grid-cols-2 gap-6 mb-6">
 55 |     <div class="card">
 56 |       <h3 class="text-lg font-semibold mb-4">Average WER (%) - Lower is better</h3>
 57 |       <canvas id="werChart"></canvas>
 58 |     </div>
 59 |     <div class="card">
 60 |       <h3 class="text-lg font-semibold mb-4">Avg Time to First Word (ms) - Lower is better</h3>
 61 |       <canvas id="latencyChart"></canvas>
 62 |     </div>
 63 |   </div>
 64 | 
 65 |   <div class="card overflow-x-auto">
 66 |     <h3 class="text-lg font-semibold mb-4">Overall Model Ranking</h3>
 67 |     <table class="min-w-full divide-y divide-slate-200 text-sm">
 68 |       <thead>
 69 |         <tr class="text-left">
 70 |           <th class="p-3">Model</th>
 71 |           <th class="p-3">Avg WER</th>
 72 |           <th class="p-3">Avg CER</th>
 73 |           <th class="p-3">Similarity</th>
 74 |           <th class="p-3">TTFW (ms)</th>
 75 |           <th class="p-3">Completion (ms)</th>
 76 |         </tr>
 77 |       </thead>
 78 |       <tbody>
 79 |         ${modelTableRows}
 80 |       </tbody>
 81 |     </table>
 82 |   </div>
 83 | 
 84 |   <div class="card">
 85 |     <h3 class="text-lg font-semibold mb-4">Call Details & Transcripts</h3>
 86 |     <div class="mb-4">
 87 |         <label class="block text-sm font-medium text-slate-700 mb-2">Select Call</label>
 88 |         <select id="callSelect" class="w-full p-2 border rounded bg-slate-50" onchange="renderCallDetails()">
 89 |            ${callOptions}
 90 |         </select>
 91 |     </div>
 92 |     
 93 |     <div id="callDetails" class="space-y-6">
 94 |        <!-- Populated by JS -->
 95 |     </div>
 96 |   </div>
 97 | 
 98 |   <script>
 99 |     const results = ${resultsJson};
100 | 
101 |     // Charts
102 |     new Chart(document.getElementById('werChart'), {
103 |       type: 'bar',
104 |       data: {
105 |         labels: ${JSON.stringify(models)},
106 |         datasets: [{
107 |           label: 'WER (%)',
108 |           data: ${JSON.stringify(werData)},
109 |           backgroundColor: 'rgba(59, 130, 246, 0.6)',
110 |           borderColor: 'rgb(59, 130, 246)',
111 |           borderWidth: 1
112 |         }]
113 |       },
114 |       options: { scales: { y: { beginAtZero: true } } }
115 |     });
116 | 
117 |     new Chart(document.getElementById('latencyChart'), {
118 |       type: 'bar',
119 |       data: {
120 |         labels: ${JSON.stringify(models)},
121 |         datasets: [{
122 |           label: 'TTFW (ms)',
123 |           data: ${JSON.stringify(latencyData)},
124 |           backgroundColor: 'rgba(16, 185, 129, 0.6)',
125 |           borderColor: 'rgb(16, 185, 129)',
126 |           borderWidth: 1
127 |         }]
128 |       },
129 |       options: { scales: { y: { beginAtZero: true } } }
130 |     });
131 | 
132 |     // Call Details Renderer
133 |     function renderCallDetails() {
134 |         const select = document.getElementById('callSelect');
135 |         const index = select.value;
136 |         const call = results.perCallResults[index];
137 |         const container = document.getElementById('callDetails');
138 |         
139 |         if (!call) return;
140 | 
141 |         let html = '<div class="grid grid-cols-1 lg:grid-cols-2 gap-6">';
142 |         
143 |         // Baseline
144 |         html += \`
145 |           <div class="bg-slate-50 p-4 rounded border">
146 |             <h4 class="font-bold mb-2 text-blue-800">Baseline (Whisper V3)</h4>
147 |             <p class="text-slate-700 leading-relaxed text-sm whitespace-pre-wrap">\${escapeHtml(call.baselineTranscript)}</p>
148 |           </div>
149 |         \`;
150 | 
151 |         // Models
152 |         Object.entries(call.results).forEach(([name, res]) => {
153 |            html += \`
154 |              <div class="bg-slate-50 p-4 rounded border">
155 |                 <div class="flex justify-between items-center mb-2">
156 |                    <h4 class="font-bold text-slate-800">\${name}</h4>
157 |                    <span class="text-xs px-2 py-1 rounded bg-blue-100 text-blue-800">WER: \${(res.metrics.wer * 100).toFixed(1)}%</span>
158 |                 </div>
159 |                 <p class="text-slate-700 leading-relaxed text-sm whitespace-pre-wrap">\${escapeHtml(res.transcript)}</p>
160 |              </div>
161 |            \`;
162 |         });
163 | 
164 |         html += '</div>';
165 |         container.innerHTML = html;
166 |     }
167 |     
168 |     function escapeHtml(text) {
169 |       if (!text) return '';
170 |       return text
171 |           .replace(/&/g, "&amp;")
172 |           .replace(/</g, "&lt;")
173 |           .replace(/>/g, "&gt;")
174 |           .replace(/"/g, "&quot;")
175 |           .replace(/'/g, "&#039;");
176 |     }
177 | 
178 |     // Init
179 |     renderCallDetails();
180 | 
181 |   </script>
182 | </body>
183 | </html>`
184 |   }
185 | }
186 | 
187 | 


--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------
  1 | import { Command } from 'commander'
  2 | import chalk from 'chalk'
  3 | import { MultiBar, Presets } from 'cli-progress'
  4 | import path from 'path'
  5 | import { fileURLToPath } from 'url'
  6 | import { AudioFetcher, CallRecord } from './lib/audio-fetcher.js'
  7 | import { BaselineTranscriber } from './lib/baseline-transcriber.js'
  8 | import { MetricsCalculator } from './lib/metrics-calculator.js'
  9 | import { ResultsStorage } from './lib/results-storage.js'
 10 | import { ReportGenerator } from './lib/report-generator.js'
 11 | import {
 12 |   DeepgramTranscriber,
 13 |   AssemblyAITranscriber,
 14 |   GoogleStreamingTranscriber,
 15 |   AwsTranscribeStreaming
 16 | } from './lib/streaming-transcribers.js'
 17 | import { BenchmarkResult, CallResult, StreamingTranscriber } from './lib/types.js'
 18 | 
 19 | const moduleDir = path.dirname(fileURLToPath(import.meta.url))
 20 | 
 21 | const program = new Command()
 22 | 
 23 | program
 24 |   .name('asr-benchmark')
 25 |   .description('Benchmark streaming ASR providers against Whisper baseline')
 26 |   .option('-w, --workspace-id <id>', 'Workspace ID to fetch calls from (required for db source)')
 27 |   .option('-n, --count <number>', 'Number of calls to analyze', '10')
 28 |   .option('-a, --agent-id <id>', 'Filter by specific Agent ID (required for vapi source)')
 29 |   .option(
 30 |     '-o, --output <path>',
 31 |     'Output directory',
 32 |     path.join(moduleDir, 'output')
 33 |   )
 34 |   .option('--source <type>', 'Source of calls: "db" or "vapi"', 'db')
 35 |   .option('--min-duration <seconds>', 'Minimum call duration in seconds', '0')
 36 |   .option('--skip-baseline', 'Skip baseline generation if transcript exists (not implemented yet fully)', false)
 37 |   .option('--models <names>', 'Comma-separated list of models to test', 'all')
 38 |   .parse(process.argv)
 39 | 
 40 | const options = program.opts()
 41 | 
 42 | async function main() {
 43 |   console.log(chalk.bold.blue('🎤 ASR Benchmark Tool'))
 44 |   if (options.workspaceId) console.log(chalk.gray(`Workspace: ${options.workspaceId}`))
 45 |   console.log(chalk.gray(`Output: ${options.output}`))
 46 | 
 47 |   const storage = new ResultsStorage(options.output)
 48 |   const outputDir = await storage.init()
 49 | 
 50 |   // 1. Fetch Calls
 51 |   const fetcher = new AudioFetcher(outputDir)
 52 |   await fetcher.connect()
 53 |   
 54 |   let calls: CallRecord[] = []
 55 | 
 56 |   if (options.source === 'vapi') {
 57 |       if (!options.agentId) {
 58 |           console.error(chalk.red('Error: --agent-id is required when using --source vapi'))
 59 |           process.exit(1)
 60 |       }
 61 |       console.log(chalk.yellow(`Fetching calls from VAPI API (Agent: ${options.agentId}, Min Duration: ${options.minDuration}s)...`))
 62 |       calls = await fetcher.fetchCallsFromVapi(
 63 |           options.agentId, 
 64 |           parseInt(options.count), 
 65 |           parseInt(options.minDuration)
 66 |       )
 67 |   } else {
 68 |       if (!options.workspaceId) {
 69 |           console.error(chalk.red('Error: --workspace-id is required when using --source db'))
 70 |           process.exit(1)
 71 |       }
 72 |       console.log(chalk.yellow('Fetching recent calls from DB...'))
 73 |       calls = await fetcher.fetchRecentCalls(
 74 |           parseInt(options.count), 
 75 |           options.workspaceId, 
 76 |           options.agentId
 77 |       )
 78 |   }
 79 | 
 80 |   await fetcher.disconnect()
 81 | 
 82 |   if (calls.length === 0) {
 83 |     console.log(chalk.red('No calls found matching criteria.'))
 84 |     process.exit(1)
 85 |   }
 86 |   console.log(chalk.green(`Found ${calls.length} calls.`))
 87 | 
 88 |   // 2. Setup Components
 89 |   const baselineTranscriber = new BaselineTranscriber()
 90 |   const metricsCalc = new MetricsCalculator()
 91 |   
 92 |   // Setup Streaming Models
 93 |   const allModels: StreamingTranscriber[] = [
 94 |     new DeepgramTranscriber('nova-3', 'Deepgram Nova 3'),
 95 |     new DeepgramTranscriber('nova-2', 'Deepgram Nova 2'),
 96 |     new AssemblyAITranscriber(),
 97 |     new GoogleStreamingTranscriber(),
 98 |     new AwsTranscribeStreaming()
 99 |   ]
100 | 
101 |   const requestedModels = options.models === 'all' 
102 |     ? allModels 
103 |     : allModels.filter(m => options.models.split(',').includes(m.name))
104 | 
105 |   // 3. Process Calls
106 |   const multiBar = new MultiBar({
107 |     clearOnComplete: false,
108 |     hideCursor: true,
109 |     format: '{bar} | {percentage}% | {value}/{total} | {task}'
110 |   }, Presets.shades_grey)
111 | 
112 |   const mainBar = multiBar.create(calls.length, 0, { task: 'Processing Calls' })
113 |   
114 |   const perCallResults: CallResult[] = []
115 |   const modelStats: Record<string, { wer: number[], cer: number[], sim: number[], ttfw: number[], ttc: number[] }> = {}
116 |   
117 |   requestedModels.forEach(m => {
118 |     modelStats[m.name] = { wer: [], cer: [], sim: [], ttfw: [], ttc: [] }
119 |   })
120 | 
121 |   for (const call of calls) {
122 |     mainBar.increment(0, { task: `Processing Call ${call.id}` })
123 | 
124 |     // Download Audio
125 |     const audioPath = await fetcher.downloadAudio(call.recordingUrl, call.id)
126 |     
127 |     // Baseline
128 |     // Use existing transcript from DB if available and reliable? 
129 |     // The prompt implies using Whisper Large v3 as baseline. VAPI might use a different model.
130 |     // We will re-transcribe with Whisper Large v3 locally/API to be sure of the baseline quality.
131 |     let baselineText = ''
132 |     try {
133 |         const res = await baselineTranscriber.transcribe(audioPath)
134 |         baselineText = res.transcript
135 |     } catch (err) {
136 |         console.error(chalk.red(`Baseline failed for ${call.id}:`), err)
137 |         baselineText = call.transcript || '' // Fallback to VAPI transcript
138 |     }
139 | 
140 |     const callResult: CallResult = {
141 |       callId: call.id,
142 |       audioPath,
143 |       baselineTranscript: baselineText,
144 |       results: {}
145 |     }
146 | 
147 |     // Run Models in Parallel
148 |     await Promise.all(requestedModels.map(async (model) => {
149 |         try {
150 |             const transRes = await model.transcribe(audioPath)
151 |             const metrics = await metricsCalc.calculateAll(
152 |                 baselineText, 
153 |                 transRes.transcript, 
154 |                 { firstWord: transRes.timeToFirstWord, completion: transRes.timeToCompletion }
155 |             )
156 |             
157 |             callResult.results[model.name] = {
158 |                 transcript: transRes.transcript,
159 |                 metrics
160 |             }
161 | 
162 |             // Accumulate stats
163 |             modelStats[model.name].wer.push(metrics.wer)
164 |             modelStats[model.name].cer.push(metrics.cer)
165 |             modelStats[model.name].sim.push(metrics.similarity)
166 |             modelStats[model.name].ttfw.push(metrics.latency.firstWord)
167 |             modelStats[model.name].ttc.push(metrics.latency.completion)
168 | 
169 |         } catch (err: any) {
170 |             // Log error but continue
171 |             // console.error(`Model ${model.name} failed for ${call.id}`, err.message)
172 |         }
173 |     }))
174 | 
175 |     perCallResults.push(callResult)
176 |     mainBar.increment()
177 |   }
178 | 
179 |   multiBar.stop()
180 | 
181 |   // 4. Aggregate Results
182 |   const benchmarkResult: BenchmarkResult = {
183 |     timestamp: new Date().toISOString(),
184 |     baselineModel: 'Whisper Large v3',
185 |     callsAnalyzed: calls.length,
186 |     models: requestedModels.map(m => {
187 |         const stats = modelStats[m.name]
188 |         const count = stats.wer.length || 1
189 |         return {
190 |             name: m.name,
191 |             avgWER: stats.wer.reduce((a,b) => a+b, 0) / count,
192 |             avgCER: stats.cer.reduce((a,b) => a+b, 0) / count,
193 |             avgSimilarity: stats.sim.reduce((a,b) => a+b, 0) / count,
194 |             avgTimeToFirstWord: stats.ttfw.reduce((a,b) => a+b, 0) / count,
195 |             avgTimeToCompletion: stats.ttc.reduce((a,b) => a+b, 0) / count
196 |         }
197 |     }),
198 |     perCallResults
199 |   }
200 | 
201 |   // 5. Save & Report
202 |   await storage.saveResults(benchmarkResult)
203 |   
204 |   const reporter = new ReportGenerator()
205 |   await reporter.generateReport(benchmarkResult, outputDir)
206 | 
207 |   console.log(chalk.bold.green('\n✅ Benchmark Complete!'))
208 |   console.log(`Results saved to: ${outputDir}`)
209 |   console.log(`Report: file://${path.join(outputDir, 'report.html')}`)
210 | }
211 | 
212 | main().catch(err => {
213 |   console.error(chalk.red('Fatal Error:'), err)
214 |   process.exit(1)
215 | })
216 | 
217 | 


--------------------------------------------------------------------------------
/lib/streaming-transcribers.ts:
--------------------------------------------------------------------------------
  1 | import { createClient, LiveTranscriptionEvents } from '@deepgram/sdk'
  2 | import { AssemblyAI } from 'assemblyai'
  3 | import { protos, SpeechClient } from '@google-cloud/speech'
  4 | import {
  5 |   TranscribeStreamingClient,
  6 |   StartStreamTranscriptionCommand,
  7 |   StartStreamTranscriptionCommandInput
  8 | } from '@aws-sdk/client-transcribe-streaming'
  9 | import fs from 'fs'
 10 | import { getEnvVar, sleep } from './utils.js'
 11 | import { StreamingTranscriber, TranscriptionResult } from './types.js'
 12 | 
 13 | type DeepgramTranscriptEvent = {
 14 |   channel: {
 15 |     alternatives: Array<{
 16 |       transcript: string
 17 |     }>
 18 |   }
 19 |   is_final: boolean
 20 | }
 21 | 
 22 | type AssemblyTranscriptEvent = {
 23 |   text: string
 24 |   message_type: 'PartialTranscript' | 'FinalTranscript'
 25 | }
 26 | 
 27 | const bufferToArrayBuffer = (buffer: Buffer): ArrayBufferLike =>
 28 |   buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
 29 | 
 30 | // --- Deepgram ---
 31 | 
 32 | export class DeepgramTranscriber implements StreamingTranscriber {
 33 |   name: string
 34 |   private model: string
 35 |   private apiKey: string
 36 | 
 37 |   constructor(model: string = 'nova-3', name: string) {
 38 |     this.model = model
 39 |     this.name = name
 40 |     this.apiKey = getEnvVar('DEEPGRAM_API_KEY', true)
 41 |   }
 42 | 
 43 |   async transcribe(audioPath: string): Promise<TranscriptionResult> {
 44 |     const deepgram = createClient(this.apiKey)
 45 |     const startTime = Date.now()
 46 |     let firstWordTime = 0
 47 |     const transcripts: string[] = []
 48 | 
 49 |     return new Promise(async (resolve, reject) => {
 50 |       const connection = deepgram.listen.live({
 51 |         model: this.model,
 52 |         smart_format: true,
 53 |         interim_results: true, 
 54 |       })
 55 | 
 56 |       connection.on(LiveTranscriptionEvents.Open, async () => {
 57 |       connection.on(LiveTranscriptionEvents.Transcript, (data: DeepgramTranscriptEvent) => {
 58 |             const transcript = data.channel.alternatives[0]?.transcript ?? ''
 59 |             if (transcript && !firstWordTime) {
 60 |                 firstWordTime = Date.now() - startTime
 61 |             }
 62 |             if (data.is_final && transcript) {
 63 |                 transcripts.push(transcript)
 64 |             }
 65 |         })
 66 | 
 67 |         connection.on(LiveTranscriptionEvents.Close, () => {
 68 |            const endTime = Date.now()
 69 |            resolve({
 70 |              transcript: transcripts.join(' '),
 71 |              timeToFirstWord: firstWordTime || (endTime - startTime),
 72 |              timeToCompletion: endTime - startTime,
 73 |              metadata: { model: this.model }
 74 |            })
 75 |         })
 76 | 
 77 |         connection.on(LiveTranscriptionEvents.Error, (err: unknown) => {
 78 |             reject(err instanceof Error ? err : new Error('Deepgram streaming error'))
 79 |         })
 80 | 
 81 |         // Stream audio
 82 |         const fileBuffer = await fs.promises.readFile(audioPath)
 83 |         connection.send(bufferToArrayBuffer(fileBuffer))
 84 |         // finish
 85 |         connection.finish()
 86 |       })
 87 |     })
 88 |   }
 89 | }
 90 | 
 91 | // --- AssemblyAI ---
 92 | 
 93 | export class AssemblyAITranscriber implements StreamingTranscriber {
 94 |   name: string = 'AssemblyAI Streaming'
 95 |   private client: AssemblyAI
 96 | 
 97 |   constructor() {
 98 |     this.client = new AssemblyAI({
 99 |       apiKey: getEnvVar('ASSEMBLYAI_API_KEY', true)
100 |     })
101 |   }
102 | 
103 |   async transcribe(audioPath: string): Promise<TranscriptionResult> {
104 |     const startTime = Date.now()
105 |     let firstWordTime = 0
106 |     const transcripts: string[] = []
107 |     
108 |     const rt = this.client.realtime.transcriber({
109 |       sampleRate: 16000
110 |     })
111 | 
112 |     return new Promise(async (resolve, reject) => {
113 |       rt.on('open', ({ sessionId }: { sessionId: string }) => {
114 |         // Session started
115 |       })
116 | 
117 |       rt.on('error', (error: unknown) => {
118 |         reject(error instanceof Error ? error : new Error('AssemblyAI streaming error'))
119 |       })
120 | 
121 |       rt.on('transcript', (transcript: AssemblyTranscriptEvent) => {
122 |         if (transcript.text && !firstWordTime) {
123 |              firstWordTime = Date.now() - startTime
124 |         }
125 |         if (transcript.message_type === 'FinalTranscript') {
126 |           transcripts.push(transcript.text)
127 |         }
128 |       })
129 | 
130 |       rt.on('close', (code: number, reason: string) => {
131 |         const endTime = Date.now()
132 |         resolve({
133 |             transcript: transcripts.join(' '),
134 |             timeToFirstWord: firstWordTime || (endTime - startTime),
135 |             timeToCompletion: endTime - startTime,
136 |             metadata: { model: 'assemblyai-streaming' }
137 |         })
138 |       })
139 | 
140 |       try {
141 |         await rt.connect()
142 |         const fileBuffer = await fs.promises.readFile(audioPath)
143 |         const CHUNK_SIZE = 8192 
144 |         for (let i = 0; i < fileBuffer.length; i += CHUNK_SIZE) {
145 |             const chunk = fileBuffer.subarray(i, i + CHUNK_SIZE)
146 |             rt.sendAudio(bufferToArrayBuffer(chunk))
147 |             await sleep(20)
148 |         }
149 |         await rt.close()
150 |       } catch (err) {
151 |         reject(err instanceof Error ? err : new Error('AssemblyAI streaming failure'))
152 |       }
153 |     })
154 |   }
155 | }
156 | 
157 | // --- Google Cloud Speech ---
158 | 
159 | export class GoogleStreamingTranscriber implements StreamingTranscriber {
160 |   name: string = 'Google Streaming'
161 |   private client: SpeechClient
162 | 
163 |   constructor() {
164 |     // Relies on GOOGLE_APPLICATION_CREDENTIALS env var or default auth
165 |     this.client = new SpeechClient()
166 |   }
167 | 
168 |   async transcribe(audioPath: string): Promise<TranscriptionResult> {
169 |     const startTime = Date.now()
170 |     let firstWordTime = 0
171 |     const transcripts: string[] = []
172 | 
173 |     return new Promise((resolve, reject) => {
174 |       const recognizeStream = this.client
175 |         .streamingRecognize({
176 |           config: {
177 |             encoding: 'LINEAR16', // Assumes WAV/PCM. Might need conversion if MP3.
178 |             sampleRateHertz: 16000,
179 |             languageCode: 'en-US',
180 |             enableAutomaticPunctuation: true,
181 |           },
182 |           interimResults: true,
183 |         })
184 |         .on('error', reject)
185 |         .on('data', (data: protos.google.cloud.speech.v1.StreamingRecognizeResponse) => {
186 |           const result = data.results?.[0]
187 |           const transcript = result?.alternatives?.[0]?.transcript
188 |           if (result && transcript) {
189 |             if (transcript && !firstWordTime) {
190 |                 firstWordTime = Date.now() - startTime
191 |             }
192 |             if (result.isFinal) {
193 |                 transcripts.push(transcript)
194 |             }
195 |           }
196 |         })
197 |         .on('end', () => {
198 |            const endTime = Date.now()
199 |            resolve({
200 |              transcript: transcripts.join(' '),
201 |              timeToFirstWord: firstWordTime || (endTime - startTime),
202 |              timeToCompletion: endTime - startTime,
203 |              metadata: { model: 'google-streaming' }
204 |            })
205 |         })
206 | 
207 |       const fileStream = fs.createReadStream(audioPath)
208 |       fileStream.pipe(recognizeStream)
209 |     })
210 |   }
211 | }
212 | 
213 | // --- AWS Transcribe ---
214 | 
215 | export class AwsTranscribeStreaming implements StreamingTranscriber {
216 |   name: string = 'AWS Transcribe Streaming'
217 |   private client: TranscribeStreamingClient
218 | 
219 |   constructor() {
220 |     this.client = new TranscribeStreamingClient({
221 |       region: getEnvVar('AWS_REGION', true),
222 |       credentials: {
223 |         accessKeyId: getEnvVar('AWS_ACCESS_KEY_ID', true),
224 |         secretAccessKey: getEnvVar('AWS_SECRET_ACCESS_KEY', true),
225 |       }
226 |     })
227 |   }
228 | 
229 |   async transcribe(audioPath: string): Promise<TranscriptionResult> {
230 |     const startTime = Date.now()
231 |     let firstWordTime = 0
232 |     const transcripts: string[] = []
233 | 
234 |     // Create an async generator for the audio stream
235 |     const audioStream = async function* () {
236 |       const fileStream = fs.createReadStream(audioPath, { highWaterMark: 1024 * 4 })
237 |       for await (const chunk of fileStream) {
238 |         yield { AudioEvent: { AudioChunk: chunk } }
239 |       }
240 |     }()
241 | 
242 |     const params: StartStreamTranscriptionCommandInput = {
243 |       LanguageCode: 'en-US',
244 |       MediaEncoding: 'pcm', // Assumes PCM/WAV. AWS Transcribe Streaming supports pcm, ogg-opus, flac.
245 |       MediaSampleRateHertz: 16000,
246 |       AudioStream: audioStream,
247 |     }
248 | 
249 |     try {
250 |       const command = new StartStreamTranscriptionCommand(params)
251 |       const response = await this.client.send(command)
252 |       
253 |       if (response.TranscriptResultStream) {
254 |         for await (const event of response.TranscriptResultStream) {
255 |           if (event.TranscriptEvent) {
256 |             const results = event.TranscriptEvent.Transcript?.Results
257 |             if (results && results.length > 0) {
258 |               const result = results[0]
259 |               if (result.Alternatives && result.Alternatives.length > 0) {
260 |                  const transcript = result.Alternatives[0].Transcript
261 |                  if (transcript && !firstWordTime) {
262 |                     firstWordTime = Date.now() - startTime
263 |                  }
264 |                  if (!result.IsPartial) {
265 |                     transcripts.push(transcript || '')
266 |                  }
267 |               }
268 |             }
269 |           }
270 |         }
271 |       }
272 | 
273 |       const endTime = Date.now()
274 |       return {
275 |          transcript: transcripts.join(' '),
276 |          timeToFirstWord: firstWordTime || (endTime - startTime),
277 |          timeToCompletion: endTime - startTime,
278 |          metadata: { model: 'aws-streaming' }
279 |       }
280 | 
281 |     } catch (err) {
282 |       throw err
283 |     }
284 |   }
285 | }
286 | 


--------------------------------------------------------------------------------