├── tsconfig.json ├── .gitignore ├── lib ├── utils.ts ├── types.ts ├── baseline-transcriber.ts ├── results-storage.ts ├── metrics-calculator.ts ├── audio-fetcher.ts ├── report-generator.ts └── streaming-transcribers.ts ├── package.json ├── README.md └── index.ts /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "NodeNext", 5 | "moduleResolution": "NodeNext", 6 | "esModuleInterop": true, 7 | "types": ["node"], 8 | "lib": ["ES2022"], 9 | "forceConsistentCasingInFileNames": true, 10 | "strict": true, 11 | "skipLibCheck": true 12 | } 13 | } 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | 4 | # Output 5 | output/ 6 | dist/ 7 | build/ 8 | 9 | # Environment variables 10 | .env 11 | .env.local 12 | .env.*.local 13 | 14 | # Logs 15 | logs 16 | *.log 17 | npm-debug.log* 18 | yarn-debug.log* 19 | yarn-error.log* 20 | 21 | # IDE 22 | .vscode/ 23 | .idea/ 24 | *.swp 25 | *.swo 26 | *~ 27 | 28 | # OS 29 | .DS_Store 30 | Thumbs.db 31 | 32 | # TypeScript 33 | *.tsbuildinfo 34 | -------------------------------------------------------------------------------- /lib/utils.ts: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv' 2 | import path from 'path' 3 | import fs from 'fs' 4 | import { fileURLToPath } from 'url' 5 | 6 | export function loadEnv() { 7 | // Try to load from .env.local in root first, then .env in root 8 | const moduleDir = path.dirname(fileURLToPath(import.meta.url)) 9 | const rootDir = path.resolve(moduleDir, '../../') 10 | const envLocalPath = path.join(rootDir, '.env.local') 11 | const envPath = path.join(rootDir, '.env') 12 | 13 | if (fs.existsSync(envLocalPath)) { 14 | dotenv.config({ path: envLocalPath }) 15 | } else if (fs.existsSync(envPath)) { 16 | dotenv.config({ path: envPath }) 17 | } 18 | } 19 | 20 | export function getEnvVar(key: string, required = false): string { 21 | const value = process.env[key] 22 | if (required && !value) { 23 | throw new Error(`Missing required environment variable: ${key}`) 24 | } 25 | return value || '' 26 | } 27 | 28 | export function sleep(ms: number) { 29 | return new Promise((resolve) => setTimeout(resolve, ms)) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "asr-benchmark", 3 | "version": "1.0.0", 4 | "description": "ASR Benchmarking Tool for VAPI Calls", 5 | "type": "module", 6 | "main": "index.ts", 7 | "scripts": { 8 | "benchmark": "tsx index.ts" 9 | }, 10 | "dependencies": { 11 | "@aws-sdk/client-transcribe-streaming": "^3.696.0", 12 | "@deepgram/sdk": "^3.9.0", 13 | "@google-cloud/speech": "^6.7.0", 14 | "assemblyai": "^4.8.0", 15 | "chalk": "^5.3.0", 16 | "cli-progress": "^3.12.0", 17 | "commander": "^12.1.0", 18 | "csv-stringify": "^6.5.1", 19 | "dotenv": "^16.4.5", 20 | "fast-levenshtein": "^3.0.0", 21 | "fs-extra": "^11.2.0", 22 | "openai": "^4.73.0", 23 | "pg": "^8.13.1", 24 | "postgres": "^3.4.5", 25 | "tsx": "^4.19.2", 26 | "zod": "^3.23.8" 27 | }, 28 | "devDependencies": { 29 | "@types/cli-progress": "^3.11.6", 30 | "@types/fast-levenshtein": "^0.0.2", 31 | "@types/fs-extra": "^11.0.4", 32 | "@types/node": "^22.10.1", 33 | "@types/pg": "^8.11.10", 34 | "typescript": "^5.6.3" 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /lib/types.ts: -------------------------------------------------------------------------------- 1 | export interface TranscriptionResult { 2 | transcript: string 3 | timeToFirstWord: number 4 | timeToCompletion: number 5 | metadata: Record 6 | words?: Array<{ 7 | word: string 8 | start: number 9 | end: number 10 | confidence: number 11 | }> 12 | } 13 | 14 | export interface StreamingTranscriber { 15 | name: string 16 | transcribe(audioPath: string): Promise 17 | } 18 | 19 | export interface MetricResult { 20 | wer: number 21 | cer: number 22 | similarity: number 23 | latency: { 24 | firstWord: number 25 | completion: number 26 | } 27 | } 28 | 29 | export interface CallResult { 30 | callId: string 31 | audioPath: string 32 | baselineTranscript: string 33 | results: Record 37 | } 38 | 39 | export interface BenchmarkResult { 40 | timestamp: string 41 | baselineModel: string 42 | callsAnalyzed: number 43 | models: Array<{ 44 | name: string 45 | avgWER: number 46 | avgCER: number 47 | avgSimilarity: number 48 | avgTimeToFirstWord: number 49 | avgTimeToCompletion: number 50 | }> 51 | perCallResults: CallResult[] 52 | } 53 | 54 | export interface ASRConfig { 55 | openaiApiKey?: string 56 | deepgramApiKey?: string 57 | assemblyAiApiKey?: string 58 | googleCredentialsPath?: string 59 | awsAccessKeyId?: string 60 | awsSecretAccessKey?: string 61 | awsRegion?: string 62 | } 63 | 64 | -------------------------------------------------------------------------------- /lib/baseline-transcriber.ts: -------------------------------------------------------------------------------- 1 | import OpenAI from 'openai' 2 | import fs from 'fs-extra' 3 | import { getEnvVar } from './utils.js' 4 | import { TranscriptionResult } from './types.js' 5 | 6 | export class BaselineTranscriber { 7 | private openai: OpenAI 8 | 9 | constructor() { 10 | const apiKey = getEnvVar('OPENAI_API_KEY', true) 11 | this.openai = new OpenAI({ apiKey }) 12 | } 13 | 14 | async transcribe(audioPath: string): Promise { 15 | const startTime = Date.now() 16 | 17 | const fileStream = fs.createReadStream(audioPath) 18 | 19 | const response = await this.openai.audio.transcriptions.create({ 20 | file: fileStream, 21 | model: 'whisper-1', // Using 'whisper-1' which is typically Large v2/v3 on API 22 | response_format: 'verbose_json', 23 | timestamp_granularities: ['word'], 24 | }) 25 | 26 | const endTime = Date.now() 27 | const timeToCompletion = endTime - startTime 28 | 29 | // Determine first word time from words if available 30 | let timeToFirstWord = 0 31 | if (response.words && response.words.length > 0) { 32 | // API returns seconds, convert to ms 33 | timeToFirstWord = response.words[0].start * 1000 34 | } else { 35 | // Fallback if no words (shouldn't happen with verbose_json + word granularity) 36 | timeToFirstWord = timeToCompletion // rough estimate if not streaming 37 | } 38 | 39 | return { 40 | transcript: response.text, 41 | timeToFirstWord, 42 | timeToCompletion, 43 | metadata: { 44 | model: 'whisper-1', 45 | duration: response.duration, 46 | }, 47 | words: response.words?.map((w: any) => ({ 48 | word: w.word, 49 | start: w.start, 50 | end: w.end, 51 | confidence: 1.0 // OpenAI doesn't always return confidence per word in all modes, assume 1 for baseline or check schema 52 | })) 53 | } 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /lib/results-storage.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs-extra' 2 | import path from 'path' 3 | import { stringify } from 'csv-stringify/sync' 4 | import { BenchmarkResult, CallResult } from './types.js' 5 | 6 | type CsvRow = { 7 | callId: string 8 | model: string 9 | wer: string 10 | cer: string 11 | similarity: string 12 | timeToFirstWord: number 13 | timeToCompletion: number 14 | transcriptLength: number 15 | } 16 | 17 | export class ResultsStorage { 18 | private outputDir: string 19 | 20 | constructor(baseOutputDir: string) { 21 | const timestamp = new Date().toISOString().replace(/[:.]/g, '-') 22 | this.outputDir = path.join(baseOutputDir, timestamp) 23 | } 24 | 25 | async init() { 26 | await fs.ensureDir(this.outputDir) 27 | await fs.ensureDir(path.join(this.outputDir, 'transcripts')) 28 | return this.outputDir 29 | } 30 | 31 | async saveResults(results: BenchmarkResult) { 32 | // 1. Save full JSON 33 | await fs.writeJSON(path.join(this.outputDir, 'results.json'), results, { spaces: 2 }) 34 | 35 | // 2. Save individual transcripts 36 | for (const callResult of results.perCallResults) { 37 | await fs.writeJSON( 38 | path.join(this.outputDir, 'transcripts', `${callResult.callId}.json`), 39 | callResult, 40 | { spaces: 2 } 41 | ) 42 | } 43 | 44 | // 3. Save CSV Summary 45 | const csvRows: CsvRow[] = results.perCallResults.flatMap((call: CallResult) => { 46 | const modelEntries = Object.entries(call.results) as Array< 47 | [string, CallResult['results'][string]] 48 | > 49 | 50 | return modelEntries.map(([modelName, data]) => ({ 51 | callId: call.callId, 52 | model: modelName, 53 | wer: data.metrics.wer.toFixed(4), 54 | cer: data.metrics.cer.toFixed(4), 55 | similarity: data.metrics.similarity.toFixed(4), 56 | timeToFirstWord: data.metrics.latency.firstWord, 57 | timeToCompletion: data.metrics.latency.completion, 58 | transcriptLength: data.transcript.length 59 | })) 60 | }) 61 | 62 | const csvOutput = stringify(csvRows, { 63 | header: true, 64 | columns: ['callId', 'model', 'wer', 'cer', 'similarity', 'timeToFirstWord', 'timeToCompletion', 'transcriptLength'] 65 | }) 66 | 67 | await fs.writeFile(path.join(this.outputDir, 'summary.csv'), csvOutput) 68 | } 69 | 70 | getOutputDir() { 71 | return this.outputDir 72 | } 73 | } 74 | 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ASR Benchmarking Tool 2 | 3 | A tool to benchmark streaming ASR providers (Deepgram, AssemblyAI, Google, AWS) against OpenAI Whisper Large v3. 4 | 5 | ## Features 6 | 7 | - **Baseline Comparison**: Compares all models against Whisper Large v3 (via OpenAI API). 8 | - **Metrics**: WER (Word Error Rate), CER (Character Error Rate), Semantic Similarity, and Latency (Time to First Word). 9 | - **Visual Report**: Generates an HTML report with charts and transcript diffs. 10 | - **Parallel Processing**: Runs multiple models concurrently for efficiency. 11 | 12 | ## Setup 13 | 14 | 1. **Install Dependencies**: 15 | ```bash 16 | cd eval 17 | npm install 18 | ``` 19 | 20 | 2. **Environment Variables**: 21 | Ensure your root `.env` or `.env.local` contains the following keys: 22 | 23 | ```env 24 | # Database (to fetch call recordings) 25 | DATABASE_URL=postgresql://... 26 | 27 | # ASR Providers 28 | OPENAI_API_KEY=sk-... 29 | DEEPGRAM_API_KEY=... 30 | ASSEMBLYAI_API_KEY=... 31 | 32 | # AWS Transcribe 33 | AWS_ACCESS_KEY_ID=... 34 | AWS_SECRET_ACCESS_KEY=... 35 | AWS_REGION=us-east-1 36 | 37 | # Google Cloud Speech 38 | # Ensure GOOGLE_APPLICATION_CREDENTIALS points to your JSON key file 39 | # GOOGLE_APPLICATION_CREDENTIALS=path/to/key.json 40 | ``` 41 | 42 | ## Usage 43 | 44 | Run the benchmark from the project root: 45 | 46 | ```bash 47 | npm run benchmark:asr -- -w [options] 48 | ``` 49 | 50 | Or from the `eval` directory: 51 | 52 | ```bash 53 | npm run benchmark -- -w [options] 54 | ``` 55 | 56 | ### Options 57 | 58 | - `-w, --workspace-id ` (Required) Workspace ID to fetch calls from. 59 | - `-n, --count ` (Default: 10) Number of most recent calls to analyze. 60 | - `-a, --agent-id ` Filter by specific Agent ID. 61 | - `-o, --output ` Custom output directory (default: `eval/output`). 62 | - `--models ` Comma-separated list of models to test (default: all). 63 | - Available: `Deepgram Nova 3`, `Deepgram Nova 2`, `AssemblyAI Streaming`, `Google Streaming`, `AWS Transcribe Streaming` 64 | 65 | ### VAPI API Source 66 | 67 | You can fetch calls directly from the VAPI API instead of the database: 68 | 69 | ```bash 70 | npm run benchmark:asr -- --source vapi --agent-id --min-duration 120 71 | ``` 72 | 73 | - `--source vapi`: Switch to VAPI API source. 74 | - `--agent-id `: Required for VAPI source. 75 | - `--min-duration `: Filter calls shorter than this duration (e.g., 120 for 2 minutes). 76 | - `-w, --workspace-id` is NOT required when using VAPI source. 77 | 78 | ### Example 79 | 80 | ```bash 81 | # Benchmark last 5 calls for Deepgram and AssemblyAI 82 | npm run benchmark:asr -- -w workspace_123 -n 5 --models "Deepgram Nova 3,AssemblyAI Streaming" 83 | ``` 84 | 85 | ## Output 86 | 87 | Results are saved in `eval/output//`: 88 | - `report.html`: Interactive visualization. 89 | - `summary.csv`: Raw metrics for analysis. 90 | - `results.json`: Full detailed JSON data. 91 | - `transcripts/`: Individual JSON result per call. 92 | 93 | -------------------------------------------------------------------------------- /lib/metrics-calculator.ts: -------------------------------------------------------------------------------- 1 | import levenshtein from 'fast-levenshtein' 2 | import OpenAI from 'openai' 3 | import { getEnvVar } from './utils.js' 4 | import { MetricResult } from './types.js' 5 | 6 | export class MetricsCalculator { 7 | private openai: OpenAI 8 | 9 | constructor() { 10 | this.openai = new OpenAI({ apiKey: getEnvVar('OPENAI_API_KEY', true) }) 11 | } 12 | 13 | normalizeText(text: string): string { 14 | return text 15 | .toLowerCase() 16 | .replace(/[^\w\s]/g, '') // Remove punctuation 17 | .replace(/\s+/g, ' ') // Collapse whitespace 18 | .trim() 19 | } 20 | 21 | calculateWER(reference: string, hypothesis: string): number { 22 | const refWords = this.normalizeText(reference).split(' ') 23 | const hypWords = this.normalizeText(hypothesis).split(' ') 24 | 25 | if (refWords.length === 0) return hypWords.length > 0 ? 1.0 : 0.0 26 | 27 | const distance = this.calculateTokenDistance(refWords, hypWords) 28 | return distance / refWords.length 29 | } 30 | 31 | calculateCER(reference: string, hypothesis: string): number { 32 | const refNorm = this.normalizeText(reference) 33 | const hypNorm = this.normalizeText(hypothesis) 34 | 35 | if (refNorm.length === 0) return hypNorm.length > 0 ? 1.0 : 0.0 36 | 37 | const distance = levenshtein.get(refNorm, hypNorm) 38 | return distance / refNorm.length 39 | } 40 | 41 | private calculateTokenDistance(reference: string[], hypothesis: string[]): number { 42 | const rows = reference.length + 1 43 | const cols = hypothesis.length + 1 44 | const dp: number[][] = Array.from({ length: rows }, () => Array(cols).fill(0)) 45 | 46 | for (let i = 0; i < rows; i++) dp[i][0] = i 47 | for (let j = 0; j < cols; j++) dp[0][j] = j 48 | 49 | for (let i = 1; i < rows; i++) { 50 | for (let j = 1; j < cols; j++) { 51 | const cost = reference[i - 1] === hypothesis[j - 1] ? 0 : 1 52 | dp[i][j] = Math.min( 53 | dp[i - 1][j] + 1, 54 | dp[i][j - 1] + 1, 55 | dp[i - 1][j - 1] + cost 56 | ) 57 | } 58 | } 59 | 60 | return dp[rows - 1][cols - 1] 61 | } 62 | 63 | async calculateSimilarity(reference: string, hypothesis: string): Promise { 64 | if (!reference || !hypothesis) return 0 65 | 66 | try { 67 | const response = await this.openai.embeddings.create({ 68 | model: 'text-embedding-3-small', 69 | input: [reference, hypothesis], 70 | }) 71 | 72 | const vecA = response.data[0].embedding 73 | const vecB = response.data[1].embedding 74 | 75 | return this.cosineSimilarity(vecA, vecB) 76 | } catch (error) { 77 | console.error('Error calculating similarity:', error) 78 | return 0 79 | } 80 | } 81 | 82 | private cosineSimilarity(vecA: number[], vecB: number[]): number { 83 | let dotProduct = 0 84 | let normA = 0 85 | let normB = 0 86 | for (let i = 0; i < vecA.length; i++) { 87 | dotProduct += vecA[i] * vecB[i] 88 | normA += vecA[i] * vecA[i] 89 | normB += vecB[i] * vecB[i] 90 | } 91 | return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) 92 | } 93 | 94 | async calculateAll( 95 | reference: string, 96 | hypothesis: string, 97 | latency: { firstWord: number; completion: number } 98 | ): Promise { 99 | return { 100 | wer: this.calculateWER(reference, hypothesis), 101 | cer: this.calculateCER(reference, hypothesis), 102 | similarity: await this.calculateSimilarity(reference, hypothesis), 103 | latency 104 | } 105 | } 106 | } 107 | 108 | -------------------------------------------------------------------------------- /lib/audio-fetcher.ts: -------------------------------------------------------------------------------- 1 | import { Client } from 'pg' 2 | import fs from 'fs-extra' 3 | import path from 'path' 4 | import { loadEnv, getEnvVar } from './utils.js' 5 | 6 | loadEnv() 7 | 8 | export interface CallRecord { 9 | id: string 10 | recordingUrl: string 11 | agentId: string 12 | workspaceId: string 13 | transcript: string | null 14 | } 15 | 16 | export class AudioFetcher { 17 | private client: Client 18 | private outputDir: string 19 | 20 | constructor(outputDir: string) { 21 | this.outputDir = outputDir 22 | this.client = new Client({ 23 | connectionString: getEnvVar('DATABASE_URL', false), // Make optional if only using VAPI API 24 | }) 25 | } 26 | 27 | async connect() { 28 | // Only connect if DATABASE_URL is present 29 | if (getEnvVar('DATABASE_URL', false)) { 30 | await this.client.connect() 31 | } 32 | } 33 | 34 | async disconnect() { 35 | if (getEnvVar('DATABASE_URL', false)) { 36 | await this.client.end() 37 | } 38 | } 39 | 40 | async fetchCallsFromVapi( 41 | assistantId: string, 42 | limit: number, 43 | minDurationSeconds: number 44 | ): Promise { 45 | const apiKey = getEnvVar('VAPI_PRIVATE_KEY', true) 46 | const records: CallRecord[] = [] 47 | 48 | // Fetch calls from VAPI API 49 | // We'll fetch a bit more than limit to account for filtering 50 | const fetchLimit = Math.max(limit * 3, 50) 51 | 52 | const url = new URL('https://api.vapi.ai/call') 53 | url.searchParams.append('assistantId', assistantId) 54 | url.searchParams.append('limit', fetchLimit.toString()) 55 | url.searchParams.append('createdAtGt', new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString()) // Last 30 days 56 | 57 | const response = await fetch(url.toString(), { 58 | headers: { 59 | 'Authorization': `Bearer ${apiKey}`, 60 | 'Content-Type': 'application/json' 61 | } 62 | }) 63 | 64 | if (!response.ok) { 65 | throw new Error(`Failed to fetch calls from VAPI: ${response.statusText}`) 66 | } 67 | 68 | const data = await response.json() as any[] 69 | 70 | for (const call of data) { 71 | // Duration check 72 | // VAPI call object usually has durationSeconds or analysis.durationSeconds 73 | // We'll check top level duration or calculate from startedAt/endedAt if needed 74 | let duration = call.durationSeconds 75 | if (!duration && call.startedAt && call.endedAt) { 76 | duration = (new Date(call.endedAt).getTime() - new Date(call.startedAt).getTime()) / 1000 77 | } 78 | 79 | if ((duration || 0) < minDurationSeconds) continue 80 | 81 | if (!call.recordingUrl && !call.artifact?.recordingUrl) continue 82 | if (call.status !== 'ended') continue 83 | 84 | records.push({ 85 | id: call.id, 86 | recordingUrl: call.recordingUrl || call.artifact?.recordingUrl, 87 | agentId: call.assistantId, 88 | workspaceId: 'vapi-api', // Placeholder 89 | transcript: call.transcript || call.artifact?.transcript || null 90 | }) 91 | 92 | if (records.length >= limit) break 93 | } 94 | 95 | return records 96 | } 97 | 98 | async fetchRecentCalls( 99 | count: number, 100 | workspaceId: string, 101 | agentId?: string 102 | ): Promise { 103 | let query = ` 104 | SELECT id, "recordingUrl", "agentId", "workspaceId", transcript 105 | FROM calls 106 | WHERE "workspaceId" = $1 107 | AND "recordingUrl" IS NOT NULL 108 | AND status = 'ended' 109 | AND provider = 'vapi' 110 | ` 111 | const params: any[] = [workspaceId] 112 | 113 | if (agentId) { 114 | query += ` AND "agentId" = $2` 115 | params.push(agentId) 116 | } 117 | 118 | query += ` ORDER BY "createdAt" DESC LIMIT $${params.length + 1}` 119 | params.push(count) 120 | 121 | try { 122 | const res = await this.client.query(query, params) 123 | return res.rows 124 | } catch (error) { 125 | console.error('Error fetching calls:', error) 126 | throw error 127 | } 128 | } 129 | 130 | async downloadAudio(url: string, callId: string): Promise { 131 | const extension = path.extname(url.split('?')[0]) || '.wav' 132 | const filename = `${callId}${extension}` 133 | const filePath = path.join(this.outputDir, filename) 134 | 135 | if (await fs.pathExists(filePath)) { 136 | return filePath 137 | } 138 | 139 | const response = await fetch(url) 140 | if (!response.ok) { 141 | throw new Error(`Failed to download audio from ${url}: ${response.statusText}`) 142 | } 143 | 144 | const buffer = Buffer.from(await response.arrayBuffer()) 145 | await fs.writeFile(filePath, buffer) 146 | 147 | return filePath 148 | } 149 | } 150 | 151 | -------------------------------------------------------------------------------- /lib/report-generator.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs-extra' 2 | import path from 'path' 3 | import { BenchmarkResult } from './types.js' 4 | 5 | export class ReportGenerator { 6 | async generateReport(results: BenchmarkResult, outputDir: string) { 7 | const html = this.generateHTML(results) 8 | await fs.writeFile(path.join(outputDir, 'report.html'), html) 9 | } 10 | 11 | private generateHTML(results: BenchmarkResult): string { 12 | const models = results.models.map((model) => model.name) 13 | const werData = results.models.map((model) => (model.avgWER * 100).toFixed(2)) 14 | const latencyData = results.models.map((model) => model.avgTimeToFirstWord) 15 | const modelTableRows = results.models 16 | .map((model) => ` 17 | 18 | ${model.name} 19 | ${(model.avgWER * 100).toFixed(2)}% 20 | ${(model.avgCER * 100).toFixed(2)}% 21 | ${model.avgSimilarity.toFixed(3)} 22 | ${Math.round(model.avgTimeToFirstWord)} 23 | ${Math.round(model.avgTimeToCompletion)} 24 | 25 | `) 26 | .join('') 27 | const callOptions = results.perCallResults 28 | .map((call, index) => ``) 29 | .join('') 30 | 31 | const resultsJson = JSON.stringify(results).replace(/ 34 | 35 | 36 | 37 | 38 | ASR Benchmark Results - ${results.timestamp} 39 | 40 | 41 | 47 | 48 | 49 |
50 |

ASR Benchmark Results

51 |

Baseline: ${results.baselineModel} | Calls: ${results.callsAnalyzed} | Date: ${results.timestamp}

52 |
53 | 54 |
55 |
56 |

Average WER (%) - Lower is better

57 | 58 |
59 |
60 |

Avg Time to First Word (ms) - Lower is better

61 | 62 |
63 |
64 | 65 |
66 |

Overall Model Ranking

67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | ${modelTableRows} 80 | 81 |
ModelAvg WERAvg CERSimilarityTTFW (ms)Completion (ms)
82 |
83 | 84 |
85 |

Call Details & Transcripts

86 |
87 | 88 | 91 |
92 | 93 |
94 | 95 |
96 |
97 | 98 | 182 | 183 | ` 184 | } 185 | } 186 | 187 | -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | import { Command } from 'commander' 2 | import chalk from 'chalk' 3 | import { MultiBar, Presets } from 'cli-progress' 4 | import path from 'path' 5 | import { fileURLToPath } from 'url' 6 | import { AudioFetcher, CallRecord } from './lib/audio-fetcher.js' 7 | import { BaselineTranscriber } from './lib/baseline-transcriber.js' 8 | import { MetricsCalculator } from './lib/metrics-calculator.js' 9 | import { ResultsStorage } from './lib/results-storage.js' 10 | import { ReportGenerator } from './lib/report-generator.js' 11 | import { 12 | DeepgramTranscriber, 13 | AssemblyAITranscriber, 14 | GoogleStreamingTranscriber, 15 | AwsTranscribeStreaming 16 | } from './lib/streaming-transcribers.js' 17 | import { BenchmarkResult, CallResult, StreamingTranscriber } from './lib/types.js' 18 | 19 | const moduleDir = path.dirname(fileURLToPath(import.meta.url)) 20 | 21 | const program = new Command() 22 | 23 | program 24 | .name('asr-benchmark') 25 | .description('Benchmark streaming ASR providers against Whisper baseline') 26 | .option('-w, --workspace-id ', 'Workspace ID to fetch calls from (required for db source)') 27 | .option('-n, --count ', 'Number of calls to analyze', '10') 28 | .option('-a, --agent-id ', 'Filter by specific Agent ID (required for vapi source)') 29 | .option( 30 | '-o, --output ', 31 | 'Output directory', 32 | path.join(moduleDir, 'output') 33 | ) 34 | .option('--source ', 'Source of calls: "db" or "vapi"', 'db') 35 | .option('--min-duration ', 'Minimum call duration in seconds', '0') 36 | .option('--skip-baseline', 'Skip baseline generation if transcript exists (not implemented yet fully)', false) 37 | .option('--models ', 'Comma-separated list of models to test', 'all') 38 | .parse(process.argv) 39 | 40 | const options = program.opts() 41 | 42 | async function main() { 43 | console.log(chalk.bold.blue('🎤 ASR Benchmark Tool')) 44 | if (options.workspaceId) console.log(chalk.gray(`Workspace: ${options.workspaceId}`)) 45 | console.log(chalk.gray(`Output: ${options.output}`)) 46 | 47 | const storage = new ResultsStorage(options.output) 48 | const outputDir = await storage.init() 49 | 50 | // 1. Fetch Calls 51 | const fetcher = new AudioFetcher(outputDir) 52 | await fetcher.connect() 53 | 54 | let calls: CallRecord[] = [] 55 | 56 | if (options.source === 'vapi') { 57 | if (!options.agentId) { 58 | console.error(chalk.red('Error: --agent-id is required when using --source vapi')) 59 | process.exit(1) 60 | } 61 | console.log(chalk.yellow(`Fetching calls from VAPI API (Agent: ${options.agentId}, Min Duration: ${options.minDuration}s)...`)) 62 | calls = await fetcher.fetchCallsFromVapi( 63 | options.agentId, 64 | parseInt(options.count), 65 | parseInt(options.minDuration) 66 | ) 67 | } else { 68 | if (!options.workspaceId) { 69 | console.error(chalk.red('Error: --workspace-id is required when using --source db')) 70 | process.exit(1) 71 | } 72 | console.log(chalk.yellow('Fetching recent calls from DB...')) 73 | calls = await fetcher.fetchRecentCalls( 74 | parseInt(options.count), 75 | options.workspaceId, 76 | options.agentId 77 | ) 78 | } 79 | 80 | await fetcher.disconnect() 81 | 82 | if (calls.length === 0) { 83 | console.log(chalk.red('No calls found matching criteria.')) 84 | process.exit(1) 85 | } 86 | console.log(chalk.green(`Found ${calls.length} calls.`)) 87 | 88 | // 2. Setup Components 89 | const baselineTranscriber = new BaselineTranscriber() 90 | const metricsCalc = new MetricsCalculator() 91 | 92 | // Setup Streaming Models 93 | const allModels: StreamingTranscriber[] = [ 94 | new DeepgramTranscriber('nova-3', 'Deepgram Nova 3'), 95 | new DeepgramTranscriber('nova-2', 'Deepgram Nova 2'), 96 | new AssemblyAITranscriber(), 97 | new GoogleStreamingTranscriber(), 98 | new AwsTranscribeStreaming() 99 | ] 100 | 101 | const requestedModels = options.models === 'all' 102 | ? allModels 103 | : allModels.filter(m => options.models.split(',').includes(m.name)) 104 | 105 | // 3. Process Calls 106 | const multiBar = new MultiBar({ 107 | clearOnComplete: false, 108 | hideCursor: true, 109 | format: '{bar} | {percentage}% | {value}/{total} | {task}' 110 | }, Presets.shades_grey) 111 | 112 | const mainBar = multiBar.create(calls.length, 0, { task: 'Processing Calls' }) 113 | 114 | const perCallResults: CallResult[] = [] 115 | const modelStats: Record = {} 116 | 117 | requestedModels.forEach(m => { 118 | modelStats[m.name] = { wer: [], cer: [], sim: [], ttfw: [], ttc: [] } 119 | }) 120 | 121 | for (const call of calls) { 122 | mainBar.increment(0, { task: `Processing Call ${call.id}` }) 123 | 124 | // Download Audio 125 | const audioPath = await fetcher.downloadAudio(call.recordingUrl, call.id) 126 | 127 | // Baseline 128 | // Use existing transcript from DB if available and reliable? 129 | // The prompt implies using Whisper Large v3 as baseline. VAPI might use a different model. 130 | // We will re-transcribe with Whisper Large v3 locally/API to be sure of the baseline quality. 131 | let baselineText = '' 132 | try { 133 | const res = await baselineTranscriber.transcribe(audioPath) 134 | baselineText = res.transcript 135 | } catch (err) { 136 | console.error(chalk.red(`Baseline failed for ${call.id}:`), err) 137 | baselineText = call.transcript || '' // Fallback to VAPI transcript 138 | } 139 | 140 | const callResult: CallResult = { 141 | callId: call.id, 142 | audioPath, 143 | baselineTranscript: baselineText, 144 | results: {} 145 | } 146 | 147 | // Run Models in Parallel 148 | await Promise.all(requestedModels.map(async (model) => { 149 | try { 150 | const transRes = await model.transcribe(audioPath) 151 | const metrics = await metricsCalc.calculateAll( 152 | baselineText, 153 | transRes.transcript, 154 | { firstWord: transRes.timeToFirstWord, completion: transRes.timeToCompletion } 155 | ) 156 | 157 | callResult.results[model.name] = { 158 | transcript: transRes.transcript, 159 | metrics 160 | } 161 | 162 | // Accumulate stats 163 | modelStats[model.name].wer.push(metrics.wer) 164 | modelStats[model.name].cer.push(metrics.cer) 165 | modelStats[model.name].sim.push(metrics.similarity) 166 | modelStats[model.name].ttfw.push(metrics.latency.firstWord) 167 | modelStats[model.name].ttc.push(metrics.latency.completion) 168 | 169 | } catch (err: any) { 170 | // Log error but continue 171 | // console.error(`Model ${model.name} failed for ${call.id}`, err.message) 172 | } 173 | })) 174 | 175 | perCallResults.push(callResult) 176 | mainBar.increment() 177 | } 178 | 179 | multiBar.stop() 180 | 181 | // 4. Aggregate Results 182 | const benchmarkResult: BenchmarkResult = { 183 | timestamp: new Date().toISOString(), 184 | baselineModel: 'Whisper Large v3', 185 | callsAnalyzed: calls.length, 186 | models: requestedModels.map(m => { 187 | const stats = modelStats[m.name] 188 | const count = stats.wer.length || 1 189 | return { 190 | name: m.name, 191 | avgWER: stats.wer.reduce((a,b) => a+b, 0) / count, 192 | avgCER: stats.cer.reduce((a,b) => a+b, 0) / count, 193 | avgSimilarity: stats.sim.reduce((a,b) => a+b, 0) / count, 194 | avgTimeToFirstWord: stats.ttfw.reduce((a,b) => a+b, 0) / count, 195 | avgTimeToCompletion: stats.ttc.reduce((a,b) => a+b, 0) / count 196 | } 197 | }), 198 | perCallResults 199 | } 200 | 201 | // 5. Save & Report 202 | await storage.saveResults(benchmarkResult) 203 | 204 | const reporter = new ReportGenerator() 205 | await reporter.generateReport(benchmarkResult, outputDir) 206 | 207 | console.log(chalk.bold.green('\n✅ Benchmark Complete!')) 208 | console.log(`Results saved to: ${outputDir}`) 209 | console.log(`Report: file://${path.join(outputDir, 'report.html')}`) 210 | } 211 | 212 | main().catch(err => { 213 | console.error(chalk.red('Fatal Error:'), err) 214 | process.exit(1) 215 | }) 216 | 217 | -------------------------------------------------------------------------------- /lib/streaming-transcribers.ts: -------------------------------------------------------------------------------- 1 | import { createClient, LiveTranscriptionEvents } from '@deepgram/sdk' 2 | import { AssemblyAI } from 'assemblyai' 3 | import { protos, SpeechClient } from '@google-cloud/speech' 4 | import { 5 | TranscribeStreamingClient, 6 | StartStreamTranscriptionCommand, 7 | StartStreamTranscriptionCommandInput 8 | } from '@aws-sdk/client-transcribe-streaming' 9 | import fs from 'fs' 10 | import { getEnvVar, sleep } from './utils.js' 11 | import { StreamingTranscriber, TranscriptionResult } from './types.js' 12 | 13 | type DeepgramTranscriptEvent = { 14 | channel: { 15 | alternatives: Array<{ 16 | transcript: string 17 | }> 18 | } 19 | is_final: boolean 20 | } 21 | 22 | type AssemblyTranscriptEvent = { 23 | text: string 24 | message_type: 'PartialTranscript' | 'FinalTranscript' 25 | } 26 | 27 | const bufferToArrayBuffer = (buffer: Buffer): ArrayBufferLike => 28 | buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) 29 | 30 | // --- Deepgram --- 31 | 32 | export class DeepgramTranscriber implements StreamingTranscriber { 33 | name: string 34 | private model: string 35 | private apiKey: string 36 | 37 | constructor(model: string = 'nova-3', name: string) { 38 | this.model = model 39 | this.name = name 40 | this.apiKey = getEnvVar('DEEPGRAM_API_KEY', true) 41 | } 42 | 43 | async transcribe(audioPath: string): Promise { 44 | const deepgram = createClient(this.apiKey) 45 | const startTime = Date.now() 46 | let firstWordTime = 0 47 | const transcripts: string[] = [] 48 | 49 | return new Promise(async (resolve, reject) => { 50 | const connection = deepgram.listen.live({ 51 | model: this.model, 52 | smart_format: true, 53 | interim_results: true, 54 | }) 55 | 56 | connection.on(LiveTranscriptionEvents.Open, async () => { 57 | connection.on(LiveTranscriptionEvents.Transcript, (data: DeepgramTranscriptEvent) => { 58 | const transcript = data.channel.alternatives[0]?.transcript ?? '' 59 | if (transcript && !firstWordTime) { 60 | firstWordTime = Date.now() - startTime 61 | } 62 | if (data.is_final && transcript) { 63 | transcripts.push(transcript) 64 | } 65 | }) 66 | 67 | connection.on(LiveTranscriptionEvents.Close, () => { 68 | const endTime = Date.now() 69 | resolve({ 70 | transcript: transcripts.join(' '), 71 | timeToFirstWord: firstWordTime || (endTime - startTime), 72 | timeToCompletion: endTime - startTime, 73 | metadata: { model: this.model } 74 | }) 75 | }) 76 | 77 | connection.on(LiveTranscriptionEvents.Error, (err: unknown) => { 78 | reject(err instanceof Error ? err : new Error('Deepgram streaming error')) 79 | }) 80 | 81 | // Stream audio 82 | const fileBuffer = await fs.promises.readFile(audioPath) 83 | connection.send(bufferToArrayBuffer(fileBuffer)) 84 | // finish 85 | connection.finish() 86 | }) 87 | }) 88 | } 89 | } 90 | 91 | // --- AssemblyAI --- 92 | 93 | export class AssemblyAITranscriber implements StreamingTranscriber { 94 | name: string = 'AssemblyAI Streaming' 95 | private client: AssemblyAI 96 | 97 | constructor() { 98 | this.client = new AssemblyAI({ 99 | apiKey: getEnvVar('ASSEMBLYAI_API_KEY', true) 100 | }) 101 | } 102 | 103 | async transcribe(audioPath: string): Promise { 104 | const startTime = Date.now() 105 | let firstWordTime = 0 106 | const transcripts: string[] = [] 107 | 108 | const rt = this.client.realtime.transcriber({ 109 | sampleRate: 16000 110 | }) 111 | 112 | return new Promise(async (resolve, reject) => { 113 | rt.on('open', ({ sessionId }: { sessionId: string }) => { 114 | // Session started 115 | }) 116 | 117 | rt.on('error', (error: unknown) => { 118 | reject(error instanceof Error ? error : new Error('AssemblyAI streaming error')) 119 | }) 120 | 121 | rt.on('transcript', (transcript: AssemblyTranscriptEvent) => { 122 | if (transcript.text && !firstWordTime) { 123 | firstWordTime = Date.now() - startTime 124 | } 125 | if (transcript.message_type === 'FinalTranscript') { 126 | transcripts.push(transcript.text) 127 | } 128 | }) 129 | 130 | rt.on('close', (code: number, reason: string) => { 131 | const endTime = Date.now() 132 | resolve({ 133 | transcript: transcripts.join(' '), 134 | timeToFirstWord: firstWordTime || (endTime - startTime), 135 | timeToCompletion: endTime - startTime, 136 | metadata: { model: 'assemblyai-streaming' } 137 | }) 138 | }) 139 | 140 | try { 141 | await rt.connect() 142 | const fileBuffer = await fs.promises.readFile(audioPath) 143 | const CHUNK_SIZE = 8192 144 | for (let i = 0; i < fileBuffer.length; i += CHUNK_SIZE) { 145 | const chunk = fileBuffer.subarray(i, i + CHUNK_SIZE) 146 | rt.sendAudio(bufferToArrayBuffer(chunk)) 147 | await sleep(20) 148 | } 149 | await rt.close() 150 | } catch (err) { 151 | reject(err instanceof Error ? err : new Error('AssemblyAI streaming failure')) 152 | } 153 | }) 154 | } 155 | } 156 | 157 | // --- Google Cloud Speech --- 158 | 159 | export class GoogleStreamingTranscriber implements StreamingTranscriber { 160 | name: string = 'Google Streaming' 161 | private client: SpeechClient 162 | 163 | constructor() { 164 | // Relies on GOOGLE_APPLICATION_CREDENTIALS env var or default auth 165 | this.client = new SpeechClient() 166 | } 167 | 168 | async transcribe(audioPath: string): Promise { 169 | const startTime = Date.now() 170 | let firstWordTime = 0 171 | const transcripts: string[] = [] 172 | 173 | return new Promise((resolve, reject) => { 174 | const recognizeStream = this.client 175 | .streamingRecognize({ 176 | config: { 177 | encoding: 'LINEAR16', // Assumes WAV/PCM. Might need conversion if MP3. 178 | sampleRateHertz: 16000, 179 | languageCode: 'en-US', 180 | enableAutomaticPunctuation: true, 181 | }, 182 | interimResults: true, 183 | }) 184 | .on('error', reject) 185 | .on('data', (data: protos.google.cloud.speech.v1.StreamingRecognizeResponse) => { 186 | const result = data.results?.[0] 187 | const transcript = result?.alternatives?.[0]?.transcript 188 | if (result && transcript) { 189 | if (transcript && !firstWordTime) { 190 | firstWordTime = Date.now() - startTime 191 | } 192 | if (result.isFinal) { 193 | transcripts.push(transcript) 194 | } 195 | } 196 | }) 197 | .on('end', () => { 198 | const endTime = Date.now() 199 | resolve({ 200 | transcript: transcripts.join(' '), 201 | timeToFirstWord: firstWordTime || (endTime - startTime), 202 | timeToCompletion: endTime - startTime, 203 | metadata: { model: 'google-streaming' } 204 | }) 205 | }) 206 | 207 | const fileStream = fs.createReadStream(audioPath) 208 | fileStream.pipe(recognizeStream) 209 | }) 210 | } 211 | } 212 | 213 | // --- AWS Transcribe --- 214 | 215 | export class AwsTranscribeStreaming implements StreamingTranscriber { 216 | name: string = 'AWS Transcribe Streaming' 217 | private client: TranscribeStreamingClient 218 | 219 | constructor() { 220 | this.client = new TranscribeStreamingClient({ 221 | region: getEnvVar('AWS_REGION', true), 222 | credentials: { 223 | accessKeyId: getEnvVar('AWS_ACCESS_KEY_ID', true), 224 | secretAccessKey: getEnvVar('AWS_SECRET_ACCESS_KEY', true), 225 | } 226 | }) 227 | } 228 | 229 | async transcribe(audioPath: string): Promise { 230 | const startTime = Date.now() 231 | let firstWordTime = 0 232 | const transcripts: string[] = [] 233 | 234 | // Create an async generator for the audio stream 235 | const audioStream = async function* () { 236 | const fileStream = fs.createReadStream(audioPath, { highWaterMark: 1024 * 4 }) 237 | for await (const chunk of fileStream) { 238 | yield { AudioEvent: { AudioChunk: chunk } } 239 | } 240 | }() 241 | 242 | const params: StartStreamTranscriptionCommandInput = { 243 | LanguageCode: 'en-US', 244 | MediaEncoding: 'pcm', // Assumes PCM/WAV. AWS Transcribe Streaming supports pcm, ogg-opus, flac. 245 | MediaSampleRateHertz: 16000, 246 | AudioStream: audioStream, 247 | } 248 | 249 | try { 250 | const command = new StartStreamTranscriptionCommand(params) 251 | const response = await this.client.send(command) 252 | 253 | if (response.TranscriptResultStream) { 254 | for await (const event of response.TranscriptResultStream) { 255 | if (event.TranscriptEvent) { 256 | const results = event.TranscriptEvent.Transcript?.Results 257 | if (results && results.length > 0) { 258 | const result = results[0] 259 | if (result.Alternatives && result.Alternatives.length > 0) { 260 | const transcript = result.Alternatives[0].Transcript 261 | if (transcript && !firstWordTime) { 262 | firstWordTime = Date.now() - startTime 263 | } 264 | if (!result.IsPartial) { 265 | transcripts.push(transcript || '') 266 | } 267 | } 268 | } 269 | } 270 | } 271 | } 272 | 273 | const endTime = Date.now() 274 | return { 275 | transcript: transcripts.join(' '), 276 | timeToFirstWord: firstWordTime || (endTime - startTime), 277 | timeToCompletion: endTime - startTime, 278 | metadata: { model: 'aws-streaming' } 279 | } 280 | 281 | } catch (err) { 282 | throw err 283 | } 284 | } 285 | } 286 | --------------------------------------------------------------------------------