├── tsconfig.json ├── src ├── constants.ts ├── download.ts └── index.ts ├── package.json └── README.md /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "es6", 4 | "esModuleInterop": true, 5 | "target": "es6", 6 | "moduleResolution": "node", 7 | "sourceMap": true, 8 | "outDir": "dist" 9 | }, 10 | "lib": ["es2015"] 11 | } -------------------------------------------------------------------------------- /src/constants.ts: -------------------------------------------------------------------------------- 1 | export const DEFAULT_MODEL = "base.en"; 2 | 3 | export const NODE_MODULES_MODELS_PATH = "node_modules/whisper-onnx-speech-to-text/models"; 4 | 5 | export const MODELS_LIST = { 6 | "tiny": "whisper-tiny", 7 | "tiny.en": "whisper-tiny.en", 8 | "base": "whisper-base", 9 | "base.en": "whisper-base.en", 10 | "small": "whisper-small", 11 | "small.en": "whisper-small.en", 12 | "medium": "whisper-medium", 13 | "medium.en": "whisper-medium.en" 14 | }; 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "whisper-onnx-speech-to-text", 3 | "version": "1.0.1", 4 | "description": "Node.js plugin for speech recognition that works with OpenAI's Whisper models using ONNX.", 5 | "homepage": "https://npmjs.com/whisper-onnx-speech-to-text", 6 | "author": "Alexandr Janashvili", 7 | "main": "dist/index.js", 8 | "type": "module", 9 | "bin": { 10 | "download": "dist/download.js" 11 | }, 12 | "scripts": { 13 | "build": "npx tsc && chmod +x dist/download.js" 14 | }, 15 | "devDependencies": { 16 | "@types/node": "^20.5.1", 17 | "ts-node": "^10.9.1", 18 | "typescript": "^5.1.6" 19 | }, 20 | "dependencies": { 21 | "@xenova/transformers": "^2.5.2", 22 | "readline-sync": "^1.4.10", 23 | "shelljs": "^0.8.5", 24 | "wavefile": "^11.0.0" 25 | }, 26 | "repository": { 27 | "type": "git", 28 | "url": "https://github.com/Alexandr-Janashvili/whisper-onnx-speech-to-text.git" 29 | }, 30 | "license": "MIT", 31 | "keywords": [ 32 | "openai", 33 | "whisper", 34 | "onnx", 35 | "huggingface", 36 | "transformers", 37 | "transformers.js", 38 | "transcribe", 39 | "audio recognition", 40 | "speech", 41 | "speech-to-Text", 42 | "wav2text", 43 | "stt", 44 | "tts" 45 | ] 46 | } 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # whisper-onnx-speech-to-text 2 | 3 | [![npm downloads](https://img.shields.io/npm/dm/whisper-onnx-speech-to-text)](https://npmjs.org/package/whisper-onnx-speech-to-text) 4 | [![npm downloads](https://img.shields.io/npm/l/whisper-onnx-speech-to-text)](https://npmjs.org/package/whisper-onnx-speech-to-text) 5 | 6 | Transcribe speech to text on node.js using OpenAI's Whisper models converted to cross-platform ONNX format 7 | 8 | ## Installation 9 | 10 | 1. Add dependency to project 11 | 12 | ```text 13 | npm install whisper-onnx-speech-to-text 14 | ``` 15 | 16 | 2. Download whisper model of choice 17 | 18 | ```text 19 | npx whisper-onnx-speech-to-text download 20 | ``` 21 | 22 | ## Usage 23 | 24 | ```js 25 | import { initWhisper } from 'whisper-onnx-speech-to-text'; 26 | 27 | const whisper = await initWhisper("base.en"); 28 | 29 | const transcript = await whisper.transcribe("example/sample.wav"); 30 | ``` 31 | 32 | ### Result (JSON) 33 | 34 | ```javascript 35 | [ 36 | { 37 | text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." 38 | chunks: [ 39 | { timestamp: [0, 8.18], text: " And so my fellow Americans ask not what your country can do for you" }, 40 | { timestamp: [8.18, 11.06], text: " ask what you can do for your country." } 41 | ] 42 | } 43 | ] 44 | ``` 45 | 46 | ## API 47 | 48 | ### initWhisper 49 | The `initWhisper()` takes the name of the model and returns an instance of the Whisper class initialized with the chosen model. 50 | 51 | ### Whisper 52 | 53 | The `Whisper` class has the following methods: 54 | 55 | - `transcribe(filePath: string, language?: string)` : transcribes speech from wav file. 56 | - `filePath`: path to wav file 57 | - `language`: target language for recognition. Name format - the full name in English like `'spanish'` 58 | - `disposeModel()` : dispose initialized model. 59 | 60 | ## Made with 61 | 62 | - [Transformers.js](https://www.npmjs.com/package/@xenova/transformers) 63 | - [ShellJS](https://www.npmjs.com/package/shelljs) 64 | -------------------------------------------------------------------------------- /src/download.ts: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import shell from 'shelljs'; 4 | import readlineSync from 'readline-sync'; 5 | import path from 'path'; 6 | import { MODELS_LIST, DEFAULT_MODEL, NODE_MODULES_MODELS_PATH } from './constants.js'; 7 | 8 | const MODEL_FILES = [ 9 | "added_tokens.json", 10 | "config.json", 11 | "generation_config.json", 12 | "merges.txt", 13 | "normalizer.json", 14 | "preprocessor_config.json", 15 | "quant_config.json", 16 | "special_tokens_map.json", 17 | "tokenizer.json", 18 | "tokenizer_config.json", 19 | "vocab.json", 20 | "onnx/encoder_model.onnx", 21 | "onnx/decoder_model_merged.onnx" 22 | ]; 23 | 24 | const src="https://huggingface.co/Xenova"; 25 | const pfx="resolve/main"; 26 | 27 | const askModel = async () => { 28 | const answer = await readlineSync.question(`\n[whisper-onnx-speech-to-text] Enter model name (e.g. 'base.en') or 'cancel' to exit\n(ENTER for base.en): `) 29 | 30 | if (answer === "cancel") { 31 | console.log("[whisper-onnx-speech-to-text] Exiting model downloader. Run again with: 'npx whisper-onnx-speech-to-text download'"); 32 | process.exit(0); 33 | } 34 | else if (answer === "") { 35 | console.log("[whisper-onnx-speech-to-text] Going with", DEFAULT_MODEL); 36 | return DEFAULT_MODEL; 37 | } 38 | else if (!MODELS_LIST[answer]) { 39 | console.log("\n[whisper-onnx-speech-to-text] FAIL: Name not found."); 40 | 41 | return await askModel(); 42 | } 43 | 44 | return answer; 45 | } 46 | 47 | export default async function downloadModel() { 48 | try { 49 | console.log(` 50 | | Model | Disk | 51 | |-----------|--------| 52 | | tiny | 235 MB | 53 | | tiny.en | 235 MB | 54 | | base | 400 MB | 55 | | base.en | 400 MB | 56 | | small | 1.1 GB | 57 | | small.en | 1.1 GB | 58 | | medium | 1.2 GB | 59 | | medium.en | 1.2 GB | 60 | `); 61 | 62 | const modelName = await askModel(); 63 | 64 | if (!!shell.which("wget")) { 65 | MODEL_FILES.forEach(fileName => { 66 | shell.exec(`wget --quiet --show-progress -P ./${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}/${path.dirname(fileName)} ${src}/${MODELS_LIST[modelName]}/${pfx}/${fileName}`); 67 | }); 68 | } 69 | else if (!!shell.which("curl")) { 70 | MODEL_FILES.forEach(fileName => { 71 | shell.exec(`curl -L ${src}/${MODELS_LIST[modelName]}/${pfx}/${fileName} -o ${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}/${fileName} --create-dirs`); 72 | }); 73 | } 74 | else { 75 | console.log("[whisper-onnx-speech-to-text] Either wget or curl is required to download models."); 76 | } 77 | 78 | process.exit(0); 79 | } catch (error) { 80 | console.log("ERROR Caught in download model"); 81 | console.log(error); 82 | return error; 83 | } 84 | } 85 | 86 | downloadModel(); 87 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import wavefile from 'wavefile'; 2 | import fs from 'fs'; 3 | import path from 'path'; 4 | import util from 'util'; 5 | import { Pipeline, pipeline, env } from '@xenova/transformers'; 6 | import { DEFAULT_MODEL, MODELS_LIST, NODE_MODULES_MODELS_PATH } from './constants.js'; 7 | 8 | env.local_files_only = true; 9 | env.localModelPath = NODE_MODULES_MODELS_PATH; 10 | env.backends.onnx.wasm.numThreads = 1; 11 | 12 | const readFile = util.promisify(fs.readFile); 13 | 14 | const modelPromise = (modelName: string): Promise => { 15 | return new Promise(async (resolve, reject) => { 16 | try { 17 | 18 | if (!MODELS_LIST[modelName]) 19 | throw `[whisper-onnx-speech-to-text] modelName "${modelName}" not found in list of models.\n`; 20 | 21 | if (!fs.existsSync(`${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}`)) 22 | throw `[whisper-onnx-speech-to-text] '${modelName}' not downloaded! Run 'npx whisper-onnx-speech-to-text download'\n`; 23 | 24 | resolve(await pipeline("automatic-speech-recognition", MODELS_LIST[modelName], { quantized: false })); 25 | 26 | } catch (err) { 27 | reject(err); 28 | } 29 | }); 30 | }; 31 | 32 | const prepareAudio = async (filePath: string): Promise => { 33 | const wav = new wavefile.WaveFile(await readFile(path.normalize(filePath))); 34 | wav.toBitDepth('32f'); 35 | wav.toSampleRate(16000); 36 | let audioData = wav.getSamples(); 37 | 38 | if (Array.isArray(audioData)) { 39 | if (audioData.length > 1) { 40 | const SCALING_FACTOR = Math.sqrt(2); 41 | 42 | for (let i = 0; i < audioData[0].length; ++i) { 43 | audioData[0][i] = SCALING_FACTOR * (audioData[0][i] + audioData[1][i]) / 2; 44 | } 45 | } 46 | 47 | audioData = audioData[0]; 48 | } 49 | 50 | return audioData; 51 | }; 52 | 53 | class Whisper { 54 | private model: Pipeline; 55 | 56 | constructor(model: Pipeline) { 57 | this.model = model; 58 | } 59 | 60 | public async transcribe(filePath: string, language?: string) { 61 | try { 62 | const audioData = await prepareAudio(filePath); 63 | 64 | const lang = language ? { language } : {}; 65 | 66 | return this.model(audioData, { 67 | chunk_length_s: 30, 68 | stride_length_s: 5, 69 | return_timestamps: true, 70 | ...lang 71 | }); 72 | } catch (error) { 73 | console.log("[whisper-onnx-speech-to-text] Problem:", error); 74 | } 75 | } 76 | 77 | public async disposeModel() { 78 | return this.model.dispose(); 79 | } 80 | } 81 | 82 | export const initWhisper = async (modelName?: string): Promise => { 83 | if (!modelName) 84 | console.log("[whisper-onnx-speech-to-text] No 'modelName' provided. Trying default model:", DEFAULT_MODEL, "\n"); 85 | 86 | const model = await modelPromise(modelName || DEFAULT_MODEL); 87 | return new Whisper(model); 88 | }; 89 | --------------------------------------------------------------------------------