├── tsconfig.json
├── src
    ├── constants.ts
    ├── download.ts
    └── index.ts
├── package.json
└── README.md


/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "module": "es6",
 4 |     "esModuleInterop": true,
 5 |     "target": "es6",
 6 |     "moduleResolution": "node",
 7 |     "sourceMap": true,
 8 |     "outDir": "dist"
 9 |   },
10 |   "lib": ["es2015"]
11 | }


--------------------------------------------------------------------------------
/src/constants.ts:
--------------------------------------------------------------------------------
 1 | export const DEFAULT_MODEL = "base.en";
 2 | 
 3 | export const NODE_MODULES_MODELS_PATH = "node_modules/whisper-onnx-speech-to-text/models";
 4 | 
 5 | export const MODELS_LIST = {
 6 |   "tiny": "whisper-tiny",
 7 |   "tiny.en": "whisper-tiny.en",
 8 |   "base": "whisper-base",
 9 |   "base.en": "whisper-base.en",
10 |   "small": "whisper-small",
11 |   "small.en": "whisper-small.en",
12 |   "medium": "whisper-medium",
13 |   "medium.en": "whisper-medium.en"
14 | };
15 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "whisper-onnx-speech-to-text",
 3 |     "version": "1.0.1",
 4 |     "description": "Node.js plugin for speech recognition that works with OpenAI's Whisper models using ONNX.",
 5 |     "homepage": "https://npmjs.com/whisper-onnx-speech-to-text",
 6 |     "author": "Alexandr Janashvili",
 7 |     "main": "dist/index.js",
 8 |     "type": "module",
 9 |     "bin": {
10 |         "download": "dist/download.js"
11 |     },
12 |     "scripts": {
13 |         "build": "npx tsc && chmod +x dist/download.js"
14 |     },
15 |     "devDependencies": {
16 |         "@types/node": "^20.5.1",
17 |         "ts-node": "^10.9.1",
18 |         "typescript": "^5.1.6"
19 |     },
20 |     "dependencies": {
21 |         "@xenova/transformers": "^2.5.2",
22 |         "readline-sync": "^1.4.10",
23 |         "shelljs": "^0.8.5",
24 |         "wavefile": "^11.0.0"
25 |     },
26 |     "repository": {
27 |         "type": "git",
28 |         "url": "https://github.com/Alexandr-Janashvili/whisper-onnx-speech-to-text.git"
29 |     },
30 |     "license": "MIT",
31 |     "keywords": [
32 |         "openai",
33 |         "whisper",
34 |         "onnx",
35 |         "huggingface",
36 |         "transformers",
37 |         "transformers.js",
38 |         "transcribe",
39 |         "audio recognition",
40 |         "speech",
41 |         "speech-to-Text",
42 |         "wav2text",
43 |         "stt",
44 |         "tts"
45 |     ]
46 | }
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # whisper-onnx-speech-to-text
 2 | 
 3 | [![npm downloads](https://img.shields.io/npm/dm/whisper-onnx-speech-to-text)](https://npmjs.org/package/whisper-onnx-speech-to-text)
 4 | [![npm downloads](https://img.shields.io/npm/l/whisper-onnx-speech-to-text)](https://npmjs.org/package/whisper-onnx-speech-to-text)  
 5 | 
 6 | Transcribe speech to text on node.js using OpenAI's Whisper models converted to cross-platform ONNX format
 7 | 
 8 | ## Installation
 9 | 
10 | 1. Add dependency to project
11 | 
12 | ```text
13 | npm install whisper-onnx-speech-to-text
14 | ```
15 | 
16 | 2. Download whisper model of choice
17 | 
18 | ```text
19 | npx whisper-onnx-speech-to-text download
20 | ```
21 | 
22 | ## Usage
23 | 
24 | ```js
25 | import { initWhisper } from 'whisper-onnx-speech-to-text';
26 | 
27 | const whisper = await initWhisper("base.en");
28 | 
29 | const transcript = await whisper.transcribe("example/sample.wav");
30 | ```
31 | 
32 | ### Result (JSON)
33 | 
34 | ```javascript
35 | [
36 |   {
37 |     text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
38 |     chunks: [
39 |        { timestamp: [0, 8.18],  text: " And so my fellow Americans ask not what your country can do for you" },
40 |        { timestamp: [8.18, 11.06], text: " ask what you can do for your country." }
41 |     ]
42 |   }
43 | ]
44 | ```
45 | 
46 | ## API
47 | 
48 | ### initWhisper
49 | The `initWhisper()` takes the name of the model and returns an instance of the Whisper class initialized with the chosen model.
50 | 
51 | ### Whisper
52 | 
53 | The `Whisper` class has the following methods:
54 | 
55 | - `transcribe(filePath: string, language?: string)` : transcribes speech from wav file.
56 |   - `filePath`: path to wav file
57 |   - `language`: target language for recognition. Name format - the full name in English like `'spanish'`
58 | - `disposeModel()` : dispose initialized model.
59 | 
60 | ## Made with
61 | 
62 | - [Transformers.js](https://www.npmjs.com/package/@xenova/transformers)
63 | - [ShellJS](https://www.npmjs.com/package/shelljs)
64 | 


--------------------------------------------------------------------------------
/src/download.ts:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env node
 2 | 
 3 | import shell from 'shelljs';
 4 | import readlineSync from 'readline-sync';
 5 | import path from 'path';
 6 | import { MODELS_LIST, DEFAULT_MODEL, NODE_MODULES_MODELS_PATH } from './constants.js';
 7 | 
 8 | const MODEL_FILES = [
 9 |   "added_tokens.json",
10 |   "config.json",
11 |   "generation_config.json",
12 |   "merges.txt",
13 |   "normalizer.json",
14 |   "preprocessor_config.json",
15 |   "quant_config.json",
16 |   "special_tokens_map.json",
17 |   "tokenizer.json",
18 |   "tokenizer_config.json",
19 |   "vocab.json",
20 |   "onnx/encoder_model.onnx",
21 |   "onnx/decoder_model_merged.onnx"
22 | ];
23 | 
24 | const src="https://huggingface.co/Xenova";
25 | const pfx="resolve/main";
26 | 
27 | const askModel = async () => {
28 |   const answer = await readlineSync.question(`\n[whisper-onnx-speech-to-text] Enter model name (e.g. 'base.en') or 'cancel' to exit\n(ENTER for base.en): `)
29 | 
30 |   if (answer === "cancel") {
31 |     console.log("[whisper-onnx-speech-to-text] Exiting model downloader. Run again with: 'npx whisper-onnx-speech-to-text download'");
32 |     process.exit(0);
33 |   }
34 |   else if (answer === "") {
35 |     console.log("[whisper-onnx-speech-to-text] Going with", DEFAULT_MODEL);
36 |     return DEFAULT_MODEL;
37 |   }
38 |   else if (!MODELS_LIST[answer]) {
39 |     console.log("\n[whisper-onnx-speech-to-text] FAIL: Name not found.");
40 | 
41 |     return await askModel();
42 |   }
43 | 
44 |   return answer;
45 | }
46 | 
47 | export default async function downloadModel() {
48 |   try {
49 |     console.log(`
50 | | Model     | Disk   |
51 | |-----------|--------|
52 | | tiny      | 235 MB |
53 | | tiny.en   | 235 MB |
54 | | base      | 400 MB |
55 | | base.en   | 400 MB |
56 | | small     | 1.1 GB |
57 | | small.en  | 1.1 GB |
58 | | medium    | 1.2 GB |
59 | | medium.en | 1.2 GB |
60 | `);
61 | 
62 |     const modelName = await askModel();
63 | 
64 |     if (!!shell.which("wget")) {
65 |       MODEL_FILES.forEach(fileName => {
66 |         shell.exec(`wget --quiet --show-progress -P ./${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}/${path.dirname(fileName)} ${src}/${MODELS_LIST[modelName]}/${pfx}/${fileName}`);
67 |       });
68 |     }
69 |     else if (!!shell.which("curl")) {
70 |       MODEL_FILES.forEach(fileName => {
71 |         shell.exec(`curl -L ${src}/${MODELS_LIST[modelName]}/${pfx}/${fileName} -o ${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}/${fileName} --create-dirs`);
72 |       });
73 |     }
74 |     else {
75 |       console.log("[whisper-onnx-speech-to-text] Either wget or curl is required to download models.");
76 |     }
77 | 
78 |     process.exit(0);
79 |   } catch (error) {
80 |     console.log("ERROR Caught in download model");
81 |     console.log(error);
82 |     return error;
83 |   }
84 | }
85 | 
86 | downloadModel();
87 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import wavefile from 'wavefile';
 2 | import fs from 'fs';
 3 | import path from 'path';
 4 | import util from 'util';
 5 | import { Pipeline, pipeline, env } from '@xenova/transformers';
 6 | import { DEFAULT_MODEL, MODELS_LIST, NODE_MODULES_MODELS_PATH } from './constants.js';
 7 | 
 8 | env.local_files_only = true;
 9 | env.localModelPath = NODE_MODULES_MODELS_PATH;
10 | env.backends.onnx.wasm.numThreads = 1;
11 | 
12 | const readFile = util.promisify(fs.readFile);
13 | 
14 | const modelPromise = (modelName: string): Promise<Pipeline> => {
15 |   return new Promise<Pipeline>(async (resolve, reject) => {
16 |     try {
17 |       
18 |       if (!MODELS_LIST[modelName])
19 |         throw `[whisper-onnx-speech-to-text] modelName "${modelName}" not found in list of models.\n`;
20 |       
21 |       if (!fs.existsSync(`${NODE_MODULES_MODELS_PATH}/${MODELS_LIST[modelName]}`))
22 |         throw `[whisper-onnx-speech-to-text] '${modelName}' not downloaded! Run 'npx whisper-onnx-speech-to-text download'\n`;
23 |       
24 |       resolve(await pipeline("automatic-speech-recognition", MODELS_LIST[modelName], { quantized: false }));
25 | 
26 |     } catch (err) {
27 |         reject(err);
28 |     }
29 |   });
30 | };
31 | 
32 | const prepareAudio = async (filePath: string): Promise<Float64Array> => {
33 |   const wav = new wavefile.WaveFile(await readFile(path.normalize(filePath)));
34 |   wav.toBitDepth('32f');
35 |   wav.toSampleRate(16000);
36 |   let audioData = wav.getSamples();
37 | 
38 |   if (Array.isArray(audioData)) {
39 |       if (audioData.length > 1) {
40 |           const SCALING_FACTOR = Math.sqrt(2);
41 | 
42 |           for (let i = 0; i < audioData[0].length; ++i) {
43 |               audioData[0][i] = SCALING_FACTOR * (audioData[0][i] + audioData[1][i]) / 2;
44 |           }
45 |       }
46 | 
47 |       audioData = audioData[0];
48 |   }
49 | 
50 |   return audioData;
51 | };
52 | 
53 | class Whisper {
54 |   private model: Pipeline;
55 | 
56 |   constructor(model: Pipeline) {
57 |     this.model = model;
58 |   }
59 | 
60 |   public async transcribe(filePath: string, language?: string) {
61 |     try {
62 |       const audioData = await prepareAudio(filePath);
63 | 
64 |       const lang = language ? { language } : {};
65 |       
66 |       return this.model(audioData, {
67 |         chunk_length_s: 30,
68 |         stride_length_s: 5,
69 |         return_timestamps: true,
70 |         ...lang
71 |       });
72 |     } catch (error) {
73 |       console.log("[whisper-onnx-speech-to-text] Problem:", error);
74 |     }
75 |   }
76 | 
77 |   public async disposeModel() {
78 |     return this.model.dispose();
79 |   }
80 | }
81 | 
82 | export const initWhisper = async (modelName?: string): Promise<Whisper> => {
83 |   if (!modelName)
84 |     console.log("[whisper-onnx-speech-to-text] No 'modelName' provided. Trying default model:", DEFAULT_MODEL, "\n");
85 | 
86 |   const model = await modelPromise(modelName || DEFAULT_MODEL);
87 |   return new Whisper(model);
88 | };
89 | 


--------------------------------------------------------------------------------