├── .github └── workflows │ └── npm-publish-github-packages.yml ├── .gitignore ├── LICENSE ├── README.md ├── package-lock.json ├── package.json ├── src ├── index.ts ├── services │ ├── image-classification │ │ └── image-classification.ts │ ├── ocr │ │ └── ocr.ts │ ├── rag │ │ └── rag.ts │ ├── speech-recognition │ │ └── recognition.ts │ ├── summarization │ │ └── summarization.ts │ └── text-to-speech │ │ └── tts.ts └── utils.ts ├── test.html ├── tests └── main.test.ts ├── tsconfig.json ├── vite.config.ts └── vitest.workspace.ts /.github/workflows/npm-publish-github-packages.yml: -------------------------------------------------------------------------------- 1 | # This workflow will run tests using node and then publish a package to GitHub Packages when a release is created 2 | # For more information see: https://docs.github.com/en/actions/publishing-packages/publishing-nodejs-packages 3 | 4 | name: Node.js Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: actions/setup-node@v3 16 | with: 17 | node-version: 16 18 | - run: npm ci 19 | - run: npm run build 20 | 21 | publish-gpr: 22 | needs: build 23 | runs-on: ubuntu-latest 24 | permissions: 25 | contents: read 26 | packages: write 27 | steps: 28 | - uses: actions/checkout@v4 29 | - uses: actions/setup-node@v3 30 | with: 31 | node-version: 16 32 | registry-url: https://npm.pkg.github.com/ 33 | - run: npm ci 34 | - run: npm run build 35 | - run: npm publish 36 | env: 37 | NODE_AUTH_TOKEN: ${{secrets.GITHUB_TOKEN}} 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Justin Willis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web AI Toolkit 3 | 4 | The Web AI Toolkit simplifies the integration of AI features, such as OCR, speech-to-text, text summarization and more into your application. It ensures data privacy and offline capability by running all AI workloads locally, leveraging WebNN when available, with a fallback to WebGPU. 5 | 6 | ## Installation 7 | 8 | To install the Web AI Toolkit, run: 9 | 10 | ```sh 11 | npm install web-ai-toolkit 12 | ``` 13 | 14 | ## Available Functions 15 | 16 | *Note: Supported hardware is listed in priority of device selection. For example, for transcribing an audio file, 17 | the code will attempt to choose the GPU first and then the CPU otherwise.* 18 | 19 | | Function Name | Parameter | Type | Default Value | Supported Hardware | 20 | |-----------------------|----------------|------------------------|---------------|--------------------| 21 | | transcribeAudioFile | audioFile | Blob | - | GPU / CPU | 22 | | | model | string | "Xenova/whisper-tiny"| | 23 | | | timestamps | boolean | false | | 24 | | | language | string | "en-US" | | 25 | | textToSpeech | text | string | - | GPU / CPU | 26 | | | model | string | "Xenova/mms-tts-eng"| | 27 | | summarize | text | string | - | GPU / CPU | 28 | | | model | string | "Xenova/distilbart-cnn-6-6"| | 29 | | ocr | image | Blob | - | GPU / CPU | 30 | | | model | string | "Xenova/trocr-small-printed"| | 31 | | classifyImage | image | Blob | - | NPU / GPU / CPU | 32 | | | model | string | "Xenova/resnet-50"| | 33 | | doRAGSearch | texts | Array | [] | GPU 34 | | | query | string | "" | | 35 | 36 | ## Usage 37 | 38 | Here are examples of how to use each function: 39 | 40 | ### RAG (Retrieval-Augmented Generation) 41 | 42 | ```javascript 43 | import { doRAGSearch } from 'web-ai-toolkit'; 44 | 45 | window.showOpenFilePicker().then(async (file) => { 46 | const fileBlob = await file[0].getFile(); 47 | const text = await fileBlob.text(); 48 | 49 | // text can be derived from anything 50 | // this sample is just meant to be extremely simple 51 | // for example, your text could be an array of text that you have OCR'ed 52 | // from some photos 53 | 54 | const query = "My Search Query"; 55 | const ragQuery = await doRAGSearch([text], query); 56 | console.log(ragQuery); 57 | }); 58 | ``` 59 | 60 | ### Transcribe Audio File 61 | 62 | ```javascript 63 | import { transcribeAudioFile } from 'web-ai-toolkit'; 64 | 65 | const audioFile = ...; // Your audio file Blob 66 | const transcription = await transcribeAudioFile(audioFile, "Xenova/whisper-tiny", true, "en-US"); 67 | console.log(transcription); 68 | ``` 69 | 70 | ### Text to Speech 71 | 72 | ```javascript 73 | import { textToSpeech } from 'web-ai-toolkit'; 74 | 75 | const text = "Hello, world!"; 76 | const audio = await textToSpeech(text); 77 | console.log(audio); 78 | ``` 79 | 80 | ### Summarize Text 81 | 82 | ```javascript 83 | import { summarize } from 'web-ai-toolkit'; 84 | 85 | const text = "Long text to be summarized..."; 86 | const summary = await summarize(text); 87 | console.log(summary); 88 | ``` 89 | 90 | ### Optical Character Recognition (OCR) 91 | 92 | ```javascript 93 | import { ocr } from 'web-ai-toolkit'; 94 | 95 | const image = ...; // Your image Blob 96 | const text = await ocr(image); 97 | console.log(text); 98 | ``` 99 | 100 | ### Image Classification 101 | 102 | ```javascript 103 | import { classifyImage } from 'web-ai-toolkit'; 104 | 105 | const image = ...; // Your image Blob 106 | const text = await classifyImage(image); 107 | console.log(text); 108 | ``` 109 | 110 | ## Technical Details 111 | 112 | The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API, or even to the CPU with WebAssembly. Choosing the correct hardware to target is handled by the library. 113 | 114 | ## Contribution 115 | 116 | We welcome contributions to the Web AI Toolkit. Please fork the repository and submit a pull request with your changes. For major changes, please open an issue first to discuss what you would like to change. 117 | 118 | ## License 119 | 120 | The Web AI Toolkit is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details. 121 | 122 | ## Contact 123 | 124 | For questions or support, please open an issue here on GitHub 125 | 126 | --- 127 | 128 | Thank you for using the Web AI Toolkit! We hope it makes integrating AI into your applications easier and more efficient. 129 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-ai-toolkit", 3 | "version": "0.3.3", 4 | "repository": "https://github.com/jgw96/web-ai-toolkit", 5 | "keywords": [ 6 | "ai", 7 | "AI", 8 | "Web AI", 9 | "ONNX Runtime", 10 | "ai toolkit", 11 | "WebNN", 12 | "webnn" 13 | ], 14 | "homepage": "https://jgw96.github.io/web-ai-toolkit/", 15 | "bugs": { 16 | "url": "https://github.com/jgw96/web-ai-toolkit/issues" 17 | }, 18 | "publishConfig": { 19 | "registry": "https://registry.npmjs.org/" 20 | }, 21 | "description": "AI powered features on the web made easy", 22 | "main": "dist/index.js", 23 | "type": "module", 24 | "scripts": { 25 | "test": "vitest", 26 | "build": "vite build", 27 | "start": "npm run build && npx httpster test.html", 28 | "test:browser": "vitest --workspace=vitest.workspace.ts" 29 | }, 30 | "author": "", 31 | "license": "ISC", 32 | "devDependencies": { 33 | "@vitest/browser": "^2.1.2", 34 | "@webgpu/types": "^0.1.52", 35 | "playwright": "^1.47.2", 36 | "typescript": "^5.7.3", 37 | "vite": "^5.4.11", 38 | "vite-plugin-dts": "^4.2.2", 39 | "vitest": "^2.1.2" 40 | }, 41 | "dependencies": { 42 | "@huggingface/transformers": "^3.2.4", 43 | "@langchain/core": "^0.3.29", 44 | "@mlc-ai/web-llm": "^0.2.77", 45 | "@xenova/transformers": "^2.17.2", 46 | "langchain": "^0.3.11" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export async function transcribeAudioFile(audioFile: Blob, model: string = "Xenova/whisper-tiny", timestamps: boolean = false, language: string = "en-US") { 2 | try { 3 | const { loadTranscriber, doLocalWhisper } = await import("./services/speech-recognition/recognition"); 4 | await loadTranscriber(model, timestamps, language); 5 | return doLocalWhisper(audioFile, model); 6 | } 7 | catch (err) { 8 | console.error(err); 9 | return err; 10 | } 11 | } 12 | 13 | export async function textToSpeech(text: string, model: string = "Xenova/mms-tts-eng") { 14 | try { 15 | const { runSynthesizer } = await import("./services/text-to-speech/tts"); 16 | return runSynthesizer(text, model); 17 | } 18 | catch (err) { 19 | console.error(err); 20 | return err; 21 | } 22 | } 23 | 24 | export async function summarize(text: string, model: string = "Xenova/distilbart-cnn-6-6") { 25 | try { 26 | const { runSummarizer } = await import("./services/summarization/summarization"); 27 | return runSummarizer(text, model); 28 | } 29 | catch (err) { 30 | console.error(err); 31 | return err; 32 | } 33 | } 34 | 35 | export async function ocr(image: Blob, model: string = "Xenova/trocr-small-printed") { 36 | try { 37 | const { runOCR } = await import("./services/ocr/ocr"); 38 | return runOCR(image, model); 39 | } 40 | catch (err) { 41 | console.error(err); 42 | return err; 43 | } 44 | } 45 | 46 | export async function classifyImage(image: Blob, model: string = "Xenova/resnet-50") { 47 | try { 48 | const { runClassifier } = await import("./services/image-classification/image-classification"); 49 | return runClassifier(image, model); 50 | } 51 | catch (err) { 52 | console.error(err); 53 | return err; 54 | } 55 | } 56 | 57 | export async function doRAGSearch(texts: string[], query: string) { 58 | try { 59 | const { simpleRAG } = await import("./services/rag/rag"); 60 | return simpleRAG(texts, query); 61 | } 62 | catch (err) { 63 | console.error(err); 64 | return err; 65 | } 66 | } -------------------------------------------------------------------------------- /src/services/image-classification/image-classification.ts: -------------------------------------------------------------------------------- 1 | import { pipeline, env } from '@huggingface/transformers'; 2 | import { webGPUCheck } from '../../utils'; 3 | 4 | let classifier: any = undefined; 5 | 6 | export async function runClassifier(image: Blob | string, model: string = "Xenova/resnet-50") { 7 | return new Promise(async (resolve, reject) => { 8 | try { 9 | if (!classifier) { 10 | await loadClassifier(model); 11 | }; 12 | 13 | if (typeof image !== "string") { 14 | image = URL.createObjectURL(image); 15 | } 16 | 17 | const out = await classifier(image); 18 | resolve(out); 19 | } 20 | catch (err) { 21 | reject(err); 22 | } 23 | }); 24 | } 25 | 26 | async function loadClassifier(model: string): Promise { 27 | return new Promise(async (resolve) => { 28 | if (!classifier) { 29 | env.allowLocalModels = false; 30 | env.useBrowserCache = false; 31 | 32 | classifier = await pipeline("image-classification", model || "Xenova/resnet-50", { 33 | device: (navigator as any).ml ? "webnn-npu" : await webGPUCheck() ? "webgpu" : "wasm" 34 | }); 35 | 36 | resolve(); 37 | } 38 | else { 39 | resolve(); 40 | } 41 | }); 42 | } -------------------------------------------------------------------------------- /src/services/ocr/ocr.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-async-promise-executor */ 2 | import { pipeline, env } from '@huggingface/transformers'; 3 | import { webGPUCheck } from '../../utils'; 4 | 5 | let ocr: any = undefined; 6 | 7 | export async function runOCR(image: Blob | string, model: string = "Xenova/trocr-small-printed") { 8 | return new Promise(async (resolve, reject) => { 9 | try { 10 | if (!ocr) { 11 | await loadOCR(model); 12 | } 13 | 14 | if (typeof image !== "string") { 15 | image = URL.createObjectURL(image); 16 | } 17 | 18 | const out = await ocr(image); 19 | resolve(out); 20 | } 21 | catch(err) { 22 | reject(err); 23 | } 24 | }); 25 | } 26 | 27 | async function loadOCR(model: string): Promise { 28 | return new Promise(async (resolve) => { 29 | if (!ocr) { 30 | env.allowLocalModels = false; 31 | env.useBrowserCache = false; 32 | ocr = await pipeline('image-to-text', model || 'Xenova/trocr-small-printed', { 33 | device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm" 34 | }); 35 | 36 | resolve(); 37 | } 38 | else { 39 | resolve(); 40 | } 41 | }); 42 | } -------------------------------------------------------------------------------- /src/services/rag/rag.ts: -------------------------------------------------------------------------------- 1 | import * as webllm from "@mlc-ai/web-llm"; 2 | import type { EmbeddingsInterface } from "@langchain/core/embeddings"; 3 | import { MemoryVectorStore } from "langchain/vectorstores/memory"; 4 | import { formatDocumentsAsString } from "langchain/util/document"; 5 | import { PromptTemplate } from "@langchain/core/prompts"; 6 | import { 7 | RunnableSequence, 8 | RunnablePassthrough, 9 | } from "@langchain/core/runnables"; 10 | 11 | class WebLLMEmbeddings implements EmbeddingsInterface { 12 | engine: webllm.MLCEngineInterface; 13 | modelId: string; 14 | constructor(engine: webllm.MLCEngineInterface, modelId: string) { 15 | this.engine = engine; 16 | this.modelId = modelId; 17 | } 18 | 19 | async _embed(texts: string[]): Promise { 20 | const reply = await this.engine.embeddings.create({ 21 | input: texts, 22 | model: this.modelId, 23 | }); 24 | const result: number[][] = []; 25 | for (let i = 0; i < texts.length; i++) { 26 | result.push(reply.data[i].embedding); 27 | } 28 | return result; 29 | } 30 | 31 | async embedQuery(document: string): Promise { 32 | return this._embed([document]).then((embeddings) => embeddings[0]); 33 | } 34 | 35 | async embedDocuments(documents: string[]): Promise { 36 | return this._embed(documents); 37 | } 38 | } 39 | 40 | const initProgressCallback = (report: webllm.InitProgressReport) => { 41 | console.log('Progress:', report); 42 | 43 | window.dispatchEvent(new CustomEvent('model-loading', { 44 | detail: report, 45 | })); 46 | }; 47 | 48 | let vectorStore: MemoryVectorStore; 49 | let engine: webllm.MLCEngineInterface; 50 | let llmModelId: string; 51 | 52 | export async function loadUpDocuments(texts: string[]): Promise { 53 | const embeddingModelId = "snowflake-arctic-embed-s-q0f32-MLC-b4"; 54 | llmModelId = "Llama-3.2-1B-Instruct-q4f16_1-MLC"; 55 | 56 | if (!engine) { 57 | engine = await webllm.CreateMLCEngine( 58 | [embeddingModelId, llmModelId], 59 | { 60 | initProgressCallback: initProgressCallback, 61 | logLevel: "INFO", // specify the log level 62 | }, 63 | ); 64 | } 65 | 66 | vectorStore = await MemoryVectorStore.fromTexts( 67 | [...texts], 68 | [{ id: 1 }], 69 | new WebLLMEmbeddings(engine, embeddingModelId), 70 | ); 71 | return vectorStore; 72 | } 73 | 74 | export async function simpleRAG(texts: string[], query: string): Promise { 75 | if (!navigator.gpu) { 76 | Promise.reject("WebGPU not supported"); 77 | } 78 | 79 | const vectorStore = await loadUpDocuments(texts); 80 | const retriever = vectorStore.asRetriever(); 81 | 82 | const prompt = 83 | PromptTemplate.fromTemplate(`Answer the question based only on the following context: 84 | {context} 85 | 86 | Question: {question}`); 87 | 88 | const chain = RunnableSequence.from([ 89 | { 90 | context: retriever.pipe(formatDocumentsAsString), 91 | question: new RunnablePassthrough(), 92 | }, 93 | prompt, 94 | ]); 95 | 96 | const formattedPrompt = ( 97 | await chain.invoke(query) 98 | ).toString(); 99 | const reply = await engine.chat.completions.create({ 100 | messages: [{ role: "user", content: formattedPrompt }], 101 | model: llmModelId, 102 | }); 103 | 104 | return reply || ""; 105 | } 106 | -------------------------------------------------------------------------------- /src/services/speech-recognition/recognition.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-async-promise-executor */ 2 | import { AutomaticSpeechRecognitionPipeline, pipeline, env } from '@huggingface/transformers'; 3 | import { webGPUCheck } from '../../utils'; 4 | 5 | let transcriber: AutomaticSpeechRecognitionPipeline | undefined = undefined; 6 | 7 | export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-tiny") { 8 | return new Promise(async (resolve, reject) => { 9 | try { 10 | if (!transcriber) { 11 | await loadTranscriber(model || 'Xenova/whisper-tiny', false, 'en'); 12 | } 13 | 14 | const fileReader = new FileReader(); 15 | fileReader.onloadend = async () => { 16 | const audioCTX = new AudioContext({ 17 | sampleRate: 16000, 18 | }); 19 | const arrayBuffer = fileReader.result as ArrayBuffer; 20 | const audioData = await audioCTX.decodeAudioData(arrayBuffer); 21 | 22 | let audio; 23 | if (audioData.numberOfChannels === 2) { 24 | const SCALING_FACTOR = Math.sqrt(2); 25 | 26 | const left = audioData.getChannelData(0); 27 | const right = audioData.getChannelData(1); 28 | 29 | audio = new Float32Array(left.length); 30 | for (let i = 0; i < audioData.length; ++i) { 31 | audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2; 32 | } 33 | } else { 34 | // If the audio is not stereo, we can just use the first channel: 35 | audio = audioData.getChannelData(0); 36 | } 37 | 38 | const output = await localTranscribe(audio); 39 | resolve(output); 40 | 41 | 42 | 43 | }; 44 | fileReader.readAsArrayBuffer(audioFile); 45 | } 46 | catch (err) { 47 | reject(err); 48 | } 49 | }) 50 | } 51 | 52 | export async function loadTranscriber(model: string = "Xenova/whisper-tiny", timestamps: boolean, language: string): Promise { 53 | return new Promise(async (resolve) => { 54 | if (!transcriber) { 55 | env.allowLocalModels = false; 56 | env.useBrowserCache = false; 57 | transcriber = await pipeline('automatic-speech-recognition', model || 'Xenova/whisper-tiny', { 58 | // @ts-ignore 59 | return_timestamps: timestamps, 60 | language, 61 | // @ts-ignore 62 | device: await webGPUCheck() ? "webgpu" : "wasm" 63 | }); 64 | 65 | 66 | resolve(); 67 | } 68 | else { 69 | resolve(); 70 | } 71 | }) 72 | } 73 | 74 | export async function localTranscribe(audio: Float32Array): Promise { 75 | return new Promise(async (resolve, reject) => { 76 | if (transcriber) { 77 | // @ts-ignore 78 | const output = await transcriber(audio, { 79 | chunk_length_s: 30, 80 | stride_length_s: 5, 81 | // @ts-ignore 82 | callback_function: callback_function, // after each generation step 83 | chunk_callback: chunk_callback, // after each chunk is processed 84 | }); 85 | 86 | // @ts-ignore 87 | resolve(output.text); 88 | } 89 | else { 90 | reject(); 91 | } 92 | }) 93 | } 94 | 95 | // Storage for chunks to be processed. Initialise with an empty chunk. 96 | const chunks_to_process = [ 97 | { 98 | tokens: [], 99 | finalised: false, 100 | }, 101 | ]; 102 | 103 | // TODO: Storage for fully-processed and merged chunks 104 | // let decoded_chunks = []; 105 | 106 | function chunk_callback(chunk: any) { 107 | const last = chunks_to_process[chunks_to_process.length - 1]; 108 | 109 | // Overwrite last chunk with new info 110 | Object.assign(last, chunk); 111 | last.finalised = true; 112 | 113 | // Create an empty chunk after, if it not the last chunk 114 | if (!chunk.is_last) { 115 | chunks_to_process.push({ 116 | tokens: [], 117 | finalised: false, 118 | }); 119 | } 120 | } 121 | 122 | // Inject custom callback function to handle merging of chunks 123 | function callback_function(item: any) { 124 | // @ts-ignore 125 | const time_precision = transcriber!.processor.feature_extractor.config.chunk_length / transcriber!.model.config.max_source_positions; 126 | 127 | const last: any = chunks_to_process[chunks_to_process.length - 1]; 128 | 129 | // Update tokens of last chunk 130 | last.tokens = [...item[0].output_token_ids]; 131 | 132 | // Merge text chunks 133 | // TODO optimise so we don't have to decode all chunks every time 134 | // @ts-ignore 135 | const data = transcriber!.tokenizer._decode_asr(chunks_to_process, { 136 | time_precision: time_precision, 137 | return_timestamps: true, 138 | force_full_sequences: false, 139 | }); 140 | 141 | 142 | self.postMessage({ 143 | type: 'transcribe-interim', 144 | transcription: data[0] 145 | }); 146 | } -------------------------------------------------------------------------------- /src/services/summarization/summarization.ts: -------------------------------------------------------------------------------- 1 | import { pipeline, env } from '@huggingface/transformers'; 2 | import { webGPUCheck } from '../../utils'; 3 | 4 | let summarizer: any = undefined; 5 | 6 | export async function runSummarizer(text: string, model: string = "Xenova/distilbart-cnn-6-6") { 7 | return new Promise(async (resolve, reject) => { 8 | try { 9 | if (!summarizer) { 10 | await loadSummarizer(model); 11 | }; 12 | 13 | const out = await summarizer(text); 14 | resolve(out); 15 | } 16 | catch (err) { 17 | reject(err); 18 | } 19 | }); 20 | } 21 | 22 | async function loadSummarizer(model: string): Promise { 23 | return new Promise(async (resolve) => { 24 | if (!summarizer) { 25 | env.allowLocalModels = false; 26 | env.useBrowserCache = false; 27 | 28 | summarizer = await pipeline('summarization', model || 'Xenova/distilbart-cnn-6-6', { 29 | dtype: "fp32", 30 | device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm" 31 | }); 32 | 33 | resolve(); 34 | } 35 | else { 36 | resolve(); 37 | } 38 | }); 39 | } -------------------------------------------------------------------------------- /src/services/text-to-speech/tts.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-async-promise-executor */ 2 | import { pipeline, env } from '@huggingface/transformers'; 3 | 4 | let synthesizer: any = undefined; 5 | 6 | export async function runSynthesizer(text: string, model: string = "Xenova/mms-tts-eng") { 7 | return new Promise(async (resolve, reject) => { 8 | try { 9 | if (!synthesizer) { 10 | await loadSynthesizer(model); 11 | }; 12 | const out = await synthesizer(text); 13 | resolve(out); 14 | } 15 | catch (err) { 16 | reject(err); 17 | } 18 | }); 19 | } 20 | 21 | async function loadSynthesizer(model: string): Promise { 22 | return new Promise(async (resolve) => { 23 | if (!synthesizer) { 24 | env.allowLocalModels = false; 25 | env.useBrowserCache = false; 26 | synthesizer = await pipeline('text-to-speech', model || 'Xenova/mms-tts-eng'); 27 | resolve(); 28 | } 29 | else { 30 | resolve(); 31 | } 32 | }); 33 | } -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | export async function webGPUCheck(): Promise { 2 | // check to see if navigator.gpu exists and if we can create a device 3 | if ((navigator as any).gpu) { 4 | try { 5 | const gpu = await (navigator as any).gpu.requestAdapter(); 6 | return gpu ? true : false; 7 | } 8 | catch (err) { 9 | console.error(err); 10 | return false; 11 | } 12 | } 13 | else { 14 | return false; 15 | } 16 | } -------------------------------------------------------------------------------- /test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 |
19 | 20 |
21 | 22 | 23 |
24 | 25 |
26 | 27 | 28 |
29 | 30 |
31 | 32 |
33 | 34 | 35 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /tests/main.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, test } from 'vitest'; 2 | 3 | test('text-to-speech', async () => { 4 | return new Promise(async (resolve) => { 5 | const { textToSpeech } = await import("../src/index"); 6 | 7 | const audio = await textToSpeech("Hello, World!"); 8 | expect(audio).toBeDefined(); 9 | 10 | resolve(true); 11 | }); 12 | }); 13 | 14 | test('speech-to-text', async () => { 15 | return new Promise(async (resolve) => { 16 | const { transcribeAudioFile } = await import("../src/index"); 17 | 18 | const response = await fetch("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/ted_60_16k.wav"); 19 | const blob = await response.blob(); 20 | 21 | const text = await transcribeAudioFile(blob); 22 | expect(text).toBeDefined(); 23 | 24 | resolve(true); 25 | }); 26 | }); 27 | 28 | test('ocr', async () => { 29 | return new Promise(async (resolve) => { 30 | const { ocr } = await import("../src/index"); 31 | 32 | const response = await fetch("https://picsum.photos/200/300"); 33 | const blob = await response.blob(); 34 | 35 | const text = await ocr(blob); 36 | 37 | expect(text).toBeDefined(); 38 | resolve(true); 39 | }); 40 | }); 41 | 42 | test('image-classification', async () => { 43 | return new Promise(async (resolve) => { 44 | const { classifyImage } = await import("../src/index"); 45 | 46 | const response = await fetch("https://picsum.photos/200/300"); 47 | const blob = await response.blob(); 48 | 49 | const text = await classifyImage(blob); 50 | 51 | expect(text).toBeDefined(); 52 | resolve(true); 53 | }); 54 | }); 55 | 56 | test('summarize', async () => { 57 | return new Promise(async (resolve) => { 58 | const { summarize } = await import("../src/index"); 59 | 60 | const text = "the red fox is a small fox that lives in the forest. it has a red coat and a bushy tail. the red fox is a carnivore, which means it eats meat. it hunts small animals like rabbits"; 61 | const summary = await summarize(text); 62 | 63 | expect(summary).toBeDefined(); 64 | resolve(true); 65 | }); 66 | }); -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "forceConsistentCasingInFileNames": true, 5 | "outDir": "dist", 6 | "module": "esnext", 7 | "target": "esnext", 8 | "lib": [ 9 | "es2017", 10 | "esnext", 11 | "dom", 12 | "dom.iterable" 13 | ], 14 | "skipLibCheck": true, 15 | "declaration": true, 16 | "strict": true, 17 | "noUnusedLocals": true, 18 | "noUnusedParameters": true, 19 | "noImplicitReturns": true, 20 | "removeComments": false, 21 | "noFallthroughCasesInSwitch": true, 22 | "moduleResolution": "node", 23 | "types": ["@webgpu/types"], 24 | }, 25 | "include": [ 26 | "src/**/*.ts" 27 | ], 28 | "exclude": [] 29 | } -------------------------------------------------------------------------------- /vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite'; 2 | import dts from 'vite-plugin-dts'; 3 | // import basicSsl from '@vitejs/plugin-basic-ssl'; 4 | 5 | // https://vitejs.dev/config/ 6 | export default defineConfig({ 7 | worker: { 8 | format: "es" 9 | }, 10 | optimizeDeps: { 11 | esbuildOptions: { 12 | target: "es2022", 13 | } 14 | }, 15 | build: { 16 | sourcemap: false, 17 | assetsDir: "", 18 | cssMinify: true, 19 | target: "esnext", 20 | lib: { 21 | entry: "src/index.ts", 22 | formats: ["es"], 23 | fileName: "index", 24 | }, 25 | rollupOptions: { 26 | output: { 27 | format: "es", 28 | }, 29 | }, 30 | }, 31 | plugins: [ 32 | dts({ 33 | rollupTypes: true 34 | }) 35 | ] 36 | }) -------------------------------------------------------------------------------- /vitest.workspace.ts: -------------------------------------------------------------------------------- 1 | import { defineWorkspace } from 'vitest/config' 2 | 3 | export default defineWorkspace([ 4 | // If you want to keep running your existing tests in Node.js, uncomment the next line. 5 | // 'vite.config.ts', 6 | { 7 | extends: 'vite.config.ts', 8 | test: { 9 | testTimeout: 2000000, 10 | browser: { 11 | enabled: true, 12 | name: 'edge', 13 | provider: 'preview', 14 | }, 15 | }, 16 | }, 17 | ]) 18 | --------------------------------------------------------------------------------