├── .github
    └── workflows
    │   └── npm-publish-github-packages.yml
├── .gitignore
├── LICENSE
├── README.md
├── package-lock.json
├── package.json
├── src
    ├── index.ts
    ├── services
    │   ├── image-classification
    │   │   └── image-classification.ts
    │   ├── ocr
    │   │   └── ocr.ts
    │   ├── rag
    │   │   └── rag.ts
    │   ├── speech-recognition
    │   │   └── recognition.ts
    │   ├── summarization
    │   │   └── summarization.ts
    │   └── text-to-speech
    │   │   └── tts.ts
    └── utils.ts
├── test.html
├── tests
    └── main.test.ts
├── tsconfig.json
├── vite.config.ts
└── vitest.workspace.ts


/.github/workflows/npm-publish-github-packages.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will run tests using node and then publish a package to GitHub Packages when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/publishing-packages/publishing-nodejs-packages
 3 | 
 4 | name: Node.js Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: actions/setup-node@v3
16 |         with:
17 |           node-version: 16
18 |       - run: npm ci
19 |       - run: npm run build
20 | 
21 |   publish-gpr:
22 |     needs: build
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       contents: read
26 |       packages: write
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - uses: actions/setup-node@v3
30 |         with:
31 |           node-version: 16
32 |           registry-url: https://npm.pkg.github.com/
33 |       - run: npm ci
34 |       - run: npm run build
35 |       - run: npm publish
36 |         env:
37 |           NODE_AUTH_TOKEN: ${{secrets.GITHUB_TOKEN}}
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Justin Willis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Web AI Toolkit
  3 | 
  4 | The Web AI Toolkit simplifies the integration of AI features, such as OCR, speech-to-text, text summarization and more into your application. It ensures data privacy and offline capability by running all AI workloads locally, leveraging WebNN when available, with a fallback to WebGPU.
  5 | 
  6 | ## Installation
  7 | 
  8 | To install the Web AI Toolkit, run:
  9 | 
 10 | ```sh
 11 | npm install web-ai-toolkit
 12 | ```
 13 | 
 14 | ## Available Functions
 15 | 
 16 | *Note: Supported hardware is listed in priority of device selection. For example, for transcribing an audio file,
 17 | the code will attempt to choose the GPU first and then the CPU otherwise.*
 18 | 
 19 | | Function Name         | Parameter      | Type                   | Default Value | Supported Hardware |
 20 | |-----------------------|----------------|------------------------|---------------|--------------------|
 21 | | transcribeAudioFile   | audioFile      | Blob                   | -             | GPU / CPU               |
 22 | |                       | model          | string                 | "Xenova/whisper-tiny"|                    |
 23 | |                       | timestamps     | boolean                | false         |                    |
 24 | |                       | language       | string                 | "en-US"       |                    |
 25 | | textToSpeech          | text           | string                 | -             | GPU / CPU               |
 26 | |                       | model          | string                 | "Xenova/mms-tts-eng"|                    |
 27 | | summarize             | text           | string                 | -             | GPU / CPU               |
 28 | |                       | model          | string                 | "Xenova/distilbart-cnn-6-6"|                |
 29 | | ocr                   | image          | Blob                   | -             | GPU / CPU               |
 30 | |                       | model          | string                 | "Xenova/trocr-small-printed"|                 |
 31 | | classifyImage         | image          | Blob                   | -             | NPU / GPU / CPU               |
 32 | |                       | model          | string                 | "Xenova/resnet-50"|                 |
 33 | | doRAGSearch           | texts          | Array<string>          | []            | GPU
 34 | |                       | query          | string                 | ""            |                      |
 35 | 
 36 | ## Usage
 37 | 
 38 | Here are examples of how to use each function:
 39 | 
 40 | ### RAG (Retrieval-Augmented Generation)
 41 | 
 42 | ```javascript
 43 | import { doRAGSearch } from 'web-ai-toolkit';
 44 | 
 45 | window.showOpenFilePicker().then(async (file) => {
 46 |     const fileBlob = await file[0].getFile();
 47 |     const text = await fileBlob.text();
 48 | 
 49 |     // text can be derived from anything
 50 |     // this sample is just meant to be extremely simple
 51 |     // for example, your text could be an array of text that you have OCR'ed
 52 |     // from some photos
 53 | 
 54 |     const query = "My Search Query";
 55 |     const ragQuery = await doRAGSearch([text], query);
 56 |     console.log(ragQuery);
 57 | });
 58 | ```
 59 | 
 60 | ### Transcribe Audio File
 61 | 
 62 | ```javascript
 63 | import { transcribeAudioFile } from 'web-ai-toolkit';
 64 | 
 65 | const audioFile = ...; // Your audio file Blob
 66 | const transcription = await transcribeAudioFile(audioFile, "Xenova/whisper-tiny", true, "en-US");
 67 | console.log(transcription);
 68 | ```
 69 | 
 70 | ### Text to Speech
 71 | 
 72 | ```javascript
 73 | import { textToSpeech } from 'web-ai-toolkit';
 74 | 
 75 | const text = "Hello, world!";
 76 | const audio = await textToSpeech(text);
 77 | console.log(audio);
 78 | ```
 79 | 
 80 | ### Summarize Text
 81 | 
 82 | ```javascript
 83 | import { summarize } from 'web-ai-toolkit';
 84 | 
 85 | const text = "Long text to be summarized...";
 86 | const summary = await summarize(text);
 87 | console.log(summary);
 88 | ```
 89 | 
 90 | ### Optical Character Recognition (OCR)
 91 | 
 92 | ```javascript
 93 | import { ocr } from 'web-ai-toolkit';
 94 | 
 95 | const image = ...; // Your image Blob
 96 | const text = await ocr(image);
 97 | console.log(text);
 98 | ```
 99 | 
100 | ### Image Classification
101 | 
102 | ```javascript
103 | import { classifyImage } from 'web-ai-toolkit';
104 | 
105 | const image = ...; // Your image Blob
106 | const text = await classifyImage(image);
107 | console.log(text);
108 | ```
109 | 
110 | ## Technical Details
111 | 
112 | The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API, or even to the CPU with WebAssembly. Choosing the correct hardware to target is handled by the library.
113 | 
114 | ## Contribution
115 | 
116 | We welcome contributions to the Web AI Toolkit. Please fork the repository and submit a pull request with your changes. For major changes, please open an issue first to discuss what you would like to change.
117 | 
118 | ## License
119 | 
120 | The Web AI Toolkit is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
121 | 
122 | ## Contact
123 | 
124 | For questions or support, please open an issue here on GitHub
125 | 
126 | ---
127 | 
128 | Thank you for using the Web AI Toolkit! We hope it makes integrating AI into your applications easier and more efficient.
129 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "web-ai-toolkit",
 3 |   "version": "0.3.3",
 4 |   "repository": "https://github.com/jgw96/web-ai-toolkit",
 5 |   "keywords": [
 6 |     "ai",
 7 |     "AI",
 8 |     "Web AI",
 9 |     "ONNX Runtime",
10 |     "ai toolkit",
11 |     "WebNN",
12 |     "webnn"
13 |   ],
14 |   "homepage": "https://jgw96.github.io/web-ai-toolkit/",
15 |   "bugs": {
16 |     "url": "https://github.com/jgw96/web-ai-toolkit/issues"
17 |   },
18 |   "publishConfig": {
19 |     "registry": "https://registry.npmjs.org/"
20 |   },
21 |   "description": "AI powered features on the web made easy",
22 |   "main": "dist/index.js",
23 |   "type": "module",
24 |   "scripts": {
25 |     "test": "vitest",
26 |     "build": "vite build",
27 |     "start": "npm run build && npx httpster test.html",
28 |     "test:browser": "vitest --workspace=vitest.workspace.ts"
29 |   },
30 |   "author": "",
31 |   "license": "ISC",
32 |   "devDependencies": {
33 |     "@vitest/browser": "^2.1.2",
34 |     "@webgpu/types": "^0.1.52",
35 |     "playwright": "^1.47.2",
36 |     "typescript": "^5.7.3",
37 |     "vite": "^5.4.11",
38 |     "vite-plugin-dts": "^4.2.2",
39 |     "vitest": "^2.1.2"
40 |   },
41 |   "dependencies": {
42 |     "@huggingface/transformers": "^3.2.4",
43 |     "@langchain/core": "^0.3.29",
44 |     "@mlc-ai/web-llm": "^0.2.77",
45 |     "@xenova/transformers": "^2.17.2",
46 |     "langchain": "^0.3.11"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | export async function transcribeAudioFile(audioFile: Blob, model: string = "Xenova/whisper-tiny", timestamps: boolean = false, language: string = "en-US") {
 2 |     try {
 3 |         const { loadTranscriber, doLocalWhisper } = await import("./services/speech-recognition/recognition");
 4 |         await loadTranscriber(model, timestamps, language);
 5 |         return doLocalWhisper(audioFile, model);
 6 |     }
 7 |     catch (err) {
 8 |         console.error(err);
 9 |         return err;
10 |     }
11 | }
12 | 
13 | export async function textToSpeech(text: string, model: string = "Xenova/mms-tts-eng") {
14 |     try {
15 |         const { runSynthesizer } = await import("./services/text-to-speech/tts");
16 |         return runSynthesizer(text, model);
17 |     }
18 |     catch (err) {
19 |         console.error(err);
20 |         return err;
21 |     }
22 | }
23 | 
24 | export async function summarize(text: string, model: string = "Xenova/distilbart-cnn-6-6") {
25 |     try {
26 |         const { runSummarizer } = await import("./services/summarization/summarization");
27 |         return runSummarizer(text, model);
28 |     }
29 |     catch (err) {
30 |         console.error(err);
31 |         return err;
32 |     }
33 | }
34 | 
35 | export async function ocr(image: Blob, model: string = "Xenova/trocr-small-printed") {
36 |     try {
37 |         const { runOCR } = await import("./services/ocr/ocr");
38 |         return runOCR(image, model);
39 |     }
40 |     catch (err) {
41 |         console.error(err);
42 |         return err;
43 |     }
44 | }
45 | 
46 | export async function classifyImage(image: Blob, model: string = "Xenova/resnet-50") {
47 |     try {
48 |         const { runClassifier } = await import("./services/image-classification/image-classification");
49 |         return runClassifier(image, model);
50 |     }
51 |     catch (err) {
52 |         console.error(err);
53 |         return err;
54 |     }
55 | }
56 | 
57 | export async function doRAGSearch(texts: string[], query: string) {
58 |     try {
59 |         const { simpleRAG } = await import("./services/rag/rag");
60 |         return simpleRAG(texts, query);
61 |     }
62 |     catch (err) {
63 |         console.error(err);
64 |         return err;
65 |     }
66 | }


--------------------------------------------------------------------------------
/src/services/image-classification/image-classification.ts:
--------------------------------------------------------------------------------
 1 | import { pipeline, env } from '@huggingface/transformers';
 2 | import { webGPUCheck } from '../../utils';
 3 | 
 4 | let classifier: any = undefined;
 5 | 
 6 | export async function runClassifier(image: Blob | string, model: string = "Xenova/resnet-50") {
 7 |     return new Promise(async (resolve, reject) => {
 8 |         try {
 9 |             if (!classifier) {
10 |                 await loadClassifier(model);
11 |             };
12 | 
13 |             if (typeof image !== "string") {
14 |                 image = URL.createObjectURL(image);
15 |             }
16 | 
17 |             const out = await classifier(image);
18 |             resolve(out);
19 |         }
20 |         catch (err) {
21 |             reject(err);
22 |         }
23 |     });
24 | }
25 | 
26 | async function loadClassifier(model: string): Promise<void> {
27 |     return new Promise(async (resolve) => {
28 |         if (!classifier) {
29 |             env.allowLocalModels = false;
30 |             env.useBrowserCache = false;
31 | 
32 |             classifier = await pipeline("image-classification", model || "Xenova/resnet-50", {
33 |                 device: (navigator as any).ml ? "webnn-npu" : await webGPUCheck() ? "webgpu" : "wasm"
34 |             });
35 | 
36 |             resolve();
37 |         }
38 |         else {
39 |             resolve();
40 |         }
41 |     });
42 | }


--------------------------------------------------------------------------------
/src/services/ocr/ocr.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-async-promise-executor */
 2 | import { pipeline, env } from '@huggingface/transformers';
 3 | import { webGPUCheck } from '../../utils';
 4 | 
 5 | let ocr: any = undefined;
 6 | 
 7 | export async function runOCR(image: Blob | string, model: string = "Xenova/trocr-small-printed") {
 8 |     return new Promise(async (resolve, reject) => {
 9 |         try {
10 |             if (!ocr) {
11 |                 await loadOCR(model);
12 |             }
13 |     
14 |             if (typeof image !== "string") {
15 |                 image = URL.createObjectURL(image);
16 |             }
17 |     
18 |             const out = await ocr(image);
19 |             resolve(out);
20 |         }
21 |         catch(err) {
22 |             reject(err);
23 |         }
24 |     });
25 | }
26 | 
27 | async function loadOCR(model: string): Promise<void> {
28 |     return new Promise(async (resolve) => {
29 |         if (!ocr) {
30 |             env.allowLocalModels = false;
31 |             env.useBrowserCache = false;
32 |             ocr = await pipeline('image-to-text', model || 'Xenova/trocr-small-printed', {
33 |                 device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm"
34 |             });
35 | 
36 |             resolve();
37 |         }
38 |         else {
39 |             resolve();
40 |         }
41 |     });
42 | }


--------------------------------------------------------------------------------
/src/services/rag/rag.ts:
--------------------------------------------------------------------------------
  1 | import * as webllm from "@mlc-ai/web-llm";
  2 | import type { EmbeddingsInterface } from "@langchain/core/embeddings";
  3 | import { MemoryVectorStore } from "langchain/vectorstores/memory";
  4 | import { formatDocumentsAsString } from "langchain/util/document";
  5 | import { PromptTemplate } from "@langchain/core/prompts";
  6 | import {
  7 |     RunnableSequence,
  8 |     RunnablePassthrough,
  9 | } from "@langchain/core/runnables";
 10 | 
 11 | class WebLLMEmbeddings implements EmbeddingsInterface {
 12 |     engine: webllm.MLCEngineInterface;
 13 |     modelId: string;
 14 |     constructor(engine: webllm.MLCEngineInterface, modelId: string) {
 15 |         this.engine = engine;
 16 |         this.modelId = modelId;
 17 |     }
 18 | 
 19 |     async _embed(texts: string[]): Promise<number[][]> {
 20 |         const reply = await this.engine.embeddings.create({
 21 |             input: texts,
 22 |             model: this.modelId,
 23 |         });
 24 |         const result: number[][] = [];
 25 |         for (let i = 0; i < texts.length; i++) {
 26 |             result.push(reply.data[i].embedding);
 27 |         }
 28 |         return result;
 29 |     }
 30 | 
 31 |     async embedQuery(document: string): Promise<number[]> {
 32 |         return this._embed([document]).then((embeddings) => embeddings[0]);
 33 |     }
 34 | 
 35 |     async embedDocuments(documents: string[]): Promise<number[][]> {
 36 |         return this._embed(documents);
 37 |     }
 38 | }
 39 | 
 40 | const initProgressCallback = (report: webllm.InitProgressReport) => {
 41 |     console.log('Progress:', report);
 42 | 
 43 |     window.dispatchEvent(new CustomEvent('model-loading', {
 44 |         detail: report,
 45 |     }));
 46 | };
 47 | 
 48 | let vectorStore: MemoryVectorStore;
 49 | let engine: webllm.MLCEngineInterface;
 50 | let llmModelId: string;
 51 | 
 52 | export async function loadUpDocuments(texts: string[]): Promise<MemoryVectorStore> {
 53 |     const embeddingModelId = "snowflake-arctic-embed-s-q0f32-MLC-b4";
 54 |     llmModelId = "Llama-3.2-1B-Instruct-q4f16_1-MLC";
 55 | 
 56 |     if (!engine) {
 57 |         engine = await webllm.CreateMLCEngine(
 58 |             [embeddingModelId, llmModelId],
 59 |             {
 60 |                 initProgressCallback: initProgressCallback,
 61 |                 logLevel: "INFO", // specify the log level
 62 |             },
 63 |         );
 64 |     }
 65 | 
 66 |     vectorStore = await MemoryVectorStore.fromTexts(
 67 |         [...texts],
 68 |         [{ id: 1 }],
 69 |         new WebLLMEmbeddings(engine, embeddingModelId),
 70 |     );
 71 |     return vectorStore;
 72 | }
 73 | 
 74 | export async function simpleRAG(texts: string[], query: string): Promise<any> {
 75 |     if (!navigator.gpu) {
 76 |         Promise.reject("WebGPU not supported");
 77 |     }
 78 | 
 79 |     const vectorStore = await loadUpDocuments(texts);
 80 |     const retriever = vectorStore.asRetriever();
 81 | 
 82 |     const prompt =
 83 |         PromptTemplate.fromTemplate(`Answer the question based only on the following context:
 84 |     {context}
 85 | 
 86 |     Question: {question}`);
 87 | 
 88 |     const chain = RunnableSequence.from([
 89 |         {
 90 |             context: retriever.pipe(formatDocumentsAsString),
 91 |             question: new RunnablePassthrough(),
 92 |         },
 93 |         prompt,
 94 |     ]);
 95 | 
 96 |     const formattedPrompt = (
 97 |         await chain.invoke(query)
 98 |     ).toString();
 99 |     const reply = await engine.chat.completions.create({
100 |         messages: [{ role: "user", content: formattedPrompt }],
101 |         model: llmModelId,
102 |     });
103 | 
104 |     return reply || "";
105 | }
106 | 


--------------------------------------------------------------------------------
/src/services/speech-recognition/recognition.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable no-async-promise-executor */
  2 | import { AutomaticSpeechRecognitionPipeline, pipeline, env } from '@huggingface/transformers';
  3 | import { webGPUCheck } from '../../utils';
  4 | 
  5 | let transcriber: AutomaticSpeechRecognitionPipeline | undefined = undefined;
  6 | 
  7 | export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-tiny") {
  8 |     return new Promise(async (resolve, reject) => {
  9 |         try {
 10 |             if (!transcriber) {
 11 |                 await loadTranscriber(model || 'Xenova/whisper-tiny', false, 'en');
 12 |             }
 13 | 
 14 |             const fileReader = new FileReader();
 15 |             fileReader.onloadend = async () => {
 16 |                 const audioCTX = new AudioContext({
 17 |                     sampleRate: 16000,
 18 |                 });
 19 |                 const arrayBuffer = fileReader.result as ArrayBuffer;
 20 |                 const audioData = await audioCTX.decodeAudioData(arrayBuffer);
 21 | 
 22 |                 let audio;
 23 |                 if (audioData.numberOfChannels === 2) {
 24 |                     const SCALING_FACTOR = Math.sqrt(2);
 25 | 
 26 |                     const left = audioData.getChannelData(0);
 27 |                     const right = audioData.getChannelData(1);
 28 | 
 29 |                     audio = new Float32Array(left.length);
 30 |                     for (let i = 0; i < audioData.length; ++i) {
 31 |                         audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
 32 |                     }
 33 |                 } else {
 34 |                     // If the audio is not stereo, we can just use the first channel:
 35 |                     audio = audioData.getChannelData(0);
 36 |                 }
 37 | 
 38 |                 const output = await localTranscribe(audio);
 39 |                 resolve(output);
 40 | 
 41 | 
 42 | 
 43 |             };
 44 |             fileReader.readAsArrayBuffer(audioFile);
 45 |         }
 46 |         catch (err) {
 47 |             reject(err);
 48 |         }
 49 |     })
 50 | }
 51 | 
 52 | export async function loadTranscriber(model: string = "Xenova/whisper-tiny", timestamps: boolean, language: string): Promise<void> {
 53 |     return new Promise(async (resolve) => {
 54 |         if (!transcriber) {
 55 |             env.allowLocalModels = false;
 56 |             env.useBrowserCache = false;
 57 |             transcriber = await pipeline('automatic-speech-recognition', model || 'Xenova/whisper-tiny', {
 58 |                 // @ts-ignore
 59 |                 return_timestamps: timestamps,
 60 |                 language,
 61 |                 // @ts-ignore
 62 |                 device: await webGPUCheck() ? "webgpu" : "wasm"
 63 |             });
 64 | 
 65 | 
 66 |             resolve();
 67 |         }
 68 |         else {
 69 |             resolve();
 70 |         }
 71 |     })
 72 | }
 73 | 
 74 | export async function localTranscribe(audio: Float32Array): Promise<string> {
 75 |     return new Promise(async (resolve, reject) => {
 76 |         if (transcriber) {
 77 |             // @ts-ignore
 78 |             const output = await transcriber(audio, {
 79 |                 chunk_length_s: 30,
 80 |                 stride_length_s: 5,
 81 |                 // @ts-ignore
 82 |                 callback_function: callback_function, // after each generation step
 83 |                 chunk_callback: chunk_callback, // after each chunk is processed
 84 |             });
 85 | 
 86 |             // @ts-ignore
 87 |             resolve(output.text);
 88 |         }
 89 |         else {
 90 |             reject();
 91 |         }
 92 |     })
 93 | }
 94 | 
 95 | // Storage for chunks to be processed. Initialise with an empty chunk.
 96 | const chunks_to_process = [
 97 |     {
 98 |         tokens: [],
 99 |         finalised: false,
100 |     },
101 | ];
102 | 
103 | // TODO: Storage for fully-processed and merged chunks
104 | // let decoded_chunks = [];
105 | 
106 | function chunk_callback(chunk: any) {
107 |     const last = chunks_to_process[chunks_to_process.length - 1];
108 | 
109 |     // Overwrite last chunk with new info
110 |     Object.assign(last, chunk);
111 |     last.finalised = true;
112 | 
113 |     // Create an empty chunk after, if it not the last chunk
114 |     if (!chunk.is_last) {
115 |         chunks_to_process.push({
116 |             tokens: [],
117 |             finalised: false,
118 |         });
119 |     }
120 | }
121 | 
122 | // Inject custom callback function to handle merging of chunks
123 | function callback_function(item: any) {
124 |     // @ts-ignore
125 |     const time_precision = transcriber!.processor.feature_extractor.config.chunk_length / transcriber!.model.config.max_source_positions;
126 | 
127 |     const last: any = chunks_to_process[chunks_to_process.length - 1];
128 | 
129 |     // Update tokens of last chunk
130 |     last.tokens = [...item[0].output_token_ids];
131 | 
132 |     // Merge text chunks
133 |     // TODO optimise so we don't have to decode all chunks every time
134 |     // @ts-ignore
135 |     const data = transcriber!.tokenizer._decode_asr(chunks_to_process, {
136 |         time_precision: time_precision,
137 |         return_timestamps: true,
138 |         force_full_sequences: false,
139 |     });
140 | 
141 | 
142 |     self.postMessage({
143 |         type: 'transcribe-interim',
144 |         transcription: data[0]
145 |     });
146 | }


--------------------------------------------------------------------------------
/src/services/summarization/summarization.ts:
--------------------------------------------------------------------------------
 1 | import { pipeline, env } from '@huggingface/transformers';
 2 | import { webGPUCheck } from '../../utils';
 3 | 
 4 | let summarizer: any = undefined;
 5 | 
 6 | export async function runSummarizer(text: string, model: string = "Xenova/distilbart-cnn-6-6") {
 7 |     return new Promise(async (resolve, reject) => {
 8 |         try {
 9 |             if (!summarizer) {
10 |                 await loadSummarizer(model);
11 |             };
12 | 
13 |             const out = await summarizer(text);
14 |             resolve(out);
15 |         }
16 |         catch (err) {
17 |             reject(err);
18 |         }
19 |     });
20 | }
21 | 
22 | async function loadSummarizer(model: string): Promise<void> {
23 |     return new Promise(async (resolve) => {
24 |         if (!summarizer) {
25 |             env.allowLocalModels = false;
26 |             env.useBrowserCache = false;
27 | 
28 |             summarizer = await pipeline('summarization', model || 'Xenova/distilbart-cnn-6-6', {
29 |                 dtype: "fp32",
30 |                 device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm"
31 |             });
32 | 
33 |             resolve();
34 |         }
35 |         else {
36 |             resolve();
37 |         }
38 |     });
39 | }


--------------------------------------------------------------------------------
/src/services/text-to-speech/tts.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-async-promise-executor */
 2 | import { pipeline, env } from '@huggingface/transformers';
 3 | 
 4 | let synthesizer: any = undefined;
 5 | 
 6 | export async function runSynthesizer(text: string, model: string = "Xenova/mms-tts-eng") {
 7 |     return new Promise(async (resolve, reject) => {
 8 |         try {
 9 |             if (!synthesizer) {
10 |                 await loadSynthesizer(model);
11 |             };
12 |             const out = await synthesizer(text);
13 |             resolve(out);
14 |         }
15 |         catch (err) {
16 |             reject(err);
17 |         }
18 |     });
19 | }
20 | 
21 | async function loadSynthesizer(model: string): Promise<void> {
22 |     return new Promise(async (resolve) => {
23 |         if (!synthesizer) {
24 |             env.allowLocalModels = false;
25 |             env.useBrowserCache = false;
26 |             synthesizer = await pipeline('text-to-speech', model || 'Xenova/mms-tts-eng');
27 |             resolve();
28 |         }
29 |         else {
30 |             resolve();
31 |         }
32 |     });
33 | }


--------------------------------------------------------------------------------
/src/utils.ts:
--------------------------------------------------------------------------------
 1 | export async function webGPUCheck(): Promise<boolean> {
 2 |     // check to see if navigator.gpu exists and if we can create a device
 3 |     if ((navigator as any).gpu) {
 4 |         try {
 5 |             const gpu = await (navigator as any).gpu.requestAdapter();
 6 |             return gpu ? true : false;
 7 |         }
 8 |         catch (err) {
 9 |             console.error(err);
10 |             return false;
11 |         }
12 |     }
13 |     else {
14 |         return false;
15 |     }
16 | }


--------------------------------------------------------------------------------
/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Document</title>
 8 | </head>
 9 | 
10 | <body>
11 | 
12 |     <button id="summarize_button">Test Summarize</button>
13 |     <button id="text_to_speech_button">Test Text to Speech</button>
14 | 
15 |     <div id="whisper-block">
16 |         <input type="file" id="whisper-file" accept="audio/*" />
17 |         <button id="speech-to-text-button">Test Speech to Text</button>
18 |     </div>
19 | 
20 |     <div id="ocr-block">
21 |         <input type="file" id="ocr-file" accept="image/*" />
22 |         <button id="image-to-text-button">Test Image to Text</button>
23 |     </div>
24 | 
25 |     <div id="image-classify-block">
26 |         <input type="file" id="image-classify-file" accept="image/*" />
27 |         <button id="image-classify-button">Test Image Classification</button>
28 |     </div>
29 | 
30 |     <div id="do-rag-block">
31 |         <button id="do-rag-button">Test DoRAG Search</button>
32 |     </div>
33 | 
34 | 
35 |     <script type="module">
36 |         document.querySelector("#summarize_button").addEventListener("click", async () => {
37 |             const { summarize } = await import("/dist/index.js");
38 | 
39 |             const text = "JavaScript is a versatile and widely-used programming language primarily known for its role in front-end web development. It enables developers to create interactive and dynamic content on websites, such as responding to user inputs, animating elements, and updating content without requiring a page reload. JavaScript can be run in the browser, which makes it essential for creating modern web applications, but it is also used on the server side with environments like Node.js. Its flexibility, combined with a vast ecosystem of libraries and frameworks, makes JavaScript a fundamental tool for building both simple websites and complex web applications.";
40 |             const summary = await summarize(text);
41 |             console.log(summary);
42 |         });
43 | 
44 |         document.querySelector("#text_to_speech_button").addEventListener("click", async () => {
45 |             const { textToSpeech } = await import("/dist/index.js");
46 | 
47 |             const text = "Hello, world!";
48 |             const audio = await textToSpeech(text);
49 |             console.log(audio);
50 |         });
51 | 
52 |         document.querySelector("#speech-to-text-button").addEventListener("click", async () => {
53 |             const { transcribeAudioFile } = await import("/dist/index.js");
54 | 
55 |             const file = document.querySelector("#whisper-file").files[0];
56 |             const text = await transcribeAudioFile(file);
57 |             console.log(text);
58 |         });
59 | 
60 |         document.querySelector("#image-to-text-button").addEventListener("click", async () => {
61 |             const { ocr } = await import("/dist/index.js");
62 | 
63 |             const file = document.querySelector("#ocr-file").files[0];
64 |             const text = await ocr(URL.createObjectURL(file));
65 |             console.log(text);
66 |             URL.revokeObjectURL(file);
67 |         });
68 | 
69 |         document.querySelector("#image-classify-button").addEventListener("click", async () => {
70 |             const { classifyImage } = await import("/dist/index.js");
71 | 
72 |             const file = document.querySelector("#image-classify-file").files[0];
73 |             const text = await classifyImage(URL.createObjectURL(file));
74 |             console.log(text);
75 |             URL.revokeObjectURL(file);
76 |         });
77 | 
78 |         document.querySelector("#do-rag-button").addEventListener("click", async () => {
79 |             const { doRAGSearch } = await import("/dist/index.js");
80 | 
81 |             // open up .txt files using the file system access api
82 |             window.showOpenFilePicker().then(async (file) => {
83 |                 const fileBlob = await file[0].getFile();
84 |                 const text = await fileBlob.text();
85 | 
86 |                 const query = "What is a coho?";
87 |                 const ragQuery = await doRAGSearch([text], query);
88 |                 console.log(ragQuery);
89 |             });
90 | 
91 |         });
92 |     </script>
93 | </body>
94 | 
95 | </html>


--------------------------------------------------------------------------------
/tests/main.test.ts:
--------------------------------------------------------------------------------
 1 | import { expect, test } from 'vitest';
 2 | 
 3 | test('text-to-speech', async () => {
 4 |     return new Promise(async (resolve) => {
 5 |         const { textToSpeech } = await import("../src/index");
 6 | 
 7 |         const audio = await textToSpeech("Hello, World!");
 8 |         expect(audio).toBeDefined();
 9 | 
10 |         resolve(true);
11 |     });
12 | });
13 | 
14 | test('speech-to-text', async () => {
15 |     return new Promise(async (resolve) => {
16 |         const { transcribeAudioFile } = await import("../src/index");
17 | 
18 |         const response = await fetch("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/ted_60_16k.wav");
19 |         const blob = await response.blob();
20 | 
21 |         const text = await transcribeAudioFile(blob);
22 |         expect(text).toBeDefined();
23 | 
24 |         resolve(true);
25 |     });
26 | });
27 | 
28 | test('ocr', async () => {
29 |     return new Promise(async (resolve) => {
30 |         const { ocr } = await import("../src/index");
31 | 
32 |         const response = await fetch("https://picsum.photos/200/300");
33 |         const blob = await response.blob();
34 | 
35 |         const text = await ocr(blob);
36 | 
37 |         expect(text).toBeDefined();
38 |         resolve(true);
39 |     });
40 | });
41 | 
42 | test('image-classification', async () => {
43 |     return new Promise(async (resolve) => {
44 |         const { classifyImage } = await import("../src/index");
45 | 
46 |         const response = await fetch("https://picsum.photos/200/300");
47 |         const blob = await response.blob();
48 | 
49 |         const text = await classifyImage(blob);
50 | 
51 |         expect(text).toBeDefined();
52 |         resolve(true);
53 |     });
54 | });
55 | 
56 | test('summarize', async () => {
57 |     return new Promise(async (resolve) => {
58 |         const { summarize } = await import("../src/index");
59 | 
60 |         const text = "the red fox is a small fox that lives in the forest. it has a red coat and a bushy tail. the red fox is a carnivore, which means it eats meat. it hunts small animals like rabbits";
61 |         const summary = await summarize(text);
62 | 
63 |         expect(summary).toBeDefined();
64 |         resolve(true);
65 |     });
66 | });


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "esModuleInterop": true,
 4 |     "forceConsistentCasingInFileNames": true,
 5 |     "outDir": "dist",
 6 |     "module": "esnext",
 7 |     "target": "esnext",
 8 |     "lib": [
 9 |       "es2017",
10 |       "esnext",
11 |       "dom",
12 |       "dom.iterable"
13 |     ],
14 |     "skipLibCheck": true,
15 |     "declaration": true,
16 |     "strict": true,
17 |     "noUnusedLocals": true,
18 |     "noUnusedParameters": true,
19 |     "noImplicitReturns": true,
20 |     "removeComments": false,
21 |     "noFallthroughCasesInSwitch": true,
22 |     "moduleResolution": "node",
23 |     "types": ["@webgpu/types"],
24 |   },
25 |   "include": [
26 |     "src/**/*.ts"
27 |   ],
28 |   "exclude": []
29 | }


--------------------------------------------------------------------------------
/vite.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from 'vite';
 2 | import dts from 'vite-plugin-dts';
 3 | // import basicSsl from '@vitejs/plugin-basic-ssl';
 4 | 
 5 | // https://vitejs.dev/config/
 6 | export default defineConfig({
 7 |   worker: {
 8 |     format: "es"
 9 |   },
10 |   optimizeDeps: {
11 |     esbuildOptions: {
12 |       target: "es2022",
13 |     }
14 |   },
15 |   build: {
16 |     sourcemap: false,
17 |     assetsDir: "",
18 |     cssMinify: true,
19 |     target: "esnext",
20 |     lib: {
21 |       entry: "src/index.ts",
22 |       formats: ["es"],
23 |       fileName: "index",
24 |     },
25 |     rollupOptions: {
26 |       output: {
27 |         format: "es",
28 |       },
29 |     },
30 |   },
31 |   plugins: [
32 |     dts({
33 |       rollupTypes: true
34 |     })
35 |   ]
36 | })


--------------------------------------------------------------------------------
/vitest.workspace.ts:
--------------------------------------------------------------------------------
 1 | import { defineWorkspace } from 'vitest/config'
 2 | 
 3 | export default defineWorkspace([
 4 |   // If you want to keep running your existing tests in Node.js, uncomment the next line.
 5 |   // 'vite.config.ts',
 6 |   {
 7 |     extends: 'vite.config.ts',
 8 |     test: {
 9 |       testTimeout: 2000000,
10 |       browser: {
11 |         enabled: true,
12 |         name: 'edge',
13 |         provider: 'preview',
14 |       },
15 |     },
16 |   },
17 | ])
18 | 


--------------------------------------------------------------------------------