├── .github
    └── workflows
    │   └── ci.yml
├── Audio-Transcription-Chrome
    ├── README.md
    ├── background.js
    ├── content.js
    ├── icon128.png
    ├── manifest.json
    ├── options.html
    ├── options.js
    ├── popup.html
    ├── popup.js
    └── style.css
├── Audio-Transcription-Firefox
    ├── README.md
    ├── background.js
    ├── content.js
    ├── icon128.png
    ├── manifest.json
    ├── popup.html
    ├── popup.js
    └── style.css
├── LICENSE
├── README.md
├── TensorRT_whisper.md
├── assets
    └── jfk.flac
├── docker
    ├── Dockerfile.cpu
    ├── Dockerfile.gpu
    ├── Dockerfile.openvino
    └── Dockerfile.tensorrt
├── docs
    ├── .nojekyll
    ├── doctrees
    │   ├── environment.pickle
    │   └── index.doctree
    ├── html
    │   ├── .buildinfo
    │   ├── _sources
    │   │   └── index.rst.txt
    │   ├── _static
    │   │   ├── alabaster.css
    │   │   ├── basic.css
    │   │   ├── custom.css
    │   │   ├── doctools.js
    │   │   ├── documentation_options.js
    │   │   ├── file.png
    │   │   ├── language_data.js
    │   │   ├── minus.png
    │   │   ├── plus.png
    │   │   ├── pygments.css
    │   │   ├── searchtools.js
    │   │   └── sphinx_highlight.js
    │   ├── genindex.html
    │   ├── index.html
    │   ├── objects.inv
    │   ├── py-modindex.html
    │   ├── search.html
    │   └── searchindex.js
    └── index.html
├── requirements
    ├── client.txt
    └── server.txt
├── run_server.py
├── scripts
    ├── build_whisper_tensorrt.sh
    └── setup.sh
├── setup.py
├── tests
    ├── __init__.py
    ├── test_client.py
    ├── test_server.py
    └── test_vad.py
└── whisper_live
    ├── __init__.py
    ├── __version__.py
    ├── backend
        ├── __init__.py
        ├── base.py
        ├── faster_whisper_backend.py
        ├── openvino_backend.py
        └── trt_backend.py
    ├── client.py
    ├── server.py
    ├── transcriber
        ├── __init__.py
        ├── tensorrt_utils.py
        ├── transcriber_faster_whisper.py
        ├── transcriber_openvino.py
        └── transcriber_tensorrt.py
    ├── utils.py
    └── vad.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: Test & Build CI/CD
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |     tags:
  8 |       - v*
  9 |   pull_request:
 10 |     branches: [ main ]
 11 |     types: [opened, synchronize, reopened]
 12 | 
 13 | jobs:
 14 |   run-tests:
 15 |     runs-on: ubuntu-22.04
 16 |     strategy:
 17 |       matrix:
 18 |         python-version: [3.9, '3.10', 3.11, 3.12]
 19 |     steps:
 20 |       - uses: actions/checkout@v2
 21 |       
 22 |       - name: Set up Python ${{ matrix.python-version }}
 23 |         uses: actions/setup-python@v2
 24 |         with:
 25 |           python-version: ${{ matrix.python-version }}
 26 | 
 27 |       - name: Cache Python dependencies
 28 |         uses: actions/cache@v4
 29 |         with:
 30 |           path: |
 31 |             ~/.cache/pip
 32 |             !~/.cache/pip/log
 33 |           key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }}
 34 |           restore-keys: |
 35 |             ${{ runner.os }}-pip-${{ matrix.python-version }}-
 36 |       
 37 |       - name: Install system dependencies
 38 |         run: sudo apt-get update && sudo apt-get install -y portaudio19-dev
 39 |       
 40 |       - name: Install Python dependencies
 41 |         run: |
 42 |           python -m pip install --upgrade pip
 43 |           pip install -r requirements/server.txt --extra-index-url https://download.pytorch.org/whl/cpu
 44 |           pip install -r requirements/client.txt
 45 |       
 46 |       - name: Run tests
 47 |         run: |
 48 |           echo "Running tests with Python ${{ matrix.python-version }}"
 49 |           python -m unittest discover -s tests
 50 |   
 51 |   check-code-format:
 52 |     runs-on: ubuntu-22.04
 53 |     strategy:
 54 |       matrix:
 55 |         python-version: [3.9, '3.10', 3.11, 3.12]
 56 | 
 57 |     steps:
 58 |       - uses: actions/checkout@v2
 59 | 
 60 |       - name: Set up Python ${{ matrix.python-version }}
 61 |         uses: actions/setup-python@v2
 62 |         with:
 63 |           python-version: ${{ matrix.python-version }}
 64 | 
 65 |       - name: Install dependencies
 66 |         run: |
 67 |           python -m pip install --upgrade pip
 68 |           python -m pip install flake8
 69 | 
 70 |       - name: Lint with flake8
 71 |         run: |
 72 |           # stop the build if there are Python syntax errors or undefined names
 73 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 74 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 75 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 76 | 
 77 |   build-and-push-docker-cpu:
 78 |     needs: [run-tests, check-code-format]
 79 |     runs-on: ubuntu-22.04
 80 |     if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/'))
 81 |     steps:
 82 |       - uses: actions/checkout@v2
 83 |       
 84 |       - name: Log in to GitHub Container Registry
 85 |         uses: docker/login-action@v1
 86 |         with:
 87 |           registry: ghcr.io
 88 |           username: ${{ github.repository_owner }}
 89 |           password: ${{ secrets.GHCR_TOKEN }}
 90 | 
 91 |       - name: Set up Docker Buildx
 92 |         uses: docker/setup-buildx-action@v1
 93 | 
 94 |       - name: Build and push Docker image
 95 |         uses: docker/build-push-action@v2
 96 |         with:
 97 |           context: .
 98 |           file: docker/Dockerfile.cpu
 99 |           push: true
100 |           tags: ghcr.io/collabora/whisperlive-cpu:latest
101 |   
102 |   build-and-push-docker-gpu:
103 |     needs: [run-tests, check-code-format, build-and-push-docker-cpu]
104 |     timeout-minutes: 20
105 |     runs-on: ubuntu-22.04
106 |     if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/'))
107 |     steps:
108 |       - uses: actions/checkout@v2
109 | 
110 |       - name: Log in to GitHub Container Registry
111 |         uses: docker/login-action@v1
112 |         with:
113 |           registry: ghcr.io
114 |           username: ${{ github.repository_owner }}
115 |           password: ${{ secrets.GHCR_TOKEN }}
116 | 
117 |       - name: Docker Prune
118 |         run: docker system prune -af
119 | 
120 |       - name: Set up Docker Buildx
121 |         uses: docker/setup-buildx-action@v1
122 | 
123 |       - name: Build and push Docker GPU image
124 |         uses: docker/build-push-action@v2
125 |         with:
126 |           context: .
127 |           file: docker/Dockerfile.gpu
128 |           push: true
129 |           tags: ghcr.io/collabora/whisperlive-gpu:latest
130 | 
131 |   build-and-push-docker-openvino:
132 |     needs: [run-tests, check-code-format, build-and-push-docker-cpu]
133 |     timeout-minutes: 20
134 |     runs-on: ubuntu-22.04
135 |     if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/'))
136 |     steps:
137 |       - uses: actions/checkout@v2
138 | 
139 |       - name: Log in to GitHub Container Registry
140 |         uses: docker/login-action@v1
141 |         with:
142 |           registry: ghcr.io
143 |           username: ${{ github.repository_owner }}
144 |           password: ${{ secrets.GHCR_TOKEN }}
145 | 
146 |       - name: Docker Prune
147 |         run: docker system prune -af
148 | 
149 |       - name: Set up Docker Buildx
150 |         uses: docker/setup-buildx-action@v1
151 | 
152 |       - name: Build and push Docker GPU image
153 |         uses: docker/build-push-action@v2
154 |         with:
155 |           context: .
156 |           file: docker/Dockerfile.openvino
157 |           push: true
158 |           tags: ghcr.io/collabora/whisperlive-openvino:latest
159 | 
160 |   publish-to-pypi:
161 |     needs: [run-tests, check-code-format]
162 |     runs-on: ubuntu-22.04
163 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
164 |     steps:
165 |       - uses: actions/checkout@v2
166 | 
167 |       - name: Set up Python 3.9
168 |         uses: actions/setup-python@v2
169 |         with:
170 |           python-version: 3.9
171 | 
172 |       - name: Cache Python dependencies
173 |         uses: actions/cache@v4
174 |         with:
175 |           path: |
176 |             ~/.cache/pip
177 |             !~/.cache/pip/log
178 |           key: ubuntu-latest-pip-3.9-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }}
179 |           restore-keys: |
180 |             ubuntu-latest-pip-3.9-
181 | 
182 |       - name: Install system dependencies
183 |         run: sudo apt-get update && sudo apt-get install -y portaudio19-dev
184 | 
185 |       - name: Install Python dependencies
186 |         run: |
187 |           pip install -r requirements/server.txt
188 |           pip install -r requirements/client.txt
189 |           pip install wheel
190 | 
191 |       - name: Build package
192 |         run: python setup.py sdist bdist_wheel
193 | 
194 |       - name: Publish package to PyPI
195 |         uses: pypa/gh-action-pypi-publish@release/v1
196 |         with:
197 |           user: __token__
198 |           password: ${{ secrets.PYPI_API_TOKEN }}
199 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/README.md:
--------------------------------------------------------------------------------
 1 | # Audio Transcription
 2 | 
 3 | Audio Transcription is a Chrome extension that allows users to capture any audio playing on the current tab and transcribe it using OpenAI-whisper in real time. Users will have the option to do voice activity detection as well to not send audio to server when there is no speech.
 4 | 
 5 | We use OpenAI-whisper model to process the audio continuously and send the transcription back to the client. We apply a few optimizations on top of OpenAI's implementation to improve performance and run it faster in a real-time manner. To this end, we used [faster-whisper](https://github.com/guillaumekln/faster-whisper) which is 4x faster than OpenAI's implementation.
 6 | 
 7 | ## Loading the Extension
 8 | - Open the Google Chrome browser.
 9 | - Type chrome://extensions in the address bar and press Enter.
10 | - Enable the Developer mode toggle switch located in the top right corner.
11 | - Clone this repository
12 | - Click the Load unpacked button.
13 | - Browse to the location where you cloned the repository files and select the ```Audio Transcription``` folder.
14 | - The extension should now be loaded and visible on the extensions page.
15 | 
16 | 
17 | ## Real time transcription with OpenAI-whisper
18 | This Chrome extension allows you to send audio from your browser to a server for transcribing the audio in real time. It can also incorporate voice activity detection on the client side to detect when speech is present, and it continuously receives transcriptions of the spoken content from the server. You can select from the options menu if you want to run the speech recognition.
19 | 
20 | 
21 | ## Implementation Details
22 | 
23 | ### Capturing Audio
24 | To capture the audio in the current tab, we used the chrome `tabCapture` API to obtain a `MediaStream` object of the current tab.
25 | 
26 | ### Options
27 | When using the Audio Transcription extension, you have the following options:
28 |  - **Use Collabora Server**: We provide a demo server which runs the whisper small model.
29 |  - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper.
30 |  - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English.
31 |  - **Model Size**: Select the whisper model size to run the server with.
32 | 
33 | ### Getting Started
34 | - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).
35 | - Just click on the Chrome Extension which should show 2 options
36 |   - **Start Capture** : Starts capturing the audio in the current tab and sends the captured audio to the server for transcription. This also creates an element to show the transcriptions recieved from the server on the current tab.
37 |   - **Stop Capture** - Stops capturing the audio.
38 | 
39 | 
40 | ## Limitations
41 | This extension requires an internet connection to stream audio and receive transcriptions. The accuracy of the transcriptions may vary depending on the audio quality and the performance of the server-side transcription service. The extension may consume additional system resources while running, especially when streaming audio.
42 | 
43 | ## Note
44 | The extension relies on a properly running transcription server with multilingual support. Please follow the server documentation for setup and configuration.
45 | 
46 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/background.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Removes a tab with the specified tab ID in Google Chrome.
  3 |  * @param {number} tabId - The ID of the tab to be removed.
  4 |  * @returns {Promise<void>} A promise that resolves when the tab is successfully removed or fails to remove.
  5 |  */
  6 | function removeChromeTab(tabId) {
  7 |   return new Promise((resolve) => {
  8 |     chrome.tabs.remove(tabId)
  9 |       .then(resolve)
 10 |       .catch(resolve);
 11 |   });
 12 | }
 13 | 
 14 | 
 15 | /**
 16 |  * Executes a script file in a specific tab in Google Chrome.
 17 |  * @param {number} tabId - The ID of the tab where the script should be executed.
 18 |  * @param {string} file - The file path or URL of the script to be executed.
 19 |  * @returns {Promise<void>} A promise that resolves when the script is successfully executed or fails to execute.
 20 |  */
 21 | function executeScriptInTab(tabId, file) {
 22 |   return new Promise((resolve) => {
 23 |     chrome.scripting.executeScript(
 24 |       {
 25 |         target: { tabId },
 26 |         files: [file],
 27 |       }, () => {
 28 |         resolve();
 29 |       }
 30 |     );
 31 |   });
 32 | }
 33 | 
 34 | 
 35 | /**
 36 |  * Opens the options page of the Chrome extension in a new pinned tab.
 37 |  * @returns {Promise<chrome.tabs.Tab>} A promise that resolves with the created tab object.
 38 |  */
 39 | function openExtensionOptions() {
 40 |   return new Promise((resolve) => {
 41 |     chrome.tabs.create(
 42 |       {
 43 |         pinned: true,
 44 |         active: false,
 45 |         url: `chrome-extension://${chrome.runtime.id}/options.html`,
 46 |       },
 47 |       (tab) => {
 48 |         resolve(tab);
 49 |       }
 50 |     );
 51 |   });
 52 | }
 53 | 
 54 | 
 55 | /**
 56 |  * Retrieves the value associated with the specified key from the local storage in Google Chrome.
 57 |  * @param {string} key - The key of the value to retrieve from the local storage.
 58 |  * @returns {Promise<any>} A promise that resolves with the retrieved value from the local storage.
 59 |  */
 60 | function getLocalStorageValue(key) {
 61 |   return new Promise((resolve) => {
 62 |     chrome.storage.local.get([key], (result) => {
 63 |       resolve(result[key]);
 64 |     });
 65 |   });
 66 | }
 67 | 
 68 | 
 69 | /**
 70 |  * Sends a message to a specific tab in Google Chrome.
 71 |  * @param {number} tabId - The ID of the tab to send the message to.
 72 |  * @param {any} data - The data to be sent as the message.
 73 |  * @returns {Promise<any>} A promise that resolves with the response from the tab.
 74 |  */
 75 | function sendMessageToTab(tabId, data) {
 76 |   return new Promise((resolve) => {
 77 |     chrome.tabs.sendMessage(tabId, data, (response) => {
 78 |       resolve(response);
 79 |     });
 80 |   });
 81 | }
 82 | 
 83 | 
 84 | /**
 85 |  * Delays the execution for a specified duration.
 86 |  * @param {number} ms - The duration to sleep in milliseconds (default: 0).
 87 |  * @returns {Promise<void>} A promise that resolves after the specified duration.
 88 |  */
 89 | function delayExecution(ms = 0) {
 90 |   return new Promise((resolve) => setTimeout(resolve, ms));
 91 | }
 92 | 
 93 | 
 94 | /**
 95 |  * Sets a value associated with the specified key in the local storage of Google Chrome.
 96 |  * @param {string} key - The key to set in the local storage.
 97 |  * @param {any} value - The value to associate with the key in the local storage.
 98 |  * @returns {Promise<any>} A promise that resolves with the value that was set in the local storage.
 99 |  */
100 | function setLocalStorageValue(key, value) {
101 |   return new Promise((resolve) => {
102 |     chrome.storage.local.set(
103 |       {
104 |         [key]: value,
105 |       }, () => {
106 |         resolve(value);
107 |       }
108 |     );
109 |   });
110 | }
111 | 
112 | 
113 | /**
114 |  * Retrieves the tab object with the specified tabId.
115 |  * @param {number} tabId - The ID of the tab to retrieve.
116 |  * @returns {Promise<object>} - A Promise that resolves to the tab object.
117 |  */
118 | async function getTab(tabId) {
119 |   return new Promise((resolve) => {
120 |     chrome.tabs.get(tabId, (tab) => {
121 |       resolve(tab);
122 |     });
123 |   });
124 | }
125 | 
126 | 
127 | /**
128 |  * Starts the capture process for the specified tab.
129 |  * @param {number} tabId - The ID of the tab to start capturing.
130 |  * @returns {Promise<void>} - A Promise that resolves when the capture process is started successfully.
131 |  */
132 | async function startCapture(options) {
133 |   const { tabId } = options;
134 |   const optionTabId = await getLocalStorageValue("optionTabId");
135 |   if (optionTabId) {
136 |     await removeChromeTab(optionTabId);
137 |   }
138 | 
139 |   try {
140 |     const currentTab = await getTab(tabId);
141 |     if (currentTab.audible) {
142 |       await setLocalStorageValue("currentTabId", currentTab.id);
143 |       await executeScriptInTab(currentTab.id, "content.js");
144 |       await delayExecution(500);
145 | 
146 |       const optionTab = await openExtensionOptions();
147 | 
148 |       await setLocalStorageValue("optionTabId", optionTab.id);
149 |       await delayExecution(500);
150 | 
151 |       await sendMessageToTab(optionTab.id, {
152 |         type: "start_capture",
153 |         data: { 
154 |           currentTabId: currentTab.id, 
155 |           host: options.host, 
156 |           port: options.port, 
157 |           multilingual: options.useMultilingual,
158 |           language: options.language,
159 |           task: options.task,
160 |           modelSize: options.modelSize,
161 |           useVad: options.useVad,
162 |         },
163 |       });
164 |     } else {
165 |       console.log("No Audio");
166 |     }
167 |   } catch (error) {
168 |     console.error("Error occurred while starting capture:", error);
169 |   }
170 | }
171 | 
172 | 
173 | /**
174 |  * Stops the capture process and performs cleanup.
175 |  * @returns {Promise<void>} - A Promise that resolves when the capture process is stopped successfully.
176 |  */
177 | async function stopCapture() {
178 |   const optionTabId = await getLocalStorageValue("optionTabId");
179 |   const currentTabId = await getLocalStorageValue("currentTabId");
180 | 
181 |   if (optionTabId) {
182 |     res = await sendMessageToTab(currentTabId, {
183 |       type: "STOP",
184 |       data: { currentTabId: currentTabId },
185 |     });
186 |     await removeChromeTab(optionTabId);
187 |   }
188 | }
189 | 
190 | 
191 | /**
192 |  * Listens for messages from the runtime and performs corresponding actions.
193 |  * @param {Object} message - The message received from the runtime.
194 |  */
195 | chrome.runtime.onMessage.addListener(async (message) => {
196 |   if (message.action === "startCapture") {
197 |     startCapture(message);
198 |   } else if (message.action === "stopCapture") {
199 |     stopCapture();
200 |   } else if (message.action === "updateSelectedLanguage") {
201 |     const detectedLanguage = message.detectedLanguage;
202 |     chrome.runtime.sendMessage({ action: "updateSelectedLanguage", detectedLanguage });
203 |     chrome.storage.local.set({ selectedLanguage: detectedLanguage });
204 |   } else if (message.action === "toggleCaptureButtons") {
205 |     chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false });
206 |     chrome.storage.local.set({ capturingState: { isCapturing: false } })
207 |     stopCapture();
208 |   }
209 | });
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/content.js:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | var elem_container = null;
  4 | var elem_text = null;
  5 | 
  6 | var segments = [];
  7 | var text_segments = [];
  8 | 
  9 | function initPopupElement() {
 10 |   if (document.getElementById('popupElement')) {
 11 |     return;
 12 |   }
 13 | 
 14 |   const popupContainer = document.createElement('div');
 15 |   popupContainer.id = 'popupElement';
 16 |   popupContainer.style.cssText = 'position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: white; color: black; padding: 16px; border-radius: 10px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.5); display: none; text-align: center;';
 17 | 
 18 |   const popupText = document.createElement('span');
 19 |   popupText.textContent = 'Default Text';
 20 |   popupText.className = 'popupText';
 21 |   popupText.style.fontSize = '24px';
 22 |   popupContainer.appendChild(popupText);
 23 | 
 24 |   const buttonContainer = document.createElement('div');
 25 |   buttonContainer.style.marginTop = '8px';
 26 |   const closePopupButton = document.createElement('button');
 27 |   closePopupButton.textContent = 'Close';
 28 |   closePopupButton.style.backgroundColor = '#65428A';
 29 |   closePopupButton.style.color = 'white';
 30 |   closePopupButton.style.border = 'none';
 31 |   closePopupButton.style.padding = '8px 16px'; // Add padding for better click area
 32 |   closePopupButton.style.cursor = 'pointer';
 33 |   closePopupButton.addEventListener('click', async () => {
 34 |     popupContainer.style.display = 'none';
 35 |     await browser.runtime.sendMessage({ action: 'toggleCaptureButtons', data: false });
 36 |   });
 37 |   buttonContainer.appendChild(closePopupButton);
 38 |   popupContainer.appendChild(buttonContainer);
 39 | 
 40 |   document.body.appendChild(popupContainer);
 41 | }
 42 | 
 43 | 
 44 | function showPopup(customText) {
 45 |   const popup = document.getElementById('popupElement');
 46 |   const popupText = popup.querySelector('.popupText');
 47 | 
 48 |   if (popup && popupText) {
 49 |       popupText.textContent = customText || 'Default Text'; // Set default text if custom text is not provided
 50 |       popup.style.display = 'block';
 51 |   }
 52 | }
 53 | 
 54 | 
 55 | function init_element() {
 56 |     if (document.getElementById('transcription')) {
 57 |         return;
 58 |     }
 59 | 
 60 |     elem_container = document.createElement('div');
 61 |     elem_container.id = "transcription";
 62 |     elem_container.style.cssText = 'padding-top:16px;font-size:18px;position: fixed; top: 85%; left: 50%; transform: translate(-50%, -50%);line-height:18px;width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;';
 63 | 
 64 |     for (var i = 0; i < 4; i++) {
 65 |         elem_text = document.createElement('span');
 66 |         elem_text.style.cssText = 'position: absolute;padding-left:16px;padding-right:16px;';
 67 |         elem_text.id = "t" + i;
 68 |         elem_container.appendChild(elem_text);
 69 | 
 70 |         if (i == 3) {
 71 |             elem_text.style.top = "-1000px"
 72 |         }
 73 |     }
 74 | 
 75 |     document.body.appendChild(elem_container);
 76 | 
 77 |     let x = 0;
 78 |     let y = 0;
 79 | 
 80 |     // Query the element
 81 |     const ele = elem_container;
 82 | 
 83 |     // Handle the mousedown event
 84 |     // that's triggered when user drags the element
 85 |     const mouseDownHandler = function (e) {
 86 |         // Get the current mouse position
 87 |         x = e.clientX;
 88 |         y = e.clientY;
 89 | 
 90 |         // Attach the listeners to `document`
 91 |         document.addEventListener('mousemove', mouseMoveHandler);
 92 |         document.addEventListener('mouseup', mouseUpHandler);
 93 |     };
 94 | 
 95 |     const mouseMoveHandler = function (e) {
 96 |         // How far the mouse has been moved
 97 |         const dx = e.clientX - x;
 98 |         const dy = e.clientY - y;
 99 | 
100 |         // Set the position of element
101 |         ele.style.top = `${ele.offsetTop + dy}px`;
102 |         ele.style.left = `${ele.offsetLeft + dx}px`;
103 | 
104 |         // Reassign the position of mouse
105 |         x = e.clientX;
106 |         y = e.clientY;
107 |     };
108 | 
109 |     const mouseUpHandler = function () {
110 |         // Remove the handlers of `mousemove` and `mouseup`
111 |         document.removeEventListener('mousemove', mouseMoveHandler);
112 |         document.removeEventListener('mouseup', mouseUpHandler);
113 |     };
114 | 
115 |     ele.addEventListener('mousedown', mouseDownHandler);
116 | }
117 | 
118 | function getStyle(el,styleProp)
119 | {
120 |     var x = document.getElementById(el);
121 |     if (x.currentStyle)
122 |         var y = x.currentStyle[styleProp];
123 |     else if (window.getComputedStyle)
124 |         var y = document.defaultView.getComputedStyle(x,null).getPropertyValue(styleProp);
125 |     return y;
126 | }
127 | 
128 | function get_lines(elem, line_height) {
129 |     var divHeight = elem.offsetHeight;
130 |     var lines = divHeight / line_height;
131 | 
132 |     var original_text = elem.innerHTML;
133 | 
134 |     var words = original_text.split(' ');
135 |     var segments = [];
136 |     var current_lines = 1;
137 |     var segment = '';
138 |     var segment_len = 0;
139 |     for (var i = 0; i < words.length; i++)
140 |     {
141 |         segment += words[i] + ' ';
142 |         elem.innerHTML = segment;
143 |         divHeight = elem.offsetHeight;
144 | 
145 |         if ((divHeight / line_height) > current_lines) {
146 |             var line_segment = segment.substring(segment_len, segment.length - 1 - words[i].length - 1);
147 |             segments.push(line_segment);
148 |             segment_len += line_segment.length + 1;
149 |             current_lines++;
150 |         }
151 |     }
152 | 
153 |     var line_segment = segment.substring(segment_len, segment.length - 1)
154 |     segments.push(line_segment);
155 | 
156 |     elem.innerHTML = original_text;
157 | 
158 |     return segments;
159 | 
160 | }
161 | 
162 | function remove_element() {
163 |     var elem = document.getElementById('transcription')
164 |     for (var i = 0; i < 4; i++) {
165 |         document.getElementById("t" + i).remove();
166 |     }
167 |     elem.remove()
168 | }
169 | 
170 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
171 |     const { type, data } = request;
172 |     
173 |     if (type === "STOP") {
174 |         remove_element();
175 |         sendResponse({data: "STOPPED"});
176 |         return true;
177 |     } else if (type === "showWaitPopup"){
178 |         initPopupElement();
179 | 
180 |         showPopup(`Estimated wait time ~ ${Math.round(data)} minutes`);
181 |         sendResponse({data: "popup"});
182 |         return true;
183 |     }
184 | 
185 |     init_element();
186 | 
187 |     message = JSON.parse(data);
188 |     message = message["segments"];
189 | 
190 |     var text = '';
191 |     for (var i = 0; i < message.length; i++) {
192 |         text += message[i].text + ' ';
193 |     }
194 |     text = text.replace(/(\r\n|\n|\r)/gm, "");
195 |     
196 |     var elem = document.getElementById('t3');
197 |     elem.innerHTML = text;
198 | 
199 |     var line_height_style = getStyle('t3', 'line-height');
200 |     var line_height = parseInt(line_height_style.substring(0, line_height_style.length - 2));
201 |     var divHeight = elem.offsetHeight;
202 |     var lines = divHeight / line_height;
203 | 
204 |     text_segments = [];
205 |     text_segments = get_lines(elem, line_height);
206 |     
207 |     elem.innerHTML = '';
208 | 
209 |     if (text_segments.length > 2) {
210 |         for (var i = 0; i < 3; i++) {
211 |             document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i];
212 |         }
213 |     } else {
214 |         for (var i = 0; i < 3; i++) {
215 |             document.getElementById('t' + i).innerHTML = '';
216 |         }
217 |     }
218 | 
219 |     if (text_segments.length <= 2) {
220 |         for (var i = 0; i < text_segments.length; i++) {
221 |             document.getElementById('t' + i).innerHTML = text_segments[i];
222 |         }
223 |     } else {
224 |         for (var i = 0; i < 3; i++) {
225 |             document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i];
226 |         }
227 |     }
228 | 
229 |     for (var i = 1; i < 3; i++)
230 |     {
231 |         var parent_elem = document.getElementById('t' + (i - 1));
232 |         var elem = document.getElementById('t' + i);
233 |         elem.style.top = parent_elem.offsetHeight + parent_elem.offsetTop + 'px';
234 |     }
235 | 
236 |     sendResponse({});
237 |     return true;
238 | });
239 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/icon128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/Audio-Transcription-Chrome/icon128.png


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/manifest.json:
--------------------------------------------------------------------------------
 1 |   {
 2 |     "manifest_version": 3,
 3 | 
 4 |     "name": "Audio Transcription",
 5 |     "version": "1.0.0",
 6 |     "description": "This extension captures the audio on the current tab, sends it to a server for transcription and shows the transcription in Real-time.",
 7 |     
 8 |     "options_page": "options.html",
 9 |     "background": {
10 |         "service_worker": "background.js"
11 |     },
12 |     "permissions": [
13 |         "storage",
14 |         "activeTab",
15 |         "tabCapture",
16 |         "scripting"
17 |     ],
18 |     "icons": {
19 |         "128":"icon128.png"
20 |         },
21 |     "action": {
22 |         "default_popup": "popup.html",
23 |         "default_icon": "icon128.png"
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/options.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <title>Audio Transcription Options</title>
 7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     <link rel="stylesheet" href="style.css" type="text/css">
10 | </head>
11 | 
12 | <body>
13 |     <script src="options.js"></script>
14 | </body>
15 | 
16 | </html>


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/options.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Captures audio from the active tab in Google Chrome.
  3 |  * @returns {Promise<MediaStream>} A promise that resolves with the captured audio stream.
  4 |  */
  5 | function captureTabAudio() {
  6 |   return new Promise((resolve) => {
  7 |     chrome.tabCapture.capture(
  8 |       {
  9 |         audio: true,
 10 |         video: false,
 11 |       },
 12 |       (stream) => {
 13 |         resolve(stream);
 14 |       }
 15 |     );
 16 |   });
 17 | }
 18 | 
 19 | 
 20 | /**
 21 |  * Sends a message to a specific tab in Google Chrome.
 22 |  * @param {number} tabId - The ID of the tab to send the message to.
 23 |  * @param {any} data - The data to be sent as the message.
 24 |  * @returns {Promise<any>} A promise that resolves with the response from the tab.
 25 |  */
 26 | function sendMessageToTab(tabId, data) {
 27 |   return new Promise((resolve) => {
 28 |     chrome.tabs.sendMessage(tabId, data, (response) => {
 29 |       resolve(response);
 30 |     });
 31 |   });
 32 | }
 33 | 
 34 | 
 35 | /**
 36 |  * Resamples the audio data to a target sample rate of 16kHz.
 37 |  * @param {Array|ArrayBuffer|TypedArray} audioData - The input audio data.
 38 |  * @param {number} [origSampleRate=44100] - The original sample rate of the audio data.
 39 |  * @returns {Float32Array} The resampled audio data at 16kHz.
 40 |  */
 41 | function resampleTo16kHZ(audioData, origSampleRate = 44100) {
 42 |   // Convert the audio data to a Float32Array
 43 |   const data = new Float32Array(audioData);
 44 | 
 45 |   // Calculate the desired length of the resampled data
 46 |   const targetLength = Math.round(data.length * (16000 / origSampleRate));
 47 | 
 48 |   // Create a new Float32Array for the resampled data
 49 |   const resampledData = new Float32Array(targetLength);
 50 | 
 51 |   // Calculate the spring factor and initialize the first and last values
 52 |   const springFactor = (data.length - 1) / (targetLength - 1);
 53 |   resampledData[0] = data[0];
 54 |   resampledData[targetLength - 1] = data[data.length - 1];
 55 | 
 56 |   // Resample the audio data
 57 |   for (let i = 1; i < targetLength - 1; i++) {
 58 |     const index = i * springFactor;
 59 |     const leftIndex = Math.floor(index).toFixed();
 60 |     const rightIndex = Math.ceil(index).toFixed();
 61 |     const fraction = index - leftIndex;
 62 |     resampledData[i] = data[leftIndex] + (data[rightIndex] - data[leftIndex]) * fraction;
 63 |   }
 64 | 
 65 |   // Return the resampled data
 66 |   return resampledData;
 67 | }
 68 | 
 69 | function generateUUID() {
 70 |   let dt = new Date().getTime();
 71 |   const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
 72 |     const r = (dt + Math.random() * 16) % 16 | 0;
 73 |     dt = Math.floor(dt / 16);
 74 |     return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16);
 75 |   });
 76 |   return uuid;
 77 | }
 78 | 
 79 | 
 80 | /**
 81 |  * Starts recording audio from the captured tab.
 82 |  * @param {Object} option - The options object containing the currentTabId.
 83 |  */
 84 | async function startRecord(option) {
 85 |   const stream = await captureTabAudio();
 86 |   const uuid = generateUUID();
 87 | 
 88 |   if (stream) {
 89 |     // call when the stream inactive
 90 |     stream.oninactive = () => {
 91 |       window.close();
 92 |     };
 93 |     const socket = new WebSocket(`ws://${option.host}:${option.port}/`);
 94 |     let isServerReady = false;
 95 |     let language = option.language;
 96 |     socket.onopen = function(e) { 
 97 |       socket.send(
 98 |         JSON.stringify({
 99 |           uid: uuid,
100 |           language: option.language,
101 |           task: option.task,
102 |           model: option.modelSize,
103 |           use_vad: option.useVad
104 |         })
105 |       );
106 |     };
107 | 
108 |     socket.onmessage = async (event) => {
109 |       const data = JSON.parse(event.data);
110 |       if (data["uid"] !== uuid)
111 |         return;
112 |       
113 |       if (data["status"] === "WAIT"){
114 |         await sendMessageToTab(option.currentTabId, {
115 |           type: "showWaitPopup",
116 |           data: data["message"],
117 |         });
118 |         chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false }) 
119 |         chrome.runtime.sendMessage({ action: "stopCapture" })
120 |         return;
121 |       }
122 |         
123 |       if (isServerReady === false){
124 |         isServerReady = true;
125 |         return;
126 |       }
127 |       
128 |       if (language === null) {
129 |         language = data["language"];
130 |         
131 |         // send message to popup.js to update dropdown
132 |         // console.log(language);
133 |         chrome.runtime.sendMessage({
134 |           action: "updateSelectedLanguage",
135 |           detectedLanguage: language,
136 |         });
137 | 
138 |         return;
139 |       }
140 | 
141 |       if (data["message"] === "DISCONNECT"){
142 |         chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false })        
143 |         return;
144 |       }
145 | 
146 |       res = await sendMessageToTab(option.currentTabId, {
147 |         type: "transcript",
148 |         data: event.data,
149 |       });
150 |     };
151 | 
152 |     
153 |     const audioDataCache = [];
154 |     const context = new AudioContext();
155 |     const mediaStream = context.createMediaStreamSource(stream);
156 |     const recorder = context.createScriptProcessor(4096, 1, 1);
157 | 
158 |     recorder.onaudioprocess = async (event) => {
159 |       if (!context || !isServerReady) return;
160 | 
161 |       const inputData = event.inputBuffer.getChannelData(0);
162 |       const audioData16kHz = resampleTo16kHZ(inputData, context.sampleRate);
163 | 
164 |       audioDataCache.push(inputData);
165 | 
166 |       socket.send(audioData16kHz);
167 |     };
168 | 
169 |     // Prevent page mute
170 |     mediaStream.connect(recorder);
171 |     recorder.connect(context.destination);
172 |     mediaStream.connect(context.destination);
173 |     // }
174 |   } else {
175 |     window.close();
176 |   }
177 | }
178 | 
179 | /**
180 |  * Listener for incoming messages from the extension's background script.
181 |  * @param {Object} request - The message request object.
182 |  * @param {Object} sender - The sender object containing information about the message sender.
183 |  * @param {Function} sendResponse - The function to send a response back to the message sender.
184 |  */
185 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
186 |   const { type, data } = request;
187 | 
188 |   switch (type) {
189 |     case "start_capture":
190 |       startRecord(data);
191 |       break;
192 |     default:
193 |       break;
194 |   }
195 | 
196 |   sendResponse({});
197 |   return true;
198 | });
199 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/popup.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <title>Audio Capture</title>
  5 |   <script src="popup.js"></script>
  6 |   <link rel="stylesheet" href="style.css" type="text/css">
  7 | </head>
  8 | <body>
  9 |   <div class="header"><img src="./icon128.png"/> <h1>Audio Transcription</h1></div>
 10 |   <div class="button-container">
 11 |     <div class="button" id="startCapture">Start Capture</div>
 12 |     <div class="button" id="stopCapture" disabled>Stop Capture</div>
 13 |   </div>
 14 |   <div class="checkbox-container">
 15 |     <input type="checkbox" id="useServerCheckbox">
 16 |     <label for="useServerCheckbox">Use Collabora Whisper-Live Server</label>
 17 |   </div>
 18 |   <div class="checkbox-container">
 19 |     <input type="checkbox" id="useVadCheckbox">
 20 |     <label for="useVadCheckbox">Use Voice Activity Detection</label>
 21 |   </div>
 22 |   <div class="dropdown-container">
 23 |     <label for="languageDropdown">Select Language:</label>
 24 |     <select id="languageDropdown">
 25 |       <option value="" selected>Automatically detect</option>
 26 |       <option value="af">Afrikaans</option>
 27 |       <option value="sq">Albanian</option>
 28 |       <option value="am">Amharic</option>
 29 |       <option value="ar">Arabic</option>
 30 |       <option value="hy">Armenian</option>
 31 |       <option value="as">Assamese</option>
 32 |       <option value="az">Azerbaijani</option>
 33 |       <option value="ba">Bashkir</option>
 34 |       <option value="eu">Basque</option>
 35 |       <option value="be">Belarusian</option>
 36 |       <option value="bn">Bengali</option>
 37 |       <option value="bs">Bosnian</option>
 38 |       <option value="br">Breton</option>
 39 |       <option value="bg">Bulgarian</option>
 40 |       <option value="ca">Catalan</option>
 41 |       <option value="zh">Chinese</option>
 42 |       <option value="hr">Croatian</option>
 43 |       <option value="cs">Czech</option>
 44 |       <option value="da">Danish</option>
 45 |       <option value="nl">Dutch</option>
 46 |       <option value="en">English</option>
 47 |       <option value="et">Estonian</option>
 48 |       <option value="fo">Faroese</option>
 49 |       <option value="fi">Finnish</option>
 50 |       <option value="fr">French</option>
 51 |       <option value="gl">Galician</option>
 52 |       <option value="ka">Georgian</option>
 53 |       <option value="de">German</option>
 54 |       <option value="el">Greek</option>
 55 |       <option value="gu">Gujarati</option>
 56 |       <option value="ht">Haitian Creole</option>
 57 |       <option value="ha">Hausa</option>
 58 |       <option value="haw">Hawaiian</option>
 59 |       <option value="he">Hebrew</option>
 60 |       <option value="hi">Hindi</option>
 61 |       <option value="hu">Hungarian</option>
 62 |       <option value="is">Icelandic</option>
 63 |       <option value="id">Indonesian</option>
 64 |       <option value="it">Italian</option>
 65 |       <option value="ja">Japanese</option>
 66 |       <option value="jw">Javanese</option>
 67 |       <option value="kn">Kannada</option>
 68 |       <option value="kk">Kazakh</option>
 69 |       <option value="km">Khmer</option>
 70 |       <option value="ko">Korean</option>
 71 |       <option value="lo">Lao</option>
 72 |       <option value="la">Latin</option>
 73 |       <option value="lv">Latvian</option>
 74 |       <option value="ln">Lingala</option>
 75 |       <option value="lt">Lithuanian</option>
 76 |       <option value="lb">Luxembourgish</option>
 77 |       <option value="mk">Macedonian</option>
 78 |       <option value="mg">Malagasy</option>
 79 |       <option value="ms">Malay</option>
 80 |       <option value="ml">Malayalam</option>
 81 |       <option value="mt">Maltese</option>
 82 |       <option value="mi">Maori</option>
 83 |       <option value="mr">Marathi</option>
 84 |       <option value="mn">Mongolian</option>
 85 |       <option value="my">Myanmar</option>
 86 |       <option value="ne">Nepali</option>
 87 |       <option value="no">Norwegian</option>
 88 |       <option value="nn">Nynorsk</option>
 89 |       <option value="oc">Occitan</option>
 90 |       <option value="ps">Pashto</option>
 91 |       <option value="fa">Persian</option>
 92 |       <option value="pl">Polish</option>
 93 |       <option value="pt">Portuguese</option>
 94 |       <option value="pa">Punjabi</option>
 95 |       <option value="ro">Romanian</option>
 96 |       <option value="ru">Russian</option>
 97 |       <option value="sa">Sanskrit</option>
 98 |       <option value="sr">Serbian</option>
 99 |       <option value="sn">Shona</option>
100 |       <option value="sd">Sindhi</option>
101 |       <option value="si">Sinhala</option>
102 |       <option value="sk">Slovak</option>
103 |       <option value="sl">Slovenian</option>
104 |       <option value="so">Somali</option>
105 |       <option value="es">Spanish</option>
106 |       <option value="su">Sundanese</option>
107 |       <option value="sw">Swahili</option>
108 |       <option value="sv">Swedish</option>
109 |       <option value="tl">Tagalog</option>
110 |       <option value="tg">Tajik</option>
111 |       <option value="ta">Tamil</option>
112 |       <option value="tt">Tatar</option>
113 |       <option value="te">Telugu</option>
114 |       <option value="th">Thai</option>
115 |       <option value="bo">Tibetan</option>
116 |       <option value="tr">Turkish</option>
117 |       <option value="tk">Turkmen</option>
118 |       <option value="uk">Ukrainian</option>
119 |       <option value="ur">Urdu</option>
120 |       <option value="uz">Uzbek</option>
121 |       <option value="vi">Vietnamese</option>
122 |       <option value="cy">Welsh</option>
123 |       <option value="yi">Yiddish</option>
124 |       <option value="yo">Yoruba</option>
125 |     </select>
126 |   </div>
127 |   <div class="dropdown-container">
128 |     <label for="taskDropdown">Select task:</label>
129 |     <select id="taskDropdown" >
130 |       <option value="">Select Task</option>
131 |       <option value="transcribe" selected>Transcribe</option>
132 |       <option value="translate">Translate</option>
133 |     </select>
134 |   </div>
135 |   <div class="dropdown-container">
136 |     <label for="modelSizeDropdown">Select Model Size:</label>
137 |     <select id="modelSizeDropdown">
138 |       <option value="">Select model</option>
139 |       <option value="tiny">Tiny </option>
140 |       <option value="tiny.en">Tiny (English-only)</option>
141 |       <option value="base">Base</option>
142 |       <option value="base.en">Base (English-only)</option>
143 |       <option value="small" selected>Small</option>
144 |       <option value="small.en">Small (English-only)</option>
145 |       <option value="medium">Medium</option>
146 |       <option value="medium.en">Medium (English-only)</option>
147 |       <option value="large-v2">Large-v2</option>
148 |       <option value="large-v3">Large-v3</option>
149 |     </select>
150 |   </div>
151 | </body>
152 | </html>
153 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/popup.js:
--------------------------------------------------------------------------------
  1 | // Wait for the DOM content to be fully loaded
  2 | document.addEventListener("DOMContentLoaded", function () {
  3 |   const startButton = document.getElementById("startCapture");
  4 |   const stopButton = document.getElementById("stopCapture");
  5 | 
  6 |   const useServerCheckbox = document.getElementById("useServerCheckbox");
  7 |   const useVadCheckbox = document.getElementById("useVadCheckbox");
  8 |   const languageDropdown = document.getElementById('languageDropdown');
  9 |   const taskDropdown = document.getElementById('taskDropdown');
 10 |   const modelSizeDropdown = document.getElementById('modelSizeDropdown');
 11 |   let selectedLanguage = null;
 12 |   let selectedTask = taskDropdown.value;
 13 |   let selectedModelSize = modelSizeDropdown.value;
 14 | 
 15 |   // Add click event listeners to the buttons
 16 |   startButton.addEventListener("click", startCapture);
 17 |   stopButton.addEventListener("click", stopCapture);
 18 | 
 19 |   // Retrieve capturing state from storage on popup open
 20 |   chrome.storage.local.get("capturingState", ({ capturingState }) => {
 21 |     if (capturingState && capturingState.isCapturing) {
 22 |       toggleCaptureButtons(true);
 23 |     } else {
 24 |       toggleCaptureButtons(false);
 25 |     }
 26 |   });
 27 | 
 28 |   // Retrieve checkbox state from storage on popup open
 29 |   chrome.storage.local.get("useServerState", ({ useServerState }) => {
 30 |     if (useServerState !== undefined) {
 31 |       useServerCheckbox.checked = useServerState;
 32 |     }
 33 |   });
 34 | 
 35 |   chrome.storage.local.get("useVadState", ({ useVadState }) => {
 36 |     if (useVadState !== undefined) {
 37 |       useVadCheckbox.checked = useVadState;
 38 |     }
 39 |   });
 40 | 
 41 |   chrome.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => {
 42 |     if (storedLanguage !== undefined) {
 43 |       languageDropdown.value = storedLanguage;
 44 |       selectedLanguage = storedLanguage;
 45 |     }
 46 |   });
 47 | 
 48 |   chrome.storage.local.get("selectedTask", ({ selectedTask: storedTask }) => {
 49 |     if (storedTask !== undefined) {
 50 |       taskDropdown.value = storedTask;
 51 |       selectedTask = storedTask;
 52 |     }
 53 |   });
 54 | 
 55 |   chrome.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => {
 56 |     if (storedModelSize !== undefined) {
 57 |       modelSizeDropdown.value = storedModelSize;
 58 |       selectedModelSize = storedModelSize;
 59 |     }
 60 |   });
 61 | 
 62 |   // Function to handle the start capture button click event
 63 |   async function startCapture() {
 64 |     // Ignore click if the button is disabled
 65 |     if (startButton.disabled) {
 66 |       return;
 67 |     }
 68 | 
 69 |     // Get the current active tab
 70 |     const currentTab = await getCurrentTab();
 71 | 
 72 |     // Send a message to the background script to start capturing
 73 |     let host = "localhost";
 74 |     let port = "9090";
 75 |     const useCollaboraServer = useServerCheckbox.checked;
 76 |     if (useCollaboraServer){
 77 |       host = "transcription.kurg.org"
 78 |       port = "7090"
 79 |     }
 80 | 
 81 |     chrome.runtime.sendMessage(
 82 |       { 
 83 |         action: "startCapture", 
 84 |         tabId: currentTab.id,
 85 |         host: host,
 86 |         port: port,
 87 |         language: selectedLanguage,
 88 |         task: selectedTask,
 89 |         modelSize: selectedModelSize,
 90 |         useVad: useVadCheckbox.checked,
 91 |       }, () => {
 92 |         // Update capturing state in storage and toggle the buttons
 93 |         chrome.storage.local.set({ capturingState: { isCapturing: true } }, () => {
 94 |           toggleCaptureButtons(true);
 95 |         });
 96 |       }
 97 |     );
 98 |   }
 99 | 
100 |   // Function to handle the stop capture button click event
101 |   function stopCapture() {
102 |     // Ignore click if the button is disabled
103 |     if (stopButton.disabled) {
104 |       return;
105 |     }
106 | 
107 |     // Send a message to the background script to stop capturing
108 |     chrome.runtime.sendMessage({ action: "stopCapture" }, () => {
109 |       // Update capturing state in storage and toggle the buttons
110 |       chrome.storage.local.set({ capturingState: { isCapturing: false } }, () => {
111 |         toggleCaptureButtons(false);
112 |       });
113 |     });
114 |   }
115 | 
116 |   // Function to get the current active tab
117 |   async function getCurrentTab() {
118 |     return new Promise((resolve) => {
119 |       chrome.tabs.query({ active: true, currentWindow: true }, (tabs) => {
120 |         resolve(tabs[0]);
121 |       });
122 |     });
123 |   }
124 | 
125 |   // Function to toggle the capture buttons based on the capturing state
126 |   function toggleCaptureButtons(isCapturing) {
127 |     startButton.disabled = isCapturing;
128 |     stopButton.disabled = !isCapturing;
129 |     useServerCheckbox.disabled = isCapturing;
130 |     useVadCheckbox.disabled = isCapturing;
131 |     modelSizeDropdown.disabled = isCapturing;
132 |     languageDropdown.disabled = isCapturing;
133 |     taskDropdown.disabled = isCapturing; 
134 |     startButton.classList.toggle("disabled", isCapturing);
135 |     stopButton.classList.toggle("disabled", !isCapturing);
136 |   }
137 | 
138 |   // Save the checkbox state when it's toggled
139 |   useServerCheckbox.addEventListener("change", () => {
140 |     const useServerState = useServerCheckbox.checked;
141 |     chrome.storage.local.set({ useServerState });
142 |   });
143 | 
144 |   useVadCheckbox.addEventListener("change", () => {
145 |     const useVadState = useVadCheckbox.checked;
146 |     chrome.storage.local.set({ useVadState });
147 |   });
148 | 
149 |   languageDropdown.addEventListener('change', function() {
150 |     if (languageDropdown.value === "") {
151 |       selectedLanguage = null;
152 |     } else {
153 |       selectedLanguage = languageDropdown.value;
154 |     }
155 |     chrome.storage.local.set({ selectedLanguage });
156 |   });
157 | 
158 |   taskDropdown.addEventListener('change', function() {
159 |     selectedTask = taskDropdown.value;
160 |     chrome.storage.local.set({ selectedTask });
161 |   });
162 | 
163 |   modelSizeDropdown.addEventListener('change', function() {
164 |     selectedModelSize = modelSizeDropdown.value;
165 |     chrome.storage.local.set({ selectedModelSize });
166 |   });
167 | 
168 |   chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => {
169 |     if (request.action === "updateSelectedLanguage") {
170 |       const detectedLanguage = request.detectedLanguage;
171 |   
172 |       if (detectedLanguage) {
173 |         languageDropdown.value = detectedLanguage;
174 |         chrome.storage.local.set({ selectedLanguage: detectedLanguage });
175 |       }
176 |     }
177 |   });
178 | 
179 |   chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => {
180 |     if (request.action === "toggleCaptureButtons") {
181 |       toggleCaptureButtons(false);
182 |       chrome.storage.local.set({ capturingState: { isCapturing: false } })
183 |     }
184 |   });
185 |   
186 | });
187 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Chrome/style.css:
--------------------------------------------------------------------------------
  1 | .header {
  2 |   display: flex;
  3 |   align-items: center;
  4 |   padding-bottom: 15px;
  5 |   padding-left: 20px;
  6 |   border-bottom: 2px solid darkred;
  7 | }
  8 | 
  9 | .header-title {
 10 |   padding: 0 5px;
 11 | }
 12 | 
 13 | h1 {
 14 |   font-size: 36px;
 15 | }
 16 | 
 17 | img {
 18 |   height: 64px;
 19 |   margin: 0 20px 0 0;
 20 | }
 21 | 
 22 | h2 {
 23 |   font-size: 26px;
 24 | }
 25 | 
 26 | label {
 27 |   font-size: 16px;
 28 | }
 29 | 
 30 | .inner {
 31 |   margin-left: 40px;
 32 | }
 33 | 
 34 | .options-list {
 35 |   padding: 0;
 36 |   list-style: none;
 37 | }
 38 | 
 39 | .options-list li {
 40 |   padding: 10px;
 41 | }
 42 | 
 43 | .time {
 44 |   font-size: 16px;
 45 | }
 46 | 
 47 | .limit {
 48 |   display: inline-block;
 49 |   margin: 0;
 50 |   font-size: 12px;
 51 | }
 52 | 
 53 | .radioChoice {
 54 |   margin-left: 15px;
 55 | }
 56 | 
 57 | .button-container {
 58 |   display: flex;
 59 |   justify-content: space-between;
 60 |   padding: 10px;
 61 | }
 62 | 
 63 | .button {
 64 |   padding: 10px;
 65 |   border: 2px solid darkred;
 66 |   font-size: 16px;
 67 |   font-weight: bold;
 68 |   cursor: pointer;
 69 |   white-space: nowrap;
 70 |   width: 150px;
 71 |   border-radius: 5px;
 72 | }
 73 | 
 74 | .disabled {
 75 |   opacity: 0.6;
 76 |   cursor: not-allowed;
 77 | }
 78 | 
 79 | .button:hover:not(:disabled) {
 80 |   color: red;
 81 |   background-color: darkred;
 82 | }
 83 | 
 84 | #save {
 85 |   font-size: 16px;
 86 |   margin-left: 50px;
 87 | }
 88 | 
 89 | #status {
 90 |   color: red;
 91 |   margin-top: 8px;
 92 |   margin-left: 50px;
 93 |   font-size: 14px;
 94 | }
 95 | 
 96 | #qualityLi {
 97 |   display: none;
 98 | }
 99 | 
100 | #maxTime {
101 |   width: 30px;
102 |   text-align: center;
103 | }
104 | 
105 | .checkbox-container {
106 |   padding: 10px; 
107 | }
108 | 
109 | .dropdown-container {
110 |   padding: 10px; 
111 | }
112 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/README.md:
--------------------------------------------------------------------------------
 1 | # Audio Transcription Firefox
 2 | 
 3 | Audio Transcription is a Firefox extension that allows users to capture any audio playing on the current tab and transcribe it using OpenAI-whisper in real time. Users will have the option to do voice activity detection as well to not send audio to server when there is no speech.
 4 | 
 5 | We use OpenAI-whisper model to process the audio continuously and send the transcription back to the client. We apply a few optimizations on top of OpenAI's implementation to improve performance and run it faster in a real-time manner. To this end, we used [faster-whisper](https://github.com/guillaumekln/faster-whisper) which is 4x faster than OpenAI's implementation.
 6 | 
 7 | ## Loading the Extension
 8 | - Open the Mozilla Firefox browser.
 9 | - Type ```about:debugging#/runtime/this-firefox``` in the address bar and press Enter.
10 | - Clone this repository
11 | - Click the Load temporary Add-on.
12 | - Browse to the location where you cloned the repository files and select the ```Audio Transcription Fox``` folder.
13 | - The extension should now be loaded and visible on the extensions page.
14 | 
15 | 
16 | ## Real time transcription with OpenAI-whisper
17 | This Firefox extension allows you to send audio from your browser to a server for transcribing the audio in real time. 
18 | 
19 | ## Implementation Details
20 | 
21 | ### Capturing Audio
22 | To capture the audio in the current tab, we used the chrome `tabCapture` API to obtain a `MediaStream` object of the current tab.
23 | 
24 | ### Options
25 | When using the Audio Transcription extension, you have the following options:
26 |  - **Use Collabora Server**: We provide a demo server which runs the whisper small model.
27 |  - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper.
28 |  - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English.
29 |   - **Model Size**: Select the whisper model size to run the server with.
30 | 
31 | ### Getting Started
32 | - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).
33 | - Just click on the Firefox Extension which should show 2 options
34 |   - **Start Capture** : Starts capturing the audio in the current tab and sends the captured audio to the server for transcription. This also creates an element to show the transcriptions recieved from the server on the current tab.
35 |   - **Stop Capture** - Stops capturing the audio.
36 | 
37 | 
38 | ## Limitations
39 | This extension requires an internet connection to stream audio and receive transcriptions. The accuracy of the transcriptions may vary depending on the audio quality and the performance of the server-side transcription service. The extension may consume additional system resources while running, especially when streaming audio.
40 | 
41 | ## Note
42 | The extension relies on a properly running transcription server with multilingual support. Please follow the server documentation for setup and configuration.
43 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/background.js:
--------------------------------------------------------------------------------
 1 | browser.runtime.onMessage.addListener(async function(request, sender, sendResponse) {
 2 |   const { action, data } = request;
 3 |   if (action === "transcript") {
 4 |     await browser.tabs.query({ active: true, currentWindow: true })
 5 |       .then((tabs) => {
 6 |         const tabId = tabs[0].id;
 7 |         browser.tabs.sendMessage(tabId, { action: "show_transcript", data });
 8 |       })
 9 |       .catch((error) => {
10 |         console.error("Error retrieving active tab:", error);
11 |       });
12 |   }
13 |   if (action === "updateSelectedLanguage") {
14 |     const detectedLanguage = data;
15 |     try {
16 |       await browser.storage.local.set({ selectedLanguage: detectedLanguage });
17 |       browser.tabs.query({ active: true, currentWindow: true }).then((tabs) => {
18 |         const tabId = tabs[0].id;
19 |         browser.tabs.sendMessage(tabId, { action: "updateSelectedLanguage", detectedLanguage });
20 |       });
21 |     } catch (error) {
22 |       console.error("Error updateSelectedLanguage:", error);
23 |     }
24 |   }
25 |   if (action === "toggleCaptureButtons") {    
26 |     try {
27 |       await browser.storage.local.set({ capturingState: { isCapturing: false } });
28 |       browser.tabs.query({ active: true, currentWindow: true }).then((tabs) => {
29 |         const tabId = tabs[0].id;
30 |         browser.tabs.sendMessage(tabId, { action: "toggleCaptureButtons", data: false });
31 |       });
32 |     } catch (error) {
33 |       console.error("Error updating capturing state:", error);
34 |     }
35 | 
36 |     try{
37 |       await browser.tabs.query({ active: true, currentWindow: true })
38 |         .then((tabs) => {
39 |           const tabId = tabs[0].id;
40 |           browser.tabs.sendMessage(tabId, { action: "stopCapture", data });
41 |         })
42 |         .catch((error) => {
43 |           console.error("Error retrieving active tab:", error);
44 |         }); 
45 |     } catch (error) {
46 |       console.error(error);
47 |     }
48 |   }
49 |   
50 |   if (action === "showPopup") {
51 |     try{
52 |       await browser.tabs.query({ active: true, currentWindow: true })
53 |         .then((tabs) => {
54 |           const tabId = tabs[0].id;
55 |           browser.tabs.sendMessage(tabId, { action: "showWaitPopup", data });
56 |         })
57 |         .catch((error) => {
58 |           console.error(error);
59 |         });
60 |     } catch (error) {
61 |       console.error(error);
62 |     }
63 |   }
64 | });
65 | 
66 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/content.js:
--------------------------------------------------------------------------------
  1 | let socket = null;
  2 | let isCapturing = false;
  3 | let mediaStream = null;
  4 | let audioContext = null;
  5 | let scriptProcessor = null;
  6 | let language = null;
  7 | 
  8 | let isPaused = false;
  9 | 
 10 | const mediaElements = document.querySelectorAll('video, audio');
 11 | mediaElements.forEach((mediaElement) => {
 12 |   mediaElement.addEventListener('play', handlePlaybackStateChange);
 13 |   mediaElement.addEventListener('pause', handlePlaybackStateChange);
 14 | });
 15 | 
 16 | 
 17 | function handlePlaybackStateChange(event) {
 18 |   isPaused = event.target.paused;
 19 | }
 20 | 
 21 | function generateUUID() {
 22 |   let dt = new Date().getTime();
 23 |   const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
 24 |     const r = (dt + Math.random() * 16) % 16 | 0;
 25 |     dt = Math.floor(dt / 16);
 26 |     return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16);
 27 |   });
 28 |   return uuid;
 29 | }
 30 | 
 31 | 
 32 | /**
 33 |  * Resamples the audio data to a target sample rate of 16kHz.
 34 |  * @param {Array|ArrayBuffer|TypedArray} audioData - The input audio data.
 35 |  * @param {number} [origSampleRate=44100] - The original sample rate of the audio data.
 36 |  * @returns {Float32Array} The resampled audio data at 16kHz.
 37 |  */
 38 | function resampleTo16kHZ(audioData, origSampleRate = 44100) {
 39 |   // Convert the audio data to a Float32Array
 40 |   const data = new Float32Array(audioData);
 41 | 
 42 |   // Calculate the desired length of the resampled data
 43 |   const targetLength = Math.round(data.length * (16000 / origSampleRate));
 44 | 
 45 |   // Create a new Float32Array for the resampled data
 46 |   const resampledData = new Float32Array(targetLength);
 47 | 
 48 |   // Calculate the spring factor and initialize the first and last values
 49 |   const springFactor = (data.length - 1) / (targetLength - 1);
 50 |   resampledData[0] = data[0];
 51 |   resampledData[targetLength - 1] = data[data.length - 1];
 52 | 
 53 |   // Resample the audio data
 54 |   for (let i = 1; i < targetLength - 1; i++) {
 55 |     const index = i * springFactor;
 56 |     const leftIndex = Math.floor(index).toFixed();
 57 |     const rightIndex = Math.ceil(index).toFixed();
 58 |     const fraction = index - leftIndex;
 59 |     resampledData[i] = data[leftIndex] + (data[rightIndex] - data[leftIndex]) * fraction;
 60 |   }
 61 | 
 62 |   // Return the resampled data
 63 |   return resampledData;
 64 | }
 65 | 
 66 | function startRecording(data) {
 67 |     socket = new WebSocket(`ws://${data.host}:${data.port}/`);
 68 |     language = data.language;
 69 | 
 70 |     const uuid = generateUUID();
 71 |     socket.onopen = function(e) { 
 72 |       socket.send(
 73 |         JSON.stringify({
 74 |             uid: uuid,
 75 |             language: data.language,
 76 |             task: data.task,
 77 |             model: data.modelSize,
 78 |             use_vad: data.useVad
 79 |         })
 80 |       );
 81 |     };
 82 | 
 83 |     let isServerReady = false;
 84 |     socket.onmessage = async (event) => {
 85 |       const data = JSON.parse(event.data);
 86 |       if (data["uid"] !== uuid)
 87 |         return;
 88 |       
 89 |       if (data["status"] === "WAIT"){
 90 |         await browser.runtime.sendMessage({ action: "showPopup", data: data["message"] })
 91 |         return;
 92 |       }
 93 |       
 94 |       if (!isServerReady && data["message"] === "SERVER_READY"){
 95 |         isServerReady = true;
 96 |         return;
 97 |       }
 98 | 
 99 |       if (language === null ){
100 |         language = data["language"];
101 |         await browser.runtime.sendMessage({ action: "updateSelectedLanguage", data: language })      
102 |         return
103 |       }
104 | 
105 |       if (data["message"] === "DISCONNECT"){
106 |         await browser.runtime.sendMessage({ action: "toggleCaptureButtons", data: false })        
107 |         return
108 |       }
109 | 
110 |       await browser.runtime.sendMessage({ action: "transcript", data: event.data })
111 |           .catch(function(error) {
112 |             console.error("Error sending message:", error);
113 |           });
114 |     };
115 | 
116 |   // Access the audio stream from the current tab
117 |   navigator.mediaDevices.getUserMedia({ audio: true })
118 |     .then(function(stream) {
119 |       // Create a new MediaRecorder instance
120 |       const audioDataCache = [];
121 |       audioContext = new AudioContext();
122 |       mediaStream = audioContext.createMediaStreamSource(stream);
123 |       recorder = audioContext.createScriptProcessor(4096, 1, 1);
124 | 
125 |       recorder.onaudioprocess = async (event) => {
126 |         if (!audioContext || !isCapturing || !isServerReady || isPaused) return;
127 | 
128 |         const inputData = event.inputBuffer.getChannelData(0);
129 |         const audioData16kHz = resampleTo16kHZ(inputData, audioContext.sampleRate);
130 | 
131 |         audioDataCache.push(inputData);
132 |         
133 |         socket.send(audioData16kHz);
134 |       };
135 | 
136 |       // Prevent page mute
137 |       mediaStream.connect(recorder);
138 |       recorder.connect(audioContext.destination);
139 |     })
140 | }
141 | 
142 | var elem_container = null;
143 | var elem_text = null;
144 | 
145 | var segments = [];
146 | var text_segments = [];
147 | 
148 | function initPopupElement() {
149 |   if (document.getElementById('popupElement')) {
150 |     return;
151 |   }
152 | 
153 |   const popupContainer = document.createElement('div');
154 |   popupContainer.id = 'popupElement';
155 |   popupContainer.style.cssText = 'position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: white; color: black; padding: 16px; border-radius: 10px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.5); display: none; text-align: center;';
156 | 
157 |   const popupText = document.createElement('span');
158 |   popupText.textContent = 'Default Text';
159 |   popupText.className = 'popupText';
160 |   popupText.style.fontSize = '24px';
161 |   popupContainer.appendChild(popupText);
162 | 
163 |   const buttonContainer = document.createElement('div');
164 |   buttonContainer.style.marginTop = '8px';
165 |   const closePopupButton = document.createElement('button');
166 |   closePopupButton.textContent = 'Close';
167 |   closePopupButton.style.backgroundColor = '#65428A';
168 |   closePopupButton.style.color = 'white';
169 |   closePopupButton.style.border = 'none';
170 |   closePopupButton.style.padding = '8px 16px'; // Add padding for better click area
171 |   closePopupButton.style.cursor = 'pointer';
172 |   closePopupButton.addEventListener('click', async () => {
173 |     popupContainer.style.display = 'none';
174 |     await browser.runtime.sendMessage({ action: 'toggleCaptureButtons', data: false });
175 |   });
176 |   buttonContainer.appendChild(closePopupButton);
177 |   popupContainer.appendChild(buttonContainer);
178 | 
179 |   document.body.appendChild(popupContainer);
180 | }
181 | 
182 | 
183 | function showPopup(customText) {
184 |   const popup = document.getElementById('popupElement');
185 |   const popupText = popup.querySelector('.popupText');
186 | 
187 |   if (popup && popupText) {
188 |       popupText.textContent = customText || 'Default Text'; // Set default text if custom text is not provided
189 |       popup.style.display = 'block';
190 |   }
191 | }
192 | 
193 | 
194 | function init_element() {
195 |     if (document.getElementById('transcription')) {
196 |         return;
197 |     }
198 | 
199 |     elem_container = document.createElement('div');
200 |     elem_container.id = "transcription";
201 |     elem_container.style.cssText = 'padding-top:16px;font-size:18px;line-height:18px;position:fixed;top:85%;left:50%;transform:translate(-50%,-50%);width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;';
202 | 
203 |     for (var i = 0; i < 4; i++) {
204 |         elem_text = document.createElement('span');
205 |         elem_text.style.cssText = 'position: absolute;padding-left:16px;padding-right:16px;';
206 |         elem_text.id = "t" + i;
207 |         elem_container.appendChild(elem_text);
208 | 
209 |         if (i == 3) {
210 |             elem_text.style.top = "-1000px"
211 |         }
212 |     }
213 | 
214 |     document.body.appendChild(elem_container);
215 | 
216 |     let x = 0;
217 |     let y = 0;
218 | 
219 |     // Query the element
220 |     const ele = elem_container;
221 | 
222 |     // Handle the mousedown event
223 |     // that's triggered when user drags the element
224 |     const mouseDownHandler = function (e) {
225 |         // Get the current mouse position
226 |         x = e.clientX;
227 |         y = e.clientY;
228 | 
229 |         // Attach the listeners to `document`
230 |         document.addEventListener('mousemove', mouseMoveHandler);
231 |         document.addEventListener('mouseup', mouseUpHandler);
232 |     };
233 | 
234 |     const mouseMoveHandler = function (e) {
235 |         // How far the mouse has been moved
236 |         const dx = e.clientX - x;
237 |         const dy = e.clientY - y;
238 | 
239 |         // Set the position of element
240 |         ele.style.top = `${ele.offsetTop + dy}px`;
241 |         ele.style.left = `${ele.offsetLeft + dx}px`;
242 | 
243 |         // Reassign the position of mouse
244 |         x = e.clientX;
245 |         y = e.clientY;
246 |     };
247 | 
248 |     const mouseUpHandler = function () {
249 |         // Remove the handlers of `mousemove` and `mouseup`
250 |         document.removeEventListener('mousemove', mouseMoveHandler);
251 |         document.removeEventListener('mouseup', mouseUpHandler);
252 |     };
253 | 
254 |     ele.addEventListener('mousedown', mouseDownHandler);
255 | }
256 | 
257 | function getStyle(el,styleProp)
258 | {
259 |     var x = document.getElementById(el);
260 |     if (x.currentStyle)
261 |         var y = x.currentStyle[styleProp];
262 |     else if (window.getComputedStyle)
263 |         var y = document.defaultView.getComputedStyle(x,null).getPropertyValue(styleProp);
264 |     return y;
265 | }
266 | 
267 | function get_lines(elem, line_height) {
268 |     var divHeight = elem.offsetHeight;
269 |     var lines = divHeight / line_height;
270 | 
271 |     var original_text = elem.innerHTML;
272 | 
273 |     var words = original_text.split(' ');
274 |     var segments = [];
275 |     var current_lines = 1;
276 |     var segment = '';
277 |     var segment_len = 0;
278 |     for (var i = 0; i < words.length; i++)
279 |     {
280 |         segment += words[i] + ' ';
281 |         elem.innerHTML = segment;
282 |         divHeight = elem.offsetHeight;
283 | 
284 |         if ((divHeight / line_height) > current_lines) {
285 |             var line_segment = segment.substring(segment_len, segment.length - 1 - words[i].length - 1);
286 |             segments.push(line_segment);
287 |             segment_len += line_segment.length + 1;
288 |             current_lines++;
289 |         }
290 |     }
291 | 
292 |     var line_segment = segment.substring(segment_len, segment.length - 1)
293 |     segments.push(line_segment);
294 | 
295 |     elem.innerHTML = original_text;
296 | 
297 |     return segments;
298 | 
299 | }
300 | 
301 | function remove_element() {
302 |     var elem = document.getElementById('transcription')
303 |     for (var i = 0; i < 4; i++) {
304 |         document.getElementById("t" + i).remove();
305 |     }
306 |     elem.remove()
307 | }
308 | 
309 | browser.runtime.onMessage.addListener((request, sender, sendResponse) => {
310 |   const { action, data } = request;
311 |   if (action === "startCapture") {
312 |       isCapturing = true;
313 |       startRecording(data);
314 |   } else if (action === "stopCapture") {
315 |     
316 |     isCapturing = false;
317 |     if (socket) {
318 |         socket.close();
319 |         socket = null;
320 |     }
321 |     
322 |     if (audioContext) {
323 |         audioContext.close();
324 |         audioContext = null;
325 |         mediaStream = null;
326 |         recorder = null;
327 |     }
328 | 
329 |     remove_element();
330 | 
331 |   } else if (action === "showWaitPopup") {
332 |     
333 |     initPopupElement();
334 | 
335 |     showPopup(`Estimated wait time ~ ${Math.round(data)} minutes`);
336 | 
337 |   } else if (action === "show_transcript"){
338 |     if (!isCapturing) return;
339 |     init_element();    
340 |     message = JSON.parse(data);
341 |     message = message["segments"];
342 |     
343 |     var text = '';
344 |     for (var i = 0; i < message.length; i++) {
345 |         text += message[i].text + ' ';
346 |     }
347 |     text = text.replace(/(\r\n|\n|\r)/gm, "");
348 |     
349 |     var elem = document.getElementById('t3');
350 |     elem.innerHTML = text;
351 |   
352 |     var line_height_style = getStyle('t3', 'line-height');
353 |     var line_height = parseInt(line_height_style.substring(0, line_height_style.length - 2));
354 |     var divHeight = elem.offsetHeight;
355 |     var lines = divHeight / line_height;
356 |   
357 |     text_segments = [];
358 |     text_segments = get_lines(elem, line_height);
359 |     
360 |     elem.innerHTML = '';
361 |   
362 |     if (text_segments.length > 2) {
363 |         for (var i = 0; i < 3; i++) {
364 |             document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i];
365 |         }
366 |     } else {
367 |         for (var i = 0; i < 3; i++) {
368 |             document.getElementById('t' + i).innerHTML = '';
369 |         }
370 |     }
371 |   
372 |     if (text_segments.length <= 2) {
373 |         for (var i = 0; i < text_segments.length; i++) {
374 |             document.getElementById('t' + i).innerHTML = text_segments[i];
375 |         }
376 |     } else {
377 |         for (var i = 0; i < 3; i++) {
378 |             document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i];
379 |         }
380 |     }
381 |   
382 |     for (var i = 1; i < 3; i++)
383 |     {
384 |         var parent_elem = document.getElementById('t' + (i - 1));
385 |         var elem = document.getElementById('t' + i);
386 |         elem.style.top = parent_elem.offsetHeight + parent_elem.offsetTop + 'px';
387 |     }
388 |   }
389 |   sendResponse({});
390 | });
391 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/icon128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/Audio-Transcription-Firefox/icon128.png


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "manifest_version": 2,
 3 |     "name": "Audio Transcription",
 4 |     "version": "1.0",
 5 |     "description": "Transcribe audio from any webpage.",
 6 |     "permissions": [
 7 |       "storage",
 8 |       "activeTab",
 9 |       "<all_urls>"
10 |     ],
11 |     "background": {
12 |       "scripts": ["background.js"],
13 |       "persistent": false
14 |     },
15 |     "browser_action": {
16 |       "default_popup": "popup.html",
17 |       "default_icon": "icon128.png"
18 |     },
19 |     "icons": {
20 |         "128":"icon128.png"
21 |         },
22 |     "content_scripts": [
23 |       {
24 |         "matches": ["<all_urls>"],
25 |         "js": ["content.js"]
26 |       }
27 |     ]
28 |   }


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/popup.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <title>Audio Transcription</title>
  5 |   <script src="popup.js"></script>
  6 |   <link rel="stylesheet" href="style.css" type="text/css">
  7 | </head>
  8 | <body>
  9 |   <div class="header"><img src="./icon128.png"/> <h1>Audio Transcription</h1></div>
 10 |   <div class="button-container">
 11 |     <div class="button" id="startCapture">Start Capture</div>
 12 |     <div class="button" id="stopCapture" disabled>Stop Capture</div>
 13 |   </div>
 14 |   <div class="checkbox-container">
 15 |     <input type="checkbox" id="useServerCheckbox">
 16 |     <label for="useServerCheckbox">Use Collabora Whisper-Live Server</label>
 17 |   </div>
 18 |   <div class="checkbox-container">
 19 |     <input type="checkbox" id="useVadCheckbox">
 20 |     <label for="useVadCheckbox">Use Voice Activity Detection</label>
 21 |   </div>
 22 |   <textarea id="waitTextBox" style="display: none;"></textarea>
 23 |   <div class="dropdown-container">
 24 |     <label for="languageDropdown">Select Language:</label>
 25 |     <select id="languageDropdown">
 26 |       <option value="" selected>Automatically detect</option>
 27 |       <option value="af">Afrikaans</option>
 28 |       <option value="sq">Albanian</option>
 29 |       <option value="am">Amharic</option>
 30 |       <option value="ar">Arabic</option>
 31 |       <option value="hy">Armenian</option>
 32 |       <option value="as">Assamese</option>
 33 |       <option value="az">Azerbaijani</option>
 34 |       <option value="ba">Bashkir</option>
 35 |       <option value="eu">Basque</option>
 36 |       <option value="be">Belarusian</option>
 37 |       <option value="bn">Bengali</option>
 38 |       <option value="bs">Bosnian</option>
 39 |       <option value="br">Breton</option>
 40 |       <option value="bg">Bulgarian</option>
 41 |       <option value="ca">Catalan</option>
 42 |       <option value="zh">Chinese</option>
 43 |       <option value="hr">Croatian</option>
 44 |       <option value="cs">Czech</option>
 45 |       <option value="da">Danish</option>
 46 |       <option value="nl">Dutch</option>
 47 |       <option value="en">English</option>
 48 |       <option value="et">Estonian</option>
 49 |       <option value="fo">Faroese</option>
 50 |       <option value="fi">Finnish</option>
 51 |       <option value="fr">French</option>
 52 |       <option value="gl">Galician</option>
 53 |       <option value="ka">Georgian</option>
 54 |       <option value="de">German</option>
 55 |       <option value="el">Greek</option>
 56 |       <option value="gu">Gujarati</option>
 57 |       <option value="ht">Haitian Creole</option>
 58 |       <option value="ha">Hausa</option>
 59 |       <option value="haw">Hawaiian</option>
 60 |       <option value="he">Hebrew</option>
 61 |       <option value="hi">Hindi</option>
 62 |       <option value="hu">Hungarian</option>
 63 |       <option value="is">Icelandic</option>
 64 |       <option value="id">Indonesian</option>
 65 |       <option value="it">Italian</option>
 66 |       <option value="ja">Japanese</option>
 67 |       <option value="jw">Javanese</option>
 68 |       <option value="kn">Kannada</option>
 69 |       <option value="kk">Kazakh</option>
 70 |       <option value="km">Khmer</option>
 71 |       <option value="ko">Korean</option>
 72 |       <option value="lo">Lao</option>
 73 |       <option value="la">Latin</option>
 74 |       <option value="lv">Latvian</option>
 75 |       <option value="ln">Lingala</option>
 76 |       <option value="lt">Lithuanian</option>
 77 |       <option value="lb">Luxembourgish</option>
 78 |       <option value="mk">Macedonian</option>
 79 |       <option value="mg">Malagasy</option>
 80 |       <option value="ms">Malay</option>
 81 |       <option value="ml">Malayalam</option>
 82 |       <option value="mt">Maltese</option>
 83 |       <option value="mi">Maori</option>
 84 |       <option value="mr">Marathi</option>
 85 |       <option value="mn">Mongolian</option>
 86 |       <option value="my">Myanmar</option>
 87 |       <option value="ne">Nepali</option>
 88 |       <option value="no">Norwegian</option>
 89 |       <option value="nn">Nynorsk</option>
 90 |       <option value="oc">Occitan</option>
 91 |       <option value="ps">Pashto</option>
 92 |       <option value="fa">Persian</option>
 93 |       <option value="pl">Polish</option>
 94 |       <option value="pt">Portuguese</option>
 95 |       <option value="pa">Punjabi</option>
 96 |       <option value="ro">Romanian</option>
 97 |       <option value="ru">Russian</option>
 98 |       <option value="sa">Sanskrit</option>
 99 |       <option value="sr">Serbian</option>
100 |       <option value="sn">Shona</option>
101 |       <option value="sd">Sindhi</option>
102 |       <option value="si">Sinhala</option>
103 |       <option value="sk">Slovak</option>
104 |       <option value="sl">Slovenian</option>
105 |       <option value="so">Somali</option>
106 |       <option value="es">Spanish</option>
107 |       <option value="su">Sundanese</option>
108 |       <option value="sw">Swahili</option>
109 |       <option value="sv">Swedish</option>
110 |       <option value="tl">Tagalog</option>
111 |       <option value="tg">Tajik</option>
112 |       <option value="ta">Tamil</option>
113 |       <option value="tt">Tatar</option>
114 |       <option value="te">Telugu</option>
115 |       <option value="th">Thai</option>
116 |       <option value="bo">Tibetan</option>
117 |       <option value="tr">Turkish</option>
118 |       <option value="tk">Turkmen</option>
119 |       <option value="uk">Ukrainian</option>
120 |       <option value="ur">Urdu</option>
121 |       <option value="uz">Uzbek</option>
122 |       <option value="vi">Vietnamese</option>
123 |       <option value="cy">Welsh</option>
124 |       <option value="yi">Yiddish</option>
125 |       <option value="yo">Yoruba</option>
126 |     </select>
127 |   </div>
128 |   <div class="dropdown-container">
129 |     <label for="taskDropdown">Select task:</label>
130 |     <select id="taskDropdown" disabled>
131 |       <option value="">Select Task</option>
132 |       <option value="transcribe" selected>Transcribe</option>
133 |       <option value="translate">Translate</option>
134 |     </select>
135 |   </div>
136 |   <div class="dropdown-container">
137 |     <label for="modelSizeDropdown">Select Model Size:</label>
138 |     <select id="modelSizeDropdown">
139 |       <option value="">Select model</option>
140 |       <option value="tiny">Tiny </option>
141 |       <option value="tiny.en">Tiny (English-only)</option>
142 |       <option value="base">Base</option>
143 |       <option value="base.en">Base (English-only)</option>
144 |       <option value="small" selected>Small</option>
145 |       <option value="small.en">Small (English-only)</option>
146 |       <option value="medium">Medium</option>
147 |       <option value="medium.en">Medium (English-only)</option>
148 |       <option value="large-v2">Large-v2</option>
149 |       <option value="large-v3">Large-v3</option>
150 |     </select>
151 |   </div>
152 | </body>
153 | </html>
154 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/popup.js:
--------------------------------------------------------------------------------
  1 | document.addEventListener("DOMContentLoaded", function() {
  2 |   const startButton = document.getElementById("startCapture");
  3 |   const stopButton = document.getElementById("stopCapture");
  4 | 
  5 |   const useServerCheckbox = document.getElementById("useServerCheckbox");
  6 |   const useVadCheckbox = document.getElementById("useVadCheckbox");
  7 |   const languageDropdown = document.getElementById('languageDropdown');
  8 |   const taskDropdown = document.getElementById('taskDropdown');
  9 |   const modelSizeDropdown = document.getElementById('modelSizeDropdown');
 10 |   let selectedLanguage = null;
 11 |   let selectedTask = taskDropdown.value;
 12 |   let selectedModelSize = modelSizeDropdown.value;
 13 |   
 14 | 
 15 |   browser.storage.local.get("capturingState")
 16 |     .then(function(result) {
 17 |       const capturingState = result.capturingState;
 18 |       if (capturingState && capturingState.isCapturing) {
 19 |         toggleCaptureButtons(true);
 20 |       } else {
 21 |         toggleCaptureButtons(false);
 22 |       }
 23 |       // Enable the startButton
 24 |       startButton.disabled = false;
 25 |     })
 26 |     .catch(function(error) {
 27 |       console.error("Error retrieving capturing state:", error);
 28 |       // Enable the startButton
 29 |       startButton.disabled = false;
 30 |     });
 31 |   
 32 |   browser.storage.local.get("useServerState", ({ useServerState }) => {
 33 |     if (useServerState !== undefined) {
 34 |       useServerCheckbox.checked = useServerState;
 35 |     }
 36 |   });
 37 | 
 38 |   browser.storage.local.get("useVadState", ({ useVadState }) => {
 39 |     if (useVadState !== undefined) {
 40 |       useVadCheckbox.checked = useVadState;
 41 |     }
 42 |   });
 43 | 
 44 |   browser.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => {
 45 |     if (storedLanguage !== undefined) {
 46 |       languageDropdown.value = storedLanguage;
 47 |       selectedLanguage = storedLanguage;
 48 |     }
 49 |   });
 50 | 
 51 |   browser.storage.local.get("selectedTask", ({ selectedTask: storedTask }) => {
 52 |     if (storedTask !== undefined) {
 53 |       taskDropdown.value = storedTask;
 54 |       selectedTask = storedTask;
 55 |     }
 56 |   });
 57 | 
 58 |   browser.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => {
 59 |     if (storedModelSize !== undefined) {
 60 |       modelSizeDropdown.value = storedModelSize;
 61 |       selectedModelSize = storedModelSize;
 62 |     }
 63 |   });
 64 | 
 65 |   startButton.addEventListener("click", function() {
 66 |     let host = "localhost";
 67 |     let port = "9090";
 68 |     const useCollaboraServer = useServerCheckbox.checked;
 69 | 
 70 |     if (useCollaboraServer){
 71 |       host = "transcription.kurg.org"
 72 |       port = "7090"
 73 |     }
 74 | 
 75 |     browser.tabs.query({ active: true, currentWindow: true })
 76 |       .then(function(tabs) {
 77 |         browser.tabs.sendMessage(
 78 |           tabs[0].id, 
 79 |           { 
 80 |             action: "startCapture", 
 81 |             data: {
 82 |               host: host,
 83 |               port: port,
 84 |               language: selectedLanguage,
 85 |               task: selectedTask,
 86 |               modelSize: selectedModelSize,
 87 |               useVad: useVadCheckbox.checked,
 88 |             } 
 89 |           });
 90 |         toggleCaptureButtons(true);
 91 |         browser.storage.local.set({ capturingState: { isCapturing: true } })
 92 |           .catch(function(error) {
 93 |             console.error("Error storing capturing state:", error);
 94 |           });
 95 |       })
 96 |       .catch(function(error) {
 97 |         console.error("Error sending startCapture message:", error);
 98 |       });
 99 |   });
100 | 
101 |   stopButton.addEventListener("click", function() {
102 |     browser.tabs.query({ active: true, currentWindow: true })
103 |       .then(function(tabs) {
104 |         browser.tabs.sendMessage(tabs[0].id, { action: "stopCapture" })
105 |           .then(function(response) {
106 |             toggleCaptureButtons(false);
107 |             browser.storage.local.set({ capturingState: { isCapturing: false } })
108 |               .catch(function(error) {
109 |                 console.error("Error storing capturing state:", error);
110 |               });
111 |           })
112 |           .catch(function(error) {
113 |             console.error("Error sending stopCapture message:", error);
114 |           });
115 |       })
116 |       .catch(function(error) {
117 |         console.error("Error querying active tab:", error);
118 |       });
119 |   });
120 | 
121 |   // Function to toggle the capture buttons
122 |   function toggleCaptureButtons(isCapturing) {
123 |     startButton.disabled = isCapturing;
124 |     stopButton.disabled = !isCapturing;
125 |     useServerCheckbox.disabled = isCapturing;
126 |     useVadCheckbox.disabled = isCapturing;
127 |     modelSizeDropdown.disabled = isCapturing;
128 |     languageDropdown.disabled = isCapturing;
129 |     taskDropdown.disabled = isCapturing; 
130 |     startButton.classList.toggle("disabled", isCapturing);
131 |     stopButton.classList.toggle("disabled", !isCapturing);
132 |   }
133 | 
134 |   // Save the checkbox state when it's toggled
135 |   useServerCheckbox.addEventListener("change", () => {
136 |     const useServerState = useServerCheckbox.checked;
137 |     browser.storage.local.set({ useServerState });
138 |   });
139 | 
140 |   useVadCheckbox.addEventListener("change", () => {
141 |     const useVadState = useVadCheckbox.checked;
142 |     browser.storage.local.set({ useVadState });
143 |   });
144 | 
145 |   languageDropdown.addEventListener('change', function() {
146 |     if (languageDropdown.value === "") {
147 |       selectedLanguage = null;
148 |     } else {
149 |       selectedLanguage = languageDropdown.value;
150 |     }
151 |     browser.storage.local.set({ selectedLanguage });
152 |   });
153 | 
154 |   taskDropdown.addEventListener('change', function() {
155 |     selectedTask = taskDropdown.value;
156 |     browser.storage.local.set({ selectedTask });
157 |   });
158 | 
159 |   modelSizeDropdown.addEventListener('change', function() {
160 |     selectedModelSize = modelSizeDropdown.value;
161 |     browser.storage.local.set({ selectedModelSize });
162 |   });
163 | 
164 |   browser.runtime.onMessage.addListener((request, sender, sendResponse) => {
165 |     if (request.action === "updateSelectedLanguage") {
166 |       const detectedLanguage = request.data;
167 |   
168 |       if (detectedLanguage) {
169 |         languageDropdown.value = detectedLanguage;
170 |         selectedLanguage = detectedLanguage;
171 |         browser.storage.local.set({ selectedLanguage });
172 |       }
173 |     }
174 |   });
175 | 
176 |   browser.runtime.onMessage.addListener((request, sender, sendResponse) => {
177 |     if (request.action === "toggleCaptureButtons") {
178 |       toggleCaptureButtons(false);
179 |       browser.storage.local.set({ capturingState: { isCapturing: false } })
180 |         .catch(function(error) {
181 |           console.error("Error storing capturing state:", error);
182 |         });
183 |     }
184 |   });
185 | });
186 | 


--------------------------------------------------------------------------------
/Audio-Transcription-Firefox/style.css:
--------------------------------------------------------------------------------
  1 | .header {
  2 |   display: flex;
  3 |   align-items: center;
  4 |   padding-bottom: 15px;
  5 |   padding-left: 20px;
  6 |   border-bottom: 2px solid darkred;
  7 | }
  8 | 
  9 | .header-title {
 10 |   padding: 0 5px;
 11 | }
 12 | 
 13 | h1 {
 14 |   font-size: 36px;
 15 | }
 16 | 
 17 | img {
 18 |   height: 64px;
 19 |   margin: 0 20px 0 0;
 20 | }
 21 | 
 22 | h2 {
 23 |   font-size: 26px;
 24 | }
 25 | 
 26 | label {
 27 |   font-size: 16px;
 28 | }
 29 | 
 30 | .inner {
 31 |   margin-left: 40px;
 32 | }
 33 | 
 34 | .options-list {
 35 |   padding: 0;
 36 |   list-style: none;
 37 | }
 38 | 
 39 | .options-list li {
 40 |   padding: 10px;
 41 | }
 42 | 
 43 | .time {
 44 |   font-size: 16px;
 45 | }
 46 | 
 47 | .limit {
 48 |   display: inline-block;
 49 |   margin: 0;
 50 |   font-size: 12px;
 51 | }
 52 | 
 53 | .radioChoice {
 54 |   margin-left: 15px;
 55 | }
 56 | 
 57 | .button-container {
 58 |   display: flex;
 59 |   justify-content: space-between;
 60 |   padding: 10px;
 61 | }
 62 | 
 63 | .button {
 64 |   padding: 10px;
 65 |   border: 2px solid darkred;
 66 |   font-size: 16px;
 67 |   font-weight: bold;
 68 |   cursor: pointer;
 69 |   white-space: nowrap;
 70 |   width: 150px;
 71 |   border-radius: 5px;
 72 | }
 73 | 
 74 | .disabled {
 75 |   opacity: 0.6;
 76 |   cursor: not-allowed;
 77 | }
 78 | 
 79 | .button:hover:not(:disabled) {
 80 |   color: red;
 81 |   background-color: darkred;
 82 | }
 83 | 
 84 | #save {
 85 |   font-size: 16px;
 86 |   margin-left: 50px;
 87 | }
 88 | 
 89 | #status {
 90 |   color: red;
 91 |   margin-top: 8px;
 92 |   margin-left: 50px;
 93 |   font-size: 14px;
 94 | }
 95 | 
 96 | #qualityLi {
 97 |   display: none;
 98 | }
 99 | 
100 | #maxTime {
101 |   width: 30px;
102 |   text-align: center;
103 | }
104 | 
105 | .checkbox-container {
106 |   padding: 10px; 
107 | }
108 | 
109 | .dropdown-container {
110 |   padding: 10px; 
111 | }
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Vineet Suryan, Collabora Ltd.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WhisperLive
  2 | 
  3 | <h2 align="center">
  4 |   <a href="https://www.youtube.com/watch?v=0PHWCApIcCI"><img
  5 | src="https://img.youtube.com/vi/0PHWCApIcCI/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
  6 |   <a href="https://www.youtube.com/watch?v=0f5oiG4oPWQ"><img
  7 |   src="https://img.youtube.com/vi/0f5oiG4oPWQ/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
  8 |   <br><br>A nearly-live implementation of OpenAI's Whisper.
  9 | <br><br>
 10 | </h2>
 11 | 
 12 | This project is a real-time transcription application that uses the OpenAI Whisper model
 13 | to convert speech input into text output. It can be used to transcribe both live audio
 14 | input from microphone and pre-recorded audio files.
 15 | 
 16 | - [Installation](#installation)
 17 | - [Getting Started](#getting-started)
 18 | - [Running the Server](#running-the-server)
 19 | - [Running the Client](#running-the-client)
 20 | - [Browser Extensions](#browser-extensions)
 21 | - [Whisper Live Server in Docker](#whisper-live-server-in-docker)
 22 | - [Future Work](#future-work)
 23 | - [Blog Posts](#blog-posts)
 24 | - [Contact](#contact)
 25 | - [Citations](#citations)
 26 | 
 27 | ## Installation
 28 | - Install PyAudio
 29 | ```bash
 30 |  bash scripts/setup.sh
 31 | ```
 32 | 
 33 | - Install whisper-live from pip
 34 | ```bash
 35 |  pip install whisper-live
 36 | ```
 37 | 
 38 | ### Setting up NVIDIA/TensorRT-LLM for TensorRT backend
 39 | - Please follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup of [NVIDIA/TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and for building Whisper-TensorRT engine.
 40 | 
 41 | ## Getting Started
 42 | The server supports 3 backends `faster_whisper`, `tensorrt` and `openvino`. If running `tensorrt` backend follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md)
 43 | 
 44 | ### Running the Server
 45 | - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) backend
 46 | ```bash
 47 | python3 run_server.py --port 9090 \
 48 |                       --backend faster_whisper
 49 |   
 50 | # running with custom model and cache_dir to save auto-converted ctranslate2 models
 51 | python3 run_server.py --port 9090 \
 52 |                       --backend faster_whisper \
 53 |                       -fw "/path/to/custom/faster/whisper/model"
 54 |                       -c ~/.cache/whisper-live/
 55 | ```
 56 | 
 57 | - TensorRT backend. Currently, we recommend to only use the docker setup for TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) which works as expected. Make sure to build your TensorRT Engines before running the server with TensorRT backend.
 58 | ```bash
 59 | # Run English only model
 60 | python3 run_server.py -p 9090 \
 61 |                       -b tensorrt \
 62 |                       -trt /home/TensorRT-LLM/examples/whisper/whisper_small_en
 63 | 
 64 | # Run Multilingual model
 65 | python3 run_server.py -p 9090 \
 66 |                       -b tensorrt \
 67 |                       -trt /home/TensorRT-LLM/examples/whisper/whisper_small \
 68 |                       -m
 69 | ```
 70 | 
 71 | - WhisperLive now supports the [OpenVINO](https://github.com/openvinotoolkit/openvino) backend for efficient inference on Intel CPUs, iGPU and dGPUs. Currently, we tested the models uploaded to [huggingface by OpenVINO](https://huggingface.co/OpenVINO?search_models=whisper).
 72 |   - > **Docker Recommended:** Running WhisperLive with OpenVINO inside Docker automatically enables GPU support (iGPU/dGPU) without requiring additional host setup.
 73 |   - > **Native (non-Docker) Use:** If you prefer running outside Docker, ensure the Intel drivers and OpenVINO runtime are installed and properly configured on your system. Refer to the documentation for [installing OpenVINO](https://docs.openvino.ai/2025/get-started/install-openvino.html?PACKAGE=OPENVINO_BASE&VERSION=v_2025_0_0&OP_SYSTEM=LINUX&DISTRIBUTION=PIP#).
 74 | 
 75 | ```
 76 | python3 run_server.py -p 9090 -b openvino
 77 | ```
 78 | 
 79 | 
 80 | #### Controlling OpenMP Threads
 81 | To control the number of threads used by OpenMP, you can set the `OMP_NUM_THREADS` environment variable. This is useful for managing CPU resources and ensuring consistent performance. If not specified, `OMP_NUM_THREADS` is set to `1` by default. You can change this by using the `--omp_num_threads` argument:
 82 | ```bash
 83 | python3 run_server.py --port 9090 \
 84 |                       --backend faster_whisper \
 85 |                       --omp_num_threads 4
 86 | ```
 87 | 
 88 | #### Single model mode
 89 | By default, when running the server without specifying a model, the server will instantiate a new whisper model for every client connection. This has the advantage, that the server can use different model sizes, based on the client's requested model size. On the other hand, it also means you have to wait for the model to be loaded upon client connection and you will have increased (V)RAM usage.
 90 | 
 91 | When serving a custom TensorRT model using the `-trt` or a custom faster_whisper model using the `-fw` option, the server will instead only instantiate the custom model once and then reuse it for all client connections.
 92 | 
 93 | If you don't want this, set `--no_single_model`.
 94 | 
 95 | 
 96 | ### Running the Client
 97 | - Initializing the client with below parameters:
 98 |   - `lang`: Language of the input audio, applicable only if using a multilingual model.
 99 |   - `translate`: If set to `True` then translate from any language to `en`.
100 |   - `model`: Whisper model size.
101 |   - `use_vad`: Whether to use `Voice Activity Detection` on the server.
102 |   - `save_output_recording`: Set to True to save the microphone input as a `.wav` file during live transcription. This option is helpful for recording sessions for later playback or analysis. Defaults to `False`. 
103 |   - `output_recording_filename`: Specifies the `.wav` file path where the microphone input will be saved if `save_output_recording` is set to `True`.
104 |   - `max_clients`: Specifies the maximum number of clients the server should allow. Defaults to 4.
105 |   - `max_connection_time`: Maximum connection time for each client in seconds. Defaults to 600.
106 |   - `mute_audio_playback`: Whether to mute audio playback when transcribing an audio file. Defaults to False.
107 | 
108 | ```python
109 | from whisper_live.client import TranscriptionClient
110 | client = TranscriptionClient(
111 |   "localhost",
112 |   9090,
113 |   lang="en",
114 |   translate=False,
115 |   model="small",                                      # also support hf_model => `Systran/faster-whisper-small`
116 |   use_vad=False,
117 |   save_output_recording=True,                         # Only used for microphone input, False by Default
118 |   output_recording_filename="./output_recording.wav", # Only used for microphone input
119 |   max_clients=4,
120 |   max_connection_time=600,
121 |   mute_audio_playback=False,                          # Only used for file input, False by Default
122 | )
123 | ```
124 | It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
125 | 
126 | - Transcribe an audio file:
127 | ```python
128 | client("tests/jfk.wav")
129 | ```
130 | 
131 | - To transcribe from microphone:
132 | ```python
133 | client()
134 | ```
135 | 
136 | - To transcribe from a RTSP stream:
137 | ```python
138 | client(rtsp_url="rtsp://admin:admin@192.168.0.1/rtsp")
139 | ```
140 | 
141 | - To transcribe from a HLS stream:
142 | ```python
143 | client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8")
144 | ```
145 | 
146 | ## Browser Extensions
147 | - Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server).
148 | - Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md
149 | 
150 | ## Whisper Live Server in Docker
151 | - GPU
152 |   - Faster-Whisper
153 |   ```bash
154 |   docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
155 |   ```
156 | 
157 |   - TensorRT. Refer to [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup and more tensorrt backend configurations.
158 |   ```bash
159 |   docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt
160 |   docker run -p 9090:9090 --runtime=nvidia --entrypoint /bin/bash -it whisperlive-tensorrt
161 | 
162 |   # Build small.en engine
163 |   bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
164 |   bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
165 |   bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
166 | 
167 |   # Run server with small.en
168 |   python3 run_server.py --port 9090 \
169 |                         --backend tensorrt \
170 |                         --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
171 |                         --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8"
172 |                         --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4"
173 |   ```
174 | 
175 |   - OpenVINO
176 |   ```
177 |   docker run -it --device=/dev/dri -p 9090:9090 ghcr.io/collabora/whisperlive-openvino
178 |   ```
179 | 
180 | - CPU
181 |   - Faster-whisper
182 |   ```bash
183 |   docker run -it -p 9090:9090 ghcr.io/collabora/whisperlive-cpu:latest
184 |   ```
185 | 
186 | ## Future Work
187 | - [ ] Add translation to other languages on top of transcription.
188 | 
189 | ## Blog Posts
190 | - [Transforming speech technology with WhisperLive](https://www.collabora.com/news-and-blog/blog/2024/05/28/transforming-speech-technology-with-whisperlive/)
191 | - [WhisperFusion: Ultra-low latency conversations with an AI chatbot](https://www.collabora.com/news-and-blog/news-and-events/whisperfusion-ultra-low-latency-conversations-with-an-ai-chatbot.html) powered by WhisperLive
192 | - [Breaking language barriers 2.0: Moving closer towards fully reliable, production-ready Hindi ASR](https://www.collabora.com/news-and-blog/news-and-events/breaking-language-barriers-20-moving-closer-production-ready-hindi-asr.html) which is used in WhisperLive for hindi.
193 | 
194 | ## Contact
195 | 
196 | We are available to help you with both Open Source and proprietary AI projects. You can reach us via the Collabora website or [vineet.suryan@collabora.com](mailto:vineet.suryan@collabora.com) and [marcus.edel@collabora.com](mailto:marcus.edel@collabora.com).
197 | 
198 | 
199 | ## Citations
200 | ```bibtex
201 | @article{Whisper
202 |   title = {Robust Speech Recognition via Large-Scale Weak Supervision},
203 |   url = {https://arxiv.org/abs/2212.04356},
204 |   author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
205 |   publisher = {arXiv},
206 |   year = {2022},
207 | }
208 | ```
209 | 
210 | ```bibtex
211 | @misc{Silero VAD,
212 |   author = {Silero Team},
213 |   title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
214 |   year = {2021},
215 |   publisher = {GitHub},
216 |   journal = {GitHub repository},
217 |   howpublished = {\url{https://github.com/snakers4/silero-vad}},
218 |   email = {hello@silero.ai}
219 | }
220 | 


--------------------------------------------------------------------------------
/TensorRT_whisper.md:
--------------------------------------------------------------------------------
 1 | # WhisperLive-TensorRT
 2 | We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup.
 3 | **Note**: We use `tensorrt_llm==0.18.2`
 4 | 
 5 | ## Installation
 6 | - Install [docker](https://docs.docker.com/engine/install/)
 7 | - Install [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 8 | 
 9 | - Run WhisperLive TensorRT in docker
10 | ```bash
11 | docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt
12 | docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it whisperlive-tensorrt
13 | ```
14 | 
15 | ## Whisper TensorRT Engine
16 | - We build `small.en` and `small` multilingual TensorRT engine as examples below. The script logs the path of the directory with Whisper TensorRT engine. We need that model_path to run the server.
17 | ```bash
18 | # convert small.en
19 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
20 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
21 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
22 | 
23 | # convert small multilingual model
24 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
25 | ```
26 | 
27 | ## Run WhisperLive Server with TensorRT Backend
28 | ```bash
29 | # Run English only model
30 | python3 run_server.py --port 9090 \
31 |                       --backend tensorrt \
32 |                       --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
33 | 
34 | # Run Multilingual model
35 | python3 run_server.py --port 9090 \
36 |                       --backend tensorrt \
37 |                       --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
38 |                       --trt_multilingual
39 | ```
40 | 
41 | By default trt_backend uses cpp_session, to use python session pass `--trt_py_session` to run_server.py
42 | ```bash
43 | python3 run_server.py --port 9090 \
44 |                       --backend tensorrt \
45 |                       --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
46 |                       --trt_py_session
47 | ```


--------------------------------------------------------------------------------
/assets/jfk.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/assets/jfk.flac


--------------------------------------------------------------------------------
/docker/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-bookworm
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # install lib required for pyaudio
 6 | RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | # update pip to support for whl.metadata -> less downloading
 9 | RUN pip install --no-cache-dir -U "pip>=24"
10 | 
11 | # create a working directory
12 | RUN mkdir /app
13 | WORKDIR /app
14 | 
15 | # install pytorch, but without the nvidia-libs that are only necessary for gpu
16 | RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
17 | 
18 | # install the requirements for running the whisper-live server
19 | COPY requirements/server.txt /app/
20 | RUN pip install --no-cache-dir -r server.txt && rm server.txt
21 | 
22 | COPY whisper_live /app/whisper_live
23 | COPY run_server.py /app
24 | 
25 | CMD ["python", "run_server.py"]
26 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-bookworm
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # install lib required for pyaudio
 6 | RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | # update pip to support for whl.metadata -> less downloading
 9 | RUN pip install --no-cache-dir -U "pip>=24"
10 | 
11 | # create a working directory
12 | RUN mkdir /app
13 | WORKDIR /app
14 | 
15 | # install the requirements for running the whisper-live server
16 | COPY requirements/server.txt /app/
17 | RUN pip install --no-cache-dir -r server.txt && rm server.txt
18 | 
19 | # make the paths of the nvidia libs installed as wheels visible. equivalent to:
20 | # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
21 | ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
22 | 
23 | COPY whisper_live /app/whisper_live
24 | COPY run_server.py /app
25 | 
26 | CMD ["python", "run_server.py"]
27 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.openvino:
--------------------------------------------------------------------------------
 1 | FROM openvino/ubuntu22_runtime:latest
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | USER root
 6 | 
 7 | RUN apt update && apt install -y portaudio19-dev python-is-python3 && apt-get clean && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN pip install --no-cache-dir -U "pip>=24"
10 | 
11 | RUN mkdir /app
12 | WORKDIR /app
13 | 
14 | COPY requirements/server.txt /app/
15 | RUN pip install --no-cache-dir -r server.txt && rm server.txt
16 | 
17 | COPY whisper_live /app/whisper_live
18 | COPY run_server.py /app
19 | CMD ["python", "run_server.py", "--backend", "openvino"]
20 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.tensorrt:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |     python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \
 7 |     && apt install python-is-python3 \
 8 |     && pip install --upgrade pip setuptools \
 9 |     && rm -rf /var/lib/apt/lists/*
10 | 
11 | FROM base AS devel
12 | RUN pip install --no-cache-dir -U tensorrt_llm==0.18.2 --extra-index-url https://pypi.nvidia.com
13 | WORKDIR /app
14 | RUN git clone -b v0.18.2 https://github.com/NVIDIA/TensorRT-LLM.git \
15 |     && mv TensorRT-LLM/examples ./TensorRT-LLM-examples \
16 |     && rm -rf TensorRT-LLM
17 | 
18 | FROM devel AS release
19 | WORKDIR /app
20 | COPY assets/ ./assets
21 | RUN wget -nc -P assets/ https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
22 | 
23 | COPY scripts/setup.sh ./
24 | RUN apt update && bash setup.sh && rm setup.sh
25 | 
26 | COPY requirements/server.txt .
27 | RUN pip install --no-cache-dir -r server.txt && rm server.txt
28 | COPY whisper_live ./whisper_live
29 | COPY scripts/build_whisper_tensorrt.sh .
30 | COPY run_server.py .


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/doctrees/environment.pickle


--------------------------------------------------------------------------------
/docs/doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/doctrees/index.doctree


--------------------------------------------------------------------------------
/docs/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 7b818b47e6f359b937e5a2517f120d43
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. whisper_live documentation master file, created by
 2 |    sphinx-quickstart on Fri Sep 22 11:39:30 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Whisper Live documentation!
 7 | ========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 | 
12 | 
13 | .. automodule:: whisper_live.server
14 |    :members:
15 | 
16 | .. automodule:: whisper_live.client
17 |    :members:
18 | 
19 | 
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 


--------------------------------------------------------------------------------
/docs/html/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* This file intentionally left blank. */
2 | 


--------------------------------------------------------------------------------
/docs/html/_static/doctools.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * doctools.js
  3 |  * ~~~~~~~~~~~
  4 |  *
  5 |  * Base JavaScript utilities for all Sphinx HTML documentation.
  6 |  *
  7 |  * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | "use strict";
 12 | 
 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
 14 |   "TEXTAREA",
 15 |   "INPUT",
 16 |   "SELECT",
 17 |   "BUTTON",
 18 | ]);
 19 | 
 20 | const _ready = (callback) => {
 21 |   if (document.readyState !== "loading") {
 22 |     callback();
 23 |   } else {
 24 |     document.addEventListener("DOMContentLoaded", callback);
 25 |   }
 26 | };
 27 | 
 28 | /**
 29 |  * Small JavaScript module for the documentation.
 30 |  */
 31 | const Documentation = {
 32 |   init: () => {
 33 |     Documentation.initDomainIndexTable();
 34 |     Documentation.initOnKeyListeners();
 35 |   },
 36 | 
 37 |   /**
 38 |    * i18n support
 39 |    */
 40 |   TRANSLATIONS: {},
 41 |   PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
 42 |   LOCALE: "unknown",
 43 | 
 44 |   // gettext and ngettext don't access this so that the functions
 45 |   // can safely bound to a different name (_ = Documentation.gettext)
 46 |   gettext: (string) => {
 47 |     const translated = Documentation.TRANSLATIONS[string];
 48 |     switch (typeof translated) {
 49 |       case "undefined":
 50 |         return string; // no translation
 51 |       case "string":
 52 |         return translated; // translation exists
 53 |       default:
 54 |         return translated[0]; // (singular, plural) translation tuple exists
 55 |     }
 56 |   },
 57 | 
 58 |   ngettext: (singular, plural, n) => {
 59 |     const translated = Documentation.TRANSLATIONS[singular];
 60 |     if (typeof translated !== "undefined")
 61 |       return translated[Documentation.PLURAL_EXPR(n)];
 62 |     return n === 1 ? singular : plural;
 63 |   },
 64 | 
 65 |   addTranslations: (catalog) => {
 66 |     Object.assign(Documentation.TRANSLATIONS, catalog.messages);
 67 |     Documentation.PLURAL_EXPR = new Function(
 68 |       "n",
 69 |       `return (${catalog.plural_expr})`
 70 |     );
 71 |     Documentation.LOCALE = catalog.locale;
 72 |   },
 73 | 
 74 |   /**
 75 |    * helper function to focus on search bar
 76 |    */
 77 |   focusSearchBar: () => {
 78 |     document.querySelectorAll("input[name=q]")[0]?.focus();
 79 |   },
 80 | 
 81 |   /**
 82 |    * Initialise the domain index toggle buttons
 83 |    */
 84 |   initDomainIndexTable: () => {
 85 |     const toggler = (el) => {
 86 |       const idNumber = el.id.substr(7);
 87 |       const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
 88 |       if (el.src.substr(-9) === "minus.png") {
 89 |         el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
 90 |         toggledRows.forEach((el) => (el.style.display = "none"));
 91 |       } else {
 92 |         el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
 93 |         toggledRows.forEach((el) => (el.style.display = ""));
 94 |       }
 95 |     };
 96 | 
 97 |     const togglerElements = document.querySelectorAll("img.toggler");
 98 |     togglerElements.forEach((el) =>
 99 |       el.addEventListener("click", (event) => toggler(event.currentTarget))
100 |     );
101 |     togglerElements.forEach((el) => (el.style.display = ""));
102 |     if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
103 |   },
104 | 
105 |   initOnKeyListeners: () => {
106 |     // only install a listener if it is really needed
107 |     if (
108 |       !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
109 |       !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
110 |     )
111 |       return;
112 | 
113 |     document.addEventListener("keydown", (event) => {
114 |       // bail for input elements
115 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
116 |       // bail with special keys
117 |       if (event.altKey || event.ctrlKey || event.metaKey) return;
118 | 
119 |       if (!event.shiftKey) {
120 |         switch (event.key) {
121 |           case "ArrowLeft":
122 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
123 | 
124 |             const prevLink = document.querySelector('link[rel="prev"]');
125 |             if (prevLink && prevLink.href) {
126 |               window.location.href = prevLink.href;
127 |               event.preventDefault();
128 |             }
129 |             break;
130 |           case "ArrowRight":
131 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
132 | 
133 |             const nextLink = document.querySelector('link[rel="next"]');
134 |             if (nextLink && nextLink.href) {
135 |               window.location.href = nextLink.href;
136 |               event.preventDefault();
137 |             }
138 |             break;
139 |         }
140 |       }
141 | 
142 |       // some keyboard layouts may need Shift to get /
143 |       switch (event.key) {
144 |         case "/":
145 |           if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
146 |           Documentation.focusSearchBar();
147 |           event.preventDefault();
148 |       }
149 |     });
150 |   },
151 | };
152 | 
153 | // quick alias for translations
154 | const _ = Documentation.gettext;
155 | 
156 | _ready(Documentation.init);
157 | 


--------------------------------------------------------------------------------
/docs/html/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | const DOCUMENTATION_OPTIONS = {
 2 |     VERSION: '',
 3 |     LANGUAGE: 'en',
 4 |     COLLAPSE_INDEX: false,
 5 |     BUILDER: 'html',
 6 |     FILE_SUFFIX: '.html',
 7 |     LINK_SUFFIX: '.html',
 8 |     HAS_SOURCE: true,
 9 |     SOURCELINK_SUFFIX: '.txt',
10 |     NAVIGATION_WITH_KEYS: false,
11 |     SHOW_SEARCH_SUMMARY: true,
12 |     ENABLE_SEARCH_SHORTCUTS: true,
13 | };


--------------------------------------------------------------------------------
/docs/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/file.png


--------------------------------------------------------------------------------
/docs/html/_static/language_data.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * language_data.js
  3 |  * ~~~~~~~~~~~~~~~~
  4 |  *
  5 |  * This script contains the language-specific data used by searchtools.js,
  6 |  * namely the list of stopwords, stemmer, scorer and splitter.
  7 |  *
  8 |  * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  9 |  * :license: BSD, see LICENSE for details.
 10 |  *
 11 |  */
 12 | 
 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
 14 | 
 15 | 
 16 | /* Non-minified version is copied as a separate JS file, is available */
 17 | 
 18 | /**
 19 |  * Porter Stemmer
 20 |  */
 21 | var Stemmer = function() {
 22 | 
 23 |   var step2list = {
 24 |     ational: 'ate',
 25 |     tional: 'tion',
 26 |     enci: 'ence',
 27 |     anci: 'ance',
 28 |     izer: 'ize',
 29 |     bli: 'ble',
 30 |     alli: 'al',
 31 |     entli: 'ent',
 32 |     eli: 'e',
 33 |     ousli: 'ous',
 34 |     ization: 'ize',
 35 |     ation: 'ate',
 36 |     ator: 'ate',
 37 |     alism: 'al',
 38 |     iveness: 'ive',
 39 |     fulness: 'ful',
 40 |     ousness: 'ous',
 41 |     aliti: 'al',
 42 |     iviti: 'ive',
 43 |     biliti: 'ble',
 44 |     logi: 'log'
 45 |   };
 46 | 
 47 |   var step3list = {
 48 |     icate: 'ic',
 49 |     ative: '',
 50 |     alize: 'al',
 51 |     iciti: 'ic',
 52 |     ical: 'ic',
 53 |     ful: '',
 54 |     ness: ''
 55 |   };
 56 | 
 57 |   var c = "[^aeiou]";          // consonant
 58 |   var v = "[aeiouy]";          // vowel
 59 |   var C = c + "[^aeiouy]*";    // consonant sequence
 60 |   var V = v + "[aeiou]*";      // vowel sequence
 61 | 
 62 |   var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
 63 |   var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
 64 |   var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
 65 |   var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
 66 | 
 67 |   this.stemWord = function (w) {
 68 |     var stem;
 69 |     var suffix;
 70 |     var firstch;
 71 |     var origword = w;
 72 | 
 73 |     if (w.length < 3)
 74 |       return w;
 75 | 
 76 |     var re;
 77 |     var re2;
 78 |     var re3;
 79 |     var re4;
 80 | 
 81 |     firstch = w.substr(0,1);
 82 |     if (firstch == "y")
 83 |       w = firstch.toUpperCase() + w.substr(1);
 84 | 
 85 |     // Step 1a
 86 |     re = /^(.+?)(ss|i)es$/;
 87 |     re2 = /^(.+?)([^s])s$/;
 88 | 
 89 |     if (re.test(w))
 90 |       w = w.replace(re,"$1$2");
 91 |     else if (re2.test(w))
 92 |       w = w.replace(re2,"$1$2");
 93 | 
 94 |     // Step 1b
 95 |     re = /^(.+?)eed$/;
 96 |     re2 = /^(.+?)(ed|ing)$/;
 97 |     if (re.test(w)) {
 98 |       var fp = re.exec(w);
 99 |       re = new RegExp(mgr0);
100 |       if (re.test(fp[1])) {
101 |         re = /.$/;
102 |         w = w.replace(re,"");
103 |       }
104 |     }
105 |     else if (re2.test(w)) {
106 |       var fp = re2.exec(w);
107 |       stem = fp[1];
108 |       re2 = new RegExp(s_v);
109 |       if (re2.test(stem)) {
110 |         w = stem;
111 |         re2 = /(at|bl|iz)$/;
112 |         re3 = new RegExp("([^aeiouylsz])\\1$");
113 |         re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
114 |         if (re2.test(w))
115 |           w = w + "e";
116 |         else if (re3.test(w)) {
117 |           re = /.$/;
118 |           w = w.replace(re,"");
119 |         }
120 |         else if (re4.test(w))
121 |           w = w + "e";
122 |       }
123 |     }
124 | 
125 |     // Step 1c
126 |     re = /^(.+?)y$/;
127 |     if (re.test(w)) {
128 |       var fp = re.exec(w);
129 |       stem = fp[1];
130 |       re = new RegExp(s_v);
131 |       if (re.test(stem))
132 |         w = stem + "i";
133 |     }
134 | 
135 |     // Step 2
136 |     re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
137 |     if (re.test(w)) {
138 |       var fp = re.exec(w);
139 |       stem = fp[1];
140 |       suffix = fp[2];
141 |       re = new RegExp(mgr0);
142 |       if (re.test(stem))
143 |         w = stem + step2list[suffix];
144 |     }
145 | 
146 |     // Step 3
147 |     re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
148 |     if (re.test(w)) {
149 |       var fp = re.exec(w);
150 |       stem = fp[1];
151 |       suffix = fp[2];
152 |       re = new RegExp(mgr0);
153 |       if (re.test(stem))
154 |         w = stem + step3list[suffix];
155 |     }
156 | 
157 |     // Step 4
158 |     re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
159 |     re2 = /^(.+?)(s|t)(ion)$/;
160 |     if (re.test(w)) {
161 |       var fp = re.exec(w);
162 |       stem = fp[1];
163 |       re = new RegExp(mgr1);
164 |       if (re.test(stem))
165 |         w = stem;
166 |     }
167 |     else if (re2.test(w)) {
168 |       var fp = re2.exec(w);
169 |       stem = fp[1] + fp[2];
170 |       re2 = new RegExp(mgr1);
171 |       if (re2.test(stem))
172 |         w = stem;
173 |     }
174 | 
175 |     // Step 5
176 |     re = /^(.+?)e$/;
177 |     if (re.test(w)) {
178 |       var fp = re.exec(w);
179 |       stem = fp[1];
180 |       re = new RegExp(mgr1);
181 |       re2 = new RegExp(meq1);
182 |       re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
183 |       if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
184 |         w = stem;
185 |     }
186 |     re = /ll$/;
187 |     re2 = new RegExp(mgr1);
188 |     if (re.test(w) && re2.test(w)) {
189 |       re = /.$/;
190 |       w = w.replace(re,"");
191 |     }
192 | 
193 |     // and turn initial Y back to y
194 |     if (firstch == "y")
195 |       w = firstch.toLowerCase() + w.substr(1);
196 |     return w;
197 |   }
198 | }
199 | 
200 | 


--------------------------------------------------------------------------------
/docs/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/minus.png


--------------------------------------------------------------------------------
/docs/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/plus.png


--------------------------------------------------------------------------------
/docs/html/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | pre { line-height: 125%; }
 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 6 | .highlight .hll { background-color: #ffffcc }
 7 | .highlight { background: #f8f8f8; }
 8 | .highlight .c { color: #8f5902; font-style: italic } /* Comment */
 9 | .highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */
10 | .highlight .g { color: #000000 } /* Generic */
11 | .highlight .k { color: #004461; font-weight: bold } /* Keyword */
12 | .highlight .l { color: #000000 } /* Literal */
13 | .highlight .n { color: #000000 } /* Name */
14 | .highlight .o { color: #582800 } /* Operator */
15 | .highlight .x { color: #000000 } /* Other */
16 | .highlight .p { color: #000000; font-weight: bold } /* Punctuation */
17 | .highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */
18 | .highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */
19 | .highlight .cp { color: #8f5902 } /* Comment.Preproc */
20 | .highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */
21 | .highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */
22 | .highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */
23 | .highlight .gd { color: #a40000 } /* Generic.Deleted */
24 | .highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */
25 | .highlight .ges { color: #000000 } /* Generic.EmphStrong */
26 | .highlight .gr { color: #ef2929 } /* Generic.Error */
27 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
28 | .highlight .gi { color: #00A000 } /* Generic.Inserted */
29 | .highlight .go { color: #888888 } /* Generic.Output */
30 | .highlight .gp { color: #745334 } /* Generic.Prompt */
31 | .highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */
32 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
33 | .highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */
34 | .highlight .kc { color: #004461; font-weight: bold } /* Keyword.Constant */
35 | .highlight .kd { color: #004461; font-weight: bold } /* Keyword.Declaration */
36 | .highlight .kn { color: #004461; font-weight: bold } /* Keyword.Namespace */
37 | .highlight .kp { color: #004461; font-weight: bold } /* Keyword.Pseudo */
38 | .highlight .kr { color: #004461; font-weight: bold } /* Keyword.Reserved */
39 | .highlight .kt { color: #004461; font-weight: bold } /* Keyword.Type */
40 | .highlight .ld { color: #000000 } /* Literal.Date */
41 | .highlight .m { color: #990000 } /* Literal.Number */
42 | .highlight .s { color: #4e9a06 } /* Literal.String */
43 | .highlight .na { color: #c4a000 } /* Name.Attribute */
44 | .highlight .nb { color: #004461 } /* Name.Builtin */
45 | .highlight .nc { color: #000000 } /* Name.Class */
46 | .highlight .no { color: #000000 } /* Name.Constant */
47 | .highlight .nd { color: #888888 } /* Name.Decorator */
48 | .highlight .ni { color: #ce5c00 } /* Name.Entity */
49 | .highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */
50 | .highlight .nf { color: #000000 } /* Name.Function */
51 | .highlight .nl { color: #f57900 } /* Name.Label */
52 | .highlight .nn { color: #000000 } /* Name.Namespace */
53 | .highlight .nx { color: #000000 } /* Name.Other */
54 | .highlight .py { color: #000000 } /* Name.Property */
55 | .highlight .nt { color: #004461; font-weight: bold } /* Name.Tag */
56 | .highlight .nv { color: #000000 } /* Name.Variable */
57 | .highlight .ow { color: #004461; font-weight: bold } /* Operator.Word */
58 | .highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */
59 | .highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */
60 | .highlight .mb { color: #990000 } /* Literal.Number.Bin */
61 | .highlight .mf { color: #990000 } /* Literal.Number.Float */
62 | .highlight .mh { color: #990000 } /* Literal.Number.Hex */
63 | .highlight .mi { color: #990000 } /* Literal.Number.Integer */
64 | .highlight .mo { color: #990000 } /* Literal.Number.Oct */
65 | .highlight .sa { color: #4e9a06 } /* Literal.String.Affix */
66 | .highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */
67 | .highlight .sc { color: #4e9a06 } /* Literal.String.Char */
68 | .highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */
69 | .highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */
70 | .highlight .s2 { color: #4e9a06 } /* Literal.String.Double */
71 | .highlight .se { color: #4e9a06 } /* Literal.String.Escape */
72 | .highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */
73 | .highlight .si { color: #4e9a06 } /* Literal.String.Interpol */
74 | .highlight .sx { color: #4e9a06 } /* Literal.String.Other */
75 | .highlight .sr { color: #4e9a06 } /* Literal.String.Regex */
76 | .highlight .s1 { color: #4e9a06 } /* Literal.String.Single */
77 | .highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */
78 | .highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */
79 | .highlight .fm { color: #000000 } /* Name.Function.Magic */
80 | .highlight .vc { color: #000000 } /* Name.Variable.Class */
81 | .highlight .vg { color: #000000 } /* Name.Variable.Global */
82 | .highlight .vi { color: #000000 } /* Name.Variable.Instance */
83 | .highlight .vm { color: #000000 } /* Name.Variable.Magic */
84 | .highlight .il { color: #990000 } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/html/_static/sphinx_highlight.js:
--------------------------------------------------------------------------------
  1 | /* Highlighting utilities for Sphinx HTML documentation. */
  2 | "use strict";
  3 | 
  4 | const SPHINX_HIGHLIGHT_ENABLED = true
  5 | 
  6 | /**
  7 |  * highlight a given string on a node by wrapping it in
  8 |  * span elements with the given class name.
  9 |  */
 10 | const _highlight = (node, addItems, text, className) => {
 11 |   if (node.nodeType === Node.TEXT_NODE) {
 12 |     const val = node.nodeValue;
 13 |     const parent = node.parentNode;
 14 |     const pos = val.toLowerCase().indexOf(text);
 15 |     if (
 16 |       pos >= 0 &&
 17 |       !parent.classList.contains(className) &&
 18 |       !parent.classList.contains("nohighlight")
 19 |     ) {
 20 |       let span;
 21 | 
 22 |       const closestNode = parent.closest("body, svg, foreignObject");
 23 |       const isInSVG = closestNode && closestNode.matches("svg");
 24 |       if (isInSVG) {
 25 |         span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
 26 |       } else {
 27 |         span = document.createElement("span");
 28 |         span.classList.add(className);
 29 |       }
 30 | 
 31 |       span.appendChild(document.createTextNode(val.substr(pos, text.length)));
 32 |       const rest = document.createTextNode(val.substr(pos + text.length));
 33 |       parent.insertBefore(
 34 |         span,
 35 |         parent.insertBefore(
 36 |           rest,
 37 |           node.nextSibling
 38 |         )
 39 |       );
 40 |       node.nodeValue = val.substr(0, pos);
 41 |       /* There may be more occurrences of search term in this node. So call this
 42 |        * function recursively on the remaining fragment.
 43 |        */
 44 |       _highlight(rest, addItems, text, className);
 45 | 
 46 |       if (isInSVG) {
 47 |         const rect = document.createElementNS(
 48 |           "http://www.w3.org/2000/svg",
 49 |           "rect"
 50 |         );
 51 |         const bbox = parent.getBBox();
 52 |         rect.x.baseVal.value = bbox.x;
 53 |         rect.y.baseVal.value = bbox.y;
 54 |         rect.width.baseVal.value = bbox.width;
 55 |         rect.height.baseVal.value = bbox.height;
 56 |         rect.setAttribute("class", className);
 57 |         addItems.push({ parent: parent, target: rect });
 58 |       }
 59 |     }
 60 |   } else if (node.matches && !node.matches("button, select, textarea")) {
 61 |     node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
 62 |   }
 63 | };
 64 | const _highlightText = (thisNode, text, className) => {
 65 |   let addItems = [];
 66 |   _highlight(thisNode, addItems, text, className);
 67 |   addItems.forEach((obj) =>
 68 |     obj.parent.insertAdjacentElement("beforebegin", obj.target)
 69 |   );
 70 | };
 71 | 
 72 | /**
 73 |  * Small JavaScript module for the documentation.
 74 |  */
 75 | const SphinxHighlight = {
 76 | 
 77 |   /**
 78 |    * highlight the search words provided in localstorage in the text
 79 |    */
 80 |   highlightSearchWords: () => {
 81 |     if (!SPHINX_HIGHLIGHT_ENABLED) return;  // bail if no highlight
 82 | 
 83 |     // get and clear terms from localstorage
 84 |     const url = new URL(window.location);
 85 |     const highlight =
 86 |         localStorage.getItem("sphinx_highlight_terms")
 87 |         || url.searchParams.get("highlight")
 88 |         || "";
 89 |     localStorage.removeItem("sphinx_highlight_terms")
 90 |     url.searchParams.delete("highlight");
 91 |     window.history.replaceState({}, "", url);
 92 | 
 93 |     // get individual terms from highlight string
 94 |     const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
 95 |     if (terms.length === 0) return; // nothing to do
 96 | 
 97 |     // There should never be more than one element matching "div.body"
 98 |     const divBody = document.querySelectorAll("div.body");
 99 |     const body = divBody.length ? divBody[0] : document.querySelector("body");
100 |     window.setTimeout(() => {
101 |       terms.forEach((term) => _highlightText(body, term, "highlighted"));
102 |     }, 10);
103 | 
104 |     const searchBox = document.getElementById("searchbox");
105 |     if (searchBox === null) return;
106 |     searchBox.appendChild(
107 |       document
108 |         .createRange()
109 |         .createContextualFragment(
110 |           '<p class="highlight-link">' +
111 |             '<a href="javascript:SphinxHighlight.hideSearchWords()">' +
112 |             _("Hide Search Matches") +
113 |             "</a></p>"
114 |         )
115 |     );
116 |   },
117 | 
118 |   /**
119 |    * helper function to hide the search marks again
120 |    */
121 |   hideSearchWords: () => {
122 |     document
123 |       .querySelectorAll("#searchbox .highlight-link")
124 |       .forEach((el) => el.remove());
125 |     document
126 |       .querySelectorAll("span.highlighted")
127 |       .forEach((el) => el.classList.remove("highlighted"));
128 |     localStorage.removeItem("sphinx_highlight_terms")
129 |   },
130 | 
131 |   initEscapeListener: () => {
132 |     // only install a listener if it is really needed
133 |     if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
134 | 
135 |     document.addEventListener("keydown", (event) => {
136 |       // bail for input elements
137 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
138 |       // bail with special keys
139 |       if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
140 |       if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
141 |         SphinxHighlight.hideSearchWords();
142 |         event.preventDefault();
143 |       }
144 |     });
145 |   },
146 | };
147 | 
148 | _ready(() => {
149 |   /* Do not call highlightSearchWords() when we are on the search page.
150 |    * It will highlight words from the *previous* search query.
151 |    */
152 |   if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
153 |   SphinxHighlight.initEscapeListener();
154 | });
155 | 


--------------------------------------------------------------------------------
/docs/html/genindex.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html lang="en" data-content_root="./">
  4 |   <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  7 |     <title>Index &#8212; whisper_live  documentation</title>
  8 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
  9 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
 10 |     <script src="_static/documentation_options.js?v=5929fcd5"></script>
 11 |     <script src="_static/doctools.js?v=888ff710"></script>
 12 |     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 13 |     <link rel="index" title="Index" href="#" />
 14 |     <link rel="search" title="Search" href="search.html" />
 15 |    
 16 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 17 |   
 18 |   
 19 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 20 | 
 21 |   </head><body>
 22 |   
 23 | 
 24 |     <div class="document">
 25 |       <div class="documentwrapper">
 26 |         <div class="bodywrapper">
 27 |           
 28 | 
 29 |           <div class="body" role="main">
 30 |             
 31 | 
 32 | <h1 id="index">Index</h1>
 33 | 
 34 | <div class="genindex-jumpbox">
 35 |  <a href="#A"><strong>A</strong></a>
 36 |  | <a href="#B"><strong>B</strong></a>
 37 |  | <a href="#C"><strong>C</strong></a>
 38 |  | <a href="#D"><strong>D</strong></a>
 39 |  | <a href="#F"><strong>F</strong></a>
 40 |  | <a href="#G"><strong>G</strong></a>
 41 |  | <a href="#M"><strong>M</strong></a>
 42 |  | <a href="#O"><strong>O</strong></a>
 43 |  | <a href="#P"><strong>P</strong></a>
 44 |  | <a href="#R"><strong>R</strong></a>
 45 |  | <a href="#S"><strong>S</strong></a>
 46 |  | <a href="#T"><strong>T</strong></a>
 47 |  | <a href="#U"><strong>U</strong></a>
 48 |  | <a href="#W"><strong>W</strong></a>
 49 |  
 50 | </div>
 51 | <h2 id="A">A</h2>
 52 | <table style="width: 100%" class="indextable genindextable"><tr>
 53 |   <td style="width: 33%; vertical-align: top;"><ul>
 54 |       <li><a href="index.html#whisper_live.server.ServeClient.add_frames">add_frames() (whisper_live.server.ServeClient method)</a>
 55 | </li>
 56 |   </ul></td>
 57 | </tr></table>
 58 | 
 59 | <h2 id="B">B</h2>
 60 | <table style="width: 100%" class="indextable genindextable"><tr>
 61 |   <td style="width: 33%; vertical-align: top;"><ul>
 62 |       <li><a href="index.html#whisper_live.client.Client.bytes_to_float_array">bytes_to_float_array() (whisper_live.client.Client static method)</a>
 63 | </li>
 64 |   </ul></td>
 65 | </tr></table>
 66 | 
 67 | <h2 id="C">C</h2>
 68 | <table style="width: 100%" class="indextable genindextable"><tr>
 69 |   <td style="width: 33%; vertical-align: top;"><ul>
 70 |       <li><a href="index.html#whisper_live.server.ServeClient.cleanup">cleanup() (whisper_live.server.ServeClient method)</a>
 71 | </li>
 72 |   </ul></td>
 73 |   <td style="width: 33%; vertical-align: top;"><ul>
 74 |       <li><a href="index.html#whisper_live.client.Client">Client (class in whisper_live.client)</a>
 75 | </li>
 76 |       <li><a href="index.html#whisper_live.client.Client.close_websocket">close_websocket() (whisper_live.client.Client method)</a>
 77 | </li>
 78 |   </ul></td>
 79 | </tr></table>
 80 | 
 81 | <h2 id="D">D</h2>
 82 | <table style="width: 100%" class="indextable genindextable"><tr>
 83 |   <td style="width: 33%; vertical-align: top;"><ul>
 84 |       <li><a href="index.html#whisper_live.server.ServeClient.disconnect">disconnect() (whisper_live.server.ServeClient method)</a>
 85 | </li>
 86 |   </ul></td>
 87 | </tr></table>
 88 | 
 89 | <h2 id="F">F</h2>
 90 | <table style="width: 100%" class="indextable genindextable"><tr>
 91 |   <td style="width: 33%; vertical-align: top;"><ul>
 92 |       <li><a href="index.html#whisper_live.server.ServeClient.fill_output">fill_output() (whisper_live.server.ServeClient method)</a>
 93 | </li>
 94 |   </ul></td>
 95 | </tr></table>
 96 | 
 97 | <h2 id="G">G</h2>
 98 | <table style="width: 100%" class="indextable genindextable"><tr>
 99 |   <td style="width: 33%; vertical-align: top;"><ul>
100 |       <li><a href="index.html#whisper_live.client.Client.get_client_socket">get_client_socket() (whisper_live.client.Client method)</a>
101 | </li>
102 |   </ul></td>
103 |   <td style="width: 33%; vertical-align: top;"><ul>
104 |       <li><a href="index.html#whisper_live.server.TranscriptionServer.get_wait_time">get_wait_time() (whisper_live.server.TranscriptionServer method)</a>
105 | </li>
106 |   </ul></td>
107 | </tr></table>
108 | 
109 | <h2 id="M">M</h2>
110 | <table style="width: 100%" class="indextable genindextable"><tr>
111 |   <td style="width: 33%; vertical-align: top;"><ul>
112 |       <li>
113 |     module
114 | 
115 |       <ul>
116 |         <li><a href="index.html#module-whisper_live.client">whisper_live.client</a>
117 | </li>
118 |         <li><a href="index.html#module-whisper_live.server">whisper_live.server</a>
119 | </li>
120 |       </ul></li>
121 |   </ul></td>
122 | </tr></table>
123 | 
124 | <h2 id="O">O</h2>
125 | <table style="width: 100%" class="indextable genindextable"><tr>
126 |   <td style="width: 33%; vertical-align: top;"><ul>
127 |       <li><a href="index.html#whisper_live.client.Client.on_message">on_message() (whisper_live.client.Client method)</a>
128 | </li>
129 |   </ul></td>
130 |   <td style="width: 33%; vertical-align: top;"><ul>
131 |       <li><a href="index.html#whisper_live.client.Client.on_open">on_open() (whisper_live.client.Client method)</a>
132 | </li>
133 |   </ul></td>
134 | </tr></table>
135 | 
136 | <h2 id="P">P</h2>
137 | <table style="width: 100%" class="indextable genindextable"><tr>
138 |   <td style="width: 33%; vertical-align: top;"><ul>
139 |       <li><a href="index.html#whisper_live.client.Client.play_file">play_file() (whisper_live.client.Client method)</a>
140 | </li>
141 |   </ul></td>
142 | </tr></table>
143 | 
144 | <h2 id="R">R</h2>
145 | <table style="width: 100%" class="indextable genindextable"><tr>
146 |   <td style="width: 33%; vertical-align: top;"><ul>
147 |       <li><a href="index.html#whisper_live.client.Client.record">record() (whisper_live.client.Client method)</a>
148 | </li>
149 |       <li><a href="index.html#whisper_live.server.TranscriptionServer.recv_audio">recv_audio() (whisper_live.server.TranscriptionServer method)</a>
150 | </li>
151 |   </ul></td>
152 |   <td style="width: 33%; vertical-align: top;"><ul>
153 |       <li><a href="index.html#whisper_live.client.resample">resample() (in module whisper_live.client)</a>
154 | </li>
155 |       <li><a href="index.html#whisper_live.server.TranscriptionServer.run">run() (whisper_live.server.TranscriptionServer method)</a>
156 | </li>
157 |   </ul></td>
158 | </tr></table>
159 | 
160 | <h2 id="S">S</h2>
161 | <table style="width: 100%" class="indextable genindextable"><tr>
162 |   <td style="width: 33%; vertical-align: top;"><ul>
163 |       <li><a href="index.html#whisper_live.client.Client.send_packet_to_server">send_packet_to_server() (whisper_live.client.Client method)</a>
164 | </li>
165 |   </ul></td>
166 |   <td style="width: 33%; vertical-align: top;"><ul>
167 |       <li><a href="index.html#whisper_live.server.ServeClient">ServeClient (class in whisper_live.server)</a>
168 | </li>
169 |       <li><a href="index.html#whisper_live.server.ServeClient.speech_to_text">speech_to_text() (whisper_live.server.ServeClient method)</a>
170 | </li>
171 |   </ul></td>
172 | </tr></table>
173 | 
174 | <h2 id="T">T</h2>
175 | <table style="width: 100%" class="indextable genindextable"><tr>
176 |   <td style="width: 33%; vertical-align: top;"><ul>
177 |       <li><a href="index.html#whisper_live.client.TranscriptionClient">TranscriptionClient (class in whisper_live.client)</a>
178 | </li>
179 |   </ul></td>
180 |   <td style="width: 33%; vertical-align: top;"><ul>
181 |       <li><a href="index.html#whisper_live.server.TranscriptionServer">TranscriptionServer (class in whisper_live.server)</a>
182 | </li>
183 |   </ul></td>
184 | </tr></table>
185 | 
186 | <h2 id="U">U</h2>
187 | <table style="width: 100%" class="indextable genindextable"><tr>
188 |   <td style="width: 33%; vertical-align: top;"><ul>
189 |       <li><a href="index.html#whisper_live.server.ServeClient.update_segments">update_segments() (whisper_live.server.ServeClient method)</a>
190 | </li>
191 |   </ul></td>
192 | </tr></table>
193 | 
194 | <h2 id="W">W</h2>
195 | <table style="width: 100%" class="indextable genindextable"><tr>
196 |   <td style="width: 33%; vertical-align: top;"><ul>
197 |       <li>
198 |     whisper_live.client
199 | 
200 |       <ul>
201 |         <li><a href="index.html#module-whisper_live.client">module</a>
202 | </li>
203 |       </ul></li>
204 |       <li>
205 |     whisper_live.server
206 | 
207 |       <ul>
208 |         <li><a href="index.html#module-whisper_live.server">module</a>
209 | </li>
210 |       </ul></li>
211 |   </ul></td>
212 |   <td style="width: 33%; vertical-align: top;"><ul>
213 |       <li><a href="index.html#whisper_live.client.Client.write_audio_frames_to_file">write_audio_frames_to_file() (whisper_live.client.Client method)</a>
214 | </li>
215 |       <li><a href="index.html#whisper_live.client.Client.write_output_recording">write_output_recording() (whisper_live.client.Client method)</a>
216 | </li>
217 |   </ul></td>
218 | </tr></table>
219 | 
220 | 
221 | 
222 |           </div>
223 |           
224 |         </div>
225 |       </div>
226 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
227 |         <div class="sphinxsidebarwrapper">
228 | <h1 class="logo"><a href="index.html">whisper_live</a></h1>
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | <h3>Navigation</h3>
238 | 
239 | <div class="relations">
240 | <h3>Related Topics</h3>
241 | <ul>
242 |   <li><a href="index.html">Documentation overview</a><ul>
243 |   </ul></li>
244 | </ul>
245 | </div>
246 | <div id="searchbox" style="display: none" role="search">
247 |   <h3 id="searchlabel">Quick search</h3>
248 |     <div class="searchformwrapper">
249 |     <form class="search" action="search.html" method="get">
250 |       <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
251 |       <input type="submit" value="Go" />
252 |     </form>
253 |     </div>
254 | </div>
255 | <script>document.getElementById('searchbox').style.display = "block"</script>
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 |         </div>
265 |       </div>
266 |       <div class="clearer"></div>
267 |     </div>
268 |     <div class="footer">
269 |       &copy;2023, Collabora.
270 |       
271 |       |
272 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
273 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
274 |       
275 |     </div>
276 | 
277 |     
278 | 
279 |     
280 |   </body>
281 | </html>


--------------------------------------------------------------------------------
/docs/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/objects.inv


--------------------------------------------------------------------------------
/docs/html/py-modindex.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html lang="en" data-content_root="./">
  4 |   <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  7 |     <title>Python Module Index &#8212; whisper_live  documentation</title>
  8 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
  9 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
 10 |     <script src="_static/documentation_options.js?v=5929fcd5"></script>
 11 |     <script src="_static/doctools.js?v=888ff710"></script>
 12 |     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 13 |     <link rel="index" title="Index" href="genindex.html" />
 14 |     <link rel="search" title="Search" href="search.html" />
 15 | 
 16 |    
 17 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 18 |   
 19 |   
 20 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 21 | 
 22 | 
 23 | 
 24 |   </head><body>
 25 |   
 26 | 
 27 |     <div class="document">
 28 |       <div class="documentwrapper">
 29 |         <div class="bodywrapper">
 30 |           
 31 | 
 32 |           <div class="body" role="main">
 33 |             
 34 | 
 35 |    <h1>Python Module Index</h1>
 36 | 
 37 |    <div class="modindex-jumpbox">
 38 |    <a href="#cap-w"><strong>w</strong></a>
 39 |    </div>
 40 | 
 41 |    <table class="indextable modindextable">
 42 |      <tr class="pcap"><td></td><td>&#160;</td><td></td></tr>
 43 |      <tr class="cap" id="cap-w"><td></td><td>
 44 |        <strong>w</strong></td><td></td></tr>
 45 |      <tr>
 46 |        <td><img src="_static/minus.png" class="toggler"
 47 |               id="toggle-1" style="display: none" alt="-" /></td>
 48 |        <td>
 49 |        <code class="xref">whisper_live</code></td><td>
 50 |        <em></em></td></tr>
 51 |      <tr class="cg-1">
 52 |        <td></td>
 53 |        <td>&#160;&#160;&#160;
 54 |        <a href="index.html#module-whisper_live.client"><code class="xref">whisper_live.client</code></a></td><td>
 55 |        <em></em></td></tr>
 56 |      <tr class="cg-1">
 57 |        <td></td>
 58 |        <td>&#160;&#160;&#160;
 59 |        <a href="index.html#module-whisper_live.server"><code class="xref">whisper_live.server</code></a></td><td>
 60 |        <em></em></td></tr>
 61 |    </table>
 62 | 
 63 | 
 64 |           </div>
 65 |           
 66 |         </div>
 67 |       </div>
 68 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 69 |         <div class="sphinxsidebarwrapper">
 70 | <h1 class="logo"><a href="index.html">whisper_live</a></h1>
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | <h3>Navigation</h3>
 80 | 
 81 | <div class="relations">
 82 | <h3>Related Topics</h3>
 83 | <ul>
 84 |   <li><a href="index.html">Documentation overview</a><ul>
 85 |   </ul></li>
 86 | </ul>
 87 | </div>
 88 | <div id="searchbox" style="display: none" role="search">
 89 |   <h3 id="searchlabel">Quick search</h3>
 90 |     <div class="searchformwrapper">
 91 |     <form class="search" action="search.html" method="get">
 92 |       <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
 93 |       <input type="submit" value="Go" />
 94 |     </form>
 95 |     </div>
 96 | </div>
 97 | <script>document.getElementById('searchbox').style.display = "block"</script>
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 |         </div>
107 |       </div>
108 |       <div class="clearer"></div>
109 |     </div>
110 |     <div class="footer">
111 |       &copy;2023, Collabora.
112 |       
113 |       |
114 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
115 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
116 |       
117 |     </div>
118 | 
119 |     
120 | 
121 |     
122 |   </body>
123 | </html>


--------------------------------------------------------------------------------
/docs/html/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html lang="en" data-content_root="./">
  4 |   <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  7 |     <title>Search &#8212; whisper_live  documentation</title>
  8 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
  9 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
 10 |     
 11 |     <script src="_static/documentation_options.js?v=5929fcd5"></script>
 12 |     <script src="_static/doctools.js?v=888ff710"></script>
 13 |     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 14 |     <script src="_static/searchtools.js"></script>
 15 |     <script src="_static/language_data.js"></script>
 16 |     <link rel="index" title="Index" href="genindex.html" />
 17 |     <link rel="search" title="Search" href="#" />
 18 |   <script src="searchindex.js" defer></script>
 19 |   
 20 |    
 21 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 22 |   
 23 |   
 24 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 25 | 
 26 | 
 27 |   </head><body>
 28 |   
 29 | 
 30 |     <div class="document">
 31 |       <div class="documentwrapper">
 32 |         <div class="bodywrapper">
 33 |           
 34 | 
 35 |           <div class="body" role="main">
 36 |             
 37 |   <h1 id="search-documentation">Search</h1>
 38 |   
 39 |   <noscript>
 40 |   <div class="admonition warning">
 41 |   <p>
 42 |     Please activate JavaScript to enable the search
 43 |     functionality.
 44 |   </p>
 45 |   </div>
 46 |   </noscript>
 47 |   
 48 |   
 49 |   <p>
 50 |     Searching for multiple words only shows matches that contain
 51 |     all words.
 52 |   </p>
 53 |   
 54 |   
 55 |   <form action="" method="get">
 56 |     <input type="text" name="q" aria-labelledby="search-documentation" value="" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
 57 |     <input type="submit" value="search" />
 58 |     <span id="search-progress" style="padding-left: 10px"></span>
 59 |   </form>
 60 |   
 61 |   
 62 |   
 63 |   <div id="search-results">
 64 |   
 65 |   </div>
 66 |   
 67 | 
 68 |           </div>
 69 |           
 70 |         </div>
 71 |       </div>
 72 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 73 |         <div class="sphinxsidebarwrapper">
 74 | <h1 class="logo"><a href="index.html">whisper_live</a></h1>
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | <h3>Navigation</h3>
 84 | 
 85 | <div class="relations">
 86 | <h3>Related Topics</h3>
 87 | <ul>
 88 |   <li><a href="index.html">Documentation overview</a><ul>
 89 |   </ul></li>
 90 | </ul>
 91 | </div>
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 |         </div>
101 |       </div>
102 |       <div class="clearer"></div>
103 |     </div>
104 |     <div class="footer">
105 |       &copy;2023, Collabora.
106 |       
107 |       |
108 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
109 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
110 |       
111 |     </div>
112 | 
113 |     
114 | 
115 |     
116 |   </body>
117 | </html>


--------------------------------------------------------------------------------
/docs/html/searchindex.js:
--------------------------------------------------------------------------------
1 | Search.setIndex({"docnames": ["index"], "filenames": ["index.rst"], "titles": ["Welcome to Whisper Live documentation!"], "terms": {"class": 0, "whisper_l": 0, "server": 0, "servecli": 0, "websocket": 0, "task": 0, "transcrib": 0, "devic": 0, "none": 0, "multilingu": 0, "fals": 0, "languag": 0, "client_uid": 0, "attribut": 0, "rate": 0, "int": 0, "The": 0, "audio": 0, "sampl": 0, "constant": 0, "set": 0, "16000": 0, "server_readi": 0, "str": 0, "A": 0, "messag": 0, "i": 0, "readi": 0, "disconnect": 0, "client": 0, "should": 0, "uniqu": 0, "identifi": 0, "data": 0, "byte": 0, "accumul": 0, "frame": 0, "transcript": 0, "type": 0, "e": 0, "g": 0, "whispermodel": 0, "model": 0, "speech": 0, "text": 0, "timestamp_offset": 0, "float": 0, "offset": 0, "timestamp": 0, "frames_np": 0, "numpi": 0, "ndarrai": 0, "arrai": 0, "store": 0, "frames_offset": 0, "list": 0, "segment": 0, "current_out": 0, "current": 0, "incomplet": 0, "prev_out": 0, "previou": 0, "t_start": 0, "start": 0, "exit": 0, "bool": 0, "flag": 0, "thread": 0, "same_output_threshold": 0, "threshold": 0, "consecut": 0, "same": 0, "output": 0, "show_prev_out_thresh": 0, "show": 0, "add_pause_thresh": 0, "ad": 0, "paus": 0, "blank": 0, "send_last_n_seg": 0, "number": 0, "last": 0, "send": 0, "wrapper": 0, "textwrap": 0, "textwrapp": 0, "format": 0, "pick_previous_seg": 0, "includ": 0, "connect": 0, "add_fram": 0, "frame_np": 0, "add": 0, "ongo": 0, "stream": 0, "buffer": 0, "thi": 0, "method": 0, "respons": 0, "maintain": 0, "allow": 0, "continu": 0, "addit": 0, "thei": 0, "ar": 0, "receiv": 0, "It": 0, "also": 0, "ensur": 0, "doe": 0, "exce": 0, "specifi": 0, "size": 0, "prevent": 0, "excess": 0, "memori": 0, "usag": 0, "If": 0, "45": 0, "second": 0, "discard": 0, "oldest": 0, "30": 0, "reason": 0, "empti": 0, "initi": 0, "provid": 0, "us": 0, "real": 0, "time": 0, "process": 0, "arg": 0, "cleanup": 0, "perform": 0, "befor": 0, "servic": 0, "necessari": 0, "stop": 0, "mark": 0, "gracefulli": 0, "destroi": 0, "resourc": 0, "associ": 0, "notifi": 0, "via": 0, "them": 0, "fill_output": 0, "combin": 0, "complet": 0, "result": 0, "wrap": 0, "two": 0, "line": 0, "each": 0, "contain": 0, "maximum": 0, "50": 0, "charact": 0, "fit": 0, "within": 0, "per": 0, "concaten": 0, "order": 0, "exist": 0, "most": 0, "recent": 0, "first": 0, "older": 0, "prepend": 0, "need": 0, "limit": 0, "3": 0, "detect": 0, "ani": 0, "preced": 0, "content": 0, "return": 0, "singl": 0, "string": 0, "speech_to_text": 0, "an": 0, "infinit": 0, "loop": 0, "": 0, "wait": 0, "input": 0, "make": 0, "predict": 0, "util": 0, "asr": 0, "sent": 0, "histori": 0, "context": 0, "from": 0, "handl": 0, "durat": 0, "rais": 0, "except": 0, "issu": 0, "commun": 0, "update_seg": 0, "append": 0, "all": 0, "assum": 0, "updat": 0, "end": 0, "chronolog": 0, "one": 0, "repeat": 0, "seen": 0, "multipl": 0, "onli": 0, "onc": 0, "base": 0, "dict": 0, "dictionari": 0, "chunk": 0, "its": 0, "valid": 0, "transcriptionserv": 0, "repres": 0, "incom": 0, "vad_model": 0, "torch": 0, "modul": 0, "voic": 0, "activ": 0, "vad_threshold": 0, "clients_start_tim": 0, "track": 0, "max_client": 0, "max_connection_tim": 0, "get_wait_tim": 0, "calcul": 0, "estim": 0, "minut": 0, "recv_audio": 0, "over": 0, "vad": 0, "determin": 0, "reach": 0, "statu": 0, "until": 0, "slot": 0, "avail": 0, "clean": 0, "up": 0, "error": 0, "dure": 0, "run": 0, "host": 0, "port": 0, "9090": 0, "address": 0, "bind": 0, "transcriptioncli": 0, "is_multilingu": 0, "lang": 0, "translat": 0, "act": 0, "high": 0, "level": 0, "can": 0, "hostnam": 0, "ip": 0, "option": 0, "whether": 0, "support": 0, "default": 0, "primari": 0, "which": 0, "english": 0, "en": 0, "requir": 0, "instanc": 0, "underli": 0, "exampl": 0, "To": 0, "creat": 0, "microphon": 0, "python": 0, "transcription_cli": 0, "localhost": 0, "true": 0, "resampl": 0, "file": 0, "sr": 0, "http": 0, "github": 0, "com": 0, "openai": 0, "blob": 0, "7858aa9c08d98f75575035ecd6481f462d66ca27": 0, "py": 0, "l22": 0, "open": 0, "read": 0, "mono": 0, "waveform": 0, "save": 0, "resampled_fil": 0, "index": 0, "search": 0, "page": 0, "record": 0, "static": 0, "bytes_to_float_arrai": 0, "audio_byt": 0, "convert": 0, "16": 0, "bit": 0, "pcm": 0, "normal": 0, "have": 0, "valu": 0, "between": 0, "1": 0, "np": 0, "close_websocket": 0, "close": 0, "join": 0, "attempt": 0, "self": 0, "client_socket": 0, "after": 0, "proper": 0, "termin": 0, "get_client_socket": 0, "get": 0, "socket": 0, "websocketapp": 0, "on_messag": 0, "w": 0, "callback": 0, "function": 0, "call": 0, "when": 0, "variou": 0, "on_open": 0, "successfulli": 0, "configur": 0, "uid": 0, "mode": 0, "select": 0, "play_fil": 0, "filenam": 0, "plai": 0, "through": 0, "simultan": 0, "pyaudio": 0, "playback": 0, "point": 0, "typic": 0, "you": 0, "want": 0, "pre": 0, "path": 0, "out_fil": 0, "output_record": 0, "wav": 0, "record_second": 0, "directori": 0, "separ": 0, "interrupt": 0, "keyboardinterrupt": 0, "press": 0, "ctrl": 0, "c": 0, "name": 0, "entir": 0, "send_packet_to_serv": 0, "packet": 0, "write_audio_frames_to_fil": 0, "file_nam": 0, "write": 0, "overwritten": 0, "correct": 0, "match": 0, "channel": 0, "width": 0, "written": 0, "write_output_record": 0, "n_audio_fil": 0, "individu": 0, "expect": 0, "locat": 0, "final": 0, "delet": 0}, "objects": {"whisper_live": [[0, 0, 0, "-", "client"], [0, 0, 0, "-", "server"]], "whisper_live.client": [[0, 1, 1, "", "Client"], [0, 1, 1, "", "TranscriptionClient"], [0, 3, 1, "", "resample"]], "whisper_live.client.Client": [[0, 2, 1, "", "bytes_to_float_array"], [0, 2, 1, "", "close_websocket"], [0, 2, 1, "", "get_client_socket"], [0, 2, 1, "", "on_message"], [0, 2, 1, "", "on_open"], [0, 2, 1, "", "play_file"], [0, 2, 1, "", "record"], [0, 2, 1, "", "send_packet_to_server"], [0, 2, 1, "", "write_audio_frames_to_file"], [0, 2, 1, "", "write_output_recording"]], "whisper_live.server": [[0, 1, 1, "", "ServeClient"], [0, 1, 1, "", "TranscriptionServer"]], "whisper_live.server.ServeClient": [[0, 2, 1, "", "add_frames"], [0, 2, 1, "", "cleanup"], [0, 2, 1, "", "disconnect"], [0, 2, 1, "", "fill_output"], [0, 2, 1, "", "speech_to_text"], [0, 2, 1, "", "update_segments"]], "whisper_live.server.TranscriptionServer": [[0, 2, 1, "", "get_wait_time"], [0, 2, 1, "", "recv_audio"], [0, 2, 1, "", "run"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"welcom": 0, "whisper": 0, "live": 0, "document": 0, "indic": 0, "tabl": 0}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Welcome to Whisper Live documentation!": [[0, "welcome-to-whisper-live-documentation"]], "Indices and tables": [[0, "indices-and-tables"]]}, "indexentries": {"client (class in whisper_live.client)": [[0, "whisper_live.client.Client"]], "serveclient (class in whisper_live.server)": [[0, "whisper_live.server.ServeClient"]], "transcriptionclient (class in whisper_live.client)": [[0, "whisper_live.client.TranscriptionClient"]], "transcriptionserver (class in whisper_live.server)": [[0, "whisper_live.server.TranscriptionServer"]], "add_frames() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.add_frames"]], "bytes_to_float_array() (whisper_live.client.client static method)": [[0, "whisper_live.client.Client.bytes_to_float_array"]], "cleanup() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.cleanup"]], "close_websocket() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.close_websocket"]], "disconnect() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.disconnect"]], "fill_output() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.fill_output"]], "get_client_socket() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.get_client_socket"]], "get_wait_time() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.get_wait_time"]], "module": [[0, "module-whisper_live.client"], [0, "module-whisper_live.server"]], "on_message() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.on_message"]], "on_open() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.on_open"]], "play_file() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.play_file"]], "record() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.record"]], "recv_audio() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.recv_audio"]], "resample() (in module whisper_live.client)": [[0, "whisper_live.client.resample"]], "run() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.run"]], "send_packet_to_server() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.send_packet_to_server"]], "speech_to_text() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.speech_to_text"]], "update_segments() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.update_segments"]], "whisper_live.client": [[0, "module-whisper_live.client"]], "whisper_live.server": [[0, "module-whisper_live.server"]], "write_audio_frames_to_file() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.write_audio_frames_to_file"]], "write_output_recording() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.write_output_recording"]]}})


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | <meta http-equiv="refresh" content="0; url=./html/index.html" />


--------------------------------------------------------------------------------
/requirements/client.txt:
--------------------------------------------------------------------------------
1 | PyAudio
2 | av
3 | scipy
4 | websocket-client


--------------------------------------------------------------------------------
/requirements/server.txt:
--------------------------------------------------------------------------------
 1 | faster-whisper==1.1.0
 2 | websockets
 3 | onnxruntime==1.17.0
 4 | numba
 5 | kaldialign
 6 | soundfile
 7 | scipy
 8 | av
 9 | jiwer
10 | evaluate
11 | numpy<2
12 | openai-whisper==20240930
13 | tokenizers==0.20.3
14 | transformers[torch]
15 | 
16 | # openvino
17 | librosa
18 | openvino
19 | openvino-genai
20 | openvino-tokenizers
21 | optimum 
22 | optimum-intel


--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('--port', '-p',
 7 |                         type=int,
 8 |                         default=9090,
 9 |                         help="Websocket port to run the server on.")
10 |     parser.add_argument('--backend', '-b',
11 |                         type=str,
12 |                         default='faster_whisper',
13 |                         help='Backends from ["tensorrt", "faster_whisper", "openvino"]')
14 |     parser.add_argument('--faster_whisper_custom_model_path', '-fw',
15 |                         type=str, default=None,
16 |                         help="Custom Faster Whisper Model")
17 |     parser.add_argument('--trt_model_path', '-trt',
18 |                         type=str,
19 |                         default=None,
20 |                         help='Whisper TensorRT model path')
21 |     parser.add_argument('--trt_multilingual', '-m',
22 |                         action="store_true",
23 |                         help='Boolean only for TensorRT model. True if multilingual.')
24 |     parser.add_argument('--trt_py_session',
25 |                         action="store_true",
26 |                         help='Boolean only for TensorRT model. Use python session or cpp session, By default uses Cpp.')
27 |     parser.add_argument('--omp_num_threads', '-omp',
28 |                         type=int,
29 |                         default=1,
30 |                         help="Number of threads to use for OpenMP")
31 |     parser.add_argument('--no_single_model', '-nsm',
32 |                         action='store_true',
33 |                         help='Set this if every connection should instantiate its own model. Only relevant for custom model, passed using -trt or -fw.')
34 |     parser.add_argument('--cache_path', '-c',
35 |                         type=str,
36 |                         default="~/.cache/whisper-live/",
37 |                         help='Path to cache the converted ctranslate2 models.')
38 |     args = parser.parse_args()
39 | 
40 |     if args.backend == "tensorrt":
41 |         if args.trt_model_path is None:
42 |             raise ValueError("Please Provide a valid tensorrt model path")
43 | 
44 |     if "OMP_NUM_THREADS" not in os.environ:
45 |         os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
46 | 
47 |     from whisper_live.server import TranscriptionServer
48 |     server = TranscriptionServer()
49 |     server.run(
50 |         "0.0.0.0",
51 |         port=args.port,
52 |         backend=args.backend,
53 |         faster_whisper_custom_model_path=args.faster_whisper_custom_model_path,
54 |         whisper_tensorrt_path=args.trt_model_path,
55 |         trt_multilingual=args.trt_multilingual,
56 |         trt_py_session=args.trt_py_session,
57 |         single_model=not args.no_single_model,
58 |         cache_path=args.cache_path
59 |     )
60 | 


--------------------------------------------------------------------------------
/scripts/build_whisper_tensorrt.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | download_and_build_model() {
  4 |     local model_name="$1"
  5 |     local model_url=""
  6 | 
  7 |     case "$model_name" in
  8 |         "tiny.en")
  9 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt"
 10 |             ;;
 11 |         "tiny")
 12 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
 13 |             ;;
 14 |         "base.en")
 15 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt"
 16 |             ;;
 17 |         "base")
 18 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt"
 19 |             ;;
 20 |         "small.en")
 21 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt"
 22 |             ;;
 23 |         "small")
 24 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt"
 25 |             ;;
 26 |         "medium.en")
 27 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt"
 28 |             ;;
 29 |         "medium")
 30 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt"
 31 |             ;;
 32 |         "large-v1")
 33 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt"
 34 |             ;;
 35 |         "large-v2")
 36 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt"
 37 |             ;;
 38 |         "large-v3" | "large")
 39 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt"
 40 |             ;;
 41 |         "large-v3-turbo" | "turbo")
 42 |             model_url="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt"
 43 |             ;;
 44 |         *)
 45 |             echo "Invalid model name: $model_name"
 46 |             exit 1
 47 |             ;;
 48 |     esac
 49 | 
 50 |     if [ "$model_name" == "turbo" ]; then
 51 |         model_name="large-v3-turbo"
 52 |     fi
 53 | 
 54 |     local inference_precision="float16"
 55 |     local weight_only_precision="${2:-float16}"
 56 |     local max_beam_width=4
 57 |     local max_batch_size=4
 58 | 
 59 |     echo "Downloading $model_name..."
 60 |     # wget --directory-prefix=assets "$model_url"
 61 |     # echo "Download completed: ${model_name}.pt"
 62 |     if [ ! -f "assets/${model_name}.pt" ]; then
 63 |         wget --directory-prefix=assets "$model_url"
 64 |         echo "Download completed: ${model_name}.pt"
 65 |     else
 66 |         echo "${model_name}.pt already exists in assets directory."
 67 |     fi
 68 | 
 69 |     local sanitized_model_name="${model_name//./_}"
 70 |     local checkpoint_dir="whisper_${sanitized_model_name}_weights_${weight_only_precision}"
 71 |     local output_dir="whisper_${sanitized_model_name}_${weight_only_precision}"
 72 |     echo "$output_dir"
 73 |     echo "Converting model weights for $model_name..."
 74 |     python3 convert_checkpoint.py \
 75 |         $( [[ "$weight_only_precision" == "int8" || "$weight_only_precision" == "int4" ]] && echo "--use_weight_only --weight_only_precision $weight_only_precision" ) \
 76 |         --output_dir "$checkpoint_dir" --model_name "$model_name"
 77 |     
 78 |     echo "Building encoder for $model_name..."
 79 |     trtllm-build \
 80 |         --checkpoint_dir "${checkpoint_dir}/encoder" \
 81 |         --output_dir "${output_dir}/encoder" \
 82 |         --moe_plugin disable \
 83 |         --max_batch_size "$max_batch_size" \
 84 |         --gemm_plugin disable \
 85 |         --bert_attention_plugin "$inference_precision" \
 86 |         --max_input_len 3000 \
 87 |         --max_seq_len 3000
 88 |     
 89 |     echo "Building decoder for $model_name..."
 90 |     trtllm-build \
 91 |         --checkpoint_dir "${checkpoint_dir}/decoder" \
 92 |         --output_dir "${output_dir}/decoder" \
 93 |         --moe_plugin disable \
 94 |         --max_beam_width "$max_beam_width" \
 95 |         --max_batch_size "$max_batch_size" \
 96 |         --max_seq_len 225 \
 97 |         --max_input_len 32 \
 98 |         --max_encoder_input_len 3000 \
 99 |         --gemm_plugin "$inference_precision" \
100 |         --bert_attention_plugin "$inference_precision" \
101 |         --gpt_attention_plugin "$inference_precision"
102 | 
103 |     echo "TensorRT LLM engine built for $model_name."
104 |     echo "========================================="
105 |     echo "Model is located at: $(pwd)/$output_dir"
106 | }
107 | 
108 | if [ "$#" -lt 1 ]; then
109 |     echo "Usage: $0 <path-to-tensorrt-examples-dir> [model-name]"
110 |     exit 1
111 | fi
112 | 
113 | tensorrt_examples_dir="$1"
114 | model_name="${2:-small.en}"
115 | weight_only_precision="${3:-float16}"  # Default to float16 if not provided
116 | 
117 | cd $tensorrt_examples_dir/whisper
118 | pip install --no-deps -r requirements.txt
119 | 
120 | download_and_build_model "$model_name" "$weight_only_precision"
121 | 


--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Detect the operating system
 4 | if [[ "$OSTYPE" == "darwin"* ]]; then
 5 |     # macOS
 6 |     echo "Detected macOS, using Homebrew for installation"
 7 | 
 8 |     # Check if Homebrew is installed
 9 |     if ! command -v brew &> /dev/null; then
10 |         echo "Homebrew not found. Please install Homebrew first: https://brew.sh/"
11 |         exit 1
12 |     fi
13 | 
14 |     # Install packages using Homebrew
15 |     brew install portaudio wget
16 | else
17 |     # Linux (Debian/Ubuntu)
18 |     echo "Detected Linux, using apt-get for installation"
19 |     apt-get install portaudio19-dev wget -y
20 | fi
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import find_packages, setup
 3 | from whisper_live.__version__ import __version__
 4 | 
 5 | 
 6 | # The directory containing this file
 7 | HERE = pathlib.Path(__file__).parent
 8 | 
 9 | # The text of the README file
10 | README = (HERE / "README.md").read_text()
11 | 
12 | # This call to setup() does all the work
13 | setup(
14 |     name="whisper_live",
15 |     version=__version__,
16 |     description="A nearly-live implementation of OpenAI's Whisper.",
17 |     long_description=README,
18 |     long_description_content_type="text/markdown",
19 |     include_package_data=True,
20 |     url="https://github.com/collabora/WhisperLive",
21 |     author="Collabora Ltd",
22 |     author_email="vineet.suryan@collabora.com",
23 |     license="MIT",
24 |     classifiers=[
25 |         "Development Status :: 4 - Beta",
26 |         "Intended Audience :: Developers",
27 |         "Intended Audience :: Science/Research",
28 |         "License :: OSI Approved :: MIT License",
29 |         "Programming Language :: Python :: 3",
30 |         "Programming Language :: Python :: 3 :: Only",
31 |         "Programming Language :: Python :: 3.8",
32 |         "Programming Language :: Python :: 3.9",
33 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
34 |     ],
35 |     packages=find_packages(
36 |         exclude=(
37 |             "examples",
38 |             "Audio-Transcription-Chrome",
39 |             "Audio-Transcription-Firefox",
40 |             "requirements",
41 |             "whisper-finetuning"
42 |         )
43 |     ),
44 |     install_requires=[
45 |         "PyAudio",
46 |         "faster-whisper==1.1.0",
47 |         "torch",
48 |         "torchaudio",
49 |         "websockets",
50 |         "onnxruntime==1.17.0",
51 |         "scipy",
52 |         "websocket-client",
53 |         "numba",
54 |         "openai-whisper==20240930",
55 |         "kaldialign",
56 |         "soundfile",
57 |         "tokenizers==0.20.3",
58 |         "librosa",
59 |         "numpy==1.26.4",
60 |         "openvino",
61 |         "openvino-genai",
62 |         "openvino-tokenizers",
63 |         "optimum", 
64 |         "optimum-intel",
65 |     ],
66 |     python_requires=">=3.9"
67 | )
68 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_client.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import scipy
  4 | import websocket
  5 | import copy
  6 | import unittest
  7 | from unittest.mock import patch, MagicMock
  8 | from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
  9 | from whisper_live.utils import resample
 10 | from pathlib import Path
 11 | 
 12 | 
 13 | class BaseTestCase(unittest.TestCase):
 14 |     @patch('whisper_live.client.websocket.WebSocketApp')
 15 |     @patch('whisper_live.client.pyaudio.PyAudio')
 16 |     def setUp(self, mock_pyaudio, mock_websocket):
 17 |         self.mock_pyaudio_instance = MagicMock()
 18 |         mock_pyaudio.return_value = self.mock_pyaudio_instance
 19 |         self.mock_stream = MagicMock()
 20 |         self.mock_pyaudio_instance.open.return_value = self.mock_stream
 21 | 
 22 |         self.mock_ws_app = mock_websocket.return_value
 23 |         self.mock_ws_app.send = MagicMock()
 24 | 
 25 |         self.client = TranscriptionClient(host='localhost', port=9090, lang="en").client
 26 | 
 27 |         self.mock_pyaudio = mock_pyaudio
 28 |         self.mock_websocket = mock_websocket
 29 |         self.mock_audio_packet = b'\x00\x01\x02\x03'
 30 | 
 31 |     def tearDown(self):
 32 |         self.client.close_websocket()
 33 |         self.mock_pyaudio.stop()
 34 |         self.mock_websocket.stop()
 35 |         del self.client
 36 | 
 37 | class TestClientWebSocketCommunication(BaseTestCase):
 38 |     def test_websocket_communication(self):
 39 |         expected_url = 'ws://localhost:9090'
 40 |         self.mock_websocket.assert_called()
 41 |         self.assertEqual(self.mock_websocket.call_args[0][0], expected_url)
 42 | 
 43 | 
 44 | class TestClientCallbacks(BaseTestCase):
 45 |     def test_on_open(self):
 46 |         expected_message = json.dumps({
 47 |             "uid": self.client.uid,
 48 |             "language": self.client.language,
 49 |             "task": self.client.task,
 50 |             "model": self.client.model,
 51 |             "use_vad": True,
 52 |             "max_clients": 4,
 53 |             "max_connection_time": 600,
 54 |             "send_last_n_segments": 10,
 55 |             "no_speech_thresh": 0.45,
 56 |             "clip_audio": False,
 57 |             "same_output_threshold": 10,
 58 |         })
 59 |         self.client.on_open(self.mock_ws_app)
 60 |         self.mock_ws_app.send.assert_called_with(expected_message)
 61 | 
 62 |     def test_on_message(self):
 63 |         message = json.dumps(
 64 |             {
 65 |                 "uid": self.client.uid,
 66 |                 "message": "SERVER_READY",
 67 |                 "backend": "faster_whisper"
 68 |             }
 69 |         )
 70 |         self.client.on_message(self.mock_ws_app, message)
 71 | 
 72 |         message = json.dumps({
 73 |             "uid": self.client.uid,
 74 |             "segments": [
 75 |                 {"start": 0, "end": 1, "text": "Test transcript", "completed": True},
 76 |                 {"start": 1, "end": 2, "text": "Test transcript 2", "completed": True},
 77 |                 {"start": 2, "end": 3, "text": "Test transcript 3", "completed": True}
 78 |             ]
 79 |         })
 80 |         self.client.on_message(self.mock_ws_app, message)
 81 | 
 82 |         # Assert that the transcript was updated correctly
 83 |         self.assertEqual(len(self.client.transcript), 3)
 84 |         self.assertEqual(self.client.transcript[1]['text'], "Test transcript 2")
 85 | 
 86 |     def test_on_close(self):
 87 |         close_status_code = 1000
 88 |         close_msg = "Normal closure"
 89 |         self.client.on_close(self.mock_ws_app, close_status_code, close_msg)
 90 | 
 91 |         self.assertFalse(self.client.recording)
 92 |         self.assertFalse(self.client.server_error)
 93 |         self.assertFalse(self.client.waiting)
 94 | 
 95 |     def test_on_error(self):
 96 |         error_message = "Test Error"
 97 |         self.client.on_error(self.mock_ws_app, error_message)
 98 | 
 99 |         self.assertTrue(self.client.server_error)
100 |         self.assertEqual(self.client.error_message, error_message)
101 | 
102 | 
103 | class TestAudioResampling(unittest.TestCase):
104 |     def test_resample_audio(self):
105 |         original_audio = "assets/jfk.flac"
106 |         expected_sr = 16000
107 |         resampled_audio = resample(original_audio, expected_sr)
108 | 
109 |         sr, _ = scipy.io.wavfile.read(resampled_audio)
110 |         self.assertEqual(sr, expected_sr)
111 | 
112 |         os.remove(resampled_audio)
113 | 
114 | 
115 | class TestSendingAudioPacket(BaseTestCase):
116 |     def test_send_packet(self):
117 |         self.client.send_packet_to_server(self.mock_audio_packet)
118 |         self.client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
119 | 
120 | class TestTee(BaseTestCase):
121 |     @patch('whisper_live.client.websocket.WebSocketApp')
122 |     @patch('whisper_live.client.pyaudio.PyAudio')
123 |     def setUp(self, mock_audio, mock_websocket):
124 |         super().setUp()
125 |         self.client2 = Client(host='localhost', port=9090, lang="es", translate=False, srt_file_path="transcript.srt")
126 |         self.client3 = Client(host='localhost', port=9090, lang="es", translate=True, srt_file_path="translation.srt")
127 |         # need a separate mock for each websocket
128 |         self.client3.client_socket = copy.deepcopy(self.client3.client_socket)
129 |         self.tee = TranscriptionTeeClient([self.client2, self.client3])
130 | 
131 |     def tearDown(self):
132 |         self.tee.close_all_clients()
133 |         del self.tee
134 |         super().tearDown()
135 | 
136 |     def test_invalid_constructor(self):
137 |         with self.assertRaises(Exception) as context:
138 |             TranscriptionTeeClient([])
139 | 
140 |     def test_multicast_unconditional(self):
141 |         self.tee.multicast_packet(self.mock_audio_packet, True)
142 |         for client in self.tee.clients:
143 |             client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
144 | 
145 |     def test_multicast_conditional(self):
146 |         self.client2.recording = False
147 |         self.client3.recording = True
148 |         self.tee.multicast_packet(self.mock_audio_packet, False)
149 |         self.client2.client_socket.send.assert_not_called()
150 |         self.client3.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
151 | 
152 |     def test_close_all(self):
153 |         self.tee.close_all_clients()
154 |         for client in self.tee.clients:
155 |             client.client_socket.close.assert_called()
156 | 
157 |     def test_write_all_srt(self):
158 |         for client in self.tee.clients:
159 |             client.server_backend = "faster_whisper"
160 |         self.tee.write_all_clients_srt()
161 |         self.assertTrue(Path("transcript.srt").is_file())
162 |         self.assertTrue(Path("translation.srt").is_file())
163 | 


--------------------------------------------------------------------------------
/tests/test_server.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import json
  4 | import unittest
  5 | from unittest import mock
  6 | 
  7 | import numpy as np
  8 | import jiwer
  9 | 
 10 | from websockets.exceptions import ConnectionClosed
 11 | from whisper_live.server import TranscriptionServer, BackendType, ClientManager
 12 | from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
 13 | from whisper.normalizers import EnglishTextNormalizer
 14 | 
 15 | 
 16 | class TestTranscriptionServerInitialization(unittest.TestCase):
 17 |     def test_initialization(self):
 18 |         server = TranscriptionServer()
 19 |         server.client_manager = ClientManager(max_clients=4, max_connection_time=600)
 20 |         self.assertEqual(server.client_manager.max_clients, 4)
 21 |         self.assertEqual(server.client_manager.max_connection_time, 600)
 22 |         self.assertDictEqual(server.client_manager.clients, {})
 23 |         self.assertDictEqual(server.client_manager.start_times, {})
 24 | 
 25 | 
 26 | class TestGetWaitTime(unittest.TestCase):
 27 |     def setUp(self):
 28 |         self.server = TranscriptionServer()
 29 |         self.server.client_manager = ClientManager(max_clients=4, max_connection_time=600)
 30 |         self.server.client_manager.start_times = {
 31 |             'client1': time.time() - 120,
 32 |             'client2': time.time() - 300
 33 |         }
 34 |         self.server.client_manager.max_connection_time = 600
 35 | 
 36 |     def test_get_wait_time(self):
 37 |         expected_wait_time = (600 - (time.time() - self.server.client_manager.start_times['client2'])) / 60
 38 |         print(self.server.client_manager.get_wait_time(), expected_wait_time)
 39 |         self.assertAlmostEqual(self.server.client_manager.get_wait_time(), expected_wait_time, places=2)
 40 | 
 41 | 
 42 | class TestServerConnection(unittest.TestCase):
 43 |     def setUp(self):
 44 |         self.server = TranscriptionServer()
 45 | 
 46 |     @mock.patch('websockets.WebSocketCommonProtocol')
 47 |     def test_connection(self, mock_websocket):
 48 |         mock_websocket.recv.return_value = json.dumps({
 49 |             'uid': 'test_client',
 50 |             'language': 'en',
 51 |             'task': 'transcribe',
 52 |             'model': 'tiny.en'
 53 |         })
 54 |         self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
 55 | 
 56 |     @mock.patch('websockets.WebSocketCommonProtocol')
 57 |     def test_recv_audio_exception_handling(self, mock_websocket):
 58 |         mock_websocket.recv.side_effect = [json.dumps({
 59 |             'uid': 'test_client',
 60 |             'language': 'en',
 61 |             'task': 'transcribe',
 62 |             'model': 'tiny.en'
 63 |         }),  np.array([1, 2, 3]).tobytes()]
 64 | 
 65 |         with self.assertLogs(level="ERROR"):
 66 |             self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
 67 | 
 68 |         self.assertNotIn(mock_websocket, self.server.client_manager.clients)
 69 | 
 70 | 
 71 | class TestServerInferenceAccuracy(unittest.TestCase):
 72 |     @classmethod
 73 |     def setUpClass(cls):
 74 |         cls.mock_pyaudio_patch = mock.patch('pyaudio.PyAudio')
 75 |         cls.mock_pyaudio = cls.mock_pyaudio_patch.start()
 76 |         cls.mock_pyaudio.return_value.open.return_value = mock.MagicMock()
 77 |         
 78 |         cls.server_process = subprocess.Popen(["python", "run_server.py"])
 79 |         time.sleep(2)
 80 | 
 81 |     @classmethod
 82 |     def tearDownClass(cls):
 83 |         cls.server_process.terminate()
 84 |         cls.server_process.wait()
 85 | 
 86 |     def setUp(self):
 87 |         self.normalizer = EnglishTextNormalizer()
 88 | 
 89 |     def check_prediction(self, srt_path):
 90 |         gt = "And so my fellow Americans, ask not, what your country can do for you. Ask what you can do for your country!"
 91 |         with open(srt_path, "r") as f:
 92 |             lines = f.readlines()
 93 |             prediction = " ".join([line.strip() for line in lines[2::4]])
 94 |         prediction_normalized = self.normalizer(prediction)
 95 |         gt_normalized = self.normalizer(gt)
 96 | 
 97 |         # calculate WER
 98 |         wer_score = jiwer.wer(gt_normalized, prediction_normalized)
 99 |         self.assertLess(wer_score, 0.05)
100 | 
101 |     def test_inference(self):
102 |         client = TranscriptionClient(
103 |             "localhost", "9090", model="base.en", lang="en",
104 |         )
105 |         client("assets/jfk.flac")
106 |         self.check_prediction("output.srt")
107 | 
108 |     def test_simultaneous_inference(self):
109 |         client1 = Client(
110 |             "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript1.srt")
111 |         client2 = Client(
112 |             "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript2.srt")
113 |         tee = TranscriptionTeeClient([client1, client2])
114 |         tee("assets/jfk.flac")
115 |         self.check_prediction("transcript1.srt")
116 |         self.check_prediction("transcript2.srt")
117 | 
118 | 
119 | class TestExceptionHandling(unittest.TestCase):
120 |     def setUp(self):
121 |         self.server = TranscriptionServer()
122 | 
123 |     @mock.patch('websockets.WebSocketCommonProtocol')
124 |     def test_connection_closed_exception(self, mock_websocket):
125 |         mock_websocket.recv.side_effect = ConnectionClosed(1001, "testing connection closed", rcvd_then_sent=mock.Mock())
126 | 
127 |         with self.assertLogs(level="INFO") as log:
128 |             self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
129 |             self.assertTrue(any("Connection closed by client" in message for message in log.output))
130 | 
131 |     @mock.patch('websockets.WebSocketCommonProtocol')
132 |     def test_json_decode_exception(self, mock_websocket):
133 |         mock_websocket.recv.return_value = "invalid json"
134 | 
135 |         with self.assertLogs(level="ERROR") as log:
136 |             self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
137 |             self.assertTrue(any("Failed to decode JSON from client" in message for message in log.output))
138 | 
139 |     @mock.patch('websockets.WebSocketCommonProtocol')
140 |     def test_unexpected_exception_handling(self, mock_websocket):
141 |         mock_websocket.recv.side_effect = RuntimeError("Unexpected error")
142 | 
143 |         with self.assertLogs(level="ERROR") as log:
144 |             self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
145 |             for message in log.output:
146 |                 print(message)
147 |             print()
148 |             self.assertTrue(any("Unexpected error" in message for message in log.output))
149 | 


--------------------------------------------------------------------------------
/tests/test_vad.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from whisper_live.transcriber.tensorrt_utils import load_audio
 4 | from whisper_live.vad import VoiceActivityDetector
 5 | 
 6 | 
 7 | class TestVoiceActivityDetection(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.vad = VoiceActivityDetector()
10 |         self.sample_rate = 16000
11 | 
12 |     def generate_silence(self, duration_seconds):
13 |         return np.zeros(int(self.sample_rate * duration_seconds), dtype=np.float32)
14 | 
15 |     def load_speech_segment(self, filepath):
16 |         return load_audio(filepath)
17 | 
18 |     def test_vad_silence_detection(self):
19 |         silence = self.generate_silence(3)
20 |         is_speech_present = self.vad(silence.copy())
21 |         self.assertFalse(is_speech_present, "VAD incorrectly identified silence as speech.")
22 | 
23 |     def test_vad_speech_detection(self):
24 |         audio_tensor = load_audio("assets/jfk.flac")
25 |         is_speech_present = self.vad(audio_tensor)
26 |         self.assertTrue(is_speech_present, "VAD failed to identify speech segment.")
27 | 


--------------------------------------------------------------------------------
/whisper_live/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/__init__.py


--------------------------------------------------------------------------------
/whisper_live/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.7.1"
2 | 


--------------------------------------------------------------------------------
/whisper_live/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/backend/__init__.py


--------------------------------------------------------------------------------
/whisper_live/backend/faster_whisper_backend.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import threading
  5 | import time
  6 | import torch
  7 | import ctranslate2
  8 | from huggingface_hub import snapshot_download
  9 | 
 10 | from whisper_live.transcriber.transcriber_faster_whisper import WhisperModel
 11 | from whisper_live.backend.base import ServeClientBase
 12 | 
 13 | 
 14 | class ServeClientFasterWhisper(ServeClientBase):
 15 |     SINGLE_MODEL = None
 16 |     SINGLE_MODEL_LOCK = threading.Lock()
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         websocket,
 21 |         task="transcribe",
 22 |         device=None,
 23 |         language=None,
 24 |         client_uid=None,
 25 |         model="small.en",
 26 |         initial_prompt=None,
 27 |         vad_parameters=None,
 28 |         use_vad=True,
 29 |         single_model=False,
 30 |         send_last_n_segments=10,
 31 |         no_speech_thresh=0.45,
 32 |         clip_audio=False,
 33 |         same_output_threshold=10,
 34 |         cache_path="~/.cache/whisper-live/"
 35 |     ):
 36 |         """
 37 |         Initialize a ServeClient instance.
 38 |         The Whisper model is initialized based on the client's language and device availability.
 39 |         The transcription thread is started upon initialization. A "SERVER_READY" message is sent
 40 |         to the client to indicate that the server is ready.
 41 | 
 42 |         Args:
 43 |             websocket (WebSocket): The WebSocket connection for the client.
 44 |             task (str, optional): The task type, e.g., "transcribe". Defaults to "transcribe".
 45 |             device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
 46 |             language (str, optional): The language for transcription. Defaults to None.
 47 |             client_uid (str, optional): A unique identifier for the client. Defaults to None.
 48 |             model (str, optional): The whisper model size. Defaults to 'small.en'
 49 |             initial_prompt (str, optional): Prompt for whisper inference. Defaults to None.
 50 |             single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
 51 |             send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
 52 |             no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
 53 |             clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
 54 |             same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
 55 | 
 56 |         """
 57 |         super().__init__(
 58 |             client_uid,
 59 |             websocket,
 60 |             send_last_n_segments,
 61 |             no_speech_thresh,
 62 |             clip_audio,
 63 |             same_output_threshold,
 64 |         )
 65 |         self.cache_path = cache_path
 66 |         self.model_sizes = [
 67 |             "tiny", "tiny.en", "base", "base.en", "small", "small.en",
 68 |             "medium", "medium.en", "large-v2", "large-v3", "distil-small.en",
 69 |             "distil-medium.en", "distil-large-v2", "distil-large-v3",
 70 |             "large-v3-turbo", "turbo"
 71 |         ]
 72 | 
 73 |         self.model_size_or_path = model
 74 |         self.language = "en" if self.model_size_or_path.endswith("en") else language
 75 |         self.task = task
 76 |         self.initial_prompt = initial_prompt
 77 |         self.vad_parameters = vad_parameters or {"onset": 0.5}
 78 | 
 79 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 80 |         if device == "cuda":
 81 |             major, _ = torch.cuda.get_device_capability(device)
 82 |             self.compute_type = "float16" if major >= 7 else "float32"
 83 |         else:
 84 |             self.compute_type = "int8"
 85 | 
 86 |         if self.model_size_or_path is None:
 87 |             return
 88 |         logging.info(f"Using Device={device} with precision {self.compute_type}")
 89 |     
 90 |         try:
 91 |             if single_model:
 92 |                 if ServeClientFasterWhisper.SINGLE_MODEL is None:
 93 |                     self.create_model(device)
 94 |                     ServeClientFasterWhisper.SINGLE_MODEL = self.transcriber
 95 |                 else:
 96 |                     self.transcriber = ServeClientFasterWhisper.SINGLE_MODEL
 97 |             else:
 98 |                 self.create_model(device)
 99 |         except Exception as e:
100 |             logging.error(f"Failed to load model: {e}")
101 |             self.websocket.send(json.dumps({
102 |                 "uid": self.client_uid,
103 |                 "status": "ERROR",
104 |                 "message": f"Failed to load model: {str(self.model_size_or_path)}"
105 |             }))
106 |             self.websocket.close()
107 |             return
108 | 
109 |         self.use_vad = use_vad
110 | 
111 |         # threading
112 |         self.trans_thread = threading.Thread(target=self.speech_to_text)
113 |         self.trans_thread.start()
114 |         self.websocket.send(
115 |             json.dumps(
116 |                 {
117 |                     "uid": self.client_uid,
118 |                     "message": self.SERVER_READY,
119 |                     "backend": "faster_whisper"
120 |                 }
121 |             )
122 |         )
123 | 
124 |     def create_model(self, device):
125 |         """
126 |         Instantiates a new model, sets it as the transcriber. If model is a huggingface model_id
127 |         then it is automatically converted to ctranslate2(faster_whisper) format.
128 |         """
129 |         model_ref = self.model_size_or_path
130 | 
131 |         if model_ref in self.model_sizes:
132 |             model_to_load = model_ref
133 |         else:
134 |             logging.info(f"Model not in model_sizes")
135 |             if os.path.isdir(model_ref) and ctranslate2.contains_model(model_ref):
136 |                 model_to_load = model_ref
137 |             else:
138 |                 local_snapshot = snapshot_download(
139 |                     repo_id = model_ref,
140 |                     repo_type = "model",
141 |                 )
142 |                 if ctranslate2.contains_model(local_snapshot):
143 |                     model_to_load = local_snapshot
144 |                 else:
145 |                     cache_root = os.path.expanduser(os.path.join(self.cache_path, "whisper-ct2-models/"))
146 |                     os.makedirs(cache_root, exist_ok=True)
147 |                     safe_name = model_ref.replace("/", "--")
148 |                     ct2_dir = os.path.join(cache_root, safe_name)
149 | 
150 |                     if not ctranslate2.contains_model(ct2_dir):
151 |                         logging.info(f"Converting '{model_ref}' to CTranslate2 @ {ct2_dir}")
152 |                         ct2_converter = ctranslate2.converters.TransformersConverter(
153 |                             local_snapshot, 
154 |                             copy_files=["tokenizer.json", "preprocessor_config.json"]
155 |                         )
156 |                         ct2_converter.convert(
157 |                             output_dir=ct2_dir,
158 |                             quantization=self.compute_type,
159 |                             force=False,  # skip if already up-to-date
160 |                         )
161 |                     model_to_load = ct2_dir
162 | 
163 |         logging.info(f"Loading model: {model_to_load}")
164 |         self.transcriber = WhisperModel(
165 |             model_to_load,
166 |             device=device,
167 |             compute_type=self.compute_type,
168 |             local_files_only=False,
169 |         )
170 | 
171 |     def set_language(self, info):
172 |         """
173 |         Updates the language attribute based on the detected language information.
174 | 
175 |         Args:
176 |             info (object): An object containing the detected language and its probability. This object
177 |                         must have at least two attributes: `language`, a string indicating the detected
178 |                         language, and `language_probability`, a float representing the confidence level
179 |                         of the language detection.
180 |         """
181 |         if info.language_probability > 0.5:
182 |             self.language = info.language
183 |             logging.info(f"Detected language {self.language} with probability {info.language_probability}")
184 |             self.websocket.send(json.dumps(
185 |                 {"uid": self.client_uid, "language": self.language, "language_prob": info.language_probability}))
186 | 
187 |     def transcribe_audio(self, input_sample):
188 |         """
189 |         Transcribes the provided audio sample using the configured transcriber instance.
190 | 
191 |         If the language has not been set, it updates the session's language based on the transcription
192 |         information.
193 | 
194 |         Args:
195 |             input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy
196 |                                     array representing the audio data.
197 | 
198 |         Returns:
199 |             The transcription result from the transcriber. The exact format of this result
200 |             depends on the implementation of the `transcriber.transcribe` method but typically
201 |             includes the transcribed text.
202 |         """
203 |         if ServeClientFasterWhisper.SINGLE_MODEL:
204 |             ServeClientFasterWhisper.SINGLE_MODEL_LOCK.acquire()
205 |         result, info = self.transcriber.transcribe(
206 |             input_sample,
207 |             initial_prompt=self.initial_prompt,
208 |             language=self.language,
209 |             task=self.task,
210 |             vad_filter=self.use_vad,
211 |             vad_parameters=self.vad_parameters if self.use_vad else None)
212 |         if ServeClientFasterWhisper.SINGLE_MODEL:
213 |             ServeClientFasterWhisper.SINGLE_MODEL_LOCK.release()
214 | 
215 |         if self.language is None and info is not None:
216 |             self.set_language(info)
217 |         return result
218 | 
219 |     def handle_transcription_output(self, result, duration):
220 |         """
221 |         Handle the transcription output, updating the transcript and sending data to the client.
222 | 
223 |         Args:
224 |             result (str): The result from whisper inference i.e. the list of segments.
225 |             duration (float): Duration of the transcribed audio chunk.
226 |         """
227 |         segments = []
228 |         if len(result):
229 |             self.t_start = None
230 |             last_segment = self.update_segments(result, duration)
231 |             segments = self.prepare_segments(last_segment)
232 | 
233 |         if len(segments):
234 |             self.send_transcription_to_client(segments)
235 | 


--------------------------------------------------------------------------------
/whisper_live/backend/openvino_backend.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import threading
  4 | import time
  5 | 
  6 | from openvino import Core
  7 | from whisper_live.backend.base import ServeClientBase
  8 | from whisper_live.transcriber.transcriber_openvino import WhisperOpenVINO
  9 | 
 10 | 
 11 | class ServeClientOpenVINO(ServeClientBase):
 12 |     SINGLE_MODEL = None
 13 |     SINGLE_MODEL_LOCK = threading.Lock()
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         websocket,
 18 |         task="transcribe",
 19 |         device=None,
 20 |         language=None,
 21 |         client_uid=None,
 22 |         model="small.en",
 23 |         initial_prompt=None,
 24 |         vad_parameters=None,
 25 |         use_vad=True,
 26 |         single_model=False,
 27 |         send_last_n_segments=10,
 28 |         no_speech_thresh=0.45,
 29 |         clip_audio=False,
 30 |         same_output_threshold=10,
 31 |     ):
 32 |         """
 33 |         Initialize a ServeClient instance.
 34 |         The Whisper model is initialized based on the client's language and device availability.
 35 |         The transcription thread is started upon initialization. A "SERVER_READY" message is sent
 36 |         to the client to indicate that the server is ready.
 37 | 
 38 |         Args:
 39 |             websocket (WebSocket): The WebSocket connection for the client.
 40 |             task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
 41 |             device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
 42 |             language (str, optional): The language for transcription. Defaults to None.
 43 |             client_uid (str, optional): A unique identifier for the client. Defaults to None.
 44 |             model (str, optional): Huggingface model_id for a valid OpenVINO model.
 45 |             initial_prompt (str, optional): Prompt for whisper inference. Defaults to None.
 46 |             single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
 47 |             send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
 48 |             no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
 49 |             clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
 50 |             same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
 51 |         """
 52 |         super().__init__(
 53 |             client_uid,
 54 |             websocket,
 55 |             send_last_n_segments,
 56 |             no_speech_thresh,
 57 |             clip_audio,
 58 |             same_output_threshold,
 59 |         )
 60 |         self.language = "en" if language is None else language
 61 |         if not self.language.startswith("<|"):
 62 |             self.language = f"<|{self.language}|>"
 63 | 
 64 |         self.task = "transcribe" if task is None else task
 65 | 
 66 |         self.clip_audio = True
 67 | 
 68 |         core = Core()
 69 |         available_devices = core.available_devices
 70 |         if 'GPU' in available_devices:
 71 |             selected_device = 'GPU'
 72 |         else:
 73 |             gpu_devices = [d for d in available_devices if d.startswith('GPU')]
 74 |             selected_device = gpu_devices[0] if gpu_devices else 'CPU'
 75 |         self.device = selected_device
 76 | 
 77 | 
 78 |         if single_model:
 79 |             if ServeClientOpenVINO.SINGLE_MODEL is None:
 80 |                 self.create_model(model)
 81 |                 ServeClientOpenVINO.SINGLE_MODEL = self.transcriber
 82 |             else:
 83 |                 self.transcriber = ServeClientOpenVINO.SINGLE_MODEL
 84 |         else:
 85 |             self.create_model(model)
 86 | 
 87 |         # threading
 88 |         self.trans_thread = threading.Thread(target=self.speech_to_text)
 89 |         self.trans_thread.start()
 90 | 
 91 |         self.websocket.send(json.dumps({
 92 |             "uid": self.client_uid,
 93 |             "message": self.SERVER_READY,
 94 |             "backend": "openvino"
 95 |         }))
 96 |         logging.info(f"Using OpenVINO device: {self.device}")
 97 |         logging.info(f"Running OpenVINO backend with language: {self.language} and task: {self.task}")
 98 | 
 99 |     def create_model(self, model_id):
100 |         """
101 |         Instantiates a new model, sets it as the transcriber.
102 |         """
103 |         self.transcriber = WhisperOpenVINO(
104 |             model_id,
105 |             device=self.device,
106 |             language=self.language,
107 |             task=self.task
108 |         )
109 | 
110 |     def transcribe_audio(self, input_sample):
111 |         """
112 |         Transcribes the provided audio sample using the configured transcriber instance.
113 | 
114 |         If the language has not been set, it updates the session's language based on the transcription
115 |         information.
116 | 
117 |         Args:
118 |             input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy
119 |                                     array representing the audio data.
120 | 
121 |         Returns:
122 |             The transcription result from the transcriber. The exact format of this result
123 |             depends on the implementation of the `transcriber.transcribe` method but typically
124 |             includes the transcribed text.
125 |         """
126 |         if ServeClientOpenVINO.SINGLE_MODEL:
127 |             ServeClientOpenVINO.SINGLE_MODEL_LOCK.acquire()
128 |         result = self.transcriber.transcribe(input_sample)
129 |         if ServeClientOpenVINO.SINGLE_MODEL:
130 |             ServeClientOpenVINO.SINGLE_MODEL_LOCK.release()
131 |         return result
132 | 
133 |     def handle_transcription_output(self, result, duration):
134 |         """
135 |         Handle the transcription output, updating the transcript and sending data to the client.
136 | 
137 |         Args:
138 |             result (str): The result from whisper inference i.e. the list of segments.
139 |             duration (float): Duration of the transcribed audio chunk.
140 |         """
141 |         segments = []
142 |         if len(result):
143 |             self.t_start = None
144 |             last_segment = self.update_segments(result, duration)
145 |             segments = self.prepare_segments(last_segment)
146 | 
147 |         if len(segments):
148 |             self.send_transcription_to_client(segments)
149 | 


--------------------------------------------------------------------------------
/whisper_live/backend/trt_backend.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import threading
  4 | import time
  5 | 
  6 | from whisper_live.backend.base import ServeClientBase
  7 | from whisper_live.transcriber.transcriber_tensorrt import WhisperTRTLLM
  8 | 
  9 | 
 10 | class ServeClientTensorRT(ServeClientBase):
 11 |     SINGLE_MODEL = None
 12 |     SINGLE_MODEL_LOCK = threading.Lock()
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         websocket,
 17 |         task="transcribe",
 18 |         multilingual=False,
 19 |         language=None,
 20 |         client_uid=None,
 21 |         model=None,
 22 |         single_model=False,
 23 |         use_py_session=False,
 24 |         max_new_tokens=225,
 25 |         send_last_n_segments=10,
 26 |         no_speech_thresh=0.45,
 27 |         clip_audio=False,
 28 |         same_output_threshold=10,
 29 |     ):
 30 |         """
 31 |         Initialize a ServeClient instance.
 32 |         The Whisper model is initialized based on the client's language and device availability.
 33 |         The transcription thread is started upon initialization. A "SERVER_READY" message is sent
 34 |         to the client to indicate that the server is ready.
 35 | 
 36 |         Args:
 37 |             websocket (WebSocket): The WebSocket connection for the client.
 38 |             task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
 39 |             device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
 40 |             multilingual (bool, optional): Whether the client supports multilingual transcription. Defaults to False.
 41 |             language (str, optional): The language for transcription. Defaults to None.
 42 |             client_uid (str, optional): A unique identifier for the client. Defaults to None.
 43 |             single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
 44 |             use_py_session (bool, optional): Use python session or cpp session. Defaults to Cpp Session.
 45 |             max_new_tokens (int, optional): Max number of tokens to generate.
 46 |             send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
 47 |             no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
 48 |             clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
 49 |             same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
 50 |         """
 51 |         super().__init__(
 52 |             client_uid,
 53 |             websocket,
 54 |             send_last_n_segments,
 55 |             no_speech_thresh,
 56 |             clip_audio,
 57 |             same_output_threshold,
 58 |         )
 59 | 
 60 |         self.language = language if multilingual else "en"
 61 |         self.task = task
 62 |         self.eos = False
 63 |         self.max_new_tokens = max_new_tokens
 64 | 
 65 |         if single_model:
 66 |             if ServeClientTensorRT.SINGLE_MODEL is None:
 67 |                 self.create_model(model, multilingual, use_py_session=use_py_session)
 68 |                 ServeClientTensorRT.SINGLE_MODEL = self.transcriber
 69 |             else:
 70 |                 self.transcriber = ServeClientTensorRT.SINGLE_MODEL
 71 |         else:
 72 |             self.create_model(model, multilingual, use_py_session=use_py_session)
 73 | 
 74 |         # threading
 75 |         self.trans_thread = threading.Thread(target=self.speech_to_text)
 76 |         self.trans_thread.start()
 77 | 
 78 |         self.websocket.send(json.dumps({
 79 |             "uid": self.client_uid,
 80 |             "message": self.SERVER_READY,
 81 |             "backend": "tensorrt"
 82 |         }))
 83 | 
 84 |     def create_model(self, model, multilingual, warmup=True, use_py_session=False):
 85 |         """
 86 |         Instantiates a new model, sets it as the transcriber and does warmup if desired.
 87 |         """
 88 |         self.transcriber = WhisperTRTLLM(
 89 |             model,
 90 |             assets_dir="assets",
 91 |             device="cuda",
 92 |             is_multilingual=multilingual,
 93 |             language=self.language,
 94 |             task=self.task,
 95 |             use_py_session=use_py_session,
 96 |             max_output_len=self.max_new_tokens,
 97 |         )
 98 |         if warmup:
 99 |             self.warmup()
100 | 
101 |     def warmup(self, warmup_steps=10):
102 |         """
103 |         Warmup TensorRT since first few inferences are slow.
104 | 
105 |         Args:
106 |             warmup_steps (int): Number of steps to warm up the model for.
107 |         """
108 |         logging.info("[INFO:] Warming up TensorRT engine..")
109 |         mel, _ = self.transcriber.log_mel_spectrogram("assets/jfk.flac")
110 |         for i in range(warmup_steps):
111 |             self.transcriber.transcribe(mel)
112 | 
113 |     def set_eos(self, eos):
114 |         """
115 |         Sets the End of Speech (EOS) flag.
116 | 
117 |         Args:
118 |             eos (bool): The value to set for the EOS flag.
119 |         """
120 |         self.lock.acquire()
121 |         self.eos = eos
122 |         self.lock.release()
123 | 
124 |     def handle_transcription_output(self, last_segment, duration):
125 |         """
126 |         Handle the transcription output, updating the transcript and sending data to the client.
127 | 
128 |         Args:
129 |             last_segment (str): The last segment from the whisper output which is considered to be incomplete because
130 |                                 of the possibility of word being truncated.
131 |             duration (float): Duration of the transcribed audio chunk.
132 |         """
133 |         segments = self.prepare_segments({"text": last_segment})
134 |         self.send_transcription_to_client(segments)
135 |         if self.eos:
136 |             self.update_timestamp_offset(last_segment, duration)
137 | 
138 |     def transcribe_audio(self, input_bytes):
139 |         """
140 |         Transcribe the audio chunk and send the results to the client.
141 | 
142 |         Args:
143 |             input_bytes (np.array): The audio chunk to transcribe.
144 |         """
145 |         if ServeClientTensorRT.SINGLE_MODEL:
146 |             ServeClientTensorRT.SINGLE_MODEL_LOCK.acquire()
147 |         logging.info(f"[WhisperTensorRT:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
148 |         mel, duration = self.transcriber.log_mel_spectrogram(input_bytes)
149 |         last_segment = self.transcriber.transcribe(
150 |             mel,
151 |             text_prefix=f"<|startoftranscript|><|{self.language}|><|{self.task}|><|notimestamps|>",
152 |         )
153 |         if ServeClientTensorRT.SINGLE_MODEL:
154 |             ServeClientTensorRT.SINGLE_MODEL_LOCK.release()
155 |         if last_segment:
156 |             self.handle_transcription_output(last_segment, duration)
157 | 
158 |     def update_timestamp_offset(self, last_segment, duration):
159 |         """
160 |         Update timestamp offset and transcript.
161 | 
162 |         Args:
163 |             last_segment (str): Last transcribed audio from the whisper model.
164 |             duration (float): Duration of the last audio chunk.
165 |         """
166 |         if not len(self.transcript):
167 |             self.transcript.append({"text": last_segment + " "})
168 |         elif self.transcript[-1]["text"].strip() != last_segment:
169 |             self.transcript.append({"text": last_segment + " "})
170 |         
171 |         with self.lock:
172 |             self.timestamp_offset += duration
173 | 
174 |     def speech_to_text(self):
175 |         """
176 |         Process an audio stream in an infinite loop, continuously transcribing the speech.
177 | 
178 |         This method continuously receives audio frames, performs real-time transcription, and sends
179 |         transcribed segments to the client via a WebSocket connection.
180 | 
181 |         If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
182 |         It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
183 |         are sent to the client in real-time, and a history of segments is maintained to provide context.
184 | 
185 |         Raises:
186 |             Exception: If there is an issue with audio processing or WebSocket communication.
187 | 
188 |         """
189 |         while True:
190 |             if self.exit:
191 |                 logging.info("Exiting speech to text thread")
192 |                 break
193 | 
194 |             if self.frames_np is None:
195 |                 time.sleep(0.02)    # wait for any audio to arrive
196 |                 continue
197 | 
198 |             self.clip_audio_if_no_valid_segment()
199 | 
200 |             input_bytes, duration = self.get_audio_chunk_for_processing()
201 |             if duration < 0.4:
202 |                 continue
203 | 
204 |             try:
205 |                 input_sample = input_bytes.copy()
206 |                 logging.info(f"[WhisperTensorRT:] Processing audio with duration: {duration}")
207 |                 self.transcribe_audio(input_sample)
208 | 
209 |             except Exception as e:
210 |                 logging.error(f"[ERROR]: {e}")
211 | 


--------------------------------------------------------------------------------
/whisper_live/transcriber/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/transcriber/__init__.py


--------------------------------------------------------------------------------
/whisper_live/transcriber/transcriber_openvino.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import os
 3 | 
 4 | import openvino_genai as ov_genai
 5 | import huggingface_hub as hf_hub
 6 | 
 7 | 
 8 | class WhisperOpenVINO(object):
 9 |     def __init__(self, model_id="OpenVINO/whisper-tiny-fp16-ov", device="CPU", language="en", task="transcribe"):
10 |         model_path = model_id.split('/')[-1]
11 |         cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "openvino_whisper_models")
12 |         os.makedirs(cache_dir, exist_ok=True)
13 |         model_path = os.path.join(cache_dir, model_path)
14 |         if not os.path.exists(model_path):
15 |             hf_hub.snapshot_download(model_id, local_dir=model_path)
16 |         self.model = ov_genai.WhisperPipeline(str(model_path), device=device)
17 |         self.language = language
18 |         self.task = task
19 | 
20 |     def transcribe(self, input_audio):
21 |         outputs = self.model.generate(input_audio, return_timestamps=True, language=self.language, task=self.task)
22 |         outputs = [seg for seg in outputs.chunks]
23 |         return outputs
24 | 


--------------------------------------------------------------------------------
/whisper_live/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import textwrap
 3 | import scipy
 4 | import numpy as np
 5 | import av
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def clear_screen():
10 |     """Clears the console screen."""
11 |     os.system("cls" if os.name == "nt" else "clear")
12 | 
13 | 
14 | def print_transcript(text):
15 |     """Prints formatted transcript text."""
16 |     wrapper = textwrap.TextWrapper(width=60)
17 |     for line in wrapper.wrap(text="".join(text)):
18 |         print(line)
19 | 
20 | 
21 | def format_time(s):
22 |     """Convert seconds (float) to SRT time format."""
23 |     hours = int(s // 3600)
24 |     minutes = int((s % 3600) // 60)
25 |     seconds = int(s % 60)
26 |     milliseconds = int((s - int(s)) * 1000)
27 |     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
28 | 
29 | 
30 | def create_srt_file(segments, resampled_file):
31 |     with open(resampled_file, 'w', encoding='utf-8') as srt_file:
32 |         segment_number = 1
33 |         for segment in segments:
34 |             start_time = format_time(float(segment['start']))
35 |             end_time = format_time(float(segment['end']))
36 |             text = segment['text']
37 | 
38 |             srt_file.write(f"{segment_number}\n")
39 |             srt_file.write(f"{start_time} --> {end_time}\n")
40 |             srt_file.write(f"{text}\n\n")
41 | 
42 |             segment_number += 1
43 | 
44 | 
45 | def resample(file: str, sr: int = 16000):
46 |     """
47 |     Resample the audio file to 16kHz.
48 | 
49 |     Args:
50 |         file (str): The audio file to open
51 |         sr (int): The sample rate to resample the audio if necessary
52 | 
53 |     Returns:
54 |         resampled_file (str): The resampled audio file
55 |     """
56 |     container = av.open(file)
57 |     stream = next(s for s in container.streams if s.type == 'audio')
58 | 
59 |     resampler = av.AudioResampler(
60 |         format='s16',
61 |         layout='mono',
62 |         rate=sr,
63 |     )
64 | 
65 |     resampled_file = Path(file).stem + "_resampled.wav"
66 |     output_container = av.open(resampled_file, mode='w')
67 |     output_stream = output_container.add_stream('pcm_s16le', rate=sr)
68 |     output_stream.layout = 'mono'
69 | 
70 |     for frame in container.decode(audio=0):
71 |         frame.pts = None
72 |         resampled_frames = resampler.resample(frame)
73 |         if resampled_frames is not None:
74 |             for resampled_frame in resampled_frames:
75 |                 for packet in output_stream.encode(resampled_frame):
76 |                     output_container.mux(packet)
77 | 
78 |     for packet in output_stream.encode(None):
79 |         output_container.mux(packet)
80 | 
81 |     output_container.close()
82 |     return resampled_file
83 | 


--------------------------------------------------------------------------------
/whisper_live/vad.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import torch
  4 | import numpy as np
  5 | import onnxruntime
  6 | import warnings
  7 | 
  8 | 
  9 | class VoiceActivityDetection():
 10 | 
 11 |     def __init__(self, force_onnx_cpu=True):
 12 |         path = self.download()
 13 | 
 14 |         opts = onnxruntime.SessionOptions()
 15 |         opts.log_severity_level = 3
 16 | 
 17 |         opts.inter_op_num_threads = 1
 18 |         opts.intra_op_num_threads = 1
 19 | 
 20 |         if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
 21 |             self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
 22 |         else:
 23 |             self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts)
 24 | 
 25 |         self.reset_states()
 26 |         if '16k' in path:
 27 |             warnings.warn('This model support only 16000 sampling rate!')
 28 |             self.sample_rates = [16000]
 29 |         else:
 30 |             self.sample_rates = [8000, 16000]
 31 | 
 32 |     def _validate_input(self, x, sr: int):
 33 |         if x.dim() == 1:
 34 |             x = x.unsqueeze(0)
 35 |         if x.dim() > 2:
 36 |             raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
 37 | 
 38 |         if sr != 16000 and (sr % 16000 == 0):
 39 |             step = sr // 16000
 40 |             x = x[:,::step]
 41 |             sr = 16000
 42 | 
 43 |         if sr not in self.sample_rates:
 44 |             raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
 45 |         if sr / x.shape[1] > 31.25:
 46 |             raise ValueError("Input audio chunk is too short")
 47 | 
 48 |         return x, sr
 49 | 
 50 |     def reset_states(self, batch_size=1):
 51 |         self._state = torch.zeros((2, batch_size, 128)).float()
 52 |         self._context = torch.zeros(0)
 53 |         self._last_sr = 0
 54 |         self._last_batch_size = 0
 55 | 
 56 |     def __call__(self, x, sr: int):
 57 | 
 58 |         x, sr = self._validate_input(x, sr)
 59 |         num_samples = 512 if sr == 16000 else 256
 60 | 
 61 |         if x.shape[-1] != num_samples:
 62 |             raise ValueError(f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
 63 | 
 64 |         batch_size = x.shape[0]
 65 |         context_size = 64 if sr == 16000 else 32
 66 | 
 67 |         if not self._last_batch_size:
 68 |             self.reset_states(batch_size)
 69 |         if (self._last_sr) and (self._last_sr != sr):
 70 |             self.reset_states(batch_size)
 71 |         if (self._last_batch_size) and (self._last_batch_size != batch_size):
 72 |             self.reset_states(batch_size)
 73 | 
 74 |         if not len(self._context):
 75 |             self._context = torch.zeros(batch_size, context_size)
 76 | 
 77 |         x = torch.cat([self._context, x], dim=1)
 78 |         if sr in [8000, 16000]:
 79 |             ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
 80 |             ort_outs = self.session.run(None, ort_inputs)
 81 |             out, state = ort_outs
 82 |             self._state = torch.from_numpy(state)
 83 |         else:
 84 |             raise ValueError()
 85 | 
 86 |         self._context = x[..., -context_size:]
 87 |         self._last_sr = sr
 88 |         self._last_batch_size = batch_size
 89 | 
 90 |         out = torch.from_numpy(out)
 91 |         return out
 92 | 
 93 |     def audio_forward(self, x, sr: int):
 94 |         outs = []
 95 |         x, sr = self._validate_input(x, sr)
 96 |         self.reset_states()
 97 |         num_samples = 512 if sr == 16000 else 256
 98 | 
 99 |         if x.shape[1] % num_samples:
100 |             pad_num = num_samples - (x.shape[1] % num_samples)
101 |             x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
102 | 
103 |         for i in range(0, x.shape[1], num_samples):
104 |             wavs_batch = x[:, i:i+num_samples]
105 |             out_chunk = self.__call__(wavs_batch, sr)
106 |             outs.append(out_chunk)
107 | 
108 |         stacked = torch.cat(outs, dim=1)
109 |         return stacked.cpu()
110 | 
111 |     @staticmethod
112 |     def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"):
113 |         target_dir = os.path.expanduser("~/.cache/whisper-live/")
114 | 
115 |         # Ensure the target directory exists
116 |         os.makedirs(target_dir, exist_ok=True)
117 | 
118 |         # Define the target file path
119 |         model_filename = os.path.join(target_dir, "silero_vad.onnx")
120 | 
121 |         # Check if the model file already exists
122 |         if not os.path.exists(model_filename):
123 |             # If it doesn't exist, download the model using wget
124 |             try:
125 |                 subprocess.run(["wget", "-O", model_filename, model_url], check=True)
126 |             except subprocess.CalledProcessError:
127 |                 print("Failed to download the model using wget.")
128 |         return model_filename
129 | 
130 | 
131 | class VoiceActivityDetector:
132 |     def __init__(self, threshold=0.5, frame_rate=16000):
133 |         """
134 |         Initializes the VoiceActivityDetector with a voice activity detection model and a threshold.
135 | 
136 |         Args:
137 |             threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5.
138 |         """
139 |         self.model = VoiceActivityDetection()
140 |         self.threshold = threshold
141 |         self.frame_rate = frame_rate
142 | 
143 |     def __call__(self, audio_frame):
144 |         """
145 |         Determines if the given audio frame contains speech by comparing the detected speech probability against
146 |         the threshold.
147 | 
148 |         Args:
149 |             audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a
150 |                                       NumPy array of audio samples.
151 | 
152 |         Returns:
153 |             bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity;
154 |                   False otherwise.
155 |         """
156 |         speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0]
157 |         return torch.any(speech_probs > self.threshold).item()
158 | 


--------------------------------------------------------------------------------