├── .github └── workflows │ └── ci.yml ├── Audio-Transcription-Chrome ├── README.md ├── background.js ├── content.js ├── icon128.png ├── manifest.json ├── options.html ├── options.js ├── popup.html ├── popup.js └── style.css ├── Audio-Transcription-Firefox ├── README.md ├── background.js ├── content.js ├── icon128.png ├── manifest.json ├── popup.html ├── popup.js └── style.css ├── LICENSE ├── README.md ├── TensorRT_whisper.md ├── assets └── jfk.flac ├── docker ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Dockerfile.openvino └── Dockerfile.tensorrt ├── docs ├── .nojekyll ├── doctrees │ ├── environment.pickle │ └── index.doctree ├── html │ ├── .buildinfo │ ├── _sources │ │ └── index.rst.txt │ ├── _static │ │ ├── alabaster.css │ │ ├── basic.css │ │ ├── custom.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ └── sphinx_highlight.js │ ├── genindex.html │ ├── index.html │ ├── objects.inv │ ├── py-modindex.html │ ├── search.html │ └── searchindex.js └── index.html ├── requirements ├── client.txt └── server.txt ├── run_server.py ├── scripts ├── build_whisper_tensorrt.sh └── setup.sh ├── setup.py ├── tests ├── __init__.py ├── test_client.py ├── test_server.py └── test_vad.py └── whisper_live ├── __init__.py ├── __version__.py ├── backend ├── __init__.py ├── base.py ├── faster_whisper_backend.py ├── openvino_backend.py └── trt_backend.py ├── client.py ├── server.py ├── transcriber ├── __init__.py ├── tensorrt_utils.py ├── transcriber_faster_whisper.py ├── transcriber_openvino.py └── transcriber_tensorrt.py ├── utils.py └── vad.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Test & Build CI/CD 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - v* 9 | pull_request: 10 | branches: [ main ] 11 | types: [opened, synchronize, reopened] 12 | 13 | jobs: 14 | run-tests: 15 | runs-on: ubuntu-22.04 16 | strategy: 17 | matrix: 18 | python-version: [3.9, '3.10', 3.11, 3.12] 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Cache Python dependencies 28 | uses: actions/cache@v4 29 | with: 30 | path: | 31 | ~/.cache/pip 32 | !~/.cache/pip/log 33 | key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }} 34 | restore-keys: | 35 | ${{ runner.os }}-pip-${{ matrix.python-version }}- 36 | 37 | - name: Install system dependencies 38 | run: sudo apt-get update && sudo apt-get install -y portaudio19-dev 39 | 40 | - name: Install Python dependencies 41 | run: | 42 | python -m pip install --upgrade pip 43 | pip install -r requirements/server.txt --extra-index-url https://download.pytorch.org/whl/cpu 44 | pip install -r requirements/client.txt 45 | 46 | - name: Run tests 47 | run: | 48 | echo "Running tests with Python ${{ matrix.python-version }}" 49 | python -m unittest discover -s tests 50 | 51 | check-code-format: 52 | runs-on: ubuntu-22.04 53 | strategy: 54 | matrix: 55 | python-version: [3.9, '3.10', 3.11, 3.12] 56 | 57 | steps: 58 | - uses: actions/checkout@v2 59 | 60 | - name: Set up Python ${{ matrix.python-version }} 61 | uses: actions/setup-python@v2 62 | with: 63 | python-version: ${{ matrix.python-version }} 64 | 65 | - name: Install dependencies 66 | run: | 67 | python -m pip install --upgrade pip 68 | python -m pip install flake8 69 | 70 | - name: Lint with flake8 71 | run: | 72 | # stop the build if there are Python syntax errors or undefined names 73 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 74 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 75 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 76 | 77 | build-and-push-docker-cpu: 78 | needs: [run-tests, check-code-format] 79 | runs-on: ubuntu-22.04 80 | if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) 81 | steps: 82 | - uses: actions/checkout@v2 83 | 84 | - name: Log in to GitHub Container Registry 85 | uses: docker/login-action@v1 86 | with: 87 | registry: ghcr.io 88 | username: ${{ github.repository_owner }} 89 | password: ${{ secrets.GHCR_TOKEN }} 90 | 91 | - name: Set up Docker Buildx 92 | uses: docker/setup-buildx-action@v1 93 | 94 | - name: Build and push Docker image 95 | uses: docker/build-push-action@v2 96 | with: 97 | context: . 98 | file: docker/Dockerfile.cpu 99 | push: true 100 | tags: ghcr.io/collabora/whisperlive-cpu:latest 101 | 102 | build-and-push-docker-gpu: 103 | needs: [run-tests, check-code-format, build-and-push-docker-cpu] 104 | timeout-minutes: 20 105 | runs-on: ubuntu-22.04 106 | if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) 107 | steps: 108 | - uses: actions/checkout@v2 109 | 110 | - name: Log in to GitHub Container Registry 111 | uses: docker/login-action@v1 112 | with: 113 | registry: ghcr.io 114 | username: ${{ github.repository_owner }} 115 | password: ${{ secrets.GHCR_TOKEN }} 116 | 117 | - name: Docker Prune 118 | run: docker system prune -af 119 | 120 | - name: Set up Docker Buildx 121 | uses: docker/setup-buildx-action@v1 122 | 123 | - name: Build and push Docker GPU image 124 | uses: docker/build-push-action@v2 125 | with: 126 | context: . 127 | file: docker/Dockerfile.gpu 128 | push: true 129 | tags: ghcr.io/collabora/whisperlive-gpu:latest 130 | 131 | build-and-push-docker-openvino: 132 | needs: [run-tests, check-code-format, build-and-push-docker-cpu] 133 | timeout-minutes: 20 134 | runs-on: ubuntu-22.04 135 | if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) 136 | steps: 137 | - uses: actions/checkout@v2 138 | 139 | - name: Log in to GitHub Container Registry 140 | uses: docker/login-action@v1 141 | with: 142 | registry: ghcr.io 143 | username: ${{ github.repository_owner }} 144 | password: ${{ secrets.GHCR_TOKEN }} 145 | 146 | - name: Docker Prune 147 | run: docker system prune -af 148 | 149 | - name: Set up Docker Buildx 150 | uses: docker/setup-buildx-action@v1 151 | 152 | - name: Build and push Docker GPU image 153 | uses: docker/build-push-action@v2 154 | with: 155 | context: . 156 | file: docker/Dockerfile.openvino 157 | push: true 158 | tags: ghcr.io/collabora/whisperlive-openvino:latest 159 | 160 | publish-to-pypi: 161 | needs: [run-tests, check-code-format] 162 | runs-on: ubuntu-22.04 163 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 164 | steps: 165 | - uses: actions/checkout@v2 166 | 167 | - name: Set up Python 3.9 168 | uses: actions/setup-python@v2 169 | with: 170 | python-version: 3.9 171 | 172 | - name: Cache Python dependencies 173 | uses: actions/cache@v4 174 | with: 175 | path: | 176 | ~/.cache/pip 177 | !~/.cache/pip/log 178 | key: ubuntu-latest-pip-3.9-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }} 179 | restore-keys: | 180 | ubuntu-latest-pip-3.9- 181 | 182 | - name: Install system dependencies 183 | run: sudo apt-get update && sudo apt-get install -y portaudio19-dev 184 | 185 | - name: Install Python dependencies 186 | run: | 187 | pip install -r requirements/server.txt 188 | pip install -r requirements/client.txt 189 | pip install wheel 190 | 191 | - name: Build package 192 | run: python setup.py sdist bdist_wheel 193 | 194 | - name: Publish package to PyPI 195 | uses: pypa/gh-action-pypi-publish@release/v1 196 | with: 197 | user: __token__ 198 | password: ${{ secrets.PYPI_API_TOKEN }} 199 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/README.md: -------------------------------------------------------------------------------- 1 | # Audio Transcription 2 | 3 | Audio Transcription is a Chrome extension that allows users to capture any audio playing on the current tab and transcribe it using OpenAI-whisper in real time. Users will have the option to do voice activity detection as well to not send audio to server when there is no speech. 4 | 5 | We use OpenAI-whisper model to process the audio continuously and send the transcription back to the client. We apply a few optimizations on top of OpenAI's implementation to improve performance and run it faster in a real-time manner. To this end, we used [faster-whisper](https://github.com/guillaumekln/faster-whisper) which is 4x faster than OpenAI's implementation. 6 | 7 | ## Loading the Extension 8 | - Open the Google Chrome browser. 9 | - Type chrome://extensions in the address bar and press Enter. 10 | - Enable the Developer mode toggle switch located in the top right corner. 11 | - Clone this repository 12 | - Click the Load unpacked button. 13 | - Browse to the location where you cloned the repository files and select the ```Audio Transcription``` folder. 14 | - The extension should now be loaded and visible on the extensions page. 15 | 16 | 17 | ## Real time transcription with OpenAI-whisper 18 | This Chrome extension allows you to send audio from your browser to a server for transcribing the audio in real time. It can also incorporate voice activity detection on the client side to detect when speech is present, and it continuously receives transcriptions of the spoken content from the server. You can select from the options menu if you want to run the speech recognition. 19 | 20 | 21 | ## Implementation Details 22 | 23 | ### Capturing Audio 24 | To capture the audio in the current tab, we used the chrome `tabCapture` API to obtain a `MediaStream` object of the current tab. 25 | 26 | ### Options 27 | When using the Audio Transcription extension, you have the following options: 28 | - **Use Collabora Server**: We provide a demo server which runs the whisper small model. 29 | - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper. 30 | - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English. 31 | - **Model Size**: Select the whisper model size to run the server with. 32 | 33 | ### Getting Started 34 | - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live). 35 | - Just click on the Chrome Extension which should show 2 options 36 | - **Start Capture** : Starts capturing the audio in the current tab and sends the captured audio to the server for transcription. This also creates an element to show the transcriptions recieved from the server on the current tab. 37 | - **Stop Capture** - Stops capturing the audio. 38 | 39 | 40 | ## Limitations 41 | This extension requires an internet connection to stream audio and receive transcriptions. The accuracy of the transcriptions may vary depending on the audio quality and the performance of the server-side transcription service. The extension may consume additional system resources while running, especially when streaming audio. 42 | 43 | ## Note 44 | The extension relies on a properly running transcription server with multilingual support. Please follow the server documentation for setup and configuration. 45 | 46 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/background.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Removes a tab with the specified tab ID in Google Chrome. 3 | * @param {number} tabId - The ID of the tab to be removed. 4 | * @returns {Promise} A promise that resolves when the tab is successfully removed or fails to remove. 5 | */ 6 | function removeChromeTab(tabId) { 7 | return new Promise((resolve) => { 8 | chrome.tabs.remove(tabId) 9 | .then(resolve) 10 | .catch(resolve); 11 | }); 12 | } 13 | 14 | 15 | /** 16 | * Executes a script file in a specific tab in Google Chrome. 17 | * @param {number} tabId - The ID of the tab where the script should be executed. 18 | * @param {string} file - The file path or URL of the script to be executed. 19 | * @returns {Promise} A promise that resolves when the script is successfully executed or fails to execute. 20 | */ 21 | function executeScriptInTab(tabId, file) { 22 | return new Promise((resolve) => { 23 | chrome.scripting.executeScript( 24 | { 25 | target: { tabId }, 26 | files: [file], 27 | }, () => { 28 | resolve(); 29 | } 30 | ); 31 | }); 32 | } 33 | 34 | 35 | /** 36 | * Opens the options page of the Chrome extension in a new pinned tab. 37 | * @returns {Promise} A promise that resolves with the created tab object. 38 | */ 39 | function openExtensionOptions() { 40 | return new Promise((resolve) => { 41 | chrome.tabs.create( 42 | { 43 | pinned: true, 44 | active: false, 45 | url: `chrome-extension://${chrome.runtime.id}/options.html`, 46 | }, 47 | (tab) => { 48 | resolve(tab); 49 | } 50 | ); 51 | }); 52 | } 53 | 54 | 55 | /** 56 | * Retrieves the value associated with the specified key from the local storage in Google Chrome. 57 | * @param {string} key - The key of the value to retrieve from the local storage. 58 | * @returns {Promise} A promise that resolves with the retrieved value from the local storage. 59 | */ 60 | function getLocalStorageValue(key) { 61 | return new Promise((resolve) => { 62 | chrome.storage.local.get([key], (result) => { 63 | resolve(result[key]); 64 | }); 65 | }); 66 | } 67 | 68 | 69 | /** 70 | * Sends a message to a specific tab in Google Chrome. 71 | * @param {number} tabId - The ID of the tab to send the message to. 72 | * @param {any} data - The data to be sent as the message. 73 | * @returns {Promise} A promise that resolves with the response from the tab. 74 | */ 75 | function sendMessageToTab(tabId, data) { 76 | return new Promise((resolve) => { 77 | chrome.tabs.sendMessage(tabId, data, (response) => { 78 | resolve(response); 79 | }); 80 | }); 81 | } 82 | 83 | 84 | /** 85 | * Delays the execution for a specified duration. 86 | * @param {number} ms - The duration to sleep in milliseconds (default: 0). 87 | * @returns {Promise} A promise that resolves after the specified duration. 88 | */ 89 | function delayExecution(ms = 0) { 90 | return new Promise((resolve) => setTimeout(resolve, ms)); 91 | } 92 | 93 | 94 | /** 95 | * Sets a value associated with the specified key in the local storage of Google Chrome. 96 | * @param {string} key - The key to set in the local storage. 97 | * @param {any} value - The value to associate with the key in the local storage. 98 | * @returns {Promise} A promise that resolves with the value that was set in the local storage. 99 | */ 100 | function setLocalStorageValue(key, value) { 101 | return new Promise((resolve) => { 102 | chrome.storage.local.set( 103 | { 104 | [key]: value, 105 | }, () => { 106 | resolve(value); 107 | } 108 | ); 109 | }); 110 | } 111 | 112 | 113 | /** 114 | * Retrieves the tab object with the specified tabId. 115 | * @param {number} tabId - The ID of the tab to retrieve. 116 | * @returns {Promise} - A Promise that resolves to the tab object. 117 | */ 118 | async function getTab(tabId) { 119 | return new Promise((resolve) => { 120 | chrome.tabs.get(tabId, (tab) => { 121 | resolve(tab); 122 | }); 123 | }); 124 | } 125 | 126 | 127 | /** 128 | * Starts the capture process for the specified tab. 129 | * @param {number} tabId - The ID of the tab to start capturing. 130 | * @returns {Promise} - A Promise that resolves when the capture process is started successfully. 131 | */ 132 | async function startCapture(options) { 133 | const { tabId } = options; 134 | const optionTabId = await getLocalStorageValue("optionTabId"); 135 | if (optionTabId) { 136 | await removeChromeTab(optionTabId); 137 | } 138 | 139 | try { 140 | const currentTab = await getTab(tabId); 141 | if (currentTab.audible) { 142 | await setLocalStorageValue("currentTabId", currentTab.id); 143 | await executeScriptInTab(currentTab.id, "content.js"); 144 | await delayExecution(500); 145 | 146 | const optionTab = await openExtensionOptions(); 147 | 148 | await setLocalStorageValue("optionTabId", optionTab.id); 149 | await delayExecution(500); 150 | 151 | await sendMessageToTab(optionTab.id, { 152 | type: "start_capture", 153 | data: { 154 | currentTabId: currentTab.id, 155 | host: options.host, 156 | port: options.port, 157 | multilingual: options.useMultilingual, 158 | language: options.language, 159 | task: options.task, 160 | modelSize: options.modelSize, 161 | useVad: options.useVad, 162 | }, 163 | }); 164 | } else { 165 | console.log("No Audio"); 166 | } 167 | } catch (error) { 168 | console.error("Error occurred while starting capture:", error); 169 | } 170 | } 171 | 172 | 173 | /** 174 | * Stops the capture process and performs cleanup. 175 | * @returns {Promise} - A Promise that resolves when the capture process is stopped successfully. 176 | */ 177 | async function stopCapture() { 178 | const optionTabId = await getLocalStorageValue("optionTabId"); 179 | const currentTabId = await getLocalStorageValue("currentTabId"); 180 | 181 | if (optionTabId) { 182 | res = await sendMessageToTab(currentTabId, { 183 | type: "STOP", 184 | data: { currentTabId: currentTabId }, 185 | }); 186 | await removeChromeTab(optionTabId); 187 | } 188 | } 189 | 190 | 191 | /** 192 | * Listens for messages from the runtime and performs corresponding actions. 193 | * @param {Object} message - The message received from the runtime. 194 | */ 195 | chrome.runtime.onMessage.addListener(async (message) => { 196 | if (message.action === "startCapture") { 197 | startCapture(message); 198 | } else if (message.action === "stopCapture") { 199 | stopCapture(); 200 | } else if (message.action === "updateSelectedLanguage") { 201 | const detectedLanguage = message.detectedLanguage; 202 | chrome.runtime.sendMessage({ action: "updateSelectedLanguage", detectedLanguage }); 203 | chrome.storage.local.set({ selectedLanguage: detectedLanguage }); 204 | } else if (message.action === "toggleCaptureButtons") { 205 | chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false }); 206 | chrome.storage.local.set({ capturingState: { isCapturing: false } }) 207 | stopCapture(); 208 | } 209 | }); 210 | 211 | 212 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/content.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | var elem_container = null; 4 | var elem_text = null; 5 | 6 | var segments = []; 7 | var text_segments = []; 8 | 9 | function initPopupElement() { 10 | if (document.getElementById('popupElement')) { 11 | return; 12 | } 13 | 14 | const popupContainer = document.createElement('div'); 15 | popupContainer.id = 'popupElement'; 16 | popupContainer.style.cssText = 'position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: white; color: black; padding: 16px; border-radius: 10px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.5); display: none; text-align: center;'; 17 | 18 | const popupText = document.createElement('span'); 19 | popupText.textContent = 'Default Text'; 20 | popupText.className = 'popupText'; 21 | popupText.style.fontSize = '24px'; 22 | popupContainer.appendChild(popupText); 23 | 24 | const buttonContainer = document.createElement('div'); 25 | buttonContainer.style.marginTop = '8px'; 26 | const closePopupButton = document.createElement('button'); 27 | closePopupButton.textContent = 'Close'; 28 | closePopupButton.style.backgroundColor = '#65428A'; 29 | closePopupButton.style.color = 'white'; 30 | closePopupButton.style.border = 'none'; 31 | closePopupButton.style.padding = '8px 16px'; // Add padding for better click area 32 | closePopupButton.style.cursor = 'pointer'; 33 | closePopupButton.addEventListener('click', async () => { 34 | popupContainer.style.display = 'none'; 35 | await browser.runtime.sendMessage({ action: 'toggleCaptureButtons', data: false }); 36 | }); 37 | buttonContainer.appendChild(closePopupButton); 38 | popupContainer.appendChild(buttonContainer); 39 | 40 | document.body.appendChild(popupContainer); 41 | } 42 | 43 | 44 | function showPopup(customText) { 45 | const popup = document.getElementById('popupElement'); 46 | const popupText = popup.querySelector('.popupText'); 47 | 48 | if (popup && popupText) { 49 | popupText.textContent = customText || 'Default Text'; // Set default text if custom text is not provided 50 | popup.style.display = 'block'; 51 | } 52 | } 53 | 54 | 55 | function init_element() { 56 | if (document.getElementById('transcription')) { 57 | return; 58 | } 59 | 60 | elem_container = document.createElement('div'); 61 | elem_container.id = "transcription"; 62 | elem_container.style.cssText = 'padding-top:16px;font-size:18px;position: fixed; top: 85%; left: 50%; transform: translate(-50%, -50%);line-height:18px;width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;'; 63 | 64 | for (var i = 0; i < 4; i++) { 65 | elem_text = document.createElement('span'); 66 | elem_text.style.cssText = 'position: absolute;padding-left:16px;padding-right:16px;'; 67 | elem_text.id = "t" + i; 68 | elem_container.appendChild(elem_text); 69 | 70 | if (i == 3) { 71 | elem_text.style.top = "-1000px" 72 | } 73 | } 74 | 75 | document.body.appendChild(elem_container); 76 | 77 | let x = 0; 78 | let y = 0; 79 | 80 | // Query the element 81 | const ele = elem_container; 82 | 83 | // Handle the mousedown event 84 | // that's triggered when user drags the element 85 | const mouseDownHandler = function (e) { 86 | // Get the current mouse position 87 | x = e.clientX; 88 | y = e.clientY; 89 | 90 | // Attach the listeners to `document` 91 | document.addEventListener('mousemove', mouseMoveHandler); 92 | document.addEventListener('mouseup', mouseUpHandler); 93 | }; 94 | 95 | const mouseMoveHandler = function (e) { 96 | // How far the mouse has been moved 97 | const dx = e.clientX - x; 98 | const dy = e.clientY - y; 99 | 100 | // Set the position of element 101 | ele.style.top = `${ele.offsetTop + dy}px`; 102 | ele.style.left = `${ele.offsetLeft + dx}px`; 103 | 104 | // Reassign the position of mouse 105 | x = e.clientX; 106 | y = e.clientY; 107 | }; 108 | 109 | const mouseUpHandler = function () { 110 | // Remove the handlers of `mousemove` and `mouseup` 111 | document.removeEventListener('mousemove', mouseMoveHandler); 112 | document.removeEventListener('mouseup', mouseUpHandler); 113 | }; 114 | 115 | ele.addEventListener('mousedown', mouseDownHandler); 116 | } 117 | 118 | function getStyle(el,styleProp) 119 | { 120 | var x = document.getElementById(el); 121 | if (x.currentStyle) 122 | var y = x.currentStyle[styleProp]; 123 | else if (window.getComputedStyle) 124 | var y = document.defaultView.getComputedStyle(x,null).getPropertyValue(styleProp); 125 | return y; 126 | } 127 | 128 | function get_lines(elem, line_height) { 129 | var divHeight = elem.offsetHeight; 130 | var lines = divHeight / line_height; 131 | 132 | var original_text = elem.innerHTML; 133 | 134 | var words = original_text.split(' '); 135 | var segments = []; 136 | var current_lines = 1; 137 | var segment = ''; 138 | var segment_len = 0; 139 | for (var i = 0; i < words.length; i++) 140 | { 141 | segment += words[i] + ' '; 142 | elem.innerHTML = segment; 143 | divHeight = elem.offsetHeight; 144 | 145 | if ((divHeight / line_height) > current_lines) { 146 | var line_segment = segment.substring(segment_len, segment.length - 1 - words[i].length - 1); 147 | segments.push(line_segment); 148 | segment_len += line_segment.length + 1; 149 | current_lines++; 150 | } 151 | } 152 | 153 | var line_segment = segment.substring(segment_len, segment.length - 1) 154 | segments.push(line_segment); 155 | 156 | elem.innerHTML = original_text; 157 | 158 | return segments; 159 | 160 | } 161 | 162 | function remove_element() { 163 | var elem = document.getElementById('transcription') 164 | for (var i = 0; i < 4; i++) { 165 | document.getElementById("t" + i).remove(); 166 | } 167 | elem.remove() 168 | } 169 | 170 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 171 | const { type, data } = request; 172 | 173 | if (type === "STOP") { 174 | remove_element(); 175 | sendResponse({data: "STOPPED"}); 176 | return true; 177 | } else if (type === "showWaitPopup"){ 178 | initPopupElement(); 179 | 180 | showPopup(`Estimated wait time ~ ${Math.round(data)} minutes`); 181 | sendResponse({data: "popup"}); 182 | return true; 183 | } 184 | 185 | init_element(); 186 | 187 | message = JSON.parse(data); 188 | message = message["segments"]; 189 | 190 | var text = ''; 191 | for (var i = 0; i < message.length; i++) { 192 | text += message[i].text + ' '; 193 | } 194 | text = text.replace(/(\r\n|\n|\r)/gm, ""); 195 | 196 | var elem = document.getElementById('t3'); 197 | elem.innerHTML = text; 198 | 199 | var line_height_style = getStyle('t3', 'line-height'); 200 | var line_height = parseInt(line_height_style.substring(0, line_height_style.length - 2)); 201 | var divHeight = elem.offsetHeight; 202 | var lines = divHeight / line_height; 203 | 204 | text_segments = []; 205 | text_segments = get_lines(elem, line_height); 206 | 207 | elem.innerHTML = ''; 208 | 209 | if (text_segments.length > 2) { 210 | for (var i = 0; i < 3; i++) { 211 | document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i]; 212 | } 213 | } else { 214 | for (var i = 0; i < 3; i++) { 215 | document.getElementById('t' + i).innerHTML = ''; 216 | } 217 | } 218 | 219 | if (text_segments.length <= 2) { 220 | for (var i = 0; i < text_segments.length; i++) { 221 | document.getElementById('t' + i).innerHTML = text_segments[i]; 222 | } 223 | } else { 224 | for (var i = 0; i < 3; i++) { 225 | document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i]; 226 | } 227 | } 228 | 229 | for (var i = 1; i < 3; i++) 230 | { 231 | var parent_elem = document.getElementById('t' + (i - 1)); 232 | var elem = document.getElementById('t' + i); 233 | elem.style.top = parent_elem.offsetHeight + parent_elem.offsetTop + 'px'; 234 | } 235 | 236 | sendResponse({}); 237 | return true; 238 | }); 239 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/icon128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/Audio-Transcription-Chrome/icon128.png -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 3, 3 | 4 | "name": "Audio Transcription", 5 | "version": "1.0.0", 6 | "description": "This extension captures the audio on the current tab, sends it to a server for transcription and shows the transcription in Real-time.", 7 | 8 | "options_page": "options.html", 9 | "background": { 10 | "service_worker": "background.js" 11 | }, 12 | "permissions": [ 13 | "storage", 14 | "activeTab", 15 | "tabCapture", 16 | "scripting" 17 | ], 18 | "icons": { 19 | "128":"icon128.png" 20 | }, 21 | "action": { 22 | "default_popup": "popup.html", 23 | "default_icon": "icon128.png" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Audio Transcription Options 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/options.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Captures audio from the active tab in Google Chrome. 3 | * @returns {Promise} A promise that resolves with the captured audio stream. 4 | */ 5 | function captureTabAudio() { 6 | return new Promise((resolve) => { 7 | chrome.tabCapture.capture( 8 | { 9 | audio: true, 10 | video: false, 11 | }, 12 | (stream) => { 13 | resolve(stream); 14 | } 15 | ); 16 | }); 17 | } 18 | 19 | 20 | /** 21 | * Sends a message to a specific tab in Google Chrome. 22 | * @param {number} tabId - The ID of the tab to send the message to. 23 | * @param {any} data - The data to be sent as the message. 24 | * @returns {Promise} A promise that resolves with the response from the tab. 25 | */ 26 | function sendMessageToTab(tabId, data) { 27 | return new Promise((resolve) => { 28 | chrome.tabs.sendMessage(tabId, data, (response) => { 29 | resolve(response); 30 | }); 31 | }); 32 | } 33 | 34 | 35 | /** 36 | * Resamples the audio data to a target sample rate of 16kHz. 37 | * @param {Array|ArrayBuffer|TypedArray} audioData - The input audio data. 38 | * @param {number} [origSampleRate=44100] - The original sample rate of the audio data. 39 | * @returns {Float32Array} The resampled audio data at 16kHz. 40 | */ 41 | function resampleTo16kHZ(audioData, origSampleRate = 44100) { 42 | // Convert the audio data to a Float32Array 43 | const data = new Float32Array(audioData); 44 | 45 | // Calculate the desired length of the resampled data 46 | const targetLength = Math.round(data.length * (16000 / origSampleRate)); 47 | 48 | // Create a new Float32Array for the resampled data 49 | const resampledData = new Float32Array(targetLength); 50 | 51 | // Calculate the spring factor and initialize the first and last values 52 | const springFactor = (data.length - 1) / (targetLength - 1); 53 | resampledData[0] = data[0]; 54 | resampledData[targetLength - 1] = data[data.length - 1]; 55 | 56 | // Resample the audio data 57 | for (let i = 1; i < targetLength - 1; i++) { 58 | const index = i * springFactor; 59 | const leftIndex = Math.floor(index).toFixed(); 60 | const rightIndex = Math.ceil(index).toFixed(); 61 | const fraction = index - leftIndex; 62 | resampledData[i] = data[leftIndex] + (data[rightIndex] - data[leftIndex]) * fraction; 63 | } 64 | 65 | // Return the resampled data 66 | return resampledData; 67 | } 68 | 69 | function generateUUID() { 70 | let dt = new Date().getTime(); 71 | const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 72 | const r = (dt + Math.random() * 16) % 16 | 0; 73 | dt = Math.floor(dt / 16); 74 | return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16); 75 | }); 76 | return uuid; 77 | } 78 | 79 | 80 | /** 81 | * Starts recording audio from the captured tab. 82 | * @param {Object} option - The options object containing the currentTabId. 83 | */ 84 | async function startRecord(option) { 85 | const stream = await captureTabAudio(); 86 | const uuid = generateUUID(); 87 | 88 | if (stream) { 89 | // call when the stream inactive 90 | stream.oninactive = () => { 91 | window.close(); 92 | }; 93 | const socket = new WebSocket(`ws://${option.host}:${option.port}/`); 94 | let isServerReady = false; 95 | let language = option.language; 96 | socket.onopen = function(e) { 97 | socket.send( 98 | JSON.stringify({ 99 | uid: uuid, 100 | language: option.language, 101 | task: option.task, 102 | model: option.modelSize, 103 | use_vad: option.useVad 104 | }) 105 | ); 106 | }; 107 | 108 | socket.onmessage = async (event) => { 109 | const data = JSON.parse(event.data); 110 | if (data["uid"] !== uuid) 111 | return; 112 | 113 | if (data["status"] === "WAIT"){ 114 | await sendMessageToTab(option.currentTabId, { 115 | type: "showWaitPopup", 116 | data: data["message"], 117 | }); 118 | chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false }) 119 | chrome.runtime.sendMessage({ action: "stopCapture" }) 120 | return; 121 | } 122 | 123 | if (isServerReady === false){ 124 | isServerReady = true; 125 | return; 126 | } 127 | 128 | if (language === null) { 129 | language = data["language"]; 130 | 131 | // send message to popup.js to update dropdown 132 | // console.log(language); 133 | chrome.runtime.sendMessage({ 134 | action: "updateSelectedLanguage", 135 | detectedLanguage: language, 136 | }); 137 | 138 | return; 139 | } 140 | 141 | if (data["message"] === "DISCONNECT"){ 142 | chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false }) 143 | return; 144 | } 145 | 146 | res = await sendMessageToTab(option.currentTabId, { 147 | type: "transcript", 148 | data: event.data, 149 | }); 150 | }; 151 | 152 | 153 | const audioDataCache = []; 154 | const context = new AudioContext(); 155 | const mediaStream = context.createMediaStreamSource(stream); 156 | const recorder = context.createScriptProcessor(4096, 1, 1); 157 | 158 | recorder.onaudioprocess = async (event) => { 159 | if (!context || !isServerReady) return; 160 | 161 | const inputData = event.inputBuffer.getChannelData(0); 162 | const audioData16kHz = resampleTo16kHZ(inputData, context.sampleRate); 163 | 164 | audioDataCache.push(inputData); 165 | 166 | socket.send(audioData16kHz); 167 | }; 168 | 169 | // Prevent page mute 170 | mediaStream.connect(recorder); 171 | recorder.connect(context.destination); 172 | mediaStream.connect(context.destination); 173 | // } 174 | } else { 175 | window.close(); 176 | } 177 | } 178 | 179 | /** 180 | * Listener for incoming messages from the extension's background script. 181 | * @param {Object} request - The message request object. 182 | * @param {Object} sender - The sender object containing information about the message sender. 183 | * @param {Function} sendResponse - The function to send a response back to the message sender. 184 | */ 185 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 186 | const { type, data } = request; 187 | 188 | switch (type) { 189 | case "start_capture": 190 | startRecord(data); 191 | break; 192 | default: 193 | break; 194 | } 195 | 196 | sendResponse({}); 197 | return true; 198 | }); 199 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Audio Capture 5 | 6 | 7 | 8 | 9 |

Audio Transcription

10 |
11 |
Start Capture
12 |
Stop Capture
13 |
14 |
15 | 16 | 17 |
18 |
19 | 20 | 21 |
22 | 127 | 135 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/popup.js: -------------------------------------------------------------------------------- 1 | // Wait for the DOM content to be fully loaded 2 | document.addEventListener("DOMContentLoaded", function () { 3 | const startButton = document.getElementById("startCapture"); 4 | const stopButton = document.getElementById("stopCapture"); 5 | 6 | const useServerCheckbox = document.getElementById("useServerCheckbox"); 7 | const useVadCheckbox = document.getElementById("useVadCheckbox"); 8 | const languageDropdown = document.getElementById('languageDropdown'); 9 | const taskDropdown = document.getElementById('taskDropdown'); 10 | const modelSizeDropdown = document.getElementById('modelSizeDropdown'); 11 | let selectedLanguage = null; 12 | let selectedTask = taskDropdown.value; 13 | let selectedModelSize = modelSizeDropdown.value; 14 | 15 | // Add click event listeners to the buttons 16 | startButton.addEventListener("click", startCapture); 17 | stopButton.addEventListener("click", stopCapture); 18 | 19 | // Retrieve capturing state from storage on popup open 20 | chrome.storage.local.get("capturingState", ({ capturingState }) => { 21 | if (capturingState && capturingState.isCapturing) { 22 | toggleCaptureButtons(true); 23 | } else { 24 | toggleCaptureButtons(false); 25 | } 26 | }); 27 | 28 | // Retrieve checkbox state from storage on popup open 29 | chrome.storage.local.get("useServerState", ({ useServerState }) => { 30 | if (useServerState !== undefined) { 31 | useServerCheckbox.checked = useServerState; 32 | } 33 | }); 34 | 35 | chrome.storage.local.get("useVadState", ({ useVadState }) => { 36 | if (useVadState !== undefined) { 37 | useVadCheckbox.checked = useVadState; 38 | } 39 | }); 40 | 41 | chrome.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => { 42 | if (storedLanguage !== undefined) { 43 | languageDropdown.value = storedLanguage; 44 | selectedLanguage = storedLanguage; 45 | } 46 | }); 47 | 48 | chrome.storage.local.get("selectedTask", ({ selectedTask: storedTask }) => { 49 | if (storedTask !== undefined) { 50 | taskDropdown.value = storedTask; 51 | selectedTask = storedTask; 52 | } 53 | }); 54 | 55 | chrome.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => { 56 | if (storedModelSize !== undefined) { 57 | modelSizeDropdown.value = storedModelSize; 58 | selectedModelSize = storedModelSize; 59 | } 60 | }); 61 | 62 | // Function to handle the start capture button click event 63 | async function startCapture() { 64 | // Ignore click if the button is disabled 65 | if (startButton.disabled) { 66 | return; 67 | } 68 | 69 | // Get the current active tab 70 | const currentTab = await getCurrentTab(); 71 | 72 | // Send a message to the background script to start capturing 73 | let host = "localhost"; 74 | let port = "9090"; 75 | const useCollaboraServer = useServerCheckbox.checked; 76 | if (useCollaboraServer){ 77 | host = "transcription.kurg.org" 78 | port = "7090" 79 | } 80 | 81 | chrome.runtime.sendMessage( 82 | { 83 | action: "startCapture", 84 | tabId: currentTab.id, 85 | host: host, 86 | port: port, 87 | language: selectedLanguage, 88 | task: selectedTask, 89 | modelSize: selectedModelSize, 90 | useVad: useVadCheckbox.checked, 91 | }, () => { 92 | // Update capturing state in storage and toggle the buttons 93 | chrome.storage.local.set({ capturingState: { isCapturing: true } }, () => { 94 | toggleCaptureButtons(true); 95 | }); 96 | } 97 | ); 98 | } 99 | 100 | // Function to handle the stop capture button click event 101 | function stopCapture() { 102 | // Ignore click if the button is disabled 103 | if (stopButton.disabled) { 104 | return; 105 | } 106 | 107 | // Send a message to the background script to stop capturing 108 | chrome.runtime.sendMessage({ action: "stopCapture" }, () => { 109 | // Update capturing state in storage and toggle the buttons 110 | chrome.storage.local.set({ capturingState: { isCapturing: false } }, () => { 111 | toggleCaptureButtons(false); 112 | }); 113 | }); 114 | } 115 | 116 | // Function to get the current active tab 117 | async function getCurrentTab() { 118 | return new Promise((resolve) => { 119 | chrome.tabs.query({ active: true, currentWindow: true }, (tabs) => { 120 | resolve(tabs[0]); 121 | }); 122 | }); 123 | } 124 | 125 | // Function to toggle the capture buttons based on the capturing state 126 | function toggleCaptureButtons(isCapturing) { 127 | startButton.disabled = isCapturing; 128 | stopButton.disabled = !isCapturing; 129 | useServerCheckbox.disabled = isCapturing; 130 | useVadCheckbox.disabled = isCapturing; 131 | modelSizeDropdown.disabled = isCapturing; 132 | languageDropdown.disabled = isCapturing; 133 | taskDropdown.disabled = isCapturing; 134 | startButton.classList.toggle("disabled", isCapturing); 135 | stopButton.classList.toggle("disabled", !isCapturing); 136 | } 137 | 138 | // Save the checkbox state when it's toggled 139 | useServerCheckbox.addEventListener("change", () => { 140 | const useServerState = useServerCheckbox.checked; 141 | chrome.storage.local.set({ useServerState }); 142 | }); 143 | 144 | useVadCheckbox.addEventListener("change", () => { 145 | const useVadState = useVadCheckbox.checked; 146 | chrome.storage.local.set({ useVadState }); 147 | }); 148 | 149 | languageDropdown.addEventListener('change', function() { 150 | if (languageDropdown.value === "") { 151 | selectedLanguage = null; 152 | } else { 153 | selectedLanguage = languageDropdown.value; 154 | } 155 | chrome.storage.local.set({ selectedLanguage }); 156 | }); 157 | 158 | taskDropdown.addEventListener('change', function() { 159 | selectedTask = taskDropdown.value; 160 | chrome.storage.local.set({ selectedTask }); 161 | }); 162 | 163 | modelSizeDropdown.addEventListener('change', function() { 164 | selectedModelSize = modelSizeDropdown.value; 165 | chrome.storage.local.set({ selectedModelSize }); 166 | }); 167 | 168 | chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => { 169 | if (request.action === "updateSelectedLanguage") { 170 | const detectedLanguage = request.detectedLanguage; 171 | 172 | if (detectedLanguage) { 173 | languageDropdown.value = detectedLanguage; 174 | chrome.storage.local.set({ selectedLanguage: detectedLanguage }); 175 | } 176 | } 177 | }); 178 | 179 | chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => { 180 | if (request.action === "toggleCaptureButtons") { 181 | toggleCaptureButtons(false); 182 | chrome.storage.local.set({ capturingState: { isCapturing: false } }) 183 | } 184 | }); 185 | 186 | }); 187 | -------------------------------------------------------------------------------- /Audio-Transcription-Chrome/style.css: -------------------------------------------------------------------------------- 1 | .header { 2 | display: flex; 3 | align-items: center; 4 | padding-bottom: 15px; 5 | padding-left: 20px; 6 | border-bottom: 2px solid darkred; 7 | } 8 | 9 | .header-title { 10 | padding: 0 5px; 11 | } 12 | 13 | h1 { 14 | font-size: 36px; 15 | } 16 | 17 | img { 18 | height: 64px; 19 | margin: 0 20px 0 0; 20 | } 21 | 22 | h2 { 23 | font-size: 26px; 24 | } 25 | 26 | label { 27 | font-size: 16px; 28 | } 29 | 30 | .inner { 31 | margin-left: 40px; 32 | } 33 | 34 | .options-list { 35 | padding: 0; 36 | list-style: none; 37 | } 38 | 39 | .options-list li { 40 | padding: 10px; 41 | } 42 | 43 | .time { 44 | font-size: 16px; 45 | } 46 | 47 | .limit { 48 | display: inline-block; 49 | margin: 0; 50 | font-size: 12px; 51 | } 52 | 53 | .radioChoice { 54 | margin-left: 15px; 55 | } 56 | 57 | .button-container { 58 | display: flex; 59 | justify-content: space-between; 60 | padding: 10px; 61 | } 62 | 63 | .button { 64 | padding: 10px; 65 | border: 2px solid darkred; 66 | font-size: 16px; 67 | font-weight: bold; 68 | cursor: pointer; 69 | white-space: nowrap; 70 | width: 150px; 71 | border-radius: 5px; 72 | } 73 | 74 | .disabled { 75 | opacity: 0.6; 76 | cursor: not-allowed; 77 | } 78 | 79 | .button:hover:not(:disabled) { 80 | color: red; 81 | background-color: darkred; 82 | } 83 | 84 | #save { 85 | font-size: 16px; 86 | margin-left: 50px; 87 | } 88 | 89 | #status { 90 | color: red; 91 | margin-top: 8px; 92 | margin-left: 50px; 93 | font-size: 14px; 94 | } 95 | 96 | #qualityLi { 97 | display: none; 98 | } 99 | 100 | #maxTime { 101 | width: 30px; 102 | text-align: center; 103 | } 104 | 105 | .checkbox-container { 106 | padding: 10px; 107 | } 108 | 109 | .dropdown-container { 110 | padding: 10px; 111 | } 112 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/README.md: -------------------------------------------------------------------------------- 1 | # Audio Transcription Firefox 2 | 3 | Audio Transcription is a Firefox extension that allows users to capture any audio playing on the current tab and transcribe it using OpenAI-whisper in real time. Users will have the option to do voice activity detection as well to not send audio to server when there is no speech. 4 | 5 | We use OpenAI-whisper model to process the audio continuously and send the transcription back to the client. We apply a few optimizations on top of OpenAI's implementation to improve performance and run it faster in a real-time manner. To this end, we used [faster-whisper](https://github.com/guillaumekln/faster-whisper) which is 4x faster than OpenAI's implementation. 6 | 7 | ## Loading the Extension 8 | - Open the Mozilla Firefox browser. 9 | - Type ```about:debugging#/runtime/this-firefox``` in the address bar and press Enter. 10 | - Clone this repository 11 | - Click the Load temporary Add-on. 12 | - Browse to the location where you cloned the repository files and select the ```Audio Transcription Fox``` folder. 13 | - The extension should now be loaded and visible on the extensions page. 14 | 15 | 16 | ## Real time transcription with OpenAI-whisper 17 | This Firefox extension allows you to send audio from your browser to a server for transcribing the audio in real time. 18 | 19 | ## Implementation Details 20 | 21 | ### Capturing Audio 22 | To capture the audio in the current tab, we used the chrome `tabCapture` API to obtain a `MediaStream` object of the current tab. 23 | 24 | ### Options 25 | When using the Audio Transcription extension, you have the following options: 26 | - **Use Collabora Server**: We provide a demo server which runs the whisper small model. 27 | - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper. 28 | - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English. 29 | - **Model Size**: Select the whisper model size to run the server with. 30 | 31 | ### Getting Started 32 | - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live). 33 | - Just click on the Firefox Extension which should show 2 options 34 | - **Start Capture** : Starts capturing the audio in the current tab and sends the captured audio to the server for transcription. This also creates an element to show the transcriptions recieved from the server on the current tab. 35 | - **Stop Capture** - Stops capturing the audio. 36 | 37 | 38 | ## Limitations 39 | This extension requires an internet connection to stream audio and receive transcriptions. The accuracy of the transcriptions may vary depending on the audio quality and the performance of the server-side transcription service. The extension may consume additional system resources while running, especially when streaming audio. 40 | 41 | ## Note 42 | The extension relies on a properly running transcription server with multilingual support. Please follow the server documentation for setup and configuration. 43 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/background.js: -------------------------------------------------------------------------------- 1 | browser.runtime.onMessage.addListener(async function(request, sender, sendResponse) { 2 | const { action, data } = request; 3 | if (action === "transcript") { 4 | await browser.tabs.query({ active: true, currentWindow: true }) 5 | .then((tabs) => { 6 | const tabId = tabs[0].id; 7 | browser.tabs.sendMessage(tabId, { action: "show_transcript", data }); 8 | }) 9 | .catch((error) => { 10 | console.error("Error retrieving active tab:", error); 11 | }); 12 | } 13 | if (action === "updateSelectedLanguage") { 14 | const detectedLanguage = data; 15 | try { 16 | await browser.storage.local.set({ selectedLanguage: detectedLanguage }); 17 | browser.tabs.query({ active: true, currentWindow: true }).then((tabs) => { 18 | const tabId = tabs[0].id; 19 | browser.tabs.sendMessage(tabId, { action: "updateSelectedLanguage", detectedLanguage }); 20 | }); 21 | } catch (error) { 22 | console.error("Error updateSelectedLanguage:", error); 23 | } 24 | } 25 | if (action === "toggleCaptureButtons") { 26 | try { 27 | await browser.storage.local.set({ capturingState: { isCapturing: false } }); 28 | browser.tabs.query({ active: true, currentWindow: true }).then((tabs) => { 29 | const tabId = tabs[0].id; 30 | browser.tabs.sendMessage(tabId, { action: "toggleCaptureButtons", data: false }); 31 | }); 32 | } catch (error) { 33 | console.error("Error updating capturing state:", error); 34 | } 35 | 36 | try{ 37 | await browser.tabs.query({ active: true, currentWindow: true }) 38 | .then((tabs) => { 39 | const tabId = tabs[0].id; 40 | browser.tabs.sendMessage(tabId, { action: "stopCapture", data }); 41 | }) 42 | .catch((error) => { 43 | console.error("Error retrieving active tab:", error); 44 | }); 45 | } catch (error) { 46 | console.error(error); 47 | } 48 | } 49 | 50 | if (action === "showPopup") { 51 | try{ 52 | await browser.tabs.query({ active: true, currentWindow: true }) 53 | .then((tabs) => { 54 | const tabId = tabs[0].id; 55 | browser.tabs.sendMessage(tabId, { action: "showWaitPopup", data }); 56 | }) 57 | .catch((error) => { 58 | console.error(error); 59 | }); 60 | } catch (error) { 61 | console.error(error); 62 | } 63 | } 64 | }); 65 | 66 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/content.js: -------------------------------------------------------------------------------- 1 | let socket = null; 2 | let isCapturing = false; 3 | let mediaStream = null; 4 | let audioContext = null; 5 | let scriptProcessor = null; 6 | let language = null; 7 | 8 | let isPaused = false; 9 | 10 | const mediaElements = document.querySelectorAll('video, audio'); 11 | mediaElements.forEach((mediaElement) => { 12 | mediaElement.addEventListener('play', handlePlaybackStateChange); 13 | mediaElement.addEventListener('pause', handlePlaybackStateChange); 14 | }); 15 | 16 | 17 | function handlePlaybackStateChange(event) { 18 | isPaused = event.target.paused; 19 | } 20 | 21 | function generateUUID() { 22 | let dt = new Date().getTime(); 23 | const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 24 | const r = (dt + Math.random() * 16) % 16 | 0; 25 | dt = Math.floor(dt / 16); 26 | return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16); 27 | }); 28 | return uuid; 29 | } 30 | 31 | 32 | /** 33 | * Resamples the audio data to a target sample rate of 16kHz. 34 | * @param {Array|ArrayBuffer|TypedArray} audioData - The input audio data. 35 | * @param {number} [origSampleRate=44100] - The original sample rate of the audio data. 36 | * @returns {Float32Array} The resampled audio data at 16kHz. 37 | */ 38 | function resampleTo16kHZ(audioData, origSampleRate = 44100) { 39 | // Convert the audio data to a Float32Array 40 | const data = new Float32Array(audioData); 41 | 42 | // Calculate the desired length of the resampled data 43 | const targetLength = Math.round(data.length * (16000 / origSampleRate)); 44 | 45 | // Create a new Float32Array for the resampled data 46 | const resampledData = new Float32Array(targetLength); 47 | 48 | // Calculate the spring factor and initialize the first and last values 49 | const springFactor = (data.length - 1) / (targetLength - 1); 50 | resampledData[0] = data[0]; 51 | resampledData[targetLength - 1] = data[data.length - 1]; 52 | 53 | // Resample the audio data 54 | for (let i = 1; i < targetLength - 1; i++) { 55 | const index = i * springFactor; 56 | const leftIndex = Math.floor(index).toFixed(); 57 | const rightIndex = Math.ceil(index).toFixed(); 58 | const fraction = index - leftIndex; 59 | resampledData[i] = data[leftIndex] + (data[rightIndex] - data[leftIndex]) * fraction; 60 | } 61 | 62 | // Return the resampled data 63 | return resampledData; 64 | } 65 | 66 | function startRecording(data) { 67 | socket = new WebSocket(`ws://${data.host}:${data.port}/`); 68 | language = data.language; 69 | 70 | const uuid = generateUUID(); 71 | socket.onopen = function(e) { 72 | socket.send( 73 | JSON.stringify({ 74 | uid: uuid, 75 | language: data.language, 76 | task: data.task, 77 | model: data.modelSize, 78 | use_vad: data.useVad 79 | }) 80 | ); 81 | }; 82 | 83 | let isServerReady = false; 84 | socket.onmessage = async (event) => { 85 | const data = JSON.parse(event.data); 86 | if (data["uid"] !== uuid) 87 | return; 88 | 89 | if (data["status"] === "WAIT"){ 90 | await browser.runtime.sendMessage({ action: "showPopup", data: data["message"] }) 91 | return; 92 | } 93 | 94 | if (!isServerReady && data["message"] === "SERVER_READY"){ 95 | isServerReady = true; 96 | return; 97 | } 98 | 99 | if (language === null ){ 100 | language = data["language"]; 101 | await browser.runtime.sendMessage({ action: "updateSelectedLanguage", data: language }) 102 | return 103 | } 104 | 105 | if (data["message"] === "DISCONNECT"){ 106 | await browser.runtime.sendMessage({ action: "toggleCaptureButtons", data: false }) 107 | return 108 | } 109 | 110 | await browser.runtime.sendMessage({ action: "transcript", data: event.data }) 111 | .catch(function(error) { 112 | console.error("Error sending message:", error); 113 | }); 114 | }; 115 | 116 | // Access the audio stream from the current tab 117 | navigator.mediaDevices.getUserMedia({ audio: true }) 118 | .then(function(stream) { 119 | // Create a new MediaRecorder instance 120 | const audioDataCache = []; 121 | audioContext = new AudioContext(); 122 | mediaStream = audioContext.createMediaStreamSource(stream); 123 | recorder = audioContext.createScriptProcessor(4096, 1, 1); 124 | 125 | recorder.onaudioprocess = async (event) => { 126 | if (!audioContext || !isCapturing || !isServerReady || isPaused) return; 127 | 128 | const inputData = event.inputBuffer.getChannelData(0); 129 | const audioData16kHz = resampleTo16kHZ(inputData, audioContext.sampleRate); 130 | 131 | audioDataCache.push(inputData); 132 | 133 | socket.send(audioData16kHz); 134 | }; 135 | 136 | // Prevent page mute 137 | mediaStream.connect(recorder); 138 | recorder.connect(audioContext.destination); 139 | }) 140 | } 141 | 142 | var elem_container = null; 143 | var elem_text = null; 144 | 145 | var segments = []; 146 | var text_segments = []; 147 | 148 | function initPopupElement() { 149 | if (document.getElementById('popupElement')) { 150 | return; 151 | } 152 | 153 | const popupContainer = document.createElement('div'); 154 | popupContainer.id = 'popupElement'; 155 | popupContainer.style.cssText = 'position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: white; color: black; padding: 16px; border-radius: 10px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.5); display: none; text-align: center;'; 156 | 157 | const popupText = document.createElement('span'); 158 | popupText.textContent = 'Default Text'; 159 | popupText.className = 'popupText'; 160 | popupText.style.fontSize = '24px'; 161 | popupContainer.appendChild(popupText); 162 | 163 | const buttonContainer = document.createElement('div'); 164 | buttonContainer.style.marginTop = '8px'; 165 | const closePopupButton = document.createElement('button'); 166 | closePopupButton.textContent = 'Close'; 167 | closePopupButton.style.backgroundColor = '#65428A'; 168 | closePopupButton.style.color = 'white'; 169 | closePopupButton.style.border = 'none'; 170 | closePopupButton.style.padding = '8px 16px'; // Add padding for better click area 171 | closePopupButton.style.cursor = 'pointer'; 172 | closePopupButton.addEventListener('click', async () => { 173 | popupContainer.style.display = 'none'; 174 | await browser.runtime.sendMessage({ action: 'toggleCaptureButtons', data: false }); 175 | }); 176 | buttonContainer.appendChild(closePopupButton); 177 | popupContainer.appendChild(buttonContainer); 178 | 179 | document.body.appendChild(popupContainer); 180 | } 181 | 182 | 183 | function showPopup(customText) { 184 | const popup = document.getElementById('popupElement'); 185 | const popupText = popup.querySelector('.popupText'); 186 | 187 | if (popup && popupText) { 188 | popupText.textContent = customText || 'Default Text'; // Set default text if custom text is not provided 189 | popup.style.display = 'block'; 190 | } 191 | } 192 | 193 | 194 | function init_element() { 195 | if (document.getElementById('transcription')) { 196 | return; 197 | } 198 | 199 | elem_container = document.createElement('div'); 200 | elem_container.id = "transcription"; 201 | elem_container.style.cssText = 'padding-top:16px;font-size:18px;line-height:18px;position:fixed;top:85%;left:50%;transform:translate(-50%,-50%);width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;'; 202 | 203 | for (var i = 0; i < 4; i++) { 204 | elem_text = document.createElement('span'); 205 | elem_text.style.cssText = 'position: absolute;padding-left:16px;padding-right:16px;'; 206 | elem_text.id = "t" + i; 207 | elem_container.appendChild(elem_text); 208 | 209 | if (i == 3) { 210 | elem_text.style.top = "-1000px" 211 | } 212 | } 213 | 214 | document.body.appendChild(elem_container); 215 | 216 | let x = 0; 217 | let y = 0; 218 | 219 | // Query the element 220 | const ele = elem_container; 221 | 222 | // Handle the mousedown event 223 | // that's triggered when user drags the element 224 | const mouseDownHandler = function (e) { 225 | // Get the current mouse position 226 | x = e.clientX; 227 | y = e.clientY; 228 | 229 | // Attach the listeners to `document` 230 | document.addEventListener('mousemove', mouseMoveHandler); 231 | document.addEventListener('mouseup', mouseUpHandler); 232 | }; 233 | 234 | const mouseMoveHandler = function (e) { 235 | // How far the mouse has been moved 236 | const dx = e.clientX - x; 237 | const dy = e.clientY - y; 238 | 239 | // Set the position of element 240 | ele.style.top = `${ele.offsetTop + dy}px`; 241 | ele.style.left = `${ele.offsetLeft + dx}px`; 242 | 243 | // Reassign the position of mouse 244 | x = e.clientX; 245 | y = e.clientY; 246 | }; 247 | 248 | const mouseUpHandler = function () { 249 | // Remove the handlers of `mousemove` and `mouseup` 250 | document.removeEventListener('mousemove', mouseMoveHandler); 251 | document.removeEventListener('mouseup', mouseUpHandler); 252 | }; 253 | 254 | ele.addEventListener('mousedown', mouseDownHandler); 255 | } 256 | 257 | function getStyle(el,styleProp) 258 | { 259 | var x = document.getElementById(el); 260 | if (x.currentStyle) 261 | var y = x.currentStyle[styleProp]; 262 | else if (window.getComputedStyle) 263 | var y = document.defaultView.getComputedStyle(x,null).getPropertyValue(styleProp); 264 | return y; 265 | } 266 | 267 | function get_lines(elem, line_height) { 268 | var divHeight = elem.offsetHeight; 269 | var lines = divHeight / line_height; 270 | 271 | var original_text = elem.innerHTML; 272 | 273 | var words = original_text.split(' '); 274 | var segments = []; 275 | var current_lines = 1; 276 | var segment = ''; 277 | var segment_len = 0; 278 | for (var i = 0; i < words.length; i++) 279 | { 280 | segment += words[i] + ' '; 281 | elem.innerHTML = segment; 282 | divHeight = elem.offsetHeight; 283 | 284 | if ((divHeight / line_height) > current_lines) { 285 | var line_segment = segment.substring(segment_len, segment.length - 1 - words[i].length - 1); 286 | segments.push(line_segment); 287 | segment_len += line_segment.length + 1; 288 | current_lines++; 289 | } 290 | } 291 | 292 | var line_segment = segment.substring(segment_len, segment.length - 1) 293 | segments.push(line_segment); 294 | 295 | elem.innerHTML = original_text; 296 | 297 | return segments; 298 | 299 | } 300 | 301 | function remove_element() { 302 | var elem = document.getElementById('transcription') 303 | for (var i = 0; i < 4; i++) { 304 | document.getElementById("t" + i).remove(); 305 | } 306 | elem.remove() 307 | } 308 | 309 | browser.runtime.onMessage.addListener((request, sender, sendResponse) => { 310 | const { action, data } = request; 311 | if (action === "startCapture") { 312 | isCapturing = true; 313 | startRecording(data); 314 | } else if (action === "stopCapture") { 315 | 316 | isCapturing = false; 317 | if (socket) { 318 | socket.close(); 319 | socket = null; 320 | } 321 | 322 | if (audioContext) { 323 | audioContext.close(); 324 | audioContext = null; 325 | mediaStream = null; 326 | recorder = null; 327 | } 328 | 329 | remove_element(); 330 | 331 | } else if (action === "showWaitPopup") { 332 | 333 | initPopupElement(); 334 | 335 | showPopup(`Estimated wait time ~ ${Math.round(data)} minutes`); 336 | 337 | } else if (action === "show_transcript"){ 338 | if (!isCapturing) return; 339 | init_element(); 340 | message = JSON.parse(data); 341 | message = message["segments"]; 342 | 343 | var text = ''; 344 | for (var i = 0; i < message.length; i++) { 345 | text += message[i].text + ' '; 346 | } 347 | text = text.replace(/(\r\n|\n|\r)/gm, ""); 348 | 349 | var elem = document.getElementById('t3'); 350 | elem.innerHTML = text; 351 | 352 | var line_height_style = getStyle('t3', 'line-height'); 353 | var line_height = parseInt(line_height_style.substring(0, line_height_style.length - 2)); 354 | var divHeight = elem.offsetHeight; 355 | var lines = divHeight / line_height; 356 | 357 | text_segments = []; 358 | text_segments = get_lines(elem, line_height); 359 | 360 | elem.innerHTML = ''; 361 | 362 | if (text_segments.length > 2) { 363 | for (var i = 0; i < 3; i++) { 364 | document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i]; 365 | } 366 | } else { 367 | for (var i = 0; i < 3; i++) { 368 | document.getElementById('t' + i).innerHTML = ''; 369 | } 370 | } 371 | 372 | if (text_segments.length <= 2) { 373 | for (var i = 0; i < text_segments.length; i++) { 374 | document.getElementById('t' + i).innerHTML = text_segments[i]; 375 | } 376 | } else { 377 | for (var i = 0; i < 3; i++) { 378 | document.getElementById('t' + i).innerHTML = text_segments[text_segments.length - 3 + i]; 379 | } 380 | } 381 | 382 | for (var i = 1; i < 3; i++) 383 | { 384 | var parent_elem = document.getElementById('t' + (i - 1)); 385 | var elem = document.getElementById('t' + i); 386 | elem.style.top = parent_elem.offsetHeight + parent_elem.offsetTop + 'px'; 387 | } 388 | } 389 | sendResponse({}); 390 | }); 391 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/icon128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/Audio-Transcription-Firefox/icon128.png -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "Audio Transcription", 4 | "version": "1.0", 5 | "description": "Transcribe audio from any webpage.", 6 | "permissions": [ 7 | "storage", 8 | "activeTab", 9 | "" 10 | ], 11 | "background": { 12 | "scripts": ["background.js"], 13 | "persistent": false 14 | }, 15 | "browser_action": { 16 | "default_popup": "popup.html", 17 | "default_icon": "icon128.png" 18 | }, 19 | "icons": { 20 | "128":"icon128.png" 21 | }, 22 | "content_scripts": [ 23 | { 24 | "matches": [""], 25 | "js": ["content.js"] 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Audio Transcription 5 | 6 | 7 | 8 | 9 |

Audio Transcription

10 |
11 |
Start Capture
12 |
Stop Capture
13 |
14 |
15 | 16 | 17 |
18 |
19 | 20 | 21 |
22 | 23 | 128 | 136 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/popup.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function() { 2 | const startButton = document.getElementById("startCapture"); 3 | const stopButton = document.getElementById("stopCapture"); 4 | 5 | const useServerCheckbox = document.getElementById("useServerCheckbox"); 6 | const useVadCheckbox = document.getElementById("useVadCheckbox"); 7 | const languageDropdown = document.getElementById('languageDropdown'); 8 | const taskDropdown = document.getElementById('taskDropdown'); 9 | const modelSizeDropdown = document.getElementById('modelSizeDropdown'); 10 | let selectedLanguage = null; 11 | let selectedTask = taskDropdown.value; 12 | let selectedModelSize = modelSizeDropdown.value; 13 | 14 | 15 | browser.storage.local.get("capturingState") 16 | .then(function(result) { 17 | const capturingState = result.capturingState; 18 | if (capturingState && capturingState.isCapturing) { 19 | toggleCaptureButtons(true); 20 | } else { 21 | toggleCaptureButtons(false); 22 | } 23 | // Enable the startButton 24 | startButton.disabled = false; 25 | }) 26 | .catch(function(error) { 27 | console.error("Error retrieving capturing state:", error); 28 | // Enable the startButton 29 | startButton.disabled = false; 30 | }); 31 | 32 | browser.storage.local.get("useServerState", ({ useServerState }) => { 33 | if (useServerState !== undefined) { 34 | useServerCheckbox.checked = useServerState; 35 | } 36 | }); 37 | 38 | browser.storage.local.get("useVadState", ({ useVadState }) => { 39 | if (useVadState !== undefined) { 40 | useVadCheckbox.checked = useVadState; 41 | } 42 | }); 43 | 44 | browser.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => { 45 | if (storedLanguage !== undefined) { 46 | languageDropdown.value = storedLanguage; 47 | selectedLanguage = storedLanguage; 48 | } 49 | }); 50 | 51 | browser.storage.local.get("selectedTask", ({ selectedTask: storedTask }) => { 52 | if (storedTask !== undefined) { 53 | taskDropdown.value = storedTask; 54 | selectedTask = storedTask; 55 | } 56 | }); 57 | 58 | browser.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => { 59 | if (storedModelSize !== undefined) { 60 | modelSizeDropdown.value = storedModelSize; 61 | selectedModelSize = storedModelSize; 62 | } 63 | }); 64 | 65 | startButton.addEventListener("click", function() { 66 | let host = "localhost"; 67 | let port = "9090"; 68 | const useCollaboraServer = useServerCheckbox.checked; 69 | 70 | if (useCollaboraServer){ 71 | host = "transcription.kurg.org" 72 | port = "7090" 73 | } 74 | 75 | browser.tabs.query({ active: true, currentWindow: true }) 76 | .then(function(tabs) { 77 | browser.tabs.sendMessage( 78 | tabs[0].id, 79 | { 80 | action: "startCapture", 81 | data: { 82 | host: host, 83 | port: port, 84 | language: selectedLanguage, 85 | task: selectedTask, 86 | modelSize: selectedModelSize, 87 | useVad: useVadCheckbox.checked, 88 | } 89 | }); 90 | toggleCaptureButtons(true); 91 | browser.storage.local.set({ capturingState: { isCapturing: true } }) 92 | .catch(function(error) { 93 | console.error("Error storing capturing state:", error); 94 | }); 95 | }) 96 | .catch(function(error) { 97 | console.error("Error sending startCapture message:", error); 98 | }); 99 | }); 100 | 101 | stopButton.addEventListener("click", function() { 102 | browser.tabs.query({ active: true, currentWindow: true }) 103 | .then(function(tabs) { 104 | browser.tabs.sendMessage(tabs[0].id, { action: "stopCapture" }) 105 | .then(function(response) { 106 | toggleCaptureButtons(false); 107 | browser.storage.local.set({ capturingState: { isCapturing: false } }) 108 | .catch(function(error) { 109 | console.error("Error storing capturing state:", error); 110 | }); 111 | }) 112 | .catch(function(error) { 113 | console.error("Error sending stopCapture message:", error); 114 | }); 115 | }) 116 | .catch(function(error) { 117 | console.error("Error querying active tab:", error); 118 | }); 119 | }); 120 | 121 | // Function to toggle the capture buttons 122 | function toggleCaptureButtons(isCapturing) { 123 | startButton.disabled = isCapturing; 124 | stopButton.disabled = !isCapturing; 125 | useServerCheckbox.disabled = isCapturing; 126 | useVadCheckbox.disabled = isCapturing; 127 | modelSizeDropdown.disabled = isCapturing; 128 | languageDropdown.disabled = isCapturing; 129 | taskDropdown.disabled = isCapturing; 130 | startButton.classList.toggle("disabled", isCapturing); 131 | stopButton.classList.toggle("disabled", !isCapturing); 132 | } 133 | 134 | // Save the checkbox state when it's toggled 135 | useServerCheckbox.addEventListener("change", () => { 136 | const useServerState = useServerCheckbox.checked; 137 | browser.storage.local.set({ useServerState }); 138 | }); 139 | 140 | useVadCheckbox.addEventListener("change", () => { 141 | const useVadState = useVadCheckbox.checked; 142 | browser.storage.local.set({ useVadState }); 143 | }); 144 | 145 | languageDropdown.addEventListener('change', function() { 146 | if (languageDropdown.value === "") { 147 | selectedLanguage = null; 148 | } else { 149 | selectedLanguage = languageDropdown.value; 150 | } 151 | browser.storage.local.set({ selectedLanguage }); 152 | }); 153 | 154 | taskDropdown.addEventListener('change', function() { 155 | selectedTask = taskDropdown.value; 156 | browser.storage.local.set({ selectedTask }); 157 | }); 158 | 159 | modelSizeDropdown.addEventListener('change', function() { 160 | selectedModelSize = modelSizeDropdown.value; 161 | browser.storage.local.set({ selectedModelSize }); 162 | }); 163 | 164 | browser.runtime.onMessage.addListener((request, sender, sendResponse) => { 165 | if (request.action === "updateSelectedLanguage") { 166 | const detectedLanguage = request.data; 167 | 168 | if (detectedLanguage) { 169 | languageDropdown.value = detectedLanguage; 170 | selectedLanguage = detectedLanguage; 171 | browser.storage.local.set({ selectedLanguage }); 172 | } 173 | } 174 | }); 175 | 176 | browser.runtime.onMessage.addListener((request, sender, sendResponse) => { 177 | if (request.action === "toggleCaptureButtons") { 178 | toggleCaptureButtons(false); 179 | browser.storage.local.set({ capturingState: { isCapturing: false } }) 180 | .catch(function(error) { 181 | console.error("Error storing capturing state:", error); 182 | }); 183 | } 184 | }); 185 | }); 186 | -------------------------------------------------------------------------------- /Audio-Transcription-Firefox/style.css: -------------------------------------------------------------------------------- 1 | .header { 2 | display: flex; 3 | align-items: center; 4 | padding-bottom: 15px; 5 | padding-left: 20px; 6 | border-bottom: 2px solid darkred; 7 | } 8 | 9 | .header-title { 10 | padding: 0 5px; 11 | } 12 | 13 | h1 { 14 | font-size: 36px; 15 | } 16 | 17 | img { 18 | height: 64px; 19 | margin: 0 20px 0 0; 20 | } 21 | 22 | h2 { 23 | font-size: 26px; 24 | } 25 | 26 | label { 27 | font-size: 16px; 28 | } 29 | 30 | .inner { 31 | margin-left: 40px; 32 | } 33 | 34 | .options-list { 35 | padding: 0; 36 | list-style: none; 37 | } 38 | 39 | .options-list li { 40 | padding: 10px; 41 | } 42 | 43 | .time { 44 | font-size: 16px; 45 | } 46 | 47 | .limit { 48 | display: inline-block; 49 | margin: 0; 50 | font-size: 12px; 51 | } 52 | 53 | .radioChoice { 54 | margin-left: 15px; 55 | } 56 | 57 | .button-container { 58 | display: flex; 59 | justify-content: space-between; 60 | padding: 10px; 61 | } 62 | 63 | .button { 64 | padding: 10px; 65 | border: 2px solid darkred; 66 | font-size: 16px; 67 | font-weight: bold; 68 | cursor: pointer; 69 | white-space: nowrap; 70 | width: 150px; 71 | border-radius: 5px; 72 | } 73 | 74 | .disabled { 75 | opacity: 0.6; 76 | cursor: not-allowed; 77 | } 78 | 79 | .button:hover:not(:disabled) { 80 | color: red; 81 | background-color: darkred; 82 | } 83 | 84 | #save { 85 | font-size: 16px; 86 | margin-left: 50px; 87 | } 88 | 89 | #status { 90 | color: red; 91 | margin-top: 8px; 92 | margin-left: 50px; 93 | font-size: 14px; 94 | } 95 | 96 | #qualityLi { 97 | display: none; 98 | } 99 | 100 | #maxTime { 101 | width: 30px; 102 | text-align: center; 103 | } 104 | 105 | .checkbox-container { 106 | padding: 10px; 107 | } 108 | 109 | .dropdown-container { 110 | padding: 10px; 111 | } 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Vineet Suryan, Collabora Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WhisperLive 2 | 3 |

4 | WhisperLive 6 | WhisperLive 8 |

A nearly-live implementation of OpenAI's Whisper. 9 |

10 |

11 | 12 | This project is a real-time transcription application that uses the OpenAI Whisper model 13 | to convert speech input into text output. It can be used to transcribe both live audio 14 | input from microphone and pre-recorded audio files. 15 | 16 | - [Installation](#installation) 17 | - [Getting Started](#getting-started) 18 | - [Running the Server](#running-the-server) 19 | - [Running the Client](#running-the-client) 20 | - [Browser Extensions](#browser-extensions) 21 | - [Whisper Live Server in Docker](#whisper-live-server-in-docker) 22 | - [Future Work](#future-work) 23 | - [Blog Posts](#blog-posts) 24 | - [Contact](#contact) 25 | - [Citations](#citations) 26 | 27 | ## Installation 28 | - Install PyAudio 29 | ```bash 30 | bash scripts/setup.sh 31 | ``` 32 | 33 | - Install whisper-live from pip 34 | ```bash 35 | pip install whisper-live 36 | ``` 37 | 38 | ### Setting up NVIDIA/TensorRT-LLM for TensorRT backend 39 | - Please follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup of [NVIDIA/TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and for building Whisper-TensorRT engine. 40 | 41 | ## Getting Started 42 | The server supports 3 backends `faster_whisper`, `tensorrt` and `openvino`. If running `tensorrt` backend follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) 43 | 44 | ### Running the Server 45 | - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) backend 46 | ```bash 47 | python3 run_server.py --port 9090 \ 48 | --backend faster_whisper 49 | 50 | # running with custom model and cache_dir to save auto-converted ctranslate2 models 51 | python3 run_server.py --port 9090 \ 52 | --backend faster_whisper \ 53 | -fw "/path/to/custom/faster/whisper/model" 54 | -c ~/.cache/whisper-live/ 55 | ``` 56 | 57 | - TensorRT backend. Currently, we recommend to only use the docker setup for TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) which works as expected. Make sure to build your TensorRT Engines before running the server with TensorRT backend. 58 | ```bash 59 | # Run English only model 60 | python3 run_server.py -p 9090 \ 61 | -b tensorrt \ 62 | -trt /home/TensorRT-LLM/examples/whisper/whisper_small_en 63 | 64 | # Run Multilingual model 65 | python3 run_server.py -p 9090 \ 66 | -b tensorrt \ 67 | -trt /home/TensorRT-LLM/examples/whisper/whisper_small \ 68 | -m 69 | ``` 70 | 71 | - WhisperLive now supports the [OpenVINO](https://github.com/openvinotoolkit/openvino) backend for efficient inference on Intel CPUs, iGPU and dGPUs. Currently, we tested the models uploaded to [huggingface by OpenVINO](https://huggingface.co/OpenVINO?search_models=whisper). 72 | - > **Docker Recommended:** Running WhisperLive with OpenVINO inside Docker automatically enables GPU support (iGPU/dGPU) without requiring additional host setup. 73 | - > **Native (non-Docker) Use:** If you prefer running outside Docker, ensure the Intel drivers and OpenVINO runtime are installed and properly configured on your system. Refer to the documentation for [installing OpenVINO](https://docs.openvino.ai/2025/get-started/install-openvino.html?PACKAGE=OPENVINO_BASE&VERSION=v_2025_0_0&OP_SYSTEM=LINUX&DISTRIBUTION=PIP#). 74 | 75 | ``` 76 | python3 run_server.py -p 9090 -b openvino 77 | ``` 78 | 79 | 80 | #### Controlling OpenMP Threads 81 | To control the number of threads used by OpenMP, you can set the `OMP_NUM_THREADS` environment variable. This is useful for managing CPU resources and ensuring consistent performance. If not specified, `OMP_NUM_THREADS` is set to `1` by default. You can change this by using the `--omp_num_threads` argument: 82 | ```bash 83 | python3 run_server.py --port 9090 \ 84 | --backend faster_whisper \ 85 | --omp_num_threads 4 86 | ``` 87 | 88 | #### Single model mode 89 | By default, when running the server without specifying a model, the server will instantiate a new whisper model for every client connection. This has the advantage, that the server can use different model sizes, based on the client's requested model size. On the other hand, it also means you have to wait for the model to be loaded upon client connection and you will have increased (V)RAM usage. 90 | 91 | When serving a custom TensorRT model using the `-trt` or a custom faster_whisper model using the `-fw` option, the server will instead only instantiate the custom model once and then reuse it for all client connections. 92 | 93 | If you don't want this, set `--no_single_model`. 94 | 95 | 96 | ### Running the Client 97 | - Initializing the client with below parameters: 98 | - `lang`: Language of the input audio, applicable only if using a multilingual model. 99 | - `translate`: If set to `True` then translate from any language to `en`. 100 | - `model`: Whisper model size. 101 | - `use_vad`: Whether to use `Voice Activity Detection` on the server. 102 | - `save_output_recording`: Set to True to save the microphone input as a `.wav` file during live transcription. This option is helpful for recording sessions for later playback or analysis. Defaults to `False`. 103 | - `output_recording_filename`: Specifies the `.wav` file path where the microphone input will be saved if `save_output_recording` is set to `True`. 104 | - `max_clients`: Specifies the maximum number of clients the server should allow. Defaults to 4. 105 | - `max_connection_time`: Maximum connection time for each client in seconds. Defaults to 600. 106 | - `mute_audio_playback`: Whether to mute audio playback when transcribing an audio file. Defaults to False. 107 | 108 | ```python 109 | from whisper_live.client import TranscriptionClient 110 | client = TranscriptionClient( 111 | "localhost", 112 | 9090, 113 | lang="en", 114 | translate=False, 115 | model="small", # also support hf_model => `Systran/faster-whisper-small` 116 | use_vad=False, 117 | save_output_recording=True, # Only used for microphone input, False by Default 118 | output_recording_filename="./output_recording.wav", # Only used for microphone input 119 | max_clients=4, 120 | max_connection_time=600, 121 | mute_audio_playback=False, # Only used for file input, False by Default 122 | ) 123 | ``` 124 | It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language. 125 | 126 | - Transcribe an audio file: 127 | ```python 128 | client("tests/jfk.wav") 129 | ``` 130 | 131 | - To transcribe from microphone: 132 | ```python 133 | client() 134 | ``` 135 | 136 | - To transcribe from a RTSP stream: 137 | ```python 138 | client(rtsp_url="rtsp://admin:admin@192.168.0.1/rtsp") 139 | ``` 140 | 141 | - To transcribe from a HLS stream: 142 | ```python 143 | client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8") 144 | ``` 145 | 146 | ## Browser Extensions 147 | - Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server). 148 | - Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md 149 | 150 | ## Whisper Live Server in Docker 151 | - GPU 152 | - Faster-Whisper 153 | ```bash 154 | docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest 155 | ``` 156 | 157 | - TensorRT. Refer to [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup and more tensorrt backend configurations. 158 | ```bash 159 | docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt 160 | docker run -p 9090:9090 --runtime=nvidia --entrypoint /bin/bash -it whisperlive-tensorrt 161 | 162 | # Build small.en engine 163 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16 164 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization 165 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization 166 | 167 | # Run server with small.en 168 | python3 run_server.py --port 9090 \ 169 | --backend tensorrt \ 170 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16" 171 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8" 172 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4" 173 | ``` 174 | 175 | - OpenVINO 176 | ``` 177 | docker run -it --device=/dev/dri -p 9090:9090 ghcr.io/collabora/whisperlive-openvino 178 | ``` 179 | 180 | - CPU 181 | - Faster-whisper 182 | ```bash 183 | docker run -it -p 9090:9090 ghcr.io/collabora/whisperlive-cpu:latest 184 | ``` 185 | 186 | ## Future Work 187 | - [ ] Add translation to other languages on top of transcription. 188 | 189 | ## Blog Posts 190 | - [Transforming speech technology with WhisperLive](https://www.collabora.com/news-and-blog/blog/2024/05/28/transforming-speech-technology-with-whisperlive/) 191 | - [WhisperFusion: Ultra-low latency conversations with an AI chatbot](https://www.collabora.com/news-and-blog/news-and-events/whisperfusion-ultra-low-latency-conversations-with-an-ai-chatbot.html) powered by WhisperLive 192 | - [Breaking language barriers 2.0: Moving closer towards fully reliable, production-ready Hindi ASR](https://www.collabora.com/news-and-blog/news-and-events/breaking-language-barriers-20-moving-closer-production-ready-hindi-asr.html) which is used in WhisperLive for hindi. 193 | 194 | ## Contact 195 | 196 | We are available to help you with both Open Source and proprietary AI projects. You can reach us via the Collabora website or [vineet.suryan@collabora.com](mailto:vineet.suryan@collabora.com) and [marcus.edel@collabora.com](mailto:marcus.edel@collabora.com). 197 | 198 | 199 | ## Citations 200 | ```bibtex 201 | @article{Whisper 202 | title = {Robust Speech Recognition via Large-Scale Weak Supervision}, 203 | url = {https://arxiv.org/abs/2212.04356}, 204 | author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya}, 205 | publisher = {arXiv}, 206 | year = {2022}, 207 | } 208 | ``` 209 | 210 | ```bibtex 211 | @misc{Silero VAD, 212 | author = {Silero Team}, 213 | title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier}, 214 | year = {2021}, 215 | publisher = {GitHub}, 216 | journal = {GitHub repository}, 217 | howpublished = {\url{https://github.com/snakers4/silero-vad}}, 218 | email = {hello@silero.ai} 219 | } 220 | -------------------------------------------------------------------------------- /TensorRT_whisper.md: -------------------------------------------------------------------------------- 1 | # WhisperLive-TensorRT 2 | We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup. 3 | **Note**: We use `tensorrt_llm==0.18.2` 4 | 5 | ## Installation 6 | - Install [docker](https://docs.docker.com/engine/install/) 7 | - Install [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) 8 | 9 | - Run WhisperLive TensorRT in docker 10 | ```bash 11 | docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt 12 | docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it whisperlive-tensorrt 13 | ``` 14 | 15 | ## Whisper TensorRT Engine 16 | - We build `small.en` and `small` multilingual TensorRT engine as examples below. The script logs the path of the directory with Whisper TensorRT engine. We need that model_path to run the server. 17 | ```bash 18 | # convert small.en 19 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16 20 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization 21 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization 22 | 23 | # convert small multilingual model 24 | bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small 25 | ``` 26 | 27 | ## Run WhisperLive Server with TensorRT Backend 28 | ```bash 29 | # Run English only model 30 | python3 run_server.py --port 9090 \ 31 | --backend tensorrt \ 32 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16" 33 | 34 | # Run Multilingual model 35 | python3 run_server.py --port 9090 \ 36 | --backend tensorrt \ 37 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \ 38 | --trt_multilingual 39 | ``` 40 | 41 | By default trt_backend uses cpp_session, to use python session pass `--trt_py_session` to run_server.py 42 | ```bash 43 | python3 run_server.py --port 9090 \ 44 | --backend tensorrt \ 45 | --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \ 46 | --trt_py_session 47 | ``` -------------------------------------------------------------------------------- /assets/jfk.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/assets/jfk.flac -------------------------------------------------------------------------------- /docker/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM python:3.10-bookworm 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | # install lib required for pyaudio 6 | RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/* 7 | 8 | # update pip to support for whl.metadata -> less downloading 9 | RUN pip install --no-cache-dir -U "pip>=24" 10 | 11 | # create a working directory 12 | RUN mkdir /app 13 | WORKDIR /app 14 | 15 | # install pytorch, but without the nvidia-libs that are only necessary for gpu 16 | RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu 17 | 18 | # install the requirements for running the whisper-live server 19 | COPY requirements/server.txt /app/ 20 | RUN pip install --no-cache-dir -r server.txt && rm server.txt 21 | 22 | COPY whisper_live /app/whisper_live 23 | COPY run_server.py /app 24 | 25 | CMD ["python", "run_server.py"] 26 | -------------------------------------------------------------------------------- /docker/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM python:3.10-bookworm 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | # install lib required for pyaudio 6 | RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/* 7 | 8 | # update pip to support for whl.metadata -> less downloading 9 | RUN pip install --no-cache-dir -U "pip>=24" 10 | 11 | # create a working directory 12 | RUN mkdir /app 13 | WORKDIR /app 14 | 15 | # install the requirements for running the whisper-live server 16 | COPY requirements/server.txt /app/ 17 | RUN pip install --no-cache-dir -r server.txt && rm server.txt 18 | 19 | # make the paths of the nvidia libs installed as wheels visible. equivalent to: 20 | # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` 21 | ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib" 22 | 23 | COPY whisper_live /app/whisper_live 24 | COPY run_server.py /app 25 | 26 | CMD ["python", "run_server.py"] 27 | -------------------------------------------------------------------------------- /docker/Dockerfile.openvino: -------------------------------------------------------------------------------- 1 | FROM openvino/ubuntu22_runtime:latest 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | USER root 6 | 7 | RUN apt update && apt install -y portaudio19-dev python-is-python3 && apt-get clean && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN pip install --no-cache-dir -U "pip>=24" 10 | 11 | RUN mkdir /app 12 | WORKDIR /app 13 | 14 | COPY requirements/server.txt /app/ 15 | RUN pip install --no-cache-dir -r server.txt && rm server.txt 16 | 17 | COPY whisper_live /app/whisper_live 18 | COPY run_server.py /app 19 | CMD ["python", "run_server.py", "--backend", "openvino"] 20 | -------------------------------------------------------------------------------- /docker/Dockerfile.tensorrt: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \ 7 | && apt install python-is-python3 \ 8 | && pip install --upgrade pip setuptools \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | FROM base AS devel 12 | RUN pip install --no-cache-dir -U tensorrt_llm==0.18.2 --extra-index-url https://pypi.nvidia.com 13 | WORKDIR /app 14 | RUN git clone -b v0.18.2 https://github.com/NVIDIA/TensorRT-LLM.git \ 15 | && mv TensorRT-LLM/examples ./TensorRT-LLM-examples \ 16 | && rm -rf TensorRT-LLM 17 | 18 | FROM devel AS release 19 | WORKDIR /app 20 | COPY assets/ ./assets 21 | RUN wget -nc -P assets/ https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz 22 | 23 | COPY scripts/setup.sh ./ 24 | RUN apt update && bash setup.sh && rm setup.sh 25 | 26 | COPY requirements/server.txt . 27 | RUN pip install --no-cache-dir -r server.txt && rm server.txt 28 | COPY whisper_live ./whisper_live 29 | COPY scripts/build_whisper_tensorrt.sh . 30 | COPY run_server.py . -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/.nojekyll -------------------------------------------------------------------------------- /docs/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 7b818b47e6f359b937e5a2517f120d43 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. whisper_live documentation master file, created by 2 | sphinx-quickstart on Fri Sep 22 11:39:30 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Whisper Live documentation! 7 | ======================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | 13 | .. automodule:: whisper_live.server 14 | :members: 15 | 16 | .. automodule:: whisper_live.client 17 | :members: 18 | 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /docs/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /docs/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Base JavaScript utilities for all Sphinx HTML documentation. 6 | * 7 | * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | "use strict"; 12 | 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ 14 | "TEXTAREA", 15 | "INPUT", 16 | "SELECT", 17 | "BUTTON", 18 | ]); 19 | 20 | const _ready = (callback) => { 21 | if (document.readyState !== "loading") { 22 | callback(); 23 | } else { 24 | document.addEventListener("DOMContentLoaded", callback); 25 | } 26 | }; 27 | 28 | /** 29 | * Small JavaScript module for the documentation. 30 | */ 31 | const Documentation = { 32 | init: () => { 33 | Documentation.initDomainIndexTable(); 34 | Documentation.initOnKeyListeners(); 35 | }, 36 | 37 | /** 38 | * i18n support 39 | */ 40 | TRANSLATIONS: {}, 41 | PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), 42 | LOCALE: "unknown", 43 | 44 | // gettext and ngettext don't access this so that the functions 45 | // can safely bound to a different name (_ = Documentation.gettext) 46 | gettext: (string) => { 47 | const translated = Documentation.TRANSLATIONS[string]; 48 | switch (typeof translated) { 49 | case "undefined": 50 | return string; // no translation 51 | case "string": 52 | return translated; // translation exists 53 | default: 54 | return translated[0]; // (singular, plural) translation tuple exists 55 | } 56 | }, 57 | 58 | ngettext: (singular, plural, n) => { 59 | const translated = Documentation.TRANSLATIONS[singular]; 60 | if (typeof translated !== "undefined") 61 | return translated[Documentation.PLURAL_EXPR(n)]; 62 | return n === 1 ? singular : plural; 63 | }, 64 | 65 | addTranslations: (catalog) => { 66 | Object.assign(Documentation.TRANSLATIONS, catalog.messages); 67 | Documentation.PLURAL_EXPR = new Function( 68 | "n", 69 | `return (${catalog.plural_expr})` 70 | ); 71 | Documentation.LOCALE = catalog.locale; 72 | }, 73 | 74 | /** 75 | * helper function to focus on search bar 76 | */ 77 | focusSearchBar: () => { 78 | document.querySelectorAll("input[name=q]")[0]?.focus(); 79 | }, 80 | 81 | /** 82 | * Initialise the domain index toggle buttons 83 | */ 84 | initDomainIndexTable: () => { 85 | const toggler = (el) => { 86 | const idNumber = el.id.substr(7); 87 | const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); 88 | if (el.src.substr(-9) === "minus.png") { 89 | el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; 90 | toggledRows.forEach((el) => (el.style.display = "none")); 91 | } else { 92 | el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; 93 | toggledRows.forEach((el) => (el.style.display = "")); 94 | } 95 | }; 96 | 97 | const togglerElements = document.querySelectorAll("img.toggler"); 98 | togglerElements.forEach((el) => 99 | el.addEventListener("click", (event) => toggler(event.currentTarget)) 100 | ); 101 | togglerElements.forEach((el) => (el.style.display = "")); 102 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); 103 | }, 104 | 105 | initOnKeyListeners: () => { 106 | // only install a listener if it is really needed 107 | if ( 108 | !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && 109 | !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS 110 | ) 111 | return; 112 | 113 | document.addEventListener("keydown", (event) => { 114 | // bail for input elements 115 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 116 | // bail with special keys 117 | if (event.altKey || event.ctrlKey || event.metaKey) return; 118 | 119 | if (!event.shiftKey) { 120 | switch (event.key) { 121 | case "ArrowLeft": 122 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 123 | 124 | const prevLink = document.querySelector('link[rel="prev"]'); 125 | if (prevLink && prevLink.href) { 126 | window.location.href = prevLink.href; 127 | event.preventDefault(); 128 | } 129 | break; 130 | case "ArrowRight": 131 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 132 | 133 | const nextLink = document.querySelector('link[rel="next"]'); 134 | if (nextLink && nextLink.href) { 135 | window.location.href = nextLink.href; 136 | event.preventDefault(); 137 | } 138 | break; 139 | } 140 | } 141 | 142 | // some keyboard layouts may need Shift to get / 143 | switch (event.key) { 144 | case "/": 145 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; 146 | Documentation.focusSearchBar(); 147 | event.preventDefault(); 148 | } 149 | }); 150 | }, 151 | }; 152 | 153 | // quick alias for translations 154 | const _ = Documentation.gettext; 155 | 156 | _ready(Documentation.init); 157 | -------------------------------------------------------------------------------- /docs/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | const DOCUMENTATION_OPTIONS = { 2 | VERSION: '', 3 | LANGUAGE: 'en', 4 | COLLAPSE_INDEX: false, 5 | BUILDER: 'html', 6 | FILE_SUFFIX: '.html', 7 | LINK_SUFFIX: '.html', 8 | HAS_SOURCE: true, 9 | SOURCELINK_SUFFIX: '.txt', 10 | NAVIGATION_WITH_KEYS: false, 11 | SHOW_SEARCH_SUMMARY: true, 12 | ENABLE_SEARCH_SHORTCUTS: true, 13 | }; -------------------------------------------------------------------------------- /docs/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/file.png -------------------------------------------------------------------------------- /docs/html/_static/language_data.js: -------------------------------------------------------------------------------- 1 | /* 2 | * language_data.js 3 | * ~~~~~~~~~~~~~~~~ 4 | * 5 | * This script contains the language-specific data used by searchtools.js, 6 | * namely the list of stopwords, stemmer, scorer and splitter. 7 | * 8 | * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. 9 | * :license: BSD, see LICENSE for details. 10 | * 11 | */ 12 | 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; 14 | 15 | 16 | /* Non-minified version is copied as a separate JS file, is available */ 17 | 18 | /** 19 | * Porter Stemmer 20 | */ 21 | var Stemmer = function() { 22 | 23 | var step2list = { 24 | ational: 'ate', 25 | tional: 'tion', 26 | enci: 'ence', 27 | anci: 'ance', 28 | izer: 'ize', 29 | bli: 'ble', 30 | alli: 'al', 31 | entli: 'ent', 32 | eli: 'e', 33 | ousli: 'ous', 34 | ization: 'ize', 35 | ation: 'ate', 36 | ator: 'ate', 37 | alism: 'al', 38 | iveness: 'ive', 39 | fulness: 'ful', 40 | ousness: 'ous', 41 | aliti: 'al', 42 | iviti: 'ive', 43 | biliti: 'ble', 44 | logi: 'log' 45 | }; 46 | 47 | var step3list = { 48 | icate: 'ic', 49 | ative: '', 50 | alize: 'al', 51 | iciti: 'ic', 52 | ical: 'ic', 53 | ful: '', 54 | ness: '' 55 | }; 56 | 57 | var c = "[^aeiou]"; // consonant 58 | var v = "[aeiouy]"; // vowel 59 | var C = c + "[^aeiouy]*"; // consonant sequence 60 | var V = v + "[aeiou]*"; // vowel sequence 61 | 62 | var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 63 | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 64 | var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 65 | var s_v = "^(" + C + ")?" + v; // vowel in stem 66 | 67 | this.stemWord = function (w) { 68 | var stem; 69 | var suffix; 70 | var firstch; 71 | var origword = w; 72 | 73 | if (w.length < 3) 74 | return w; 75 | 76 | var re; 77 | var re2; 78 | var re3; 79 | var re4; 80 | 81 | firstch = w.substr(0,1); 82 | if (firstch == "y") 83 | w = firstch.toUpperCase() + w.substr(1); 84 | 85 | // Step 1a 86 | re = /^(.+?)(ss|i)es$/; 87 | re2 = /^(.+?)([^s])s$/; 88 | 89 | if (re.test(w)) 90 | w = w.replace(re,"$1$2"); 91 | else if (re2.test(w)) 92 | w = w.replace(re2,"$1$2"); 93 | 94 | // Step 1b 95 | re = /^(.+?)eed$/; 96 | re2 = /^(.+?)(ed|ing)$/; 97 | if (re.test(w)) { 98 | var fp = re.exec(w); 99 | re = new RegExp(mgr0); 100 | if (re.test(fp[1])) { 101 | re = /.$/; 102 | w = w.replace(re,""); 103 | } 104 | } 105 | else if (re2.test(w)) { 106 | var fp = re2.exec(w); 107 | stem = fp[1]; 108 | re2 = new RegExp(s_v); 109 | if (re2.test(stem)) { 110 | w = stem; 111 | re2 = /(at|bl|iz)$/; 112 | re3 = new RegExp("([^aeiouylsz])\\1$"); 113 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 114 | if (re2.test(w)) 115 | w = w + "e"; 116 | else if (re3.test(w)) { 117 | re = /.$/; 118 | w = w.replace(re,""); 119 | } 120 | else if (re4.test(w)) 121 | w = w + "e"; 122 | } 123 | } 124 | 125 | // Step 1c 126 | re = /^(.+?)y$/; 127 | if (re.test(w)) { 128 | var fp = re.exec(w); 129 | stem = fp[1]; 130 | re = new RegExp(s_v); 131 | if (re.test(stem)) 132 | w = stem + "i"; 133 | } 134 | 135 | // Step 2 136 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 137 | if (re.test(w)) { 138 | var fp = re.exec(w); 139 | stem = fp[1]; 140 | suffix = fp[2]; 141 | re = new RegExp(mgr0); 142 | if (re.test(stem)) 143 | w = stem + step2list[suffix]; 144 | } 145 | 146 | // Step 3 147 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 148 | if (re.test(w)) { 149 | var fp = re.exec(w); 150 | stem = fp[1]; 151 | suffix = fp[2]; 152 | re = new RegExp(mgr0); 153 | if (re.test(stem)) 154 | w = stem + step3list[suffix]; 155 | } 156 | 157 | // Step 4 158 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 159 | re2 = /^(.+?)(s|t)(ion)$/; 160 | if (re.test(w)) { 161 | var fp = re.exec(w); 162 | stem = fp[1]; 163 | re = new RegExp(mgr1); 164 | if (re.test(stem)) 165 | w = stem; 166 | } 167 | else if (re2.test(w)) { 168 | var fp = re2.exec(w); 169 | stem = fp[1] + fp[2]; 170 | re2 = new RegExp(mgr1); 171 | if (re2.test(stem)) 172 | w = stem; 173 | } 174 | 175 | // Step 5 176 | re = /^(.+?)e$/; 177 | if (re.test(w)) { 178 | var fp = re.exec(w); 179 | stem = fp[1]; 180 | re = new RegExp(mgr1); 181 | re2 = new RegExp(meq1); 182 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 183 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) 184 | w = stem; 185 | } 186 | re = /ll$/; 187 | re2 = new RegExp(mgr1); 188 | if (re.test(w) && re2.test(w)) { 189 | re = /.$/; 190 | w = w.replace(re,""); 191 | } 192 | 193 | // and turn initial Y back to y 194 | if (firstch == "y") 195 | w = firstch.toLowerCase() + w.substr(1); 196 | return w; 197 | } 198 | } 199 | 200 | -------------------------------------------------------------------------------- /docs/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/minus.png -------------------------------------------------------------------------------- /docs/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/_static/plus.png -------------------------------------------------------------------------------- /docs/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | pre { line-height: 125%; } 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 6 | .highlight .hll { background-color: #ffffcc } 7 | .highlight { background: #f8f8f8; } 8 | .highlight .c { color: #8f5902; font-style: italic } /* Comment */ 9 | .highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */ 10 | .highlight .g { color: #000000 } /* Generic */ 11 | .highlight .k { color: #004461; font-weight: bold } /* Keyword */ 12 | .highlight .l { color: #000000 } /* Literal */ 13 | .highlight .n { color: #000000 } /* Name */ 14 | .highlight .o { color: #582800 } /* Operator */ 15 | .highlight .x { color: #000000 } /* Other */ 16 | .highlight .p { color: #000000; font-weight: bold } /* Punctuation */ 17 | .highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */ 18 | .highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */ 19 | .highlight .cp { color: #8f5902 } /* Comment.Preproc */ 20 | .highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */ 21 | .highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */ 22 | .highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */ 23 | .highlight .gd { color: #a40000 } /* Generic.Deleted */ 24 | .highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ 25 | .highlight .ges { color: #000000 } /* Generic.EmphStrong */ 26 | .highlight .gr { color: #ef2929 } /* Generic.Error */ 27 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 28 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 29 | .highlight .go { color: #888888 } /* Generic.Output */ 30 | .highlight .gp { color: #745334 } /* Generic.Prompt */ 31 | .highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */ 32 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 33 | .highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */ 34 | .highlight .kc { color: #004461; font-weight: bold } /* Keyword.Constant */ 35 | .highlight .kd { color: #004461; font-weight: bold } /* Keyword.Declaration */ 36 | .highlight .kn { color: #004461; font-weight: bold } /* Keyword.Namespace */ 37 | .highlight .kp { color: #004461; font-weight: bold } /* Keyword.Pseudo */ 38 | .highlight .kr { color: #004461; font-weight: bold } /* Keyword.Reserved */ 39 | .highlight .kt { color: #004461; font-weight: bold } /* Keyword.Type */ 40 | .highlight .ld { color: #000000 } /* Literal.Date */ 41 | .highlight .m { color: #990000 } /* Literal.Number */ 42 | .highlight .s { color: #4e9a06 } /* Literal.String */ 43 | .highlight .na { color: #c4a000 } /* Name.Attribute */ 44 | .highlight .nb { color: #004461 } /* Name.Builtin */ 45 | .highlight .nc { color: #000000 } /* Name.Class */ 46 | .highlight .no { color: #000000 } /* Name.Constant */ 47 | .highlight .nd { color: #888888 } /* Name.Decorator */ 48 | .highlight .ni { color: #ce5c00 } /* Name.Entity */ 49 | .highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */ 50 | .highlight .nf { color: #000000 } /* Name.Function */ 51 | .highlight .nl { color: #f57900 } /* Name.Label */ 52 | .highlight .nn { color: #000000 } /* Name.Namespace */ 53 | .highlight .nx { color: #000000 } /* Name.Other */ 54 | .highlight .py { color: #000000 } /* Name.Property */ 55 | .highlight .nt { color: #004461; font-weight: bold } /* Name.Tag */ 56 | .highlight .nv { color: #000000 } /* Name.Variable */ 57 | .highlight .ow { color: #004461; font-weight: bold } /* Operator.Word */ 58 | .highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */ 59 | .highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */ 60 | .highlight .mb { color: #990000 } /* Literal.Number.Bin */ 61 | .highlight .mf { color: #990000 } /* Literal.Number.Float */ 62 | .highlight .mh { color: #990000 } /* Literal.Number.Hex */ 63 | .highlight .mi { color: #990000 } /* Literal.Number.Integer */ 64 | .highlight .mo { color: #990000 } /* Literal.Number.Oct */ 65 | .highlight .sa { color: #4e9a06 } /* Literal.String.Affix */ 66 | .highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */ 67 | .highlight .sc { color: #4e9a06 } /* Literal.String.Char */ 68 | .highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */ 69 | .highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */ 70 | .highlight .s2 { color: #4e9a06 } /* Literal.String.Double */ 71 | .highlight .se { color: #4e9a06 } /* Literal.String.Escape */ 72 | .highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */ 73 | .highlight .si { color: #4e9a06 } /* Literal.String.Interpol */ 74 | .highlight .sx { color: #4e9a06 } /* Literal.String.Other */ 75 | .highlight .sr { color: #4e9a06 } /* Literal.String.Regex */ 76 | .highlight .s1 { color: #4e9a06 } /* Literal.String.Single */ 77 | .highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */ 78 | .highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */ 79 | .highlight .fm { color: #000000 } /* Name.Function.Magic */ 80 | .highlight .vc { color: #000000 } /* Name.Variable.Class */ 81 | .highlight .vg { color: #000000 } /* Name.Variable.Global */ 82 | .highlight .vi { color: #000000 } /* Name.Variable.Instance */ 83 | .highlight .vm { color: #000000 } /* Name.Variable.Magic */ 84 | .highlight .il { color: #990000 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/html/_static/sphinx_highlight.js: -------------------------------------------------------------------------------- 1 | /* Highlighting utilities for Sphinx HTML documentation. */ 2 | "use strict"; 3 | 4 | const SPHINX_HIGHLIGHT_ENABLED = true 5 | 6 | /** 7 | * highlight a given string on a node by wrapping it in 8 | * span elements with the given class name. 9 | */ 10 | const _highlight = (node, addItems, text, className) => { 11 | if (node.nodeType === Node.TEXT_NODE) { 12 | const val = node.nodeValue; 13 | const parent = node.parentNode; 14 | const pos = val.toLowerCase().indexOf(text); 15 | if ( 16 | pos >= 0 && 17 | !parent.classList.contains(className) && 18 | !parent.classList.contains("nohighlight") 19 | ) { 20 | let span; 21 | 22 | const closestNode = parent.closest("body, svg, foreignObject"); 23 | const isInSVG = closestNode && closestNode.matches("svg"); 24 | if (isInSVG) { 25 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 26 | } else { 27 | span = document.createElement("span"); 28 | span.classList.add(className); 29 | } 30 | 31 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 32 | const rest = document.createTextNode(val.substr(pos + text.length)); 33 | parent.insertBefore( 34 | span, 35 | parent.insertBefore( 36 | rest, 37 | node.nextSibling 38 | ) 39 | ); 40 | node.nodeValue = val.substr(0, pos); 41 | /* There may be more occurrences of search term in this node. So call this 42 | * function recursively on the remaining fragment. 43 | */ 44 | _highlight(rest, addItems, text, className); 45 | 46 | if (isInSVG) { 47 | const rect = document.createElementNS( 48 | "http://www.w3.org/2000/svg", 49 | "rect" 50 | ); 51 | const bbox = parent.getBBox(); 52 | rect.x.baseVal.value = bbox.x; 53 | rect.y.baseVal.value = bbox.y; 54 | rect.width.baseVal.value = bbox.width; 55 | rect.height.baseVal.value = bbox.height; 56 | rect.setAttribute("class", className); 57 | addItems.push({ parent: parent, target: rect }); 58 | } 59 | } 60 | } else if (node.matches && !node.matches("button, select, textarea")) { 61 | node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); 62 | } 63 | }; 64 | const _highlightText = (thisNode, text, className) => { 65 | let addItems = []; 66 | _highlight(thisNode, addItems, text, className); 67 | addItems.forEach((obj) => 68 | obj.parent.insertAdjacentElement("beforebegin", obj.target) 69 | ); 70 | }; 71 | 72 | /** 73 | * Small JavaScript module for the documentation. 74 | */ 75 | const SphinxHighlight = { 76 | 77 | /** 78 | * highlight the search words provided in localstorage in the text 79 | */ 80 | highlightSearchWords: () => { 81 | if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight 82 | 83 | // get and clear terms from localstorage 84 | const url = new URL(window.location); 85 | const highlight = 86 | localStorage.getItem("sphinx_highlight_terms") 87 | || url.searchParams.get("highlight") 88 | || ""; 89 | localStorage.removeItem("sphinx_highlight_terms") 90 | url.searchParams.delete("highlight"); 91 | window.history.replaceState({}, "", url); 92 | 93 | // get individual terms from highlight string 94 | const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); 95 | if (terms.length === 0) return; // nothing to do 96 | 97 | // There should never be more than one element matching "div.body" 98 | const divBody = document.querySelectorAll("div.body"); 99 | const body = divBody.length ? divBody[0] : document.querySelector("body"); 100 | window.setTimeout(() => { 101 | terms.forEach((term) => _highlightText(body, term, "highlighted")); 102 | }, 10); 103 | 104 | const searchBox = document.getElementById("searchbox"); 105 | if (searchBox === null) return; 106 | searchBox.appendChild( 107 | document 108 | .createRange() 109 | .createContextualFragment( 110 | '" 114 | ) 115 | ); 116 | }, 117 | 118 | /** 119 | * helper function to hide the search marks again 120 | */ 121 | hideSearchWords: () => { 122 | document 123 | .querySelectorAll("#searchbox .highlight-link") 124 | .forEach((el) => el.remove()); 125 | document 126 | .querySelectorAll("span.highlighted") 127 | .forEach((el) => el.classList.remove("highlighted")); 128 | localStorage.removeItem("sphinx_highlight_terms") 129 | }, 130 | 131 | initEscapeListener: () => { 132 | // only install a listener if it is really needed 133 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; 134 | 135 | document.addEventListener("keydown", (event) => { 136 | // bail for input elements 137 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 138 | // bail with special keys 139 | if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; 140 | if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { 141 | SphinxHighlight.hideSearchWords(); 142 | event.preventDefault(); 143 | } 144 | }); 145 | }, 146 | }; 147 | 148 | _ready(() => { 149 | /* Do not call highlightSearchWords() when we are on the search page. 150 | * It will highlight words from the *previous* search query. 151 | */ 152 | if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); 153 | SphinxHighlight.initEscapeListener(); 154 | }); 155 | -------------------------------------------------------------------------------- /docs/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index — whisper_live documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 |
26 |
27 | 28 | 29 |
30 | 31 | 32 |

Index

33 | 34 |
35 | A 36 | | B 37 | | C 38 | | D 39 | | F 40 | | G 41 | | M 42 | | O 43 | | P 44 | | R 45 | | S 46 | | T 47 | | U 48 | | W 49 | 50 |
51 |

A

52 | 53 | 57 |
58 | 59 |

B

60 | 61 | 65 |
66 | 67 |

C

68 | 69 | 73 | 79 |
80 | 81 |

D

82 | 83 | 87 |
88 | 89 |

F

90 | 91 | 95 |
96 | 97 |

G

98 | 99 | 103 | 107 |
108 | 109 |

M

110 | 111 | 122 |
123 | 124 |

O

125 | 126 | 130 | 134 |
135 | 136 |

P

137 | 138 | 142 |
143 | 144 |

R

145 | 146 | 152 | 158 |
159 | 160 |

S

161 | 162 | 166 | 172 |
173 | 174 |

T

175 | 176 | 180 | 184 |
185 | 186 |

U

187 | 188 | 192 |
193 | 194 |

W

195 | 196 | 212 | 218 |
    197 |
  • 198 | whisper_live.client 199 | 200 |
  • 204 |
  • 205 | whisper_live.server 206 | 207 |
  • 211 |
219 | 220 | 221 | 222 |
223 | 224 |
225 |
226 | 266 |
267 |
268 | 276 | 277 | 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /docs/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/docs/html/objects.inv -------------------------------------------------------------------------------- /docs/html/py-modindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Python Module Index — whisper_live documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 |
29 |
30 | 31 | 32 |
33 | 34 | 35 |

Python Module Index

36 | 37 |
38 | w 39 |
40 | 41 | 42 | 43 | 45 | 46 | 48 | 51 | 52 | 53 | 56 | 57 | 58 | 61 |
 
44 | w
49 | whisper_live 50 |
    54 | whisper_live.client 55 |
    59 | whisper_live.server 60 |
62 | 63 | 64 |
65 | 66 |
67 |
68 | 108 |
109 |
110 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /docs/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Search — whisper_live documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |

Search

38 | 39 | 47 | 48 | 49 |

50 | Searching for multiple words only shows matches that contain 51 | all words. 52 |

53 | 54 | 55 |
56 | 57 | 58 | 59 |
60 | 61 | 62 | 63 |
64 | 65 |
66 | 67 | 68 |
69 | 70 |
71 |
72 | 102 |
103 |
104 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /docs/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({"docnames": ["index"], "filenames": ["index.rst"], "titles": ["Welcome to Whisper Live documentation!"], "terms": {"class": 0, "whisper_l": 0, "server": 0, "servecli": 0, "websocket": 0, "task": 0, "transcrib": 0, "devic": 0, "none": 0, "multilingu": 0, "fals": 0, "languag": 0, "client_uid": 0, "attribut": 0, "rate": 0, "int": 0, "The": 0, "audio": 0, "sampl": 0, "constant": 0, "set": 0, "16000": 0, "server_readi": 0, "str": 0, "A": 0, "messag": 0, "i": 0, "readi": 0, "disconnect": 0, "client": 0, "should": 0, "uniqu": 0, "identifi": 0, "data": 0, "byte": 0, "accumul": 0, "frame": 0, "transcript": 0, "type": 0, "e": 0, "g": 0, "whispermodel": 0, "model": 0, "speech": 0, "text": 0, "timestamp_offset": 0, "float": 0, "offset": 0, "timestamp": 0, "frames_np": 0, "numpi": 0, "ndarrai": 0, "arrai": 0, "store": 0, "frames_offset": 0, "list": 0, "segment": 0, "current_out": 0, "current": 0, "incomplet": 0, "prev_out": 0, "previou": 0, "t_start": 0, "start": 0, "exit": 0, "bool": 0, "flag": 0, "thread": 0, "same_output_threshold": 0, "threshold": 0, "consecut": 0, "same": 0, "output": 0, "show_prev_out_thresh": 0, "show": 0, "add_pause_thresh": 0, "ad": 0, "paus": 0, "blank": 0, "send_last_n_seg": 0, "number": 0, "last": 0, "send": 0, "wrapper": 0, "textwrap": 0, "textwrapp": 0, "format": 0, "pick_previous_seg": 0, "includ": 0, "connect": 0, "add_fram": 0, "frame_np": 0, "add": 0, "ongo": 0, "stream": 0, "buffer": 0, "thi": 0, "method": 0, "respons": 0, "maintain": 0, "allow": 0, "continu": 0, "addit": 0, "thei": 0, "ar": 0, "receiv": 0, "It": 0, "also": 0, "ensur": 0, "doe": 0, "exce": 0, "specifi": 0, "size": 0, "prevent": 0, "excess": 0, "memori": 0, "usag": 0, "If": 0, "45": 0, "second": 0, "discard": 0, "oldest": 0, "30": 0, "reason": 0, "empti": 0, "initi": 0, "provid": 0, "us": 0, "real": 0, "time": 0, "process": 0, "arg": 0, "cleanup": 0, "perform": 0, "befor": 0, "servic": 0, "necessari": 0, "stop": 0, "mark": 0, "gracefulli": 0, "destroi": 0, "resourc": 0, "associ": 0, "notifi": 0, "via": 0, "them": 0, "fill_output": 0, "combin": 0, "complet": 0, "result": 0, "wrap": 0, "two": 0, "line": 0, "each": 0, "contain": 0, "maximum": 0, "50": 0, "charact": 0, "fit": 0, "within": 0, "per": 0, "concaten": 0, "order": 0, "exist": 0, "most": 0, "recent": 0, "first": 0, "older": 0, "prepend": 0, "need": 0, "limit": 0, "3": 0, "detect": 0, "ani": 0, "preced": 0, "content": 0, "return": 0, "singl": 0, "string": 0, "speech_to_text": 0, "an": 0, "infinit": 0, "loop": 0, "": 0, "wait": 0, "input": 0, "make": 0, "predict": 0, "util": 0, "asr": 0, "sent": 0, "histori": 0, "context": 0, "from": 0, "handl": 0, "durat": 0, "rais": 0, "except": 0, "issu": 0, "commun": 0, "update_seg": 0, "append": 0, "all": 0, "assum": 0, "updat": 0, "end": 0, "chronolog": 0, "one": 0, "repeat": 0, "seen": 0, "multipl": 0, "onli": 0, "onc": 0, "base": 0, "dict": 0, "dictionari": 0, "chunk": 0, "its": 0, "valid": 0, "transcriptionserv": 0, "repres": 0, "incom": 0, "vad_model": 0, "torch": 0, "modul": 0, "voic": 0, "activ": 0, "vad_threshold": 0, "clients_start_tim": 0, "track": 0, "max_client": 0, "max_connection_tim": 0, "get_wait_tim": 0, "calcul": 0, "estim": 0, "minut": 0, "recv_audio": 0, "over": 0, "vad": 0, "determin": 0, "reach": 0, "statu": 0, "until": 0, "slot": 0, "avail": 0, "clean": 0, "up": 0, "error": 0, "dure": 0, "run": 0, "host": 0, "port": 0, "9090": 0, "address": 0, "bind": 0, "transcriptioncli": 0, "is_multilingu": 0, "lang": 0, "translat": 0, "act": 0, "high": 0, "level": 0, "can": 0, "hostnam": 0, "ip": 0, "option": 0, "whether": 0, "support": 0, "default": 0, "primari": 0, "which": 0, "english": 0, "en": 0, "requir": 0, "instanc": 0, "underli": 0, "exampl": 0, "To": 0, "creat": 0, "microphon": 0, "python": 0, "transcription_cli": 0, "localhost": 0, "true": 0, "resampl": 0, "file": 0, "sr": 0, "http": 0, "github": 0, "com": 0, "openai": 0, "blob": 0, "7858aa9c08d98f75575035ecd6481f462d66ca27": 0, "py": 0, "l22": 0, "open": 0, "read": 0, "mono": 0, "waveform": 0, "save": 0, "resampled_fil": 0, "index": 0, "search": 0, "page": 0, "record": 0, "static": 0, "bytes_to_float_arrai": 0, "audio_byt": 0, "convert": 0, "16": 0, "bit": 0, "pcm": 0, "normal": 0, "have": 0, "valu": 0, "between": 0, "1": 0, "np": 0, "close_websocket": 0, "close": 0, "join": 0, "attempt": 0, "self": 0, "client_socket": 0, "after": 0, "proper": 0, "termin": 0, "get_client_socket": 0, "get": 0, "socket": 0, "websocketapp": 0, "on_messag": 0, "w": 0, "callback": 0, "function": 0, "call": 0, "when": 0, "variou": 0, "on_open": 0, "successfulli": 0, "configur": 0, "uid": 0, "mode": 0, "select": 0, "play_fil": 0, "filenam": 0, "plai": 0, "through": 0, "simultan": 0, "pyaudio": 0, "playback": 0, "point": 0, "typic": 0, "you": 0, "want": 0, "pre": 0, "path": 0, "out_fil": 0, "output_record": 0, "wav": 0, "record_second": 0, "directori": 0, "separ": 0, "interrupt": 0, "keyboardinterrupt": 0, "press": 0, "ctrl": 0, "c": 0, "name": 0, "entir": 0, "send_packet_to_serv": 0, "packet": 0, "write_audio_frames_to_fil": 0, "file_nam": 0, "write": 0, "overwritten": 0, "correct": 0, "match": 0, "channel": 0, "width": 0, "written": 0, "write_output_record": 0, "n_audio_fil": 0, "individu": 0, "expect": 0, "locat": 0, "final": 0, "delet": 0}, "objects": {"whisper_live": [[0, 0, 0, "-", "client"], [0, 0, 0, "-", "server"]], "whisper_live.client": [[0, 1, 1, "", "Client"], [0, 1, 1, "", "TranscriptionClient"], [0, 3, 1, "", "resample"]], "whisper_live.client.Client": [[0, 2, 1, "", "bytes_to_float_array"], [0, 2, 1, "", "close_websocket"], [0, 2, 1, "", "get_client_socket"], [0, 2, 1, "", "on_message"], [0, 2, 1, "", "on_open"], [0, 2, 1, "", "play_file"], [0, 2, 1, "", "record"], [0, 2, 1, "", "send_packet_to_server"], [0, 2, 1, "", "write_audio_frames_to_file"], [0, 2, 1, "", "write_output_recording"]], "whisper_live.server": [[0, 1, 1, "", "ServeClient"], [0, 1, 1, "", "TranscriptionServer"]], "whisper_live.server.ServeClient": [[0, 2, 1, "", "add_frames"], [0, 2, 1, "", "cleanup"], [0, 2, 1, "", "disconnect"], [0, 2, 1, "", "fill_output"], [0, 2, 1, "", "speech_to_text"], [0, 2, 1, "", "update_segments"]], "whisper_live.server.TranscriptionServer": [[0, 2, 1, "", "get_wait_time"], [0, 2, 1, "", "recv_audio"], [0, 2, 1, "", "run"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"welcom": 0, "whisper": 0, "live": 0, "document": 0, "indic": 0, "tabl": 0}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Welcome to Whisper Live documentation!": [[0, "welcome-to-whisper-live-documentation"]], "Indices and tables": [[0, "indices-and-tables"]]}, "indexentries": {"client (class in whisper_live.client)": [[0, "whisper_live.client.Client"]], "serveclient (class in whisper_live.server)": [[0, "whisper_live.server.ServeClient"]], "transcriptionclient (class in whisper_live.client)": [[0, "whisper_live.client.TranscriptionClient"]], "transcriptionserver (class in whisper_live.server)": [[0, "whisper_live.server.TranscriptionServer"]], "add_frames() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.add_frames"]], "bytes_to_float_array() (whisper_live.client.client static method)": [[0, "whisper_live.client.Client.bytes_to_float_array"]], "cleanup() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.cleanup"]], "close_websocket() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.close_websocket"]], "disconnect() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.disconnect"]], "fill_output() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.fill_output"]], "get_client_socket() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.get_client_socket"]], "get_wait_time() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.get_wait_time"]], "module": [[0, "module-whisper_live.client"], [0, "module-whisper_live.server"]], "on_message() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.on_message"]], "on_open() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.on_open"]], "play_file() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.play_file"]], "record() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.record"]], "recv_audio() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.recv_audio"]], "resample() (in module whisper_live.client)": [[0, "whisper_live.client.resample"]], "run() (whisper_live.server.transcriptionserver method)": [[0, "whisper_live.server.TranscriptionServer.run"]], "send_packet_to_server() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.send_packet_to_server"]], "speech_to_text() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.speech_to_text"]], "update_segments() (whisper_live.server.serveclient method)": [[0, "whisper_live.server.ServeClient.update_segments"]], "whisper_live.client": [[0, "module-whisper_live.client"]], "whisper_live.server": [[0, "module-whisper_live.server"]], "write_audio_frames_to_file() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.write_audio_frames_to_file"]], "write_output_recording() (whisper_live.client.client method)": [[0, "whisper_live.client.Client.write_output_recording"]]}}) -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements/client.txt: -------------------------------------------------------------------------------- 1 | PyAudio 2 | av 3 | scipy 4 | websocket-client -------------------------------------------------------------------------------- /requirements/server.txt: -------------------------------------------------------------------------------- 1 | faster-whisper==1.1.0 2 | websockets 3 | onnxruntime==1.17.0 4 | numba 5 | kaldialign 6 | soundfile 7 | scipy 8 | av 9 | jiwer 10 | evaluate 11 | numpy<2 12 | openai-whisper==20240930 13 | tokenizers==0.20.3 14 | transformers[torch] 15 | 16 | # openvino 17 | librosa 18 | openvino 19 | openvino-genai 20 | openvino-tokenizers 21 | optimum 22 | optimum-intel -------------------------------------------------------------------------------- /run_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--port', '-p', 7 | type=int, 8 | default=9090, 9 | help="Websocket port to run the server on.") 10 | parser.add_argument('--backend', '-b', 11 | type=str, 12 | default='faster_whisper', 13 | help='Backends from ["tensorrt", "faster_whisper", "openvino"]') 14 | parser.add_argument('--faster_whisper_custom_model_path', '-fw', 15 | type=str, default=None, 16 | help="Custom Faster Whisper Model") 17 | parser.add_argument('--trt_model_path', '-trt', 18 | type=str, 19 | default=None, 20 | help='Whisper TensorRT model path') 21 | parser.add_argument('--trt_multilingual', '-m', 22 | action="store_true", 23 | help='Boolean only for TensorRT model. True if multilingual.') 24 | parser.add_argument('--trt_py_session', 25 | action="store_true", 26 | help='Boolean only for TensorRT model. Use python session or cpp session, By default uses Cpp.') 27 | parser.add_argument('--omp_num_threads', '-omp', 28 | type=int, 29 | default=1, 30 | help="Number of threads to use for OpenMP") 31 | parser.add_argument('--no_single_model', '-nsm', 32 | action='store_true', 33 | help='Set this if every connection should instantiate its own model. Only relevant for custom model, passed using -trt or -fw.') 34 | parser.add_argument('--cache_path', '-c', 35 | type=str, 36 | default="~/.cache/whisper-live/", 37 | help='Path to cache the converted ctranslate2 models.') 38 | args = parser.parse_args() 39 | 40 | if args.backend == "tensorrt": 41 | if args.trt_model_path is None: 42 | raise ValueError("Please Provide a valid tensorrt model path") 43 | 44 | if "OMP_NUM_THREADS" not in os.environ: 45 | os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads) 46 | 47 | from whisper_live.server import TranscriptionServer 48 | server = TranscriptionServer() 49 | server.run( 50 | "0.0.0.0", 51 | port=args.port, 52 | backend=args.backend, 53 | faster_whisper_custom_model_path=args.faster_whisper_custom_model_path, 54 | whisper_tensorrt_path=args.trt_model_path, 55 | trt_multilingual=args.trt_multilingual, 56 | trt_py_session=args.trt_py_session, 57 | single_model=not args.no_single_model, 58 | cache_path=args.cache_path 59 | ) 60 | -------------------------------------------------------------------------------- /scripts/build_whisper_tensorrt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | download_and_build_model() { 4 | local model_name="$1" 5 | local model_url="" 6 | 7 | case "$model_name" in 8 | "tiny.en") 9 | model_url="https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt" 10 | ;; 11 | "tiny") 12 | model_url="https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt" 13 | ;; 14 | "base.en") 15 | model_url="https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt" 16 | ;; 17 | "base") 18 | model_url="https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt" 19 | ;; 20 | "small.en") 21 | model_url="https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt" 22 | ;; 23 | "small") 24 | model_url="https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt" 25 | ;; 26 | "medium.en") 27 | model_url="https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt" 28 | ;; 29 | "medium") 30 | model_url="https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt" 31 | ;; 32 | "large-v1") 33 | model_url="https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt" 34 | ;; 35 | "large-v2") 36 | model_url="https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt" 37 | ;; 38 | "large-v3" | "large") 39 | model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt" 40 | ;; 41 | "large-v3-turbo" | "turbo") 42 | model_url="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt" 43 | ;; 44 | *) 45 | echo "Invalid model name: $model_name" 46 | exit 1 47 | ;; 48 | esac 49 | 50 | if [ "$model_name" == "turbo" ]; then 51 | model_name="large-v3-turbo" 52 | fi 53 | 54 | local inference_precision="float16" 55 | local weight_only_precision="${2:-float16}" 56 | local max_beam_width=4 57 | local max_batch_size=4 58 | 59 | echo "Downloading $model_name..." 60 | # wget --directory-prefix=assets "$model_url" 61 | # echo "Download completed: ${model_name}.pt" 62 | if [ ! -f "assets/${model_name}.pt" ]; then 63 | wget --directory-prefix=assets "$model_url" 64 | echo "Download completed: ${model_name}.pt" 65 | else 66 | echo "${model_name}.pt already exists in assets directory." 67 | fi 68 | 69 | local sanitized_model_name="${model_name//./_}" 70 | local checkpoint_dir="whisper_${sanitized_model_name}_weights_${weight_only_precision}" 71 | local output_dir="whisper_${sanitized_model_name}_${weight_only_precision}" 72 | echo "$output_dir" 73 | echo "Converting model weights for $model_name..." 74 | python3 convert_checkpoint.py \ 75 | $( [[ "$weight_only_precision" == "int8" || "$weight_only_precision" == "int4" ]] && echo "--use_weight_only --weight_only_precision $weight_only_precision" ) \ 76 | --output_dir "$checkpoint_dir" --model_name "$model_name" 77 | 78 | echo "Building encoder for $model_name..." 79 | trtllm-build \ 80 | --checkpoint_dir "${checkpoint_dir}/encoder" \ 81 | --output_dir "${output_dir}/encoder" \ 82 | --moe_plugin disable \ 83 | --max_batch_size "$max_batch_size" \ 84 | --gemm_plugin disable \ 85 | --bert_attention_plugin "$inference_precision" \ 86 | --max_input_len 3000 \ 87 | --max_seq_len 3000 88 | 89 | echo "Building decoder for $model_name..." 90 | trtllm-build \ 91 | --checkpoint_dir "${checkpoint_dir}/decoder" \ 92 | --output_dir "${output_dir}/decoder" \ 93 | --moe_plugin disable \ 94 | --max_beam_width "$max_beam_width" \ 95 | --max_batch_size "$max_batch_size" \ 96 | --max_seq_len 225 \ 97 | --max_input_len 32 \ 98 | --max_encoder_input_len 3000 \ 99 | --gemm_plugin "$inference_precision" \ 100 | --bert_attention_plugin "$inference_precision" \ 101 | --gpt_attention_plugin "$inference_precision" 102 | 103 | echo "TensorRT LLM engine built for $model_name." 104 | echo "=========================================" 105 | echo "Model is located at: $(pwd)/$output_dir" 106 | } 107 | 108 | if [ "$#" -lt 1 ]; then 109 | echo "Usage: $0 [model-name]" 110 | exit 1 111 | fi 112 | 113 | tensorrt_examples_dir="$1" 114 | model_name="${2:-small.en}" 115 | weight_only_precision="${3:-float16}" # Default to float16 if not provided 116 | 117 | cd $tensorrt_examples_dir/whisper 118 | pip install --no-deps -r requirements.txt 119 | 120 | download_and_build_model "$model_name" "$weight_only_precision" 121 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Detect the operating system 4 | if [[ "$OSTYPE" == "darwin"* ]]; then 5 | # macOS 6 | echo "Detected macOS, using Homebrew for installation" 7 | 8 | # Check if Homebrew is installed 9 | if ! command -v brew &> /dev/null; then 10 | echo "Homebrew not found. Please install Homebrew first: https://brew.sh/" 11 | exit 1 12 | fi 13 | 14 | # Install packages using Homebrew 15 | brew install portaudio wget 16 | else 17 | # Linux (Debian/Ubuntu) 18 | echo "Detected Linux, using apt-get for installation" 19 | apt-get install portaudio19-dev wget -y 20 | fi 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import find_packages, setup 3 | from whisper_live.__version__ import __version__ 4 | 5 | 6 | # The directory containing this file 7 | HERE = pathlib.Path(__file__).parent 8 | 9 | # The text of the README file 10 | README = (HERE / "README.md").read_text() 11 | 12 | # This call to setup() does all the work 13 | setup( 14 | name="whisper_live", 15 | version=__version__, 16 | description="A nearly-live implementation of OpenAI's Whisper.", 17 | long_description=README, 18 | long_description_content_type="text/markdown", 19 | include_package_data=True, 20 | url="https://github.com/collabora/WhisperLive", 21 | author="Collabora Ltd", 22 | author_email="vineet.suryan@collabora.com", 23 | license="MIT", 24 | classifiers=[ 25 | "Development Status :: 4 - Beta", 26 | "Intended Audience :: Developers", 27 | "Intended Audience :: Science/Research", 28 | "License :: OSI Approved :: MIT License", 29 | "Programming Language :: Python :: 3", 30 | "Programming Language :: Python :: 3 :: Only", 31 | "Programming Language :: Python :: 3.8", 32 | "Programming Language :: Python :: 3.9", 33 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 34 | ], 35 | packages=find_packages( 36 | exclude=( 37 | "examples", 38 | "Audio-Transcription-Chrome", 39 | "Audio-Transcription-Firefox", 40 | "requirements", 41 | "whisper-finetuning" 42 | ) 43 | ), 44 | install_requires=[ 45 | "PyAudio", 46 | "faster-whisper==1.1.0", 47 | "torch", 48 | "torchaudio", 49 | "websockets", 50 | "onnxruntime==1.17.0", 51 | "scipy", 52 | "websocket-client", 53 | "numba", 54 | "openai-whisper==20240930", 55 | "kaldialign", 56 | "soundfile", 57 | "tokenizers==0.20.3", 58 | "librosa", 59 | "numpy==1.26.4", 60 | "openvino", 61 | "openvino-genai", 62 | "openvino-tokenizers", 63 | "optimum", 64 | "optimum-intel", 65 | ], 66 | python_requires=">=3.9" 67 | ) 68 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import scipy 4 | import websocket 5 | import copy 6 | import unittest 7 | from unittest.mock import patch, MagicMock 8 | from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient 9 | from whisper_live.utils import resample 10 | from pathlib import Path 11 | 12 | 13 | class BaseTestCase(unittest.TestCase): 14 | @patch('whisper_live.client.websocket.WebSocketApp') 15 | @patch('whisper_live.client.pyaudio.PyAudio') 16 | def setUp(self, mock_pyaudio, mock_websocket): 17 | self.mock_pyaudio_instance = MagicMock() 18 | mock_pyaudio.return_value = self.mock_pyaudio_instance 19 | self.mock_stream = MagicMock() 20 | self.mock_pyaudio_instance.open.return_value = self.mock_stream 21 | 22 | self.mock_ws_app = mock_websocket.return_value 23 | self.mock_ws_app.send = MagicMock() 24 | 25 | self.client = TranscriptionClient(host='localhost', port=9090, lang="en").client 26 | 27 | self.mock_pyaudio = mock_pyaudio 28 | self.mock_websocket = mock_websocket 29 | self.mock_audio_packet = b'\x00\x01\x02\x03' 30 | 31 | def tearDown(self): 32 | self.client.close_websocket() 33 | self.mock_pyaudio.stop() 34 | self.mock_websocket.stop() 35 | del self.client 36 | 37 | class TestClientWebSocketCommunication(BaseTestCase): 38 | def test_websocket_communication(self): 39 | expected_url = 'ws://localhost:9090' 40 | self.mock_websocket.assert_called() 41 | self.assertEqual(self.mock_websocket.call_args[0][0], expected_url) 42 | 43 | 44 | class TestClientCallbacks(BaseTestCase): 45 | def test_on_open(self): 46 | expected_message = json.dumps({ 47 | "uid": self.client.uid, 48 | "language": self.client.language, 49 | "task": self.client.task, 50 | "model": self.client.model, 51 | "use_vad": True, 52 | "max_clients": 4, 53 | "max_connection_time": 600, 54 | "send_last_n_segments": 10, 55 | "no_speech_thresh": 0.45, 56 | "clip_audio": False, 57 | "same_output_threshold": 10, 58 | }) 59 | self.client.on_open(self.mock_ws_app) 60 | self.mock_ws_app.send.assert_called_with(expected_message) 61 | 62 | def test_on_message(self): 63 | message = json.dumps( 64 | { 65 | "uid": self.client.uid, 66 | "message": "SERVER_READY", 67 | "backend": "faster_whisper" 68 | } 69 | ) 70 | self.client.on_message(self.mock_ws_app, message) 71 | 72 | message = json.dumps({ 73 | "uid": self.client.uid, 74 | "segments": [ 75 | {"start": 0, "end": 1, "text": "Test transcript", "completed": True}, 76 | {"start": 1, "end": 2, "text": "Test transcript 2", "completed": True}, 77 | {"start": 2, "end": 3, "text": "Test transcript 3", "completed": True} 78 | ] 79 | }) 80 | self.client.on_message(self.mock_ws_app, message) 81 | 82 | # Assert that the transcript was updated correctly 83 | self.assertEqual(len(self.client.transcript), 3) 84 | self.assertEqual(self.client.transcript[1]['text'], "Test transcript 2") 85 | 86 | def test_on_close(self): 87 | close_status_code = 1000 88 | close_msg = "Normal closure" 89 | self.client.on_close(self.mock_ws_app, close_status_code, close_msg) 90 | 91 | self.assertFalse(self.client.recording) 92 | self.assertFalse(self.client.server_error) 93 | self.assertFalse(self.client.waiting) 94 | 95 | def test_on_error(self): 96 | error_message = "Test Error" 97 | self.client.on_error(self.mock_ws_app, error_message) 98 | 99 | self.assertTrue(self.client.server_error) 100 | self.assertEqual(self.client.error_message, error_message) 101 | 102 | 103 | class TestAudioResampling(unittest.TestCase): 104 | def test_resample_audio(self): 105 | original_audio = "assets/jfk.flac" 106 | expected_sr = 16000 107 | resampled_audio = resample(original_audio, expected_sr) 108 | 109 | sr, _ = scipy.io.wavfile.read(resampled_audio) 110 | self.assertEqual(sr, expected_sr) 111 | 112 | os.remove(resampled_audio) 113 | 114 | 115 | class TestSendingAudioPacket(BaseTestCase): 116 | def test_send_packet(self): 117 | self.client.send_packet_to_server(self.mock_audio_packet) 118 | self.client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY) 119 | 120 | class TestTee(BaseTestCase): 121 | @patch('whisper_live.client.websocket.WebSocketApp') 122 | @patch('whisper_live.client.pyaudio.PyAudio') 123 | def setUp(self, mock_audio, mock_websocket): 124 | super().setUp() 125 | self.client2 = Client(host='localhost', port=9090, lang="es", translate=False, srt_file_path="transcript.srt") 126 | self.client3 = Client(host='localhost', port=9090, lang="es", translate=True, srt_file_path="translation.srt") 127 | # need a separate mock for each websocket 128 | self.client3.client_socket = copy.deepcopy(self.client3.client_socket) 129 | self.tee = TranscriptionTeeClient([self.client2, self.client3]) 130 | 131 | def tearDown(self): 132 | self.tee.close_all_clients() 133 | del self.tee 134 | super().tearDown() 135 | 136 | def test_invalid_constructor(self): 137 | with self.assertRaises(Exception) as context: 138 | TranscriptionTeeClient([]) 139 | 140 | def test_multicast_unconditional(self): 141 | self.tee.multicast_packet(self.mock_audio_packet, True) 142 | for client in self.tee.clients: 143 | client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY) 144 | 145 | def test_multicast_conditional(self): 146 | self.client2.recording = False 147 | self.client3.recording = True 148 | self.tee.multicast_packet(self.mock_audio_packet, False) 149 | self.client2.client_socket.send.assert_not_called() 150 | self.client3.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY) 151 | 152 | def test_close_all(self): 153 | self.tee.close_all_clients() 154 | for client in self.tee.clients: 155 | client.client_socket.close.assert_called() 156 | 157 | def test_write_all_srt(self): 158 | for client in self.tee.clients: 159 | client.server_backend = "faster_whisper" 160 | self.tee.write_all_clients_srt() 161 | self.assertTrue(Path("transcript.srt").is_file()) 162 | self.assertTrue(Path("translation.srt").is_file()) 163 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import json 4 | import unittest 5 | from unittest import mock 6 | 7 | import numpy as np 8 | import jiwer 9 | 10 | from websockets.exceptions import ConnectionClosed 11 | from whisper_live.server import TranscriptionServer, BackendType, ClientManager 12 | from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient 13 | from whisper.normalizers import EnglishTextNormalizer 14 | 15 | 16 | class TestTranscriptionServerInitialization(unittest.TestCase): 17 | def test_initialization(self): 18 | server = TranscriptionServer() 19 | server.client_manager = ClientManager(max_clients=4, max_connection_time=600) 20 | self.assertEqual(server.client_manager.max_clients, 4) 21 | self.assertEqual(server.client_manager.max_connection_time, 600) 22 | self.assertDictEqual(server.client_manager.clients, {}) 23 | self.assertDictEqual(server.client_manager.start_times, {}) 24 | 25 | 26 | class TestGetWaitTime(unittest.TestCase): 27 | def setUp(self): 28 | self.server = TranscriptionServer() 29 | self.server.client_manager = ClientManager(max_clients=4, max_connection_time=600) 30 | self.server.client_manager.start_times = { 31 | 'client1': time.time() - 120, 32 | 'client2': time.time() - 300 33 | } 34 | self.server.client_manager.max_connection_time = 600 35 | 36 | def test_get_wait_time(self): 37 | expected_wait_time = (600 - (time.time() - self.server.client_manager.start_times['client2'])) / 60 38 | print(self.server.client_manager.get_wait_time(), expected_wait_time) 39 | self.assertAlmostEqual(self.server.client_manager.get_wait_time(), expected_wait_time, places=2) 40 | 41 | 42 | class TestServerConnection(unittest.TestCase): 43 | def setUp(self): 44 | self.server = TranscriptionServer() 45 | 46 | @mock.patch('websockets.WebSocketCommonProtocol') 47 | def test_connection(self, mock_websocket): 48 | mock_websocket.recv.return_value = json.dumps({ 49 | 'uid': 'test_client', 50 | 'language': 'en', 51 | 'task': 'transcribe', 52 | 'model': 'tiny.en' 53 | }) 54 | self.server.recv_audio(mock_websocket, BackendType("faster_whisper")) 55 | 56 | @mock.patch('websockets.WebSocketCommonProtocol') 57 | def test_recv_audio_exception_handling(self, mock_websocket): 58 | mock_websocket.recv.side_effect = [json.dumps({ 59 | 'uid': 'test_client', 60 | 'language': 'en', 61 | 'task': 'transcribe', 62 | 'model': 'tiny.en' 63 | }), np.array([1, 2, 3]).tobytes()] 64 | 65 | with self.assertLogs(level="ERROR"): 66 | self.server.recv_audio(mock_websocket, BackendType("faster_whisper")) 67 | 68 | self.assertNotIn(mock_websocket, self.server.client_manager.clients) 69 | 70 | 71 | class TestServerInferenceAccuracy(unittest.TestCase): 72 | @classmethod 73 | def setUpClass(cls): 74 | cls.mock_pyaudio_patch = mock.patch('pyaudio.PyAudio') 75 | cls.mock_pyaudio = cls.mock_pyaudio_patch.start() 76 | cls.mock_pyaudio.return_value.open.return_value = mock.MagicMock() 77 | 78 | cls.server_process = subprocess.Popen(["python", "run_server.py"]) 79 | time.sleep(2) 80 | 81 | @classmethod 82 | def tearDownClass(cls): 83 | cls.server_process.terminate() 84 | cls.server_process.wait() 85 | 86 | def setUp(self): 87 | self.normalizer = EnglishTextNormalizer() 88 | 89 | def check_prediction(self, srt_path): 90 | gt = "And so my fellow Americans, ask not, what your country can do for you. Ask what you can do for your country!" 91 | with open(srt_path, "r") as f: 92 | lines = f.readlines() 93 | prediction = " ".join([line.strip() for line in lines[2::4]]) 94 | prediction_normalized = self.normalizer(prediction) 95 | gt_normalized = self.normalizer(gt) 96 | 97 | # calculate WER 98 | wer_score = jiwer.wer(gt_normalized, prediction_normalized) 99 | self.assertLess(wer_score, 0.05) 100 | 101 | def test_inference(self): 102 | client = TranscriptionClient( 103 | "localhost", "9090", model="base.en", lang="en", 104 | ) 105 | client("assets/jfk.flac") 106 | self.check_prediction("output.srt") 107 | 108 | def test_simultaneous_inference(self): 109 | client1 = Client( 110 | "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript1.srt") 111 | client2 = Client( 112 | "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript2.srt") 113 | tee = TranscriptionTeeClient([client1, client2]) 114 | tee("assets/jfk.flac") 115 | self.check_prediction("transcript1.srt") 116 | self.check_prediction("transcript2.srt") 117 | 118 | 119 | class TestExceptionHandling(unittest.TestCase): 120 | def setUp(self): 121 | self.server = TranscriptionServer() 122 | 123 | @mock.patch('websockets.WebSocketCommonProtocol') 124 | def test_connection_closed_exception(self, mock_websocket): 125 | mock_websocket.recv.side_effect = ConnectionClosed(1001, "testing connection closed", rcvd_then_sent=mock.Mock()) 126 | 127 | with self.assertLogs(level="INFO") as log: 128 | self.server.recv_audio(mock_websocket, BackendType("faster_whisper")) 129 | self.assertTrue(any("Connection closed by client" in message for message in log.output)) 130 | 131 | @mock.patch('websockets.WebSocketCommonProtocol') 132 | def test_json_decode_exception(self, mock_websocket): 133 | mock_websocket.recv.return_value = "invalid json" 134 | 135 | with self.assertLogs(level="ERROR") as log: 136 | self.server.recv_audio(mock_websocket, BackendType("faster_whisper")) 137 | self.assertTrue(any("Failed to decode JSON from client" in message for message in log.output)) 138 | 139 | @mock.patch('websockets.WebSocketCommonProtocol') 140 | def test_unexpected_exception_handling(self, mock_websocket): 141 | mock_websocket.recv.side_effect = RuntimeError("Unexpected error") 142 | 143 | with self.assertLogs(level="ERROR") as log: 144 | self.server.recv_audio(mock_websocket, BackendType("faster_whisper")) 145 | for message in log.output: 146 | print(message) 147 | print() 148 | self.assertTrue(any("Unexpected error" in message for message in log.output)) 149 | -------------------------------------------------------------------------------- /tests/test_vad.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from whisper_live.transcriber.tensorrt_utils import load_audio 4 | from whisper_live.vad import VoiceActivityDetector 5 | 6 | 7 | class TestVoiceActivityDetection(unittest.TestCase): 8 | def setUp(self): 9 | self.vad = VoiceActivityDetector() 10 | self.sample_rate = 16000 11 | 12 | def generate_silence(self, duration_seconds): 13 | return np.zeros(int(self.sample_rate * duration_seconds), dtype=np.float32) 14 | 15 | def load_speech_segment(self, filepath): 16 | return load_audio(filepath) 17 | 18 | def test_vad_silence_detection(self): 19 | silence = self.generate_silence(3) 20 | is_speech_present = self.vad(silence.copy()) 21 | self.assertFalse(is_speech_present, "VAD incorrectly identified silence as speech.") 22 | 23 | def test_vad_speech_detection(self): 24 | audio_tensor = load_audio("assets/jfk.flac") 25 | is_speech_present = self.vad(audio_tensor) 26 | self.assertTrue(is_speech_present, "VAD failed to identify speech segment.") 27 | -------------------------------------------------------------------------------- /whisper_live/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/__init__.py -------------------------------------------------------------------------------- /whisper_live/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.7.1" 2 | -------------------------------------------------------------------------------- /whisper_live/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/backend/__init__.py -------------------------------------------------------------------------------- /whisper_live/backend/faster_whisper_backend.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import threading 5 | import time 6 | import torch 7 | import ctranslate2 8 | from huggingface_hub import snapshot_download 9 | 10 | from whisper_live.transcriber.transcriber_faster_whisper import WhisperModel 11 | from whisper_live.backend.base import ServeClientBase 12 | 13 | 14 | class ServeClientFasterWhisper(ServeClientBase): 15 | SINGLE_MODEL = None 16 | SINGLE_MODEL_LOCK = threading.Lock() 17 | 18 | def __init__( 19 | self, 20 | websocket, 21 | task="transcribe", 22 | device=None, 23 | language=None, 24 | client_uid=None, 25 | model="small.en", 26 | initial_prompt=None, 27 | vad_parameters=None, 28 | use_vad=True, 29 | single_model=False, 30 | send_last_n_segments=10, 31 | no_speech_thresh=0.45, 32 | clip_audio=False, 33 | same_output_threshold=10, 34 | cache_path="~/.cache/whisper-live/" 35 | ): 36 | """ 37 | Initialize a ServeClient instance. 38 | The Whisper model is initialized based on the client's language and device availability. 39 | The transcription thread is started upon initialization. A "SERVER_READY" message is sent 40 | to the client to indicate that the server is ready. 41 | 42 | Args: 43 | websocket (WebSocket): The WebSocket connection for the client. 44 | task (str, optional): The task type, e.g., "transcribe". Defaults to "transcribe". 45 | device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None. 46 | language (str, optional): The language for transcription. Defaults to None. 47 | client_uid (str, optional): A unique identifier for the client. Defaults to None. 48 | model (str, optional): The whisper model size. Defaults to 'small.en' 49 | initial_prompt (str, optional): Prompt for whisper inference. Defaults to None. 50 | single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False. 51 | send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10. 52 | no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45. 53 | clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False. 54 | same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10. 55 | 56 | """ 57 | super().__init__( 58 | client_uid, 59 | websocket, 60 | send_last_n_segments, 61 | no_speech_thresh, 62 | clip_audio, 63 | same_output_threshold, 64 | ) 65 | self.cache_path = cache_path 66 | self.model_sizes = [ 67 | "tiny", "tiny.en", "base", "base.en", "small", "small.en", 68 | "medium", "medium.en", "large-v2", "large-v3", "distil-small.en", 69 | "distil-medium.en", "distil-large-v2", "distil-large-v3", 70 | "large-v3-turbo", "turbo" 71 | ] 72 | 73 | self.model_size_or_path = model 74 | self.language = "en" if self.model_size_or_path.endswith("en") else language 75 | self.task = task 76 | self.initial_prompt = initial_prompt 77 | self.vad_parameters = vad_parameters or {"onset": 0.5} 78 | 79 | device = "cuda" if torch.cuda.is_available() else "cpu" 80 | if device == "cuda": 81 | major, _ = torch.cuda.get_device_capability(device) 82 | self.compute_type = "float16" if major >= 7 else "float32" 83 | else: 84 | self.compute_type = "int8" 85 | 86 | if self.model_size_or_path is None: 87 | return 88 | logging.info(f"Using Device={device} with precision {self.compute_type}") 89 | 90 | try: 91 | if single_model: 92 | if ServeClientFasterWhisper.SINGLE_MODEL is None: 93 | self.create_model(device) 94 | ServeClientFasterWhisper.SINGLE_MODEL = self.transcriber 95 | else: 96 | self.transcriber = ServeClientFasterWhisper.SINGLE_MODEL 97 | else: 98 | self.create_model(device) 99 | except Exception as e: 100 | logging.error(f"Failed to load model: {e}") 101 | self.websocket.send(json.dumps({ 102 | "uid": self.client_uid, 103 | "status": "ERROR", 104 | "message": f"Failed to load model: {str(self.model_size_or_path)}" 105 | })) 106 | self.websocket.close() 107 | return 108 | 109 | self.use_vad = use_vad 110 | 111 | # threading 112 | self.trans_thread = threading.Thread(target=self.speech_to_text) 113 | self.trans_thread.start() 114 | self.websocket.send( 115 | json.dumps( 116 | { 117 | "uid": self.client_uid, 118 | "message": self.SERVER_READY, 119 | "backend": "faster_whisper" 120 | } 121 | ) 122 | ) 123 | 124 | def create_model(self, device): 125 | """ 126 | Instantiates a new model, sets it as the transcriber. If model is a huggingface model_id 127 | then it is automatically converted to ctranslate2(faster_whisper) format. 128 | """ 129 | model_ref = self.model_size_or_path 130 | 131 | if model_ref in self.model_sizes: 132 | model_to_load = model_ref 133 | else: 134 | logging.info(f"Model not in model_sizes") 135 | if os.path.isdir(model_ref) and ctranslate2.contains_model(model_ref): 136 | model_to_load = model_ref 137 | else: 138 | local_snapshot = snapshot_download( 139 | repo_id = model_ref, 140 | repo_type = "model", 141 | ) 142 | if ctranslate2.contains_model(local_snapshot): 143 | model_to_load = local_snapshot 144 | else: 145 | cache_root = os.path.expanduser(os.path.join(self.cache_path, "whisper-ct2-models/")) 146 | os.makedirs(cache_root, exist_ok=True) 147 | safe_name = model_ref.replace("/", "--") 148 | ct2_dir = os.path.join(cache_root, safe_name) 149 | 150 | if not ctranslate2.contains_model(ct2_dir): 151 | logging.info(f"Converting '{model_ref}' to CTranslate2 @ {ct2_dir}") 152 | ct2_converter = ctranslate2.converters.TransformersConverter( 153 | local_snapshot, 154 | copy_files=["tokenizer.json", "preprocessor_config.json"] 155 | ) 156 | ct2_converter.convert( 157 | output_dir=ct2_dir, 158 | quantization=self.compute_type, 159 | force=False, # skip if already up-to-date 160 | ) 161 | model_to_load = ct2_dir 162 | 163 | logging.info(f"Loading model: {model_to_load}") 164 | self.transcriber = WhisperModel( 165 | model_to_load, 166 | device=device, 167 | compute_type=self.compute_type, 168 | local_files_only=False, 169 | ) 170 | 171 | def set_language(self, info): 172 | """ 173 | Updates the language attribute based on the detected language information. 174 | 175 | Args: 176 | info (object): An object containing the detected language and its probability. This object 177 | must have at least two attributes: `language`, a string indicating the detected 178 | language, and `language_probability`, a float representing the confidence level 179 | of the language detection. 180 | """ 181 | if info.language_probability > 0.5: 182 | self.language = info.language 183 | logging.info(f"Detected language {self.language} with probability {info.language_probability}") 184 | self.websocket.send(json.dumps( 185 | {"uid": self.client_uid, "language": self.language, "language_prob": info.language_probability})) 186 | 187 | def transcribe_audio(self, input_sample): 188 | """ 189 | Transcribes the provided audio sample using the configured transcriber instance. 190 | 191 | If the language has not been set, it updates the session's language based on the transcription 192 | information. 193 | 194 | Args: 195 | input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy 196 | array representing the audio data. 197 | 198 | Returns: 199 | The transcription result from the transcriber. The exact format of this result 200 | depends on the implementation of the `transcriber.transcribe` method but typically 201 | includes the transcribed text. 202 | """ 203 | if ServeClientFasterWhisper.SINGLE_MODEL: 204 | ServeClientFasterWhisper.SINGLE_MODEL_LOCK.acquire() 205 | result, info = self.transcriber.transcribe( 206 | input_sample, 207 | initial_prompt=self.initial_prompt, 208 | language=self.language, 209 | task=self.task, 210 | vad_filter=self.use_vad, 211 | vad_parameters=self.vad_parameters if self.use_vad else None) 212 | if ServeClientFasterWhisper.SINGLE_MODEL: 213 | ServeClientFasterWhisper.SINGLE_MODEL_LOCK.release() 214 | 215 | if self.language is None and info is not None: 216 | self.set_language(info) 217 | return result 218 | 219 | def handle_transcription_output(self, result, duration): 220 | """ 221 | Handle the transcription output, updating the transcript and sending data to the client. 222 | 223 | Args: 224 | result (str): The result from whisper inference i.e. the list of segments. 225 | duration (float): Duration of the transcribed audio chunk. 226 | """ 227 | segments = [] 228 | if len(result): 229 | self.t_start = None 230 | last_segment = self.update_segments(result, duration) 231 | segments = self.prepare_segments(last_segment) 232 | 233 | if len(segments): 234 | self.send_transcription_to_client(segments) 235 | -------------------------------------------------------------------------------- /whisper_live/backend/openvino_backend.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import threading 4 | import time 5 | 6 | from openvino import Core 7 | from whisper_live.backend.base import ServeClientBase 8 | from whisper_live.transcriber.transcriber_openvino import WhisperOpenVINO 9 | 10 | 11 | class ServeClientOpenVINO(ServeClientBase): 12 | SINGLE_MODEL = None 13 | SINGLE_MODEL_LOCK = threading.Lock() 14 | 15 | def __init__( 16 | self, 17 | websocket, 18 | task="transcribe", 19 | device=None, 20 | language=None, 21 | client_uid=None, 22 | model="small.en", 23 | initial_prompt=None, 24 | vad_parameters=None, 25 | use_vad=True, 26 | single_model=False, 27 | send_last_n_segments=10, 28 | no_speech_thresh=0.45, 29 | clip_audio=False, 30 | same_output_threshold=10, 31 | ): 32 | """ 33 | Initialize a ServeClient instance. 34 | The Whisper model is initialized based on the client's language and device availability. 35 | The transcription thread is started upon initialization. A "SERVER_READY" message is sent 36 | to the client to indicate that the server is ready. 37 | 38 | Args: 39 | websocket (WebSocket): The WebSocket connection for the client. 40 | task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe". 41 | device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None. 42 | language (str, optional): The language for transcription. Defaults to None. 43 | client_uid (str, optional): A unique identifier for the client. Defaults to None. 44 | model (str, optional): Huggingface model_id for a valid OpenVINO model. 45 | initial_prompt (str, optional): Prompt for whisper inference. Defaults to None. 46 | single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False. 47 | send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10. 48 | no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45. 49 | clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False. 50 | same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10. 51 | """ 52 | super().__init__( 53 | client_uid, 54 | websocket, 55 | send_last_n_segments, 56 | no_speech_thresh, 57 | clip_audio, 58 | same_output_threshold, 59 | ) 60 | self.language = "en" if language is None else language 61 | if not self.language.startswith("<|"): 62 | self.language = f"<|{self.language}|>" 63 | 64 | self.task = "transcribe" if task is None else task 65 | 66 | self.clip_audio = True 67 | 68 | core = Core() 69 | available_devices = core.available_devices 70 | if 'GPU' in available_devices: 71 | selected_device = 'GPU' 72 | else: 73 | gpu_devices = [d for d in available_devices if d.startswith('GPU')] 74 | selected_device = gpu_devices[0] if gpu_devices else 'CPU' 75 | self.device = selected_device 76 | 77 | 78 | if single_model: 79 | if ServeClientOpenVINO.SINGLE_MODEL is None: 80 | self.create_model(model) 81 | ServeClientOpenVINO.SINGLE_MODEL = self.transcriber 82 | else: 83 | self.transcriber = ServeClientOpenVINO.SINGLE_MODEL 84 | else: 85 | self.create_model(model) 86 | 87 | # threading 88 | self.trans_thread = threading.Thread(target=self.speech_to_text) 89 | self.trans_thread.start() 90 | 91 | self.websocket.send(json.dumps({ 92 | "uid": self.client_uid, 93 | "message": self.SERVER_READY, 94 | "backend": "openvino" 95 | })) 96 | logging.info(f"Using OpenVINO device: {self.device}") 97 | logging.info(f"Running OpenVINO backend with language: {self.language} and task: {self.task}") 98 | 99 | def create_model(self, model_id): 100 | """ 101 | Instantiates a new model, sets it as the transcriber. 102 | """ 103 | self.transcriber = WhisperOpenVINO( 104 | model_id, 105 | device=self.device, 106 | language=self.language, 107 | task=self.task 108 | ) 109 | 110 | def transcribe_audio(self, input_sample): 111 | """ 112 | Transcribes the provided audio sample using the configured transcriber instance. 113 | 114 | If the language has not been set, it updates the session's language based on the transcription 115 | information. 116 | 117 | Args: 118 | input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy 119 | array representing the audio data. 120 | 121 | Returns: 122 | The transcription result from the transcriber. The exact format of this result 123 | depends on the implementation of the `transcriber.transcribe` method but typically 124 | includes the transcribed text. 125 | """ 126 | if ServeClientOpenVINO.SINGLE_MODEL: 127 | ServeClientOpenVINO.SINGLE_MODEL_LOCK.acquire() 128 | result = self.transcriber.transcribe(input_sample) 129 | if ServeClientOpenVINO.SINGLE_MODEL: 130 | ServeClientOpenVINO.SINGLE_MODEL_LOCK.release() 131 | return result 132 | 133 | def handle_transcription_output(self, result, duration): 134 | """ 135 | Handle the transcription output, updating the transcript and sending data to the client. 136 | 137 | Args: 138 | result (str): The result from whisper inference i.e. the list of segments. 139 | duration (float): Duration of the transcribed audio chunk. 140 | """ 141 | segments = [] 142 | if len(result): 143 | self.t_start = None 144 | last_segment = self.update_segments(result, duration) 145 | segments = self.prepare_segments(last_segment) 146 | 147 | if len(segments): 148 | self.send_transcription_to_client(segments) 149 | -------------------------------------------------------------------------------- /whisper_live/backend/trt_backend.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import threading 4 | import time 5 | 6 | from whisper_live.backend.base import ServeClientBase 7 | from whisper_live.transcriber.transcriber_tensorrt import WhisperTRTLLM 8 | 9 | 10 | class ServeClientTensorRT(ServeClientBase): 11 | SINGLE_MODEL = None 12 | SINGLE_MODEL_LOCK = threading.Lock() 13 | 14 | def __init__( 15 | self, 16 | websocket, 17 | task="transcribe", 18 | multilingual=False, 19 | language=None, 20 | client_uid=None, 21 | model=None, 22 | single_model=False, 23 | use_py_session=False, 24 | max_new_tokens=225, 25 | send_last_n_segments=10, 26 | no_speech_thresh=0.45, 27 | clip_audio=False, 28 | same_output_threshold=10, 29 | ): 30 | """ 31 | Initialize a ServeClient instance. 32 | The Whisper model is initialized based on the client's language and device availability. 33 | The transcription thread is started upon initialization. A "SERVER_READY" message is sent 34 | to the client to indicate that the server is ready. 35 | 36 | Args: 37 | websocket (WebSocket): The WebSocket connection for the client. 38 | task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe". 39 | device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None. 40 | multilingual (bool, optional): Whether the client supports multilingual transcription. Defaults to False. 41 | language (str, optional): The language for transcription. Defaults to None. 42 | client_uid (str, optional): A unique identifier for the client. Defaults to None. 43 | single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False. 44 | use_py_session (bool, optional): Use python session or cpp session. Defaults to Cpp Session. 45 | max_new_tokens (int, optional): Max number of tokens to generate. 46 | send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10. 47 | no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45. 48 | clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False. 49 | same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10. 50 | """ 51 | super().__init__( 52 | client_uid, 53 | websocket, 54 | send_last_n_segments, 55 | no_speech_thresh, 56 | clip_audio, 57 | same_output_threshold, 58 | ) 59 | 60 | self.language = language if multilingual else "en" 61 | self.task = task 62 | self.eos = False 63 | self.max_new_tokens = max_new_tokens 64 | 65 | if single_model: 66 | if ServeClientTensorRT.SINGLE_MODEL is None: 67 | self.create_model(model, multilingual, use_py_session=use_py_session) 68 | ServeClientTensorRT.SINGLE_MODEL = self.transcriber 69 | else: 70 | self.transcriber = ServeClientTensorRT.SINGLE_MODEL 71 | else: 72 | self.create_model(model, multilingual, use_py_session=use_py_session) 73 | 74 | # threading 75 | self.trans_thread = threading.Thread(target=self.speech_to_text) 76 | self.trans_thread.start() 77 | 78 | self.websocket.send(json.dumps({ 79 | "uid": self.client_uid, 80 | "message": self.SERVER_READY, 81 | "backend": "tensorrt" 82 | })) 83 | 84 | def create_model(self, model, multilingual, warmup=True, use_py_session=False): 85 | """ 86 | Instantiates a new model, sets it as the transcriber and does warmup if desired. 87 | """ 88 | self.transcriber = WhisperTRTLLM( 89 | model, 90 | assets_dir="assets", 91 | device="cuda", 92 | is_multilingual=multilingual, 93 | language=self.language, 94 | task=self.task, 95 | use_py_session=use_py_session, 96 | max_output_len=self.max_new_tokens, 97 | ) 98 | if warmup: 99 | self.warmup() 100 | 101 | def warmup(self, warmup_steps=10): 102 | """ 103 | Warmup TensorRT since first few inferences are slow. 104 | 105 | Args: 106 | warmup_steps (int): Number of steps to warm up the model for. 107 | """ 108 | logging.info("[INFO:] Warming up TensorRT engine..") 109 | mel, _ = self.transcriber.log_mel_spectrogram("assets/jfk.flac") 110 | for i in range(warmup_steps): 111 | self.transcriber.transcribe(mel) 112 | 113 | def set_eos(self, eos): 114 | """ 115 | Sets the End of Speech (EOS) flag. 116 | 117 | Args: 118 | eos (bool): The value to set for the EOS flag. 119 | """ 120 | self.lock.acquire() 121 | self.eos = eos 122 | self.lock.release() 123 | 124 | def handle_transcription_output(self, last_segment, duration): 125 | """ 126 | Handle the transcription output, updating the transcript and sending data to the client. 127 | 128 | Args: 129 | last_segment (str): The last segment from the whisper output which is considered to be incomplete because 130 | of the possibility of word being truncated. 131 | duration (float): Duration of the transcribed audio chunk. 132 | """ 133 | segments = self.prepare_segments({"text": last_segment}) 134 | self.send_transcription_to_client(segments) 135 | if self.eos: 136 | self.update_timestamp_offset(last_segment, duration) 137 | 138 | def transcribe_audio(self, input_bytes): 139 | """ 140 | Transcribe the audio chunk and send the results to the client. 141 | 142 | Args: 143 | input_bytes (np.array): The audio chunk to transcribe. 144 | """ 145 | if ServeClientTensorRT.SINGLE_MODEL: 146 | ServeClientTensorRT.SINGLE_MODEL_LOCK.acquire() 147 | logging.info(f"[WhisperTensorRT:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}") 148 | mel, duration = self.transcriber.log_mel_spectrogram(input_bytes) 149 | last_segment = self.transcriber.transcribe( 150 | mel, 151 | text_prefix=f"<|startoftranscript|><|{self.language}|><|{self.task}|><|notimestamps|>", 152 | ) 153 | if ServeClientTensorRT.SINGLE_MODEL: 154 | ServeClientTensorRT.SINGLE_MODEL_LOCK.release() 155 | if last_segment: 156 | self.handle_transcription_output(last_segment, duration) 157 | 158 | def update_timestamp_offset(self, last_segment, duration): 159 | """ 160 | Update timestamp offset and transcript. 161 | 162 | Args: 163 | last_segment (str): Last transcribed audio from the whisper model. 164 | duration (float): Duration of the last audio chunk. 165 | """ 166 | if not len(self.transcript): 167 | self.transcript.append({"text": last_segment + " "}) 168 | elif self.transcript[-1]["text"].strip() != last_segment: 169 | self.transcript.append({"text": last_segment + " "}) 170 | 171 | with self.lock: 172 | self.timestamp_offset += duration 173 | 174 | def speech_to_text(self): 175 | """ 176 | Process an audio stream in an infinite loop, continuously transcribing the speech. 177 | 178 | This method continuously receives audio frames, performs real-time transcription, and sends 179 | transcribed segments to the client via a WebSocket connection. 180 | 181 | If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction. 182 | It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments 183 | are sent to the client in real-time, and a history of segments is maintained to provide context. 184 | 185 | Raises: 186 | Exception: If there is an issue with audio processing or WebSocket communication. 187 | 188 | """ 189 | while True: 190 | if self.exit: 191 | logging.info("Exiting speech to text thread") 192 | break 193 | 194 | if self.frames_np is None: 195 | time.sleep(0.02) # wait for any audio to arrive 196 | continue 197 | 198 | self.clip_audio_if_no_valid_segment() 199 | 200 | input_bytes, duration = self.get_audio_chunk_for_processing() 201 | if duration < 0.4: 202 | continue 203 | 204 | try: 205 | input_sample = input_bytes.copy() 206 | logging.info(f"[WhisperTensorRT:] Processing audio with duration: {duration}") 207 | self.transcribe_audio(input_sample) 208 | 209 | except Exception as e: 210 | logging.error(f"[ERROR]: {e}") 211 | -------------------------------------------------------------------------------- /whisper_live/transcriber/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collabora/WhisperLive/4ae38256611dbf1b9f9b3fc009cc37c2bf0f0b00/whisper_live/transcriber/__init__.py -------------------------------------------------------------------------------- /whisper_live/transcriber/transcriber_openvino.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import os 3 | 4 | import openvino_genai as ov_genai 5 | import huggingface_hub as hf_hub 6 | 7 | 8 | class WhisperOpenVINO(object): 9 | def __init__(self, model_id="OpenVINO/whisper-tiny-fp16-ov", device="CPU", language="en", task="transcribe"): 10 | model_path = model_id.split('/')[-1] 11 | cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "openvino_whisper_models") 12 | os.makedirs(cache_dir, exist_ok=True) 13 | model_path = os.path.join(cache_dir, model_path) 14 | if not os.path.exists(model_path): 15 | hf_hub.snapshot_download(model_id, local_dir=model_path) 16 | self.model = ov_genai.WhisperPipeline(str(model_path), device=device) 17 | self.language = language 18 | self.task = task 19 | 20 | def transcribe(self, input_audio): 21 | outputs = self.model.generate(input_audio, return_timestamps=True, language=self.language, task=self.task) 22 | outputs = [seg for seg in outputs.chunks] 23 | return outputs 24 | -------------------------------------------------------------------------------- /whisper_live/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import textwrap 3 | import scipy 4 | import numpy as np 5 | import av 6 | from pathlib import Path 7 | 8 | 9 | def clear_screen(): 10 | """Clears the console screen.""" 11 | os.system("cls" if os.name == "nt" else "clear") 12 | 13 | 14 | def print_transcript(text): 15 | """Prints formatted transcript text.""" 16 | wrapper = textwrap.TextWrapper(width=60) 17 | for line in wrapper.wrap(text="".join(text)): 18 | print(line) 19 | 20 | 21 | def format_time(s): 22 | """Convert seconds (float) to SRT time format.""" 23 | hours = int(s // 3600) 24 | minutes = int((s % 3600) // 60) 25 | seconds = int(s % 60) 26 | milliseconds = int((s - int(s)) * 1000) 27 | return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" 28 | 29 | 30 | def create_srt_file(segments, resampled_file): 31 | with open(resampled_file, 'w', encoding='utf-8') as srt_file: 32 | segment_number = 1 33 | for segment in segments: 34 | start_time = format_time(float(segment['start'])) 35 | end_time = format_time(float(segment['end'])) 36 | text = segment['text'] 37 | 38 | srt_file.write(f"{segment_number}\n") 39 | srt_file.write(f"{start_time} --> {end_time}\n") 40 | srt_file.write(f"{text}\n\n") 41 | 42 | segment_number += 1 43 | 44 | 45 | def resample(file: str, sr: int = 16000): 46 | """ 47 | Resample the audio file to 16kHz. 48 | 49 | Args: 50 | file (str): The audio file to open 51 | sr (int): The sample rate to resample the audio if necessary 52 | 53 | Returns: 54 | resampled_file (str): The resampled audio file 55 | """ 56 | container = av.open(file) 57 | stream = next(s for s in container.streams if s.type == 'audio') 58 | 59 | resampler = av.AudioResampler( 60 | format='s16', 61 | layout='mono', 62 | rate=sr, 63 | ) 64 | 65 | resampled_file = Path(file).stem + "_resampled.wav" 66 | output_container = av.open(resampled_file, mode='w') 67 | output_stream = output_container.add_stream('pcm_s16le', rate=sr) 68 | output_stream.layout = 'mono' 69 | 70 | for frame in container.decode(audio=0): 71 | frame.pts = None 72 | resampled_frames = resampler.resample(frame) 73 | if resampled_frames is not None: 74 | for resampled_frame in resampled_frames: 75 | for packet in output_stream.encode(resampled_frame): 76 | output_container.mux(packet) 77 | 78 | for packet in output_stream.encode(None): 79 | output_container.mux(packet) 80 | 81 | output_container.close() 82 | return resampled_file 83 | -------------------------------------------------------------------------------- /whisper_live/vad.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import torch 4 | import numpy as np 5 | import onnxruntime 6 | import warnings 7 | 8 | 9 | class VoiceActivityDetection(): 10 | 11 | def __init__(self, force_onnx_cpu=True): 12 | path = self.download() 13 | 14 | opts = onnxruntime.SessionOptions() 15 | opts.log_severity_level = 3 16 | 17 | opts.inter_op_num_threads = 1 18 | opts.intra_op_num_threads = 1 19 | 20 | if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers(): 21 | self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts) 22 | else: 23 | self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts) 24 | 25 | self.reset_states() 26 | if '16k' in path: 27 | warnings.warn('This model support only 16000 sampling rate!') 28 | self.sample_rates = [16000] 29 | else: 30 | self.sample_rates = [8000, 16000] 31 | 32 | def _validate_input(self, x, sr: int): 33 | if x.dim() == 1: 34 | x = x.unsqueeze(0) 35 | if x.dim() > 2: 36 | raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}") 37 | 38 | if sr != 16000 and (sr % 16000 == 0): 39 | step = sr // 16000 40 | x = x[:,::step] 41 | sr = 16000 42 | 43 | if sr not in self.sample_rates: 44 | raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)") 45 | if sr / x.shape[1] > 31.25: 46 | raise ValueError("Input audio chunk is too short") 47 | 48 | return x, sr 49 | 50 | def reset_states(self, batch_size=1): 51 | self._state = torch.zeros((2, batch_size, 128)).float() 52 | self._context = torch.zeros(0) 53 | self._last_sr = 0 54 | self._last_batch_size = 0 55 | 56 | def __call__(self, x, sr: int): 57 | 58 | x, sr = self._validate_input(x, sr) 59 | num_samples = 512 if sr == 16000 else 256 60 | 61 | if x.shape[-1] != num_samples: 62 | raise ValueError(f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)") 63 | 64 | batch_size = x.shape[0] 65 | context_size = 64 if sr == 16000 else 32 66 | 67 | if not self._last_batch_size: 68 | self.reset_states(batch_size) 69 | if (self._last_sr) and (self._last_sr != sr): 70 | self.reset_states(batch_size) 71 | if (self._last_batch_size) and (self._last_batch_size != batch_size): 72 | self.reset_states(batch_size) 73 | 74 | if not len(self._context): 75 | self._context = torch.zeros(batch_size, context_size) 76 | 77 | x = torch.cat([self._context, x], dim=1) 78 | if sr in [8000, 16000]: 79 | ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')} 80 | ort_outs = self.session.run(None, ort_inputs) 81 | out, state = ort_outs 82 | self._state = torch.from_numpy(state) 83 | else: 84 | raise ValueError() 85 | 86 | self._context = x[..., -context_size:] 87 | self._last_sr = sr 88 | self._last_batch_size = batch_size 89 | 90 | out = torch.from_numpy(out) 91 | return out 92 | 93 | def audio_forward(self, x, sr: int): 94 | outs = [] 95 | x, sr = self._validate_input(x, sr) 96 | self.reset_states() 97 | num_samples = 512 if sr == 16000 else 256 98 | 99 | if x.shape[1] % num_samples: 100 | pad_num = num_samples - (x.shape[1] % num_samples) 101 | x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0) 102 | 103 | for i in range(0, x.shape[1], num_samples): 104 | wavs_batch = x[:, i:i+num_samples] 105 | out_chunk = self.__call__(wavs_batch, sr) 106 | outs.append(out_chunk) 107 | 108 | stacked = torch.cat(outs, dim=1) 109 | return stacked.cpu() 110 | 111 | @staticmethod 112 | def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"): 113 | target_dir = os.path.expanduser("~/.cache/whisper-live/") 114 | 115 | # Ensure the target directory exists 116 | os.makedirs(target_dir, exist_ok=True) 117 | 118 | # Define the target file path 119 | model_filename = os.path.join(target_dir, "silero_vad.onnx") 120 | 121 | # Check if the model file already exists 122 | if not os.path.exists(model_filename): 123 | # If it doesn't exist, download the model using wget 124 | try: 125 | subprocess.run(["wget", "-O", model_filename, model_url], check=True) 126 | except subprocess.CalledProcessError: 127 | print("Failed to download the model using wget.") 128 | return model_filename 129 | 130 | 131 | class VoiceActivityDetector: 132 | def __init__(self, threshold=0.5, frame_rate=16000): 133 | """ 134 | Initializes the VoiceActivityDetector with a voice activity detection model and a threshold. 135 | 136 | Args: 137 | threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5. 138 | """ 139 | self.model = VoiceActivityDetection() 140 | self.threshold = threshold 141 | self.frame_rate = frame_rate 142 | 143 | def __call__(self, audio_frame): 144 | """ 145 | Determines if the given audio frame contains speech by comparing the detected speech probability against 146 | the threshold. 147 | 148 | Args: 149 | audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a 150 | NumPy array of audio samples. 151 | 152 | Returns: 153 | bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity; 154 | False otherwise. 155 | """ 156 | speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0] 157 | return torch.any(speech_probs > self.threshold).item() 158 | --------------------------------------------------------------------------------