├── speech_to_text ├── __init__.py ├── utils │ ├── __init__.py │ ├── file_utils.py │ └── audio_utils.py ├── assets │ ├── silero_vad.onnx │ ├── compute_types.json │ ├── model_sizes.json │ └── languages.json ├── settings │ └── user_settings.json ├── openai_api.py ├── vad.py ├── websoket_server.py ├── __main__.py └── audio_transcriber.py ├── docs ├── demo.gif ├── architecture.png └── transcription_speed.png ├── requirements.txt ├── websocket_client ├── websocket_client.html └── websocket_client_scripts.js ├── setup.py ├── LICENSE ├── run.bat ├── .gitignore ├── README.md ├── web ├── styles.css ├── scripts.js └── index.html └── ThirdPartyNotices.txt /speech_to_text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /speech_to_text/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reriiasu/speech-to-text/HEAD/docs/demo.gif -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reriiasu/speech-to-text/HEAD/docs/architecture.png -------------------------------------------------------------------------------- /docs/transcription_speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reriiasu/speech-to-text/HEAD/docs/transcription_speed.png -------------------------------------------------------------------------------- /speech_to_text/assets/silero_vad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reriiasu/speech-to-text/HEAD/speech_to_text/assets/silero_vad.onnx -------------------------------------------------------------------------------- /speech_to_text/settings/user_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_settings": {}, 3 | "model_settings": {}, 4 | "transcribe_settings": {} 5 | } 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | faster-whisper >= 1.0.3 2 | sounddevice >= 0.4.7 3 | eel >= 0.17.0 4 | soundfile >= 0.12.1 5 | websockets >= 12.0 6 | librosa >= 0.10.2.post1 7 | openai >= 1.37.0 -------------------------------------------------------------------------------- /speech_to_text/assets/compute_types.json: -------------------------------------------------------------------------------- 1 | { 2 | "compute_types": { 3 | "default": "default", 4 | "auto": "auto", 5 | "int8": "int8", 6 | "int8_float16": "int8_float16", 7 | "int16": "int16", 8 | "float16": "float16", 9 | "float32": "float32" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /websocket_client/websocket_client.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | WebScoketClienetSample 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 |
14 |
15 | 16 | 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def read_requirements(): 5 | with open("requirements.txt") as req: 6 | return [i.strip() for i in req] 7 | 8 | 9 | setup( 10 | name="speech-to-text", 11 | version="0.4.1", 12 | description="Real-time transcription using faster-whisper", 13 | author="reriiasu", 14 | url="https://github.com/reriiasu/speech-to-text", 15 | packages=find_packages(), 16 | install_requires=read_requirements(), 17 | ) 18 | -------------------------------------------------------------------------------- /speech_to_text/assets/model_sizes.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_sizes": { 3 | "tiny": "tiny", 4 | "tiny.en": "tiny.en", 5 | "base": "base", 6 | "base.en": "base.en", 7 | "small": "small", 8 | "small.en": "small.en", 9 | "medium": "medium", 10 | "medium.en": "medium.en", 11 | "large-v1": "large-v1", 12 | "large-v2": "large-v2", 13 | "large-v3": "large-v3", 14 | "distil-large-v2": "faster-distil-whisper-large-v2", 15 | "distil-medium.en": "faster-distil-whisper-medium.en", 16 | "distil-small.en": "faster-distil-whisper-small.en", 17 | "distil-large-v3": "faster-distil-whisper-large-v3", 18 | "local_model": "local_model" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /websocket_client/websocket_client_scripts.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function () { 2 | const webSocket = new WebSocket("ws://localhost:8765"); 3 | 4 | webSocket.onopen = function () { 5 | console.log("WebSocket connection opened"); 6 | }; 7 | 8 | webSocket.onmessage = function (event) { 9 | console.log("WebSocket message received:", event.data); 10 | displayMessage(event.data); 11 | }; 12 | 13 | webSocket.onclose = function () { 14 | console.log("WebSocket connection closed"); 15 | }; 16 | 17 | webSocket.onerror = function (event) { 18 | console.error("WebSocket error:", event); 19 | }; 20 | 21 | function displayMessage(message) { 22 | const el = document.querySelector("#message"); 23 | el.textContent = message; 24 | } 25 | }); 26 | -------------------------------------------------------------------------------- /speech_to_text/openai_api.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | 4 | 5 | class OpenAIAPI: 6 | def __init__(self): 7 | openai.api_key = os.getenv("OPENAI_API_KEY") 8 | self.MODEL_NAME = "gpt-3.5-turbo" 9 | self.MAX_TOKENS = 2000 10 | 11 | def text_proofreading(self, text: str): 12 | response = openai.ChatCompletion.create( 13 | model=self.MODEL_NAME, 14 | max_tokens=self.MAX_TOKENS, 15 | messages=[ 16 | { 17 | "role": "system", 18 | "content": "Please proofread. Please return only the proofreading results.", 19 | }, 20 | {"role": "user", "content": text}, 21 | ], 22 | ) 23 | return response.choices[0]["message"]["content"].strip() 24 | -------------------------------------------------------------------------------- /speech_to_text/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import soundfile as sf 4 | 5 | script_dir = os.path.dirname(os.path.abspath(__file__)) 6 | python_root_dir = os.path.dirname(script_dir) 7 | app_root_dir = os.path.dirname(python_root_dir) 8 | 9 | 10 | def read_json(dir_name: str, json_name: str): 11 | file_path = os.path.join(python_root_dir, dir_name, json_name + ".json") 12 | with open(file_path, "r") as f: 13 | data = json.load(f) 14 | return data 15 | 16 | 17 | def write_json(dir_name: str, json_name: str, data: dict): 18 | file_path = os.path.join(python_root_dir, dir_name, json_name + ".json") 19 | with open(file_path, "w") as f: 20 | json.dump(data, f) 21 | 22 | 23 | def write_audio(dir_name: str, file_name: str, data): 24 | file_path = os.path.join(app_root_dir, dir_name, file_name + ".wav") 25 | 26 | # If a file with the same name already exists, remove it to forcefully write 27 | if os.path.exists(file_path): 28 | os.remove(file_path) 29 | 30 | sf.write(file_path, data, 16000) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 reriiasu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /speech_to_text/vad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import onnxruntime 4 | 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | 7 | 8 | class Vad: 9 | def __init__(self, threshold: float = 0.1): 10 | model_path = os.path.join(current_dir, "assets", "silero_vad.onnx") 11 | 12 | options = onnxruntime.SessionOptions() 13 | options.log_severity_level = 4 14 | 15 | self.inference_session = onnxruntime.InferenceSession( 16 | model_path, sess_options=options 17 | ) 18 | self.SAMPLING_RATE = 16000 19 | self.threshold = threshold 20 | self.h = np.zeros((2, 1, 64), dtype=np.float32) 21 | self.c = np.zeros((2, 1, 64), dtype=np.float32) 22 | 23 | def is_speech(self, audio_data: np.ndarray) -> bool: 24 | input_data = { 25 | "input": audio_data.reshape(1, -1), 26 | "sr": np.array([self.SAMPLING_RATE], dtype=np.int64), 27 | "h": self.h, 28 | "c": self.c, 29 | } 30 | out, h, c = self.inference_session.run(None, input_data) 31 | self.h, self.c = h, c 32 | return out > self.threshold 33 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | REM Name of the virtual environment directory 3 | set VENV_DIR=myenv 4 | 5 | REM Check if the virtual environment already exists 6 | if exist %VENV_DIR% ( 7 | echo Virtual environment '%VENV_DIR%' already exists. 8 | 9 | REM Activate the virtual environment 10 | call %VENV_DIR%\Scripts\activate 11 | 12 | REM Check if requirements.txt exists and update packages 13 | if exist requirements.txt ( 14 | echo Updating packages from requirements.txt... 15 | pip install -r requirements.txt 16 | echo Packages updated. 17 | ) else ( 18 | echo requirements.txt not found. 19 | ) 20 | ) else ( 21 | REM Create the virtual environment 22 | python -m venv %VENV_DIR% 23 | echo Virtual environment '%VENV_DIR%' created. 24 | 25 | REM Activate the virtual environment and install local packages 26 | call %VENV_DIR%\Scripts\activate 27 | echo Installing local packages... 28 | pip install . 29 | echo Local packages installed. 30 | ) 31 | 32 | REM Run python -m speech_to_text 33 | echo Running python -m speech_to_text... 34 | python -m speech_to_text 35 | 36 | REM Keep the command prompt open 37 | cmd /K -------------------------------------------------------------------------------- /speech_to_text/utils/audio_utils.py: -------------------------------------------------------------------------------- 1 | import sounddevice as sd 2 | import io 3 | import soundfile as sf 4 | import numpy as np 5 | import librosa 6 | 7 | 8 | # get a list of valid input devices 9 | def get_valid_input_devices(): 10 | valid_devices = [] 11 | devices = sd.query_devices() 12 | hostapis = sd.query_hostapis() 13 | 14 | for device in devices: 15 | if device["max_input_channels"] > 0: 16 | device["host_api_name"] = hostapis[device["hostapi"]]["name"] 17 | valid_devices.append(device) 18 | return valid_devices 19 | 20 | 21 | # create an audio stream 22 | def create_audio_stream(selected_device, callback): 23 | RATE = 16000 24 | CHUNK = 512 25 | CHANNELS = 1 26 | DTYPE = "float32" 27 | 28 | stream = sd.InputStream( 29 | device=selected_device, 30 | channels=CHANNELS, 31 | samplerate=RATE, 32 | callback=callback, 33 | dtype=DTYPE, 34 | blocksize=CHUNK, 35 | ) 36 | 37 | return stream 38 | 39 | 40 | def base64_to_audio(audio_data): 41 | audio_bytes = bytes(audio_data) 42 | audio_file = io.BytesIO(audio_bytes) 43 | data, samplerate = sf.read(audio_file) 44 | # whisper samplerate is 16k 45 | resample_data = librosa.resample(y=data, orig_sr=samplerate, target_sr=16000) 46 | 47 | return resample_data.astype(np.float32) 48 | -------------------------------------------------------------------------------- /speech_to_text/websoket_server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import webbrowser 4 | import os 5 | from typing import Optional 6 | 7 | python_root_dir = os.path.dirname(os.path.abspath(__file__)) 8 | app_root_dir = os.path.dirname(python_root_dir) 9 | 10 | 11 | class WebSocketServer: 12 | def __init__(self, loop): 13 | self.websocket: Optional[websockets.WebSocketServerProtocol] = None 14 | self.loop = loop 15 | self.server = None 16 | 17 | async def start_server(self): 18 | self.server = await websockets.serve(self.handler, "localhost", 8765) 19 | self.call_websocket_client() 20 | 21 | def call_websocket_client(self): 22 | path = os.path.join(app_root_dir, "websocket_client", "websocket_client.html") 23 | webbrowser.open("file://" + path) 24 | 25 | async def handler(self, ws: websockets.WebSocketServerProtocol, path): 26 | self.websocket = ws 27 | try: 28 | await ws.wait_closed() 29 | finally: 30 | if self.websocket is ws: 31 | self.websocket = None 32 | 33 | async def stop_server(self): 34 | if self.server is not None: 35 | self.server.close() 36 | await self.server.wait_closed() 37 | 38 | async def send_message(self, message: str): 39 | if self.websocket is not None: 40 | await self.websocket.send(message) 41 | 42 | def send_message_threadsafe(self, message: str): 43 | if self.websocket is not None: 44 | asyncio.run_coroutine_threadsafe(self.send_message(message), self.loop) 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /speech_to_text/assets/languages.json: -------------------------------------------------------------------------------- 1 | { 2 | "languages": { 3 | "": "", 4 | "en": "english", 5 | "zh": "chinese", 6 | "de": "german", 7 | "es": "spanish", 8 | "ru": "russian", 9 | "ko": "korean", 10 | "fr": "french", 11 | "ja": "japanese", 12 | "pt": "portuguese", 13 | "tr": "turkish", 14 | "pl": "polish", 15 | "ca": "catalan", 16 | "nl": "dutch", 17 | "ar": "arabic", 18 | "sv": "swedish", 19 | "it": "italian", 20 | "id": "indonesian", 21 | "hi": "hindi", 22 | "fi": "finnish", 23 | "vi": "vietnamese", 24 | "he": "hebrew", 25 | "uk": "ukrainian", 26 | "el": "greek", 27 | "ms": "malay", 28 | "cs": "czech", 29 | "ro": "romanian", 30 | "da": "danish", 31 | "hu": "hungarian", 32 | "ta": "tamil", 33 | "no": "norwegian", 34 | "th": "thai", 35 | "ur": "urdu", 36 | "hr": "croatian", 37 | "bg": "bulgarian", 38 | "lt": "lithuanian", 39 | "la": "latin", 40 | "mi": "maori", 41 | "ml": "malayalam", 42 | "cy": "welsh", 43 | "sk": "slovak", 44 | "te": "telugu", 45 | "fa": "persian", 46 | "lv": "latvian", 47 | "bn": "bengali", 48 | "sr": "serbian", 49 | "az": "azerbaijani", 50 | "sl": "slovenian", 51 | "kn": "kannada", 52 | "et": "estonian", 53 | "mk": "macedonian", 54 | "br": "breton", 55 | "eu": "basque", 56 | "is": "icelandic", 57 | "hy": "armenian", 58 | "ne": "nepali", 59 | "mn": "mongolian", 60 | "bs": "bosnian", 61 | "kk": "kazakh", 62 | "sq": "albanian", 63 | "sw": "swahili", 64 | "gl": "galician", 65 | "mr": "marathi", 66 | "pa": "punjabi", 67 | "si": "sinhala", 68 | "km": "khmer", 69 | "sn": "shona", 70 | "yo": "yoruba", 71 | "so": "somali", 72 | "af": "afrikaans", 73 | "oc": "occitan", 74 | "ka": "georgian", 75 | "be": "belarusian", 76 | "tg": "tajik", 77 | "sd": "sindhi", 78 | "gu": "gujarati", 79 | "am": "amharic", 80 | "yi": "yiddish", 81 | "lo": "lao", 82 | "uz": "uzbek", 83 | "fo": "faroese", 84 | "ht": "haitian creole", 85 | "ps": "pashto", 86 | "tk": "turkmen", 87 | "nn": "nynorsk", 88 | "mt": "maltese", 89 | "sa": "sanskrit", 90 | "lb": "luxembourgish", 91 | "my": "myanmar", 92 | "bo": "tibetan", 93 | "tl": "tagalog", 94 | "mg": "malagasy", 95 | "as": "assamese", 96 | "tt": "tatar", 97 | "haw": "hawaiian", 98 | "ln": "lingala", 99 | "ha": "hausa", 100 | "ba": "bashkir", 101 | "jw": "javanese", 102 | "su": "sundanese" 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # speech-to-text 2 | 3 | Real-time transcription using [faster-whisper](https://github.com/guillaumekln/faster-whisper) 4 | 5 | ![architecture](docs/architecture.png) 6 | 7 | Accepts audio input from a microphone using a [Sounddevice](https://github.com/spatialaudio/python-sounddevice). By using [Silero VAD](https://github.com/snakers4/silero-vad)(Voice Activity Detection), silent parts are detected and recognized as one voice data. This audio data is converted to text using Faster-Whisper. 8 | 9 | The HTML-based GUI allows you to check the transcription results and make detailed settings for the faster-whisper. 10 | 11 | ## Transcription speed 12 | 13 | If the sentences are well separated, the transcription takes less than a second. 14 | ![TranscriptionSpeed](docs/transcription_speed.png) 15 | 16 | Large-v2 model 17 | Executed with CUDA 11.7 on a NVIDIA GeForce RTX 3060 12GB. 18 | 19 | ## Installation 20 | 21 | 1. pip install . 22 | 23 | ### for Windows 24 | 25 | Please execute "run.bat." It will perform the following actions: 26 | 27 | 1. Create a Python virtual environment. 28 | 1. Install pip packages. 29 | 1. Run speech_to_text. 30 | 31 | ## Usage 32 | 33 | 1. python -m speech_to_text 34 | 1. Select "App Settings" and configure the settings. 35 | 1. Select "Model Settings" and configure the settings. 36 | 1. Select "Transcribe Settings" and configure the settings. 37 | 1. Select "VAD Settings" and configure the settings. 38 | 1. Start Transcription 39 | 40 | If you use the OpenAI API for text proofreading, set OPENAI_API_KEY as an environment variable. 41 | 42 | ## Notes 43 | 44 | - If you select local_model in "Model size or path", the model with the same name in the local folder will be referenced. 45 | 46 | ## Demo 47 | 48 | ![demo](docs/demo.gif) 49 | 50 | ## News 51 | 52 | ### 2023-06-26 53 | 54 | - Add generate audio files from input sound. 55 | - Add synchronize audio files with transcription. 56 | Audio and text highlighting are linked. 57 | 58 | ### 2023-06-29 59 | 60 | - Add transcription from audio files.(only wav format) 61 | 62 | ### 2023-07-03 63 | 64 | - Add Send transcription results from a WebSocket server to a WebSocket client. 65 | Example of use: Display subtitles in live streaming. 66 | 67 | ### 2023-07-05 68 | 69 | - Add generate SRT files from transcription result. 70 | 71 | ### 2023-07-08 72 | 73 | - Add support for mp3, ogg, and other audio files. 74 | Depends on Soundfile support. 75 | - Add setting to include non-speech data in buffer. 76 | While this will increase memory usage, it will improve transcription accuracy. 77 | 78 | ### 2023-07-09 79 | 80 | - Add non-speech threshold setting. 81 | 82 | ### 2023-07-11 83 | 84 | - Add Text proofreading option via OpenAI API. 85 | Transcription results can be proofread. 86 | 87 | ### 2023-07-12 88 | 89 | - Add feature where audio and word highlighting are synchronized. 90 | if Word Timestamps is true. 91 | 92 | ### 2023-10-01 93 | 94 | - Support for repetition_penalty and no_repeat_ngram_size in transcribe_settings. 95 | - Updating packages. 96 | 97 | ### 2023-11-27 98 | 99 | - Support "large-v3" model. 100 | - Update faster-whisper requirement to include the latest version "0.10.0". 101 | 102 | ### 2024-07-23 103 | 104 | - Support "Faster Distil-Whisper" model. 105 | - Update faster-whisper requirement to include the latest version "1.0.3". 106 | - Updating packages. 107 | - Add run.bat for Windows. 108 | 109 | ## Todo 110 | 111 | - [x] Save and load previous settings. 112 | 113 | - [x] Use Silero VAD 114 | 115 | - [x] Allow local parameters to be set from the GUI. 116 | 117 | - [x] Supports additional options in faster-whisper 0.8.0 118 | -------------------------------------------------------------------------------- /speech_to_text/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import eel 3 | import sys 4 | import threading 5 | 6 | from faster_whisper import WhisperModel 7 | from .audio_transcriber import AppOptions 8 | from .audio_transcriber import AudioTranscriber 9 | from .utils.audio_utils import get_valid_input_devices, base64_to_audio 10 | from .utils.file_utils import read_json, write_json, write_audio 11 | from .websoket_server import WebSocketServer 12 | from .openai_api import OpenAIAPI 13 | 14 | eel.init("web") 15 | 16 | transcriber: AudioTranscriber = None 17 | event_loop: asyncio.AbstractEventLoop = None 18 | thread: threading.Thread = None 19 | websocket_server: WebSocketServer = None 20 | openai_api: OpenAIAPI = None 21 | 22 | 23 | @eel.expose 24 | def get_valid_devices(): 25 | devices = get_valid_input_devices() 26 | return [ 27 | { 28 | "index": d["index"], 29 | "name": f"{d['name']} {d['host_api_name']} ({d['max_input_channels']} in)", 30 | } 31 | for d in devices 32 | ] 33 | 34 | 35 | @eel.expose 36 | def get_dropdown_options(): 37 | data_types = ["model_sizes", "compute_types", "languages"] 38 | 39 | dropdown_options = {} 40 | for data_type in data_types: 41 | data = read_json("assets", data_type) 42 | dropdown_options[data_type] = data[data_type] 43 | 44 | return dropdown_options 45 | 46 | 47 | @eel.expose 48 | def get_user_settings(): 49 | data_types = ["app_settings", "model_settings", "transcribe_settings"] 50 | user_settings = {} 51 | 52 | try: 53 | data = read_json("settings", "user_settings") 54 | for data_type in data_types: 55 | user_settings[data_type] = data[data_type] 56 | except Exception as e: 57 | eel.on_recive_message(str(e)) 58 | 59 | return user_settings 60 | 61 | 62 | @eel.expose 63 | def start_transcription(user_settings): 64 | global transcriber, event_loop, thread, websocket_server, openai_api 65 | try: 66 | ( 67 | filtered_app_settings, 68 | filtered_model_settings, 69 | filtered_transcribe_settings, 70 | ) = extracting_each_setting(user_settings) 71 | 72 | whisper_model = WhisperModel(**filtered_model_settings) 73 | app_settings = AppOptions(**filtered_app_settings) 74 | event_loop = asyncio.new_event_loop() 75 | 76 | if app_settings.use_websocket_server: 77 | websocket_server = WebSocketServer(event_loop) 78 | asyncio.run_coroutine_threadsafe( 79 | websocket_server.start_server(), event_loop 80 | ) 81 | 82 | if app_settings.use_openai_api: 83 | openai_api = OpenAIAPI() 84 | 85 | transcriber = AudioTranscriber( 86 | event_loop, 87 | whisper_model, 88 | filtered_transcribe_settings, 89 | app_settings, 90 | websocket_server, 91 | openai_api, 92 | ) 93 | asyncio.set_event_loop(event_loop) 94 | thread = threading.Thread(target=event_loop.run_forever, daemon=True) 95 | thread.start() 96 | 97 | asyncio.run_coroutine_threadsafe(transcriber.start_transcription(), event_loop) 98 | except Exception as e: 99 | eel.on_recive_message(str(e)) 100 | 101 | 102 | @eel.expose 103 | def stop_transcription(): 104 | global transcriber, event_loop, thread, websocket_server, openai_api 105 | if transcriber is None: 106 | eel.transcription_stoppd() 107 | return 108 | transcriber_future = asyncio.run_coroutine_threadsafe( 109 | transcriber.stop_transcription(), event_loop 110 | ) 111 | transcriber_future.result() 112 | 113 | if websocket_server is not None: 114 | websocket_server_future = asyncio.run_coroutine_threadsafe( 115 | websocket_server.stop_server(), event_loop 116 | ) 117 | websocket_server_future.result() 118 | 119 | if thread.is_alive(): 120 | event_loop.call_soon_threadsafe(event_loop.stop) 121 | thread.join() 122 | event_loop.close() 123 | transcriber = None 124 | event_loop = None 125 | thread = None 126 | websocket_server = None 127 | openai_api = None 128 | 129 | eel.transcription_stoppd() 130 | 131 | 132 | @eel.expose 133 | def audio_transcription(user_settings, base64data): 134 | global transcriber, openai_api 135 | try: 136 | ( 137 | filtered_app_settings, 138 | filtered_model_settings, 139 | filtered_transcribe_settings, 140 | ) = extracting_each_setting(user_settings) 141 | 142 | whisper_model = WhisperModel(**filtered_model_settings) 143 | app_settings = AppOptions(**filtered_app_settings) 144 | 145 | if app_settings.use_openai_api: 146 | openai_api = OpenAIAPI() 147 | 148 | transcriber = AudioTranscriber( 149 | event_loop, 150 | whisper_model, 151 | filtered_transcribe_settings, 152 | app_settings, 153 | None, 154 | openai_api, 155 | ) 156 | 157 | audio_data = base64_to_audio(base64data) 158 | if len(audio_data) > 0: 159 | write_audio("web", "voice", audio_data) 160 | transcriber.batch_transcribe_audio(audio_data) 161 | 162 | except Exception as e: 163 | eel.on_recive_message(str(e)) 164 | 165 | openai_api = None 166 | 167 | 168 | def get_filtered_app_settings(settings): 169 | valid_keys = AppOptions.__annotations__.keys() 170 | return {k: v for k, v in settings.items() if k in valid_keys} 171 | 172 | 173 | def get_filtered_model_settings(settings): 174 | valid_keys = WhisperModel.__init__.__annotations__.keys() 175 | return {k: v for k, v in settings.items() if k in valid_keys} 176 | 177 | 178 | def get_filtered_transcribe_settings(settings): 179 | valid_keys = WhisperModel.transcribe.__annotations__.keys() 180 | return {k: v for k, v in settings.items() if k in valid_keys} 181 | 182 | 183 | def extracting_each_setting(user_settings): 184 | filtered_app_settings = get_filtered_app_settings(user_settings["app_settings"]) 185 | filtered_model_settings = get_filtered_model_settings( 186 | user_settings["model_settings"] 187 | ) 188 | filtered_transcribe_settings = get_filtered_transcribe_settings( 189 | user_settings["transcribe_settings"] 190 | ) 191 | 192 | write_json( 193 | "settings", 194 | "user_settings", 195 | { 196 | "app_settings": filtered_app_settings, 197 | "model_settings": filtered_model_settings, 198 | "transcribe_settings": filtered_transcribe_settings, 199 | }, 200 | ) 201 | 202 | return filtered_app_settings, filtered_model_settings, filtered_transcribe_settings 203 | 204 | 205 | def on_close(page, sockets): 206 | print(page, "was closed") 207 | 208 | if transcriber and transcriber.transcribing: 209 | stop_transcription() 210 | sys.exit() 211 | 212 | 213 | if __name__ == "__main__": 214 | eel.start("index.html", size=(1024, 1024), close_callback=on_close) 215 | -------------------------------------------------------------------------------- /speech_to_text/audio_transcriber.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | import eel 4 | import queue 5 | import numpy as np 6 | 7 | from typing import NamedTuple 8 | from faster_whisper import WhisperModel 9 | from concurrent.futures import ThreadPoolExecutor 10 | 11 | from .utils.audio_utils import create_audio_stream 12 | from .vad import Vad 13 | from .utils.file_utils import write_audio 14 | from .websoket_server import WebSocketServer 15 | from .openai_api import OpenAIAPI 16 | 17 | 18 | class AppOptions(NamedTuple): 19 | audio_device: int 20 | silence_limit: int = 8 21 | noise_threshold: int = 5 22 | non_speech_threshold: float = 0.1 23 | include_non_speech: bool = False 24 | create_audio_file: bool = True 25 | use_websocket_server: bool = False 26 | use_openai_api: bool = False 27 | 28 | 29 | class AudioTranscriber: 30 | def __init__( 31 | self, 32 | event_loop: asyncio.AbstractEventLoop, 33 | whisper_model: WhisperModel, 34 | transcribe_settings: dict, 35 | app_options: AppOptions, 36 | websocket_server: WebSocketServer, 37 | openai_api: OpenAIAPI, 38 | ): 39 | self.event_loop = event_loop 40 | self.whisper_model: WhisperModel = whisper_model 41 | self.transcribe_settings = transcribe_settings 42 | self.app_options = app_options 43 | self.websocket_server = websocket_server 44 | self.openai_api = openai_api 45 | self.vad = Vad(app_options.non_speech_threshold) 46 | self.silence_counter: int = 0 47 | self.audio_data_list = [] 48 | self.all_audio_data_list = [] 49 | self.audio_queue = queue.Queue() 50 | self.transcribing = False 51 | self.stream = None 52 | self._running = asyncio.Event() 53 | self._transcribe_task = None 54 | 55 | async def transcribe_audio(self): 56 | # Ignore parameters that affect performance 57 | transcribe_settings = self.transcribe_settings.copy() 58 | transcribe_settings["without_timestamps"] = True 59 | transcribe_settings["word_timestamps"] = False 60 | 61 | with ThreadPoolExecutor() as executor: 62 | while self.transcribing: 63 | try: 64 | # Get audio data from queue with a timeout 65 | audio_data = await self.event_loop.run_in_executor( 66 | executor, functools.partial(self.audio_queue.get, timeout=3.0) 67 | ) 68 | 69 | # Create a partial function for the model's transcribe method 70 | func = functools.partial( 71 | self.whisper_model.transcribe, 72 | audio=audio_data, 73 | **transcribe_settings, 74 | ) 75 | 76 | # Run the transcribe method in a thread 77 | segments, _ = await self.event_loop.run_in_executor(executor, func) 78 | 79 | for segment in segments: 80 | eel.display_transcription(segment.text) 81 | if self.websocket_server is not None: 82 | await self.websocket_server.send_message(segment.text) 83 | 84 | except queue.Empty: 85 | # Skip to the next iteration if a timeout occurs 86 | continue 87 | except Exception as e: 88 | eel.on_recive_message(str(e)) 89 | 90 | def process_audio(self, audio_data: np.ndarray, frames: int, time, status): 91 | is_speech = self.vad.is_speech(audio_data) 92 | if is_speech: 93 | self.silence_counter = 0 94 | self.audio_data_list.append(audio_data.flatten()) 95 | else: 96 | self.silence_counter += 1 97 | if self.app_options.include_non_speech: 98 | self.audio_data_list.append(audio_data.flatten()) 99 | 100 | if not is_speech and self.silence_counter > self.app_options.silence_limit: 101 | self.silence_counter = 0 102 | 103 | if self.app_options.create_audio_file: 104 | self.all_audio_data_list.extend(self.audio_data_list) 105 | 106 | if len(self.audio_data_list) > self.app_options.noise_threshold: 107 | concatenate_audio_data = np.concatenate(self.audio_data_list) 108 | self.audio_data_list.clear() 109 | self.audio_queue.put(concatenate_audio_data) 110 | else: 111 | # noise clear 112 | self.audio_data_list.clear() 113 | 114 | def batch_transcribe_audio(self, audio_data: np.ndarray): 115 | segment_list = [] 116 | segments, _ = self.whisper_model.transcribe( 117 | audio=audio_data, **self.transcribe_settings 118 | ) 119 | 120 | for segment in segments: 121 | word_list = [] 122 | if self.transcribe_settings["word_timestamps"] == True: 123 | for word in segment.words: 124 | word_list.append( 125 | { 126 | "start": word.start, 127 | "end": word.end, 128 | "text": word.word, 129 | } 130 | ) 131 | segment_list.append( 132 | { 133 | "start": segment.start, 134 | "end": segment.end, 135 | "text": segment.text, 136 | "words": word_list, 137 | } 138 | ) 139 | 140 | eel.transcription_clear() 141 | 142 | if self.openai_api is not None: 143 | self.text_proofreading(segment_list) 144 | else: 145 | eel.on_recive_segments(segment_list) 146 | 147 | def text_proofreading(self, segment_list: list): 148 | # Use [#] as a separator 149 | combined_text = "[#]" + "[#]".join(segment["text"] for segment in segment_list) 150 | result = self.openai_api.text_proofreading(combined_text) 151 | split_text = result.split("[#]") 152 | 153 | del split_text[0] 154 | 155 | eel.display_transcription("Before text proofreading.") 156 | eel.on_recive_segments(segment_list) 157 | 158 | if len(split_text) == len(segment_list): 159 | for i, segment in enumerate(segment_list): 160 | segment["text"] = split_text[i] 161 | segment["words"] = [] 162 | eel.on_recive_message("proofread success.") 163 | eel.display_transcription("After text proofreading.") 164 | eel.on_recive_segments(segment_list) 165 | else: 166 | eel.on_recive_message("proofread failure.") 167 | eel.on_recive_message(result) 168 | 169 | async def start_transcription(self): 170 | try: 171 | self.transcribing = True 172 | self.stream = create_audio_stream( 173 | self.app_options.audio_device, self.process_audio 174 | ) 175 | self.stream.start() 176 | self._running.set() 177 | self._transcribe_task = asyncio.run_coroutine_threadsafe( 178 | self.transcribe_audio(), self.event_loop 179 | ) 180 | eel.on_recive_message("Transcription started.") 181 | while self._running.is_set(): 182 | await asyncio.sleep(1) 183 | except Exception as e: 184 | eel.on_recive_message(str(e)) 185 | 186 | async def stop_transcription(self): 187 | try: 188 | self.transcribing = False 189 | if self._transcribe_task is not None: 190 | self.event_loop.call_soon_threadsafe(self._transcribe_task.cancel) 191 | self._transcribe_task = None 192 | 193 | if self.app_options.create_audio_file and len(self.all_audio_data_list) > 0: 194 | audio_data = np.concatenate(self.all_audio_data_list) 195 | self.all_audio_data_list.clear() 196 | write_audio("web", "voice", audio_data) 197 | self.batch_transcribe_audio(audio_data) 198 | 199 | if self.stream is not None: 200 | self._running.clear() 201 | self.stream.stop() 202 | self.stream.close() 203 | self.stream = None 204 | eel.on_recive_message("Transcription stopped.") 205 | else: 206 | eel.on_recive_message("No active stream to stop.") 207 | except Exception as e: 208 | eel.on_recive_message(str(e)) 209 | -------------------------------------------------------------------------------- /web/styles.css: -------------------------------------------------------------------------------- 1 | html { 2 | height: 100%; 3 | margin: 0; 4 | padding: 0; 5 | } 6 | 7 | body { 8 | height: 100%; 9 | min-width: 510px; 10 | min-height: 400px; 11 | font-family: Arial, sans-serif; 12 | margin: 0; 13 | padding: 0; 14 | background-color: #f4f4f4; 15 | } 16 | 17 | #loading-screen { 18 | display: none; 19 | position: fixed; 20 | width: 100%; 21 | height: 100%; 22 | top: 0; 23 | left: 0; 24 | background: rgba(0, 0, 0, 0.6); 25 | -webkit-background-size: cover; 26 | -moz-background-size: cover; 27 | -o-background-size: cover; 28 | background-size: cover; 29 | z-index: 1000; 30 | align-items: center; 31 | justify-content: center; 32 | } 33 | 34 | #loading-screen.show { 35 | display: flex; 36 | } 37 | 38 | .spinner { 39 | border: 16px solid #f3f3f3; 40 | border-top: 16px solid #3498db; 41 | border-radius: 50%; 42 | width: 120px; 43 | height: 120px; 44 | animation: spin 2s linear infinite; 45 | } 46 | 47 | .menus { 48 | display: flex; 49 | justify-content: flex-end; 50 | list-style: none; 51 | margin: 0; 52 | padding: 5px 0; 53 | background-color: #333; 54 | } 55 | 56 | .menu { 57 | color: #fff; 58 | cursor: pointer; 59 | padding: 10px 10px; 60 | border-radius: 5px; 61 | transition: background-color 0.3s ease; 62 | width: 160px; 63 | text-align: center; 64 | margin-right: 10px; 65 | } 66 | 67 | .menu:hover { 68 | background-color: #555; 69 | } 70 | 71 | .menu.active { 72 | background-color: #888; 73 | color: #000; 74 | } 75 | 76 | #main-content { 77 | display: flex; 78 | flex-direction: column; 79 | height: calc(100% - 48px); /* Adjust for the height of the menu */ 80 | } 81 | 82 | .content-inner { 83 | display: flex; 84 | flex-direction: column; 85 | height: 100%; 86 | margin: 10px 10px; 87 | padding: 5px; 88 | border-radius: 5px; 89 | background-color: #fff; 90 | box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1); 91 | overflow: hidden; 92 | } 93 | 94 | .mode-button { 95 | display: inline-block; 96 | margin-top: 8px; 97 | margin-right: 0; 98 | margin-bottom: 8px; 99 | margin-left: 4px; 100 | padding: 8px 12px; 101 | color: rgba(51, 51, 51, 0.7); 102 | font-size: 16px; 103 | box-sizing: border-box; 104 | cursor: pointer; 105 | transition: color 0.3s ease, background-color 0.3s ease; 106 | border-radius: 4px; 107 | border: none; 108 | background: none; 109 | } 110 | 111 | .mode-button:hover { 112 | color: #333; 113 | background-color: rgba(51, 51, 51, 0.2); 114 | } 115 | 116 | .mode-button.selected { 117 | border-bottom: 2px solid #333; 118 | background-color: rgba(51, 51, 51, 0.1); 119 | color: #333; 120 | } 121 | 122 | .main-content-label { 123 | display: inline-block; 124 | margin-left: 2px; 125 | padding: 2px 5px; 126 | background-color: transparent; 127 | color: #333; 128 | font-size: 14px; 129 | box-sizing: border-box; 130 | } 131 | 132 | .setting-item { 133 | display: flex; 134 | align-items: center; 135 | margin-bottom: 10px; 136 | } 137 | 138 | .setting-item.audio-device { 139 | display: block; 140 | } 141 | 142 | .setting-label { 143 | width: 150px; 144 | text-align: right; 145 | margin-right: 10px; 146 | } 147 | 148 | input[type="text"].setting-control { 149 | width: 250px; 150 | } 151 | 152 | input[type="number"].setting-control { 153 | width: 50px; 154 | } 155 | 156 | #audio_device { 157 | width: 258px; 158 | } 159 | 160 | .menu-window { 161 | position: absolute; 162 | top: 48px; 163 | right: 0; 164 | width: 448px; 165 | transition: max-height 0.5s ease; 166 | z-index: 1; 167 | } 168 | 169 | .close-icon { 170 | position: absolute; 171 | top: 20px; 172 | right: 25px; 173 | width: 32px; 174 | height: 32px; 175 | cursor: pointer; 176 | color: #333; 177 | } 178 | 179 | .menu-window-inner { 180 | max-height: 0; 181 | margin: 10px 10px; 182 | padding: 5px; 183 | border-radius: 5px; 184 | background-color: #fff; 185 | box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1); 186 | overflow: hidden; 187 | } 188 | 189 | .menu-window-inner.open { 190 | background-color: rgba(255, 255, 255, 0.6); 191 | max-height: 1000px; 192 | transition: max-height 0.8s ease-out; 193 | } 194 | 195 | .menu-window-inner .note { 196 | font-size: 0.9em; 197 | color: #555; 198 | } 199 | 200 | .transcription-controls { 201 | display: flex; 202 | justify-content: flex-start; 203 | } 204 | 205 | .transcription-button { 206 | height: 32px; 207 | width: 180px; 208 | color: white; 209 | padding: 0 16px; 210 | text-align: center; 211 | text-decoration: none; 212 | display: inline-block; 213 | font-size: 16px; 214 | margin: 4px 4px; 215 | cursor: pointer; 216 | transition-duration: 0.4s; 217 | border-radius: 4px; 218 | } 219 | 220 | .transcription-button.start { 221 | background-color: #4caf50; 222 | border: 2px solid #4caf50; 223 | } 224 | 225 | .transcription-button.start:hover { 226 | background-color: white; 227 | color: #4caf50; 228 | } 229 | 230 | .transcription-button.stop { 231 | background-color: #f44336; 232 | border: 2px solid #f44336; 233 | } 234 | 235 | .transcription-button.stop:hover { 236 | background-color: white; 237 | color: #f44336; 238 | } 239 | 240 | .transcription-button.audio { 241 | background-color: #50a8d1; 242 | border: 2px solid #50a8d1; 243 | } 244 | 245 | .transcription-button.audio:hover { 246 | background-color: white; 247 | color: #50a8d1; 248 | } 249 | 250 | .transcription-button.hidden { 251 | display: none; 252 | } 253 | 254 | #create-srt { 255 | left: 185px; 256 | } 257 | 258 | #create-srt.hidden { 259 | display: none; 260 | } 261 | 262 | #audio-file { 263 | height: 32px; 264 | line-height: 32px; 265 | font-size: 12px; 266 | margin: 4px 2px; 267 | } 268 | 269 | #audio-file.hidden { 270 | display: none; 271 | } 272 | 273 | #audio-control { 274 | position: absolute; 275 | top: 0px; 276 | left: 285px; 277 | margin: 4px 4px; 278 | height: 16px; 279 | } 280 | 281 | .recive-message-area { 282 | position: relative; 283 | } 284 | 285 | .recive-message-area.transcription { 286 | flex-grow: 1; 287 | overflow: auto; 288 | } 289 | 290 | .recive-message-area.console { 291 | height: 165px; 292 | } 293 | 294 | .message-area { 295 | padding: 5px; 296 | margin: 10px 5px; 297 | background: #f8f8f8; 298 | border-radius: 10px; 299 | border: 1px solid #ddd; 300 | font-size: 12px; 301 | line-height: 1.4; 302 | overflow: auto; 303 | height: 120px; 304 | scrollbar-width: thin; 305 | scrollbar-color: #888 #f1f1f1; 306 | } 307 | 308 | .segment-container { 309 | width: 100%; 310 | word-wrap: break-word; 311 | white-space: normal; 312 | box-sizing: border-box; 313 | } 314 | 315 | .segment-container::after { 316 | content: "\a"; 317 | white-space: pre; 318 | } 319 | 320 | .time-label { 321 | padding-right: 5px; 322 | } 323 | 324 | #transcription.message-area { 325 | height: calc(100% - 40px); 326 | box-sizing: border-box; 327 | } 328 | 329 | .highlight { 330 | color: white; 331 | background-color: rgba(0, 0, 0, 0.7); 332 | text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); 333 | } 334 | 335 | .control-button { 336 | position: absolute; 337 | top: 0px; 338 | padding: 5px 10px; 339 | font-size: 12px; 340 | color: #fff; 341 | background-color: rgba(0, 0, 0, 0.7); 342 | border: none; 343 | border-radius: 5px; 344 | cursor: pointer; 345 | opacity: 0.4; 346 | transition: opacity 0.3s; 347 | } 348 | 349 | .copy-button { 350 | background-color: #008cba; 351 | right: 85px; 352 | } 353 | 354 | .clear-button { 355 | background-color: #f44336; 356 | right: 25px; 357 | } 358 | 359 | .control-button:hover { 360 | opacity: 1; 361 | } 362 | 363 | ::-webkit-scrollbar { 364 | height: 8px; 365 | width: 8px; 366 | } 367 | 368 | ::-webkit-scrollbar-track { 369 | background: #f1f1f1; 370 | } 371 | 372 | ::-webkit-scrollbar-thumb { 373 | background: #888; 374 | } 375 | 376 | ::-webkit-scrollbar-thumb:hover { 377 | background: #555; 378 | } 379 | 380 | .toast { 381 | visibility: hidden; 382 | min-width: 250px; 383 | color: #fff; 384 | text-align: center; 385 | border-radius: 15px; 386 | padding: 10px; 387 | position: fixed; 388 | z-index: 1; 389 | left: 50%; 390 | transform: translateX(-50%); 391 | bottom: 30px; 392 | font-size: 17px; 393 | background-color: rgba(0, 0, 0, 0.7); 394 | opacity: 0; 395 | box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.3); 396 | } 397 | 398 | .toast.show { 399 | visibility: visible; 400 | animation: fadeout 3s; 401 | animation-fill-mode: forwards; 402 | } 403 | 404 | @keyframes fadeout { 405 | 0% { 406 | opacity: 1; 407 | } 408 | 66.66% { 409 | opacity: 1; 410 | } 411 | 100% { 412 | opacity: 0; 413 | } 414 | } 415 | 416 | @keyframes spin { 417 | 0% { 418 | transform: rotate(0deg); 419 | } 420 | 100% { 421 | transform: rotate(360deg); 422 | } 423 | } 424 | -------------------------------------------------------------------------------- /ThirdPartyNotices.txt: -------------------------------------------------------------------------------- 1 | NOTICES 2 | 3 | --------------------------------------------------------- 4 | 5 | guillaumekln/faster-whisper - MIT 6 | https://github.com/guillaumekln/faster-whisper 7 | 8 | MIT License 9 | 10 | Copyright (c) 2023 Guillaume Klein 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all 20 | copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 28 | SOFTWARE. 29 | 30 | --------------------------------------------------------- 31 | 32 | python-sounddevice - MIT 33 | https://github.com/spatialaudio/python-sounddevice 34 | 35 | Copyright (c) 2015-2023 Matthias Geier 36 | 37 | Permission is hereby granted, free of charge, to any person obtaining a copy 38 | of this software and associated documentation files (the "Software"), to deal 39 | in the Software without restriction, including without limitation the rights 40 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 41 | copies of the Software, and to permit persons to whom the Software is 42 | furnished to do so, subject to the following conditions: 43 | 44 | The above copyright notice and this permission notice shall be included in 45 | all copies or substantial portions of the Software. 46 | 47 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 48 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 49 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 50 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 51 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 52 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 53 | THE SOFTWARE. 54 | 55 | --------------------------------------------------------- 56 | 57 | Eel - MIT 58 | https://github.com/spatialaudio/python-sounddevice 59 | 60 | MIT License 61 | 62 | Copyright (c) 2018 Chris Knott 63 | 64 | Permission is hereby granted, free of charge, to any person obtaining a copy 65 | of this software and associated documentation files (the "Software"), to deal 66 | in the Software without restriction, including without limitation the rights 67 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 68 | copies of the Software, and to permit persons to whom the Software is 69 | furnished to do so, subject to the following conditions: 70 | 71 | The above copyright notice and this permission notice shall be included in all 72 | copies or substantial portions of the Software. 73 | 74 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 75 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 76 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 77 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 78 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 79 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 80 | SOFTWARE. 81 | --------------------------------------------------------- 82 | 83 | silero-vad - MIT 84 | https://github.com/snakers4/silero-vad 85 | 86 | MIT License 87 | 88 | Copyright (c) 2020-present Silero Team 89 | 90 | Permission is hereby granted, free of charge, to any person obtaining a copy 91 | of this software and associated documentation files (the "Software"), to deal 92 | in the Software without restriction, including without limitation the rights 93 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 94 | copies of the Software, and to permit persons to whom the Software is 95 | furnished to do so, subject to the following conditions: 96 | 97 | The above copyright notice and this permission notice shall be included in all 98 | copies or substantial portions of the Software. 99 | 100 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 101 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 102 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 103 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 104 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 105 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 106 | SOFTWARE. 107 | --------------------------------------------------------- 108 | 109 | python-soundfile - BSD-3-Clause license 110 | https://github.com/bastibe/python-soundfile 111 | 112 | Copyright (c) 2013, Bastian Bechtold 113 | All rights reserved. 114 | 115 | Redistribution and use in source and binary forms, with or without 116 | modification, are permitted provided that the following conditions are 117 | met: 118 | 119 | * Redistributions of source code must retain the above copyright 120 | notice, this list of conditions and the following disclaimer. 121 | * Redistributions in binary form must reproduce the above copyright 122 | notice, this list of conditions and the following disclaimer in 123 | the documentation and/or other materials provided with the 124 | distribution. 125 | * Neither the name of python-soundfile nor the names 126 | of its contributors may be used to endorse or promote products 127 | derived from this software without specific prior written 128 | permission. 129 | 130 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 131 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 132 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 133 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 134 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 135 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 136 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 137 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 138 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 139 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 140 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 141 | --------------------------------------------------------- 142 | 143 | websockets - BSD-3-Clause license 144 | https://github.com/python-websockets/websockets 145 | 146 | Copyright (c) Aymeric Augustin and contributors 147 | 148 | Redistribution and use in source and binary forms, with or without 149 | modification, are permitted provided that the following conditions are met: 150 | 151 | * Redistributions of source code must retain the above copyright notice, 152 | this list of conditions and the following disclaimer. 153 | * Redistributions in binary form must reproduce the above copyright notice, 154 | this list of conditions and the following disclaimer in the documentation 155 | and/or other materials provided with the distribution. 156 | * Neither the name of the copyright holder nor the names of its contributors 157 | may be used to endorse or promote products derived from this software 158 | without specific prior written permission. 159 | 160 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 161 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 162 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 163 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 164 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 165 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 166 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 167 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 168 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 169 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 170 | --------------------------------------------------------- 171 | 172 | librosa - ISC License 173 | https://github.com/librosa/librosa 174 | 175 | ## ISC License 176 | 177 | Copyright (c) 2013--2023, librosa development team. 178 | 179 | Permission to use, copy, modify, and/or distribute this software for any 180 | purpose with or without fee is hereby granted, provided that the above 181 | copyright notice and this permission notice appear in all copies. 182 | 183 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 184 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 185 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 186 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 187 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 188 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 189 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 190 | --------------------------------------------------------- 191 | 192 | openai-python - MIT License 193 | https://github.com/openai/openai-python 194 | 195 | The MIT License 196 | 197 | Copyright (c) OpenAI (https://openai.com) 198 | 199 | Permission is hereby granted, free of charge, to any person obtaining a copy 200 | of this software and associated documentation files (the "Software"), to deal 201 | in the Software without restriction, including without limitation the rights 202 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 203 | copies of the Software, and to permit persons to whom the Software is 204 | furnished to do so, subject to the following conditions: 205 | 206 | The above copyright notice and this permission notice shall be included in 207 | all copies or substantial portions of the Software. 208 | 209 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 210 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 211 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 212 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 213 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 214 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 215 | THE SOFTWARE. 216 | --------------------------------------------------------- -------------------------------------------------------------------------------- /web/scripts.js: -------------------------------------------------------------------------------- 1 | eel.expose(on_recive_message); 2 | function on_recive_message(message) { 3 | document.querySelector("#loading-screen").classList.remove("show"); 4 | addMessage("console-message", message); 5 | } 6 | 7 | eel.expose(display_transcription); 8 | function display_transcription(transcript) { 9 | addMessage("transcription", transcript); 10 | } 11 | 12 | eel.expose(transcription_clear); 13 | function transcription_clear() { 14 | clearMessage("transcription"); 15 | } 16 | 17 | eel.expose(on_recive_segments); 18 | function on_recive_segments(segments) { 19 | document.querySelector("#loading-screen").classList.remove("show"); 20 | 21 | const audio = document.querySelector("#audio-control"); 22 | 23 | const appSettings = getAppSettings(); 24 | if (appSettings["create_audio_file"]) { 25 | audio.src = "voice.wav" + "?v=" + new Date().getTime(); 26 | audio.hidden = false; 27 | audio.load(); 28 | } 29 | 30 | const srt = document.querySelector("#create-srt"); 31 | srt.classList.remove("hidden"); 32 | 33 | const transcription = document.querySelector(`#transcription`); 34 | for (let i = 0; i < segments.length; i++) { 35 | const words = segments[i]["words"]; 36 | const start = segments[i]["start"]; 37 | const end = segments[i]["end"]; 38 | 39 | const block = document.createElement("div"); 40 | block.classList.add("segment-container"); 41 | 42 | const label = document.createElement("label"); 43 | label.classList.add("time-label"); 44 | label.textContent = `[${formatTime(start)} --> ${formatTime(end)}]`; 45 | block.appendChild(label); 46 | 47 | if (words.length !== 0) { 48 | for (let j = 0; j < words.length; j++) { 49 | const text = words[j]["text"]; 50 | const wordStart = words[j]["start"]; 51 | const wordEnd = words[j]["end"]; 52 | 53 | const span = document.createElement("span"); 54 | span.textContent = text; 55 | span.setAttribute("data-start", wordStart); 56 | span.setAttribute("data-end", wordEnd); 57 | span.addEventListener("click", onClickSegment); 58 | block.appendChild(span); 59 | } 60 | } else { 61 | const text = segments[i]["text"]; 62 | 63 | const span = document.createElement("span"); 64 | span.textContent = text; 65 | span.setAttribute("data-start", start); 66 | span.setAttribute("data-end", end); 67 | span.addEventListener("click", onClickSegment); 68 | block.appendChild(span); 69 | } 70 | 71 | transcription.appendChild(block); 72 | } 73 | } 74 | 75 | eel.expose(transcription_stoppd); 76 | function transcription_stoppd() { 77 | document.querySelector("#start-button").classList.remove("hidden"); 78 | document.querySelector("#stop-button").classList.add("hidden"); 79 | enableSettingControle(); 80 | enableModeControle(); 81 | } 82 | 83 | eel.expose(transcription_stoppd2); 84 | function transcription_stoppd2() { 85 | document.querySelector("#start-button").classList.remove("hidden"); 86 | document.querySelector("#stop-button").classList.add("hidden"); 87 | enableSettingControle(); 88 | } 89 | 90 | function addMessage(elementId, message) { 91 | const el = document.querySelector(`#${elementId}`); 92 | const newel = document.createElement("div"); 93 | newel.classList.add("segment-container"); 94 | newel.textContent = message; 95 | el.appendChild(newel); 96 | 97 | el.scrollTop = el.scrollHeight; 98 | } 99 | 100 | function onClickSegment(event) { 101 | const audio = document.querySelector("#audio-control"); 102 | audio.currentTime = event.target.getAttribute("data-start"); 103 | audio.play(); 104 | } 105 | 106 | async function updateDevices() { 107 | let devices = await eel.get_valid_devices()(); 108 | let select = document.querySelector("#audio_device"); 109 | select.innerHTML = ""; 110 | for (let i = 0; i < devices.length; i++) { 111 | let opt = document.createElement("option"); 112 | opt.value = devices[i].index; 113 | opt.innerHTML = devices[i].name; 114 | select.appendChild(opt); 115 | } 116 | } 117 | 118 | function getContentSettings(elementid) { 119 | let elements = Array.from( 120 | document.querySelector(elementid).querySelectorAll(".setting-control") 121 | ); 122 | 123 | const json = elements.reduce((obj, element) => { 124 | let value; 125 | if (element.tagName === "SELECT") { 126 | value = element.options[element.selectedIndex].value; 127 | } else if (element.tagName === "INPUT" && element.type === "checkbox") { 128 | value = element.checked; 129 | } else if (element.tagName === "INPUT" && element.type === "number") { 130 | value = Number(element.value); 131 | } else { 132 | value = element.value; 133 | } 134 | 135 | // If the value is empty and optional 136 | if (value === "" && document.querySelector(".optional") !== null) { 137 | return obj; 138 | } 139 | obj[element.id] = value; 140 | return obj; 141 | }, {}); 142 | 143 | return json; 144 | } 145 | 146 | function getAppSettings() { 147 | const settings = getContentSettings("#app-settings-window"); 148 | settings["audio_device"] = 149 | document.querySelector("#audio_device").selectedIndex; 150 | 151 | return settings; 152 | } 153 | 154 | function getModelSettings() { 155 | const settings = getContentSettings("#model-settings-window"); 156 | 157 | const deviceIndex = settings["device_index"]; 158 | if (/^(\d+|(\d+,)+\d+)$/.test(deviceIndex)) { 159 | let numbers = 0; 160 | if (deviceIndex.includes(",")) { 161 | numbers = deviceIndex.split(",").map(Number); 162 | } else { 163 | numbers = Number(deviceIndex); 164 | } 165 | settings["device_index"] = numbers; 166 | } 167 | 168 | return settings; 169 | } 170 | 171 | function getTranscribeSettings() { 172 | const transcribeSettings = getContentSettings("#transcribe-settings-window"); 173 | const vadSettings = getContentSettings("#vad-settings-window"); 174 | 175 | const temperature = transcribeSettings["temperature"]; 176 | if (/^(\d*\.?\d+|((\d*\.?\d+,)+\d*\.?\d+))$/.test(temperature)) { 177 | let numbers = 0; 178 | if (temperature.includes(",")) { 179 | numbers = temperature.split(",").map(Number); 180 | } else { 181 | numbers = Number(temperature); 182 | } 183 | transcribeSettings["temperature"] = numbers; 184 | } 185 | 186 | const suppress_tokens = transcribeSettings["suppress_tokens"]; 187 | if (/^(-?\d+|(-?\d+,)+-?\d+)$/.test(suppress_tokens)) { 188 | let numbers = 0; 189 | if (suppress_tokens.includes(",")) { 190 | numbers = suppress_tokens.split(",").map(Number); 191 | } else { 192 | numbers = [Number(suppress_tokens)]; 193 | } 194 | transcribeSettings["suppress_tokens"] = numbers; 195 | } 196 | transcribeSettings["vad_filter"] = vadSettings["vad_filter"]; 197 | delete vadSettings["vad_filter"]; 198 | transcribeSettings["vad_parameters"] = vadSettings; 199 | 200 | return transcribeSettings; 201 | } 202 | 203 | function startTranscription() { 204 | document.querySelector("#loading-screen").classList.add("show"); 205 | menuClose(); 206 | disableModeControle(); 207 | disableSettingControle(); 208 | 209 | document.querySelector("#start-button").classList.add("hidden"); 210 | document.querySelector("#stop-button").classList.remove("hidden"); 211 | hideCreateSrt(); 212 | hideAudioControl(); 213 | clearMessage("transcription"); 214 | 215 | const appSettings = getAppSettings(); 216 | const modelSettings = getModelSettings(); 217 | const transcribeSettings = getTranscribeSettings(); 218 | 219 | eel.start_transcription({ 220 | app_settings: appSettings, 221 | model_settings: modelSettings, 222 | transcribe_settings: transcribeSettings, 223 | }); 224 | } 225 | 226 | async function stopTranscription() { 227 | document.querySelector("#loading-screen").classList.add("show"); 228 | await eel.stop_transcription(); 229 | } 230 | 231 | function audioTranscription() { 232 | const fileInput = document.querySelector("#audio-file"); 233 | const file = fileInput.files[0]; 234 | 235 | if (!fileValidation(file)) { 236 | return; 237 | } 238 | document.querySelector("#loading-screen").classList.add("show"); 239 | menuClose(); 240 | 241 | hideCreateSrt(); 242 | hideAudioControl(); 243 | clearMessage("transcription"); 244 | 245 | const appSettings = getAppSettings(); 246 | const modelSettings = getModelSettings(); 247 | const transcribeSettings = getTranscribeSettings(); 248 | 249 | const reader = new FileReader(); 250 | reader.onload = function (e) { 251 | const data = new Uint8Array(e.target.result); 252 | eel.audio_transcription( 253 | { 254 | app_settings: appSettings, 255 | model_settings: modelSettings, 256 | transcribe_settings: transcribeSettings, 257 | }, 258 | Array.from(data) 259 | ); 260 | }; 261 | reader.readAsArrayBuffer(file); 262 | } 263 | 264 | function fileValidation(file) { 265 | if (!file) { 266 | on_recive_message("No file chosen"); 267 | return false; 268 | } 269 | 270 | const maxSizeInMB = 10; 271 | const maxSizeInBytes = maxSizeInMB * 1024 * 1024; 272 | if (file.size > maxSizeInBytes) { 273 | on_recive_message(`File size must be under ${maxSizeInMB}MB`); 274 | return false; 275 | } 276 | return true; 277 | } 278 | 279 | function realTimeMode() { 280 | document.querySelector("#real-time-mode").classList.add("selected"); 281 | document.querySelector("#audio-mode").classList.remove("selected"); 282 | 283 | document.querySelector("#start-button").classList.remove("hidden"); 284 | 285 | document.querySelector("#audio-transcription").classList.add("hidden"); 286 | document.querySelector("#audio-file").classList.add("hidden"); 287 | } 288 | 289 | function audioMode() { 290 | document.querySelector("#real-time-mode").classList.remove("selected"); 291 | document.querySelector("#audio-mode").classList.add("selected"); 292 | 293 | document.querySelector("#start-button").classList.add("hidden"); 294 | 295 | document.querySelector("#audio-transcription").classList.remove("hidden"); 296 | document.querySelector("#audio-file").classList.remove("hidden"); 297 | } 298 | 299 | function createDropdownOptions(options, elementId) { 300 | const select = document.querySelector(`#${elementId}`); 301 | for (const key in options) { 302 | const option = document.createElement("option"); 303 | option.value = key; 304 | option.text = options[key]; 305 | select.appendChild(option); 306 | } 307 | } 308 | 309 | function setContentSettings(settings, elementid) { 310 | if (settings === undefined) { 311 | return; 312 | } 313 | 314 | const elements = Array.from( 315 | document.querySelector(elementid).querySelectorAll(".setting-control") 316 | ); 317 | 318 | for (let element of elements) { 319 | if (!(element.id in settings)) { 320 | continue; 321 | } 322 | 323 | if (Array.isArray(settings[element.id])) { 324 | element.value = settings[element.id].join(","); 325 | } else if (element.tagName === "INPUT" && element.type === "checkbox") { 326 | element.checked = settings[element.id]; 327 | } else { 328 | element.value = settings[element.id]; 329 | } 330 | } 331 | } 332 | 333 | function setDropdownOptions() { 334 | eel.get_dropdown_options()(function (dropdownOptions) { 335 | createDropdownOptions(dropdownOptions["model_sizes"], "model_size_or_path"); 336 | createDropdownOptions(dropdownOptions["compute_types"], "compute_type"); 337 | createDropdownOptions(dropdownOptions["languages"], "language"); 338 | }); 339 | } 340 | 341 | function setUserSettings() { 342 | eel.get_user_settings()(function (userSettings) { 343 | setContentSettings(userSettings["app_settings"], "#app-settings-window"); 344 | setContentSettings( 345 | userSettings["model_settings"], 346 | "#model-settings-window" 347 | ); 348 | setContentSettings( 349 | userSettings["transcribe_settings"], 350 | "#transcribe-settings-window" 351 | ); 352 | setContentSettings( 353 | Object.assign( 354 | {}, 355 | userSettings["transcribe_settings"], 356 | userSettings["transcribe_settings"]["vad_parameters"] 357 | ), 358 | "#vad-settings-window" 359 | ); 360 | }); 361 | } 362 | 363 | function onClickMenu(el) { 364 | if (el.classList.contains("active")) { 365 | menuClose(); 366 | return; 367 | } 368 | menuClose(); 369 | 370 | el.classList.add("active"); 371 | 372 | const targetWindow = document.querySelector(`#${el.id}-window`); 373 | targetWindow.hidden = false; 374 | const inner = targetWindow.querySelector(".menu-window-inner"); 375 | requestAnimationFrame(() => { 376 | inner.classList.add("open"); 377 | }); 378 | } 379 | 380 | function menuClose() { 381 | const menus = document.querySelectorAll(".menu"); 382 | const menuWindows = document.querySelectorAll(".menu-window"); 383 | menuWindows.forEach((w) => { 384 | w.hidden = true; 385 | const inner = w.querySelector(".menu-window-inner"); 386 | inner.classList.remove("open"); 387 | }); 388 | menus.forEach((t) => t.classList.remove("active")); 389 | } 390 | 391 | function addButtonClickEventListener() { 392 | const menus = document.querySelectorAll(".menu"); 393 | menus.forEach((menu) => { 394 | menu.addEventListener("click", () => { 395 | onClickMenu(menu); 396 | }); 397 | }); 398 | 399 | const closeIcons = document.querySelectorAll(".close-icon"); 400 | closeIcons.forEach((icon) => { 401 | icon.addEventListener("click", () => { 402 | menuClose(); 403 | }); 404 | }); 405 | 406 | document 407 | .querySelector("#real-time-mode") 408 | .addEventListener("click", function () { 409 | realTimeMode(); 410 | }); 411 | 412 | document.querySelector("#audio-mode").addEventListener("click", function () { 413 | audioMode(); 414 | }); 415 | 416 | document.querySelector("#start-button").addEventListener("click", () => { 417 | startTranscription(); 418 | }); 419 | document.querySelector("#stop-button").addEventListener("click", () => { 420 | stopTranscription(); 421 | }); 422 | document 423 | .querySelector("#audio-transcription") 424 | .addEventListener("click", () => { 425 | audioTranscription(); 426 | }); 427 | 428 | document.querySelector("#create-srt").addEventListener("click", () => { 429 | createSrt(); 430 | }); 431 | 432 | document 433 | .querySelector("#transcription-copy") 434 | .addEventListener("click", () => { 435 | copyToClipboard("transcription"); 436 | }); 437 | document 438 | .querySelector("#transcription-clear") 439 | .addEventListener("click", () => { 440 | hideCreateSrt(); 441 | hideAudioControl(); 442 | clearMessage("transcription"); 443 | }); 444 | 445 | document 446 | .querySelector("#console-message-copy") 447 | .addEventListener("click", () => { 448 | copyToClipboard("console-message"); 449 | }); 450 | document 451 | .querySelector("#console-message-clear") 452 | .addEventListener("click", () => { 453 | clearMessage("console-message"); 454 | }); 455 | } 456 | 457 | function addTimeupdateEventListener() { 458 | const audio = document.querySelector(`#audio-control`); 459 | // Define the time extension to avoid skipping subtitle highlighting 460 | const timeExtension = 0.2; 461 | 462 | audio.addEventListener("timeupdate", (event) => { 463 | const currentTime = event.target.currentTime; 464 | const subtitles = document.querySelectorAll("#transcription span"); 465 | 466 | subtitles.forEach((subtitle) => { 467 | const start = parseFloat(subtitle.getAttribute("data-start")); 468 | const end = parseFloat(subtitle.getAttribute("data-end")); 469 | 470 | // Add the time extension to the end time 471 | if (currentTime >= start && currentTime <= end + timeExtension) { 472 | subtitle.classList.add("highlight"); 473 | } else { 474 | subtitle.classList.remove("highlight"); 475 | } 476 | }); 477 | }); 478 | } 479 | 480 | window.addEventListener("load", (event) => { 481 | updateDevices(); 482 | setDropdownOptions(); 483 | setUserSettings(); 484 | addButtonClickEventListener(); 485 | addTimeupdateEventListener(); 486 | }); 487 | 488 | function copyToClipboard(elementId) { 489 | const transcriptionElement = document.querySelector(`#${elementId}`); 490 | const text = transcriptionElement.innerText; 491 | navigator.clipboard.writeText(text).then( 492 | function () { 493 | showToast(); 494 | }, 495 | function (err) { 496 | console.error("Could not copy text: ", err); 497 | } 498 | ); 499 | } 500 | 501 | function downloadSRTFile(content, filename) { 502 | const blob = new Blob([content], { type: "text/srt" }); 503 | const url = URL.createObjectURL(blob); 504 | 505 | const link = document.createElement("a"); 506 | link.download = filename; 507 | link.href = url; 508 | 509 | document.body.appendChild(link); 510 | link.click(); 511 | document.body.removeChild(link); 512 | } 513 | 514 | function getSegmentsFromHTML() { 515 | const regex = /\[(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\](.+)/; 516 | const segmentContainers = document.querySelectorAll(".segment-container"); 517 | const segments = new Map(); 518 | 519 | segmentContainers.forEach((container) => { 520 | const match = container.innerText.match(regex); 521 | 522 | if (match) { 523 | const timestamp = match[1]; 524 | const text = match[2].trim(); 525 | segments.set(timestamp, text); 526 | } 527 | }); 528 | 529 | return segments; 530 | } 531 | 532 | function createSrt() { 533 | const segments = getSegmentsFromHTML(); 534 | const srtContent = createSRTContent(segments); 535 | downloadSRTFile(srtContent, "subtitles.srt"); 536 | } 537 | 538 | function copyToClipboard(elementId) { 539 | const transcriptionElement = document.querySelector(`#${elementId}`); 540 | const text = transcriptionElement.innerText; 541 | navigator.clipboard.writeText(text).then( 542 | function () { 543 | showToast(); 544 | }, 545 | function (err) { 546 | console.error("Could not copy text: ", err); 547 | } 548 | ); 549 | } 550 | 551 | function showToast() { 552 | const toastElement = document.querySelector("#toast"); 553 | toastElement.classList.add("show"); 554 | 555 | setTimeout(function () { 556 | toastElement.classList.remove("show"); 557 | }, 3000); 558 | } 559 | 560 | function clearMessage(elementId) { 561 | const el = document.querySelector(`#${elementId}`); 562 | 563 | while (el.firstChild) { 564 | el.firstChild.remove(); 565 | } 566 | } 567 | 568 | function hideCreateSrt() { 569 | const srt = document.querySelector("#create-srt"); 570 | srt.classList.add("hidden"); 571 | } 572 | 573 | function hideAudioControl() { 574 | const audio = document.querySelector("#audio-control"); 575 | audio.pause(); 576 | audio.src = ""; 577 | audio.hidden = true; 578 | } 579 | 580 | function disableSettingControle() { 581 | let elements = document.querySelectorAll(".setting-control"); 582 | 583 | for (var i = 0; i < elements.length; i++) { 584 | elements[i].disabled = true; 585 | } 586 | } 587 | 588 | function enableSettingControle() { 589 | let elements = document.querySelectorAll(".setting-control"); 590 | 591 | for (var i = 0; i < elements.length; i++) { 592 | elements[i].disabled = false; 593 | } 594 | } 595 | 596 | function disableModeControle() { 597 | document.querySelector("#real-time-mode").disabled = true; 598 | document.querySelector("#audio-mode").disabled = true; 599 | } 600 | 601 | function enableModeControle() { 602 | document.querySelector("#real-time-mode").disabled = false; 603 | document.querySelector("#audio-mode").disabled = false; 604 | } 605 | 606 | function formatTime(timeInSeconds) { 607 | const hours = Math.floor(timeInSeconds / 3600); 608 | const minutes = Math.floor(timeInSeconds / 60) % 60; 609 | const seconds = Math.floor(timeInSeconds - hours * 3600 - minutes * 60); 610 | const milliseconds = Math.round((timeInSeconds % 1) * 1000); 611 | 612 | return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(seconds, 2)},${pad( 613 | milliseconds, 614 | 3 615 | )}`; 616 | } 617 | 618 | function pad(num, size) { 619 | let s = num + ""; 620 | while (s.length < size) { 621 | s = "0" + s; 622 | } 623 | return s; 624 | } 625 | 626 | function createSRTContent(segments) { 627 | return Array.from(segments.entries()) 628 | .map(([key, value], index) => { 629 | return `${index + 1}\n${key}\n${value}\n`; 630 | }) 631 | .join("\n"); 632 | } 633 | -------------------------------------------------------------------------------- /web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Speech to text 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 |
14 |
15 | 21 |
22 |
23 |
24 | 25 | 26 |
27 |
28 | 31 | 34 | 37 | 38 |
39 |
40 |
41 | 42 |
43 | 46 | 50 | 53 | 56 |
57 |
58 |
59 |
60 |
61 | 62 |
63 | 66 | 69 |
70 |
71 |
72 | 73 |
Copied to clipboard!
74 |
75 |
76 | 136 | 137 | 205 | 206 | 386 | 387 | 452 |
453 |
454 | 455 | 456 | --------------------------------------------------------------------------------