├── assets
    └── sample_terminal.png
├── requirements.txt
├── .gitignore
├── .sample_env
├── main.py
├── start_uttertype.sh
├── LICENSE
├── utils.py
├── key_listener.py
├── table_interface.py
├── README.md
└── transcriber.py


/assets/sample_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dhruvyad/uttertype/HEAD/assets/sample_terminal.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | PyAudio
3 | PyAutoGUI
4 | pynput
5 | pyperclip
6 | python-dotenv
7 | rich
8 | webrtcvad
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # macOS
 2 | .DS_Store
 3 | .AppleDouble
 4 | .LSOverride
 5 | 
 6 | # Ignore audio.wav
 7 | audio.wav
 8 | 
 9 | # Ignore env variables file
10 | .env
11 | 
12 | __pycache__/
13 | dist
14 | build
15 | main.spec


--------------------------------------------------------------------------------
/.sample_env:
--------------------------------------------------------------------------------
 1 | # Defaults for OpenAI API:
 2 | # OPENAI_API_KEY="sk-<your_key_here>"
 3 | # OPENAI_BASE_URL="https://api.openai.com/v1"
 4 | # OPENAI_MODEL_NAME="whisper-1"
 5 | 
 6 | 
 7 | # Defaults for local faster_whisper_server:
 8 | OPENAI_API_KEY="sk-<your_key_here>"
 9 | OPENAI_BASE_URL="http://localhost:7000/v1"
10 | 
11 | OPENAI_MODEL_NAME="Systran/faster-distil-whisper-large-v3"
12 | # OPENAI_MODEL_NAME="deepdml/faster-whisper-large-v3-turbo-ct2"
13 | 
14 | UTTERTYPE_RECORD_HOTKEYS="<ctrl>+<alt>+v"
15 | # UTTERTYPE_RECORD_HOTKEYS="<cmd>+<ctrl>"
16 | 
17 | # Minimum duration of speech to send to API in case of silence
18 | UTTERTYPE_MIN_TRANSCRIPTION_SIZE_MS=10000 # defaults to: 1500


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pynput import keyboard
 3 | from transcriber import WhisperAPITranscriber
 4 | from table_interface import ConsoleTable
 5 | from key_listener import create_keylistener
 6 | from dotenv import load_dotenv
 7 | from utils import manual_type
 8 | 
 9 | 
10 | async def main():
11 |     load_dotenv()
12 | 
13 |     transcriber = WhisperAPITranscriber.create()
14 |     hotkey = create_keylistener(transcriber)
15 | 
16 |     keyboard.Listener(on_press=hotkey.press, on_release=hotkey.release).start()
17 |     console_table = ConsoleTable()
18 |     with console_table:
19 |         async for transcription, audio_duration_ms in transcriber.get_transcriptions():
20 |             manual_type(transcription.strip())
21 |             console_table.insert(
22 |                 transcription,
23 |                 round(0.0001 * audio_duration_ms / 1000, 6),
24 |             )
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     asyncio.run(main())
29 | 


--------------------------------------------------------------------------------
/start_uttertype.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get directory of the script
 4 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 5 | 
 6 | # Check if tmux is installed
 7 | if ! command -v tmux &> /dev/null; then
 8 |     echo "tmux is not installed. Please install it first."
 9 |     exit 1
10 | fi
11 | 
12 | # Check if pipenv is installed and create virtual environment if needed
13 | if command -v pipenv &> /dev/null; then
14 |     cd "$SCRIPT_DIR"
15 |     # Create/update virtual environment if needed
16 |     pipenv install --quiet
17 |     # Get the path to the virtual environment's Python
18 |     VENV_PYTHON=$(pipenv --py)
19 | else
20 |     echo "pipenv is not installed. Using system Python."
21 |     VENV_PYTHON=$(which python)
22 | fi
23 | 
24 | # Create new tmux session if it doesn't exist
25 | if ! tmux has-session -t uttertype 2>/dev/null; then
26 |     tmux new-session -s uttertype -d
27 |     tmux send-keys -t uttertype "cd '$SCRIPT_DIR' && '$VENV_PYTHON' main.py" C-m
28 | fi


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Dhruv Yadav
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import pyperclip
 4 | import pyautogui
 5 | from typing import List
 6 | from pynput import keyboard
 7 | 
 8 | keyboard_writer = keyboard.Controller()
 9 | 
10 | 
11 | def clipboard_type(text):
12 |     """
13 |     Instead of typing each key, just copy to clipboard and paste
14 |     Probably won't work for some fields that don't accept pasting
15 |     """
16 |     original_clipboard_content = pyperclip.paste()
17 |     pyperclip.copy(text)
18 |     print("Pasting:", text)
19 |     pyautogui.hotkey("command" if sys.platform == "darwin" else "ctrl", "v")
20 |     pyperclip.copy(original_clipboard_content)
21 | 
22 | 
23 | def manual_type(text: str, delay: float = 0.0042):
24 |     """
25 |     Type each key manually with delay to prevent overwhelming the target
26 |     Copied from keyboard.Controller.type() to add delay
27 |     """
28 |     for i, character in enumerate(text):
29 |         key = keyboard._CONTROL_CODES.get(character, character)
30 |         try:
31 |             keyboard_writer.press(key)
32 |             keyboard_writer.release(key)
33 |             time.sleep(delay)
34 |         except (ValueError, keyboard_writer.InvalidKeyException):
35 |             raise keyboard_writer.InvalidCharacterException(i, character)
36 | 
37 | 
38 | def transcription_concat(transcriptions: List[str]) -> str:
39 |     return " ".join([_t.strip() for _t in transcriptions])  # Simple concat for now
40 | 


--------------------------------------------------------------------------------
/key_listener.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pynput.keyboard import HotKey
 4 | 
 5 | 
 6 | class HoldHotKey(HotKey):
 7 |     def __init__(self, keys, on_activate, on_deactivate):
 8 |         self.active = False
 9 | 
10 |         def _mod_on_activate():
11 |             self.active = True
12 |             on_activate()
13 | 
14 |         def _mod_on_deactivate():
15 |             self.active = False
16 |             on_deactivate()
17 | 
18 |         super().__init__(keys, _mod_on_activate)
19 |         self._on_deactivate = _mod_on_deactivate
20 | 
21 |     def release(self, key):
22 |         super().release(key)
23 |         if self.active and self._state != self._keys:
24 |             self._on_deactivate()
25 | 
26 | 
27 | class HoldGlobeKey:
28 |     """
29 |     For macOS only, globe key requires special handling
30 |     """
31 | 
32 |     def __init__(self, on_activate, on_deactivate):
33 |         self.held = False
34 |         self._on_activate = on_activate
35 |         self._on_deactivate = on_deactivate
36 | 
37 |     def press(self, key):
38 |         if hasattr(key, "vk") and key.vk == 63:
39 |             if self.held:  # hold ended
40 |                 self._on_deactivate()
41 |             else:  # hold started
42 |                 self._on_activate()
43 |             self.held = not self.held
44 | 
45 |     def release(self, key):
46 |         """Press and release signals are mixed for globe key"""
47 |         self.press(key)
48 | 
49 | 
50 | def create_keylistener(transcriber, env_var="UTTERTYPE_RECORD_HOTKEYS"):
51 |     key_code = os.getenv(env_var, "")
52 | 
53 |     if (sys.platform == "darwin") and (key_code in ["<globe>", ""]):
54 |         return HoldGlobeKey(
55 |             on_activate=transcriber.start_recording,
56 |             on_deactivate=transcriber.stop_recording,
57 |         )
58 | 
59 |     key_code = key_code if key_code else "<ctrl>+<alt>+v"
60 | 
61 |     return HoldHotKey(
62 |           HoldHotKey.parse(key_code),
63 |           on_activate=transcriber.start_recording,
64 |           on_deactivate=transcriber.stop_recording,
65 |       )
66 | 


--------------------------------------------------------------------------------
/table_interface.py:
--------------------------------------------------------------------------------
 1 | from rich import box
 2 | from rich.align import Align
 3 | from rich.console import Console
 4 | from rich.live import Live
 5 | from rich.table import Table
 6 | from rich.text import Text
 7 | from datetime import datetime
 8 | 
 9 | 
10 | class ConsoleTable:
11 |     def __init__(self, total_cost_decimals: int = 6):
12 |         self.console = Console()
13 |         self.table = Table(show_footer=False)
14 |         self.total_cost = 0
15 |         self.total_cost_decimals = total_cost_decimals
16 | 
17 |     def _update_cost(self, cost: float):
18 |         self.total_cost += cost
19 |         self.table.columns[2].footer = (
20 |             f"${round(self.total_cost, self.total_cost_decimals)}"
21 |         )
22 | 
23 |     def _setup_table(self):
24 |         self.centered_table = Align.center(self.table)
25 |         self.console.clear()
26 |         self.table.add_column("Date", no_wrap=True)
27 |         self.table.add_column(
28 |             "Transcription", Text.from_markup("[b]Total:", justify="right")
29 |         )
30 |         self.table.add_column(
31 |             "Cost", Text.from_markup("[b]$0", justify="right"), no_wrap=True
32 |         )
33 |         self.table.show_footer = True
34 | 
35 |         self.table.columns[0].header_style = "bold green"
36 |         self.table.columns[0].style = "green"
37 |         self.table.columns[1].header_style = "bold blue"
38 |         self.table.columns[1].style = "blue"
39 |         self.table.columns[1].footer = "Total"
40 |         self.table.columns[2].header_style = "bold cyan"
41 |         self.table.columns[2].style = "cyan"
42 |         self.table.row_styles = ["none", "dim"]
43 |         self.table.box = box.SIMPLE_HEAD
44 | 
45 |     def __enter__(self):
46 |         self._setup_table()
47 |         self.live_rendering = Live(
48 |             self.centered_table,
49 |             console=self.console,
50 |             screen=False,
51 |             refresh_per_second=5,
52 |             vertical_overflow="visible",
53 |         )
54 |         self.live_rendering.__enter__()
55 | 
56 |     def __exit__(self, *args, **kwargs):
57 |         self.live_rendering.__exit__(*args, **kwargs)
58 | 
59 |     def insert(self, transcription: str, cost: float):
60 |         current_datetime = datetime.now()
61 |         formatted_datetime = current_datetime.strftime("%dth %B, %I:%M%p")
62 |         self.table.add_row(formatted_datetime, transcription, f"${cost}")
63 |         self._update_cost(cost)
64 |         # Text("API Error", style="bold red")
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # uttertype ([demo](https://www.youtube.com/watch?v=eSDYIFzU_fY))
  2 | 
  3 | <img src="./assets/sample_terminal.png" alt="alt text" style="width: 500px;"/>
  4 | 
  5 | ## Setup
  6 | 
  7 | ### 1. [Install PortAudio/PyAudio](https://people.csail.mit.edu/hubert/pyaudio/)
  8 | #### macOS
  9 | Installing portaudio on macOS can be somewhat tricky, especially on M1+ chips (Apple Silicon). In general, using conda seems to be the safest way to install portaudio
 10 | ```
 11 | conda install portaudio
 12 | ```
 13 | If that doesn't work, try installing using Homebrew
 14 | ```sh
 15 | brew install portaudio
 16 | ```
 17 | 
 18 | #### Windows
 19 | ```
 20 | python -m pip install pyaudio
 21 | ```
 22 | #### Linux
 23 | ```
 24 | sudo apt-get install python3-pyaudio
 25 | ```
 26 | ### 2. Add a HotKey
 27 | For macOS, the hotkey is automatically set to the globe key by default (&#127760; bottom left key). For Windows and Linux, you can configure the hotkey by setting the `UTTERTYPE_RECORD_HOTKEYS` environment variable in `.env`:
 28 | ```env
 29 | UTTERTYPE_RECORD_HOTKEYS="<ctrl>+<alt>+v"
 30 | ```
 31 | 
 32 | For more context, view the [pynput documentation for using HotKeys](https://pynput.readthedocs.io/en/latest/keyboard.html#global-hotkeys) (HoldHotKey is extended from this class).
 33 | 
 34 | ### 3. Install Dependencies
 35 | Choose one of the following methods to install the required dependencies:
 36 | 
 37 | #### Option A: Using pip
 38 | ```shell
 39 | python -m pip install -r requirements.txt
 40 | ```
 41 | 
 42 | #### Option B: Using pipenv
 43 | First, install pipenv if you haven't already:
 44 | ```shell
 45 | pip install pipenv
 46 | ```
 47 | 
 48 | Then, install dependencies using pipenv:
 49 | ```shell
 50 | pipenv install
 51 | ```
 52 | 
 53 | This will create a virtual environment and install all dependencies from the Pipfile. To activate the environment:
 54 | ```shell
 55 | pipenv shell
 56 | ```
 57 | 
 58 | 
 59 | If during/after installation on Linux you see error similar to:
 60 | ```
 61 | ImportError: /home/soul/anaconda3/lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /lib/x86_64-linux-gnu/libjack.so.0)
 62 | ```
 63 | Check out [StackOverflow](https://stackoverflow.com/questions/72540359/glibcxx-3-4-30-not-found-for-librosa-in-conda-virtual-environment-after-tryin) and [Berkley](https://bcourses.berkeley.edu/courses/1478831/pages/glibcxx-missing)
 64 | 
 65 | 
 66 | ### 4. Configure OpenAI Settings
 67 | 
 68 | You can configure uttertype to work with either OpenAI's official API or a local Whisper server. There are two ways to set this up:
 69 | 
 70 | #### Option A: Using a .env file (Recommended)
 71 | Create a `.env` file in the project directory with these settings:
 72 | 
 73 | ```env
 74 | # 1. Required: Your API key
 75 | OPENAI_API_KEY="sk-your-key-here"
 76 | 
 77 | # 2. Optional: Choose your API endpoint
 78 | # For OpenAI's official API (default):
 79 | OPENAI_BASE_URL="https://api.openai.com/v1"
 80 | # OR for a local [Faster Whisper server](https://github.com/fedirz/faster-whisper-server):
 81 | OPENAI_BASE_URL="http://localhost:7000/v1"
 82 | 
 83 | # 3. Optional: Select your preferred model
 84 | # For OpenAI's official API:
 85 | OPENAI_MODEL_NAME="whisper-1"
 86 | # OR for local Whisper server, some options include:
 87 | OPENAI_MODEL_NAME="Systran/faster-whisper-small"
 88 | OPENAI_MODEL_NAME="Systran/faster-distil-whisper-large-v3"
 89 | OPENAI_MODEL_NAME="deepdml/faster-whisper-large-v3-turbo-ct2"
 90 | ```
 91 | 
 92 | #### Option B: Using Environment Variables
 93 | You can also set these values directly in your terminal:
 94 | 
 95 | For Linux/macOS:
 96 | ```shell
 97 | export OPENAI_API_KEY="sk-your-key-here"
 98 | export OPENAI_BASE_URL="https://api.openai.com/v1" # optional
 99 | export OPENAI_MODEL_NAME="whisper-1" # optional
100 | ```
101 | 
102 | For Windows:
103 | ```shell
104 | $env:OPENAI_API_KEY = "sk-your-key-here"
105 | $env:OPENAI_BASE_URL = "https://api.openai.com/v1"  # optional
106 | $env:OPENAI_MODEL_NAME = "whisper-1"  # optional
107 | ```
108 | 
109 | See [`.sample_env`](.sample_env) in the repository for example configurations.
110 | 
111 | #### Using a Local Whisper Server
112 | For faster and cheaper transcription, you can set up a local [faster-whisper-server](https://github.com/fedirz/faster-whisper-server). When using a local server:
113 | 
114 | 1. Set `OPENAI_BASE_URL` to your server's address (e.g., `http://localhost:7000/v1`)
115 | 2. Choose from supported local models like:
116 |    - `Systran/faster-whisper-small` (fastest)
117 |    - `Systran/faster-distil-whisper-large-v3` (most accurate)
118 |    - `deepdml/faster-whisper-large-v3-turbo-ct2` (almost as good, but faster)
119 | 
120 | ### 5. Final run and permissions
121 | Finally, run main.py
122 | ```shell
123 | python main.py
124 | ```
125 | OR
126 | ```shell
127 | ./start_uttertype.sh # installed and configured pipenv environment would be needed
128 | ```
129 | 
130 | When the program first runs, you will likely need to give it sufficient permissions. On macOS, this will include adding terminal to accessibility under `Privacy and Security > Accessibility`, giving it permission to monitor the keyboard, and finally giving it permission to record using the microphone.
131 | 
132 | ## Usage
133 | To start transcription, press and hold the registered hotkey to start recording. To stop the recording, lift your registered hotkey. On macOS, the registered hotkey is the globe icon by default. For other operating systems, this will have to by manually configured in `main.py` as described earlier.
134 | 


--------------------------------------------------------------------------------
/transcriber.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | from typing import List, Tuple
  4 | import pyaudio
  5 | import wave
  6 | from openai import OpenAI
  7 | import asyncio
  8 | from threading import Thread, Event
  9 | import webrtcvad
 10 | from utils import transcription_concat
 11 | import tempfile
 12 | 
 13 | FORMAT = pyaudio.paInt16  # Audio format
 14 | CHANNELS = 1  # Mono audio
 15 | RATE = 16000  # Sample rate
 16 | CHUNK_DURATION_MS = 30  # Frame duration in milliseconds
 17 | CHUNK = int(RATE * CHUNK_DURATION_MS / 1000)
 18 | MIN_TRANSCRIPTION_SIZE_MS = int(
 19 |     os.getenv('UTTERTYPE_MIN_TRANSCRIPTION_SIZE_MS', 1500) # Minimum duration of speech to send to API in case of silence
 20 | )
 21 | 
 22 | 
 23 | class AudioTranscriber:
 24 |     def __init__(self):
 25 |         self.audio = pyaudio.PyAudio()
 26 |         self.recording_finished = Event()  # Threading event to end recording
 27 |         self.recording_finished.set()  # Initialize as finished
 28 |         self.frames = []
 29 |         self.audio_duration = 0
 30 |         self.rolling_transcriptions: List[Tuple[int, str]] = []  # (idx, transcription)
 31 |         self.rolling_requests: List[Thread] = []  # list of pending requests
 32 |         self.event_loop = asyncio.get_event_loop()
 33 |         self.vad = webrtcvad.Vad(1)  # Voice Activity Detector, mode can be 0 to 3
 34 |         self.transcriptions = asyncio.Queue()
 35 | 
 36 |     def start_recording(self):
 37 |         """Start recording audio from the microphone."""
 38 | 
 39 |         # Start a new recording in the background, do not block
 40 |         def _record():
 41 |             self.recording_finished = Event()
 42 |             stream = self.audio.open(
 43 |                 format=FORMAT,
 44 |                 channels=CHANNELS,
 45 |                 rate=RATE,
 46 |                 input=True,
 47 |                 frames_per_buffer=CHUNK,
 48 |             )
 49 |             intermediate_trancriptions_idx = 0
 50 |             while (
 51 |                 not self.recording_finished.is_set()
 52 |             ):  # Keep recording until interrupted
 53 |                 data = stream.read(CHUNK)
 54 |                 self.audio_duration += CHUNK_DURATION_MS
 55 |                 is_speech = self.vad.is_speech(data, RATE)
 56 |                 current_audio_duration = len(self.frames) * CHUNK_DURATION_MS
 57 |                 if (
 58 |                     not is_speech
 59 |                     and current_audio_duration >= MIN_TRANSCRIPTION_SIZE_MS
 60 |                 ):  # silence
 61 |                     rolling_request = Thread(
 62 |                         target=self._intermediate_transcription,
 63 |                         args=(
 64 |                             intermediate_trancriptions_idx,
 65 |                             self._frames_to_wav(),
 66 |                         ),
 67 |                     )
 68 |                     self.frames = []
 69 |                     self.rolling_requests.append(rolling_request)
 70 |                     rolling_request.start()
 71 |                     intermediate_trancriptions_idx += 1
 72 |                 self.frames.append(data)
 73 | 
 74 |         # start recording in a new non-blocking thread
 75 |         Thread(target=_record).start()
 76 | 
 77 |     def stop_recording(self):
 78 |         """Stop the recording and reset variables"""
 79 |         self.recording_finished.set()
 80 |         self._finish_transcription()
 81 |         self.frames = []
 82 |         self.audio_duration = 0
 83 |         self.rolling_requests = []
 84 |         self.rolling_transcriptions = []
 85 | 
 86 |     def _intermediate_transcription(self, idx, audio):
 87 |         intermediate_transcription = self.transcribe_audio(audio)
 88 |         self.rolling_transcriptions.append((idx, intermediate_transcription))
 89 | 
 90 |     def _finish_transcription(self):
 91 |         transcription = self.transcribe_audio(
 92 |             self._frames_to_wav()
 93 |         )  # Last transcription
 94 |         for request in self.rolling_requests:  # Wait for rolling requests
 95 |             request.join()
 96 |         self.rolling_transcriptions.append(
 97 |             (len(self.rolling_transcriptions), transcription)
 98 |         )
 99 |         sorted(self.rolling_transcriptions, key=lambda x: x[0])  # Sort by idx
100 |         transcriptions = [
101 |             t[1] for t in self.rolling_transcriptions
102 |         ]  # Get ordered transcriptions
103 |         self.event_loop.call_soon_threadsafe(  # Put final combined result in finished queue
104 |             self.transcriptions.put_nowait,
105 |             (transcription_concat(transcriptions), self.audio_duration),
106 |         )
107 | 
108 |     def _frames_to_wav(self):
109 |         buffer = io.BytesIO()
110 |         buffer.name = "tmp.wav"
111 |         wf = wave.open(buffer, "wb")
112 |         wf.setnchannels(CHANNELS)
113 |         wf.setsampwidth(self.audio.get_sample_size(FORMAT))
114 |         wf.setframerate(RATE)
115 |         wf.writeframes(b"".join(self.frames))
116 |         wf.close()
117 |         return buffer
118 | 
119 |     def transcribe_audio(self, audio: io.BytesIO) -> str:
120 |         raise NotImplementedError("Please use a subclass of AudioTranscriber")
121 | 
122 |     async def get_transcriptions(self):
123 |         """
124 |         Asynchronously get transcriptions from the queue.
125 |         Returns (transcription string, audio duration in ms).
126 |         """
127 |         while True:
128 |             transcription = await self.transcriptions.get()
129 |             yield transcription
130 |             self.transcriptions.task_done()
131 | 
132 | 
133 | class WhisperAPITranscriber(AudioTranscriber):
134 |     def __init__(self, base_url, model_name, *args, **kwargs):
135 |         super().__init__(*args, **kwargs)
136 | 
137 |         self.model_name = model_name
138 |         self.client = OpenAI(base_url=base_url)
139 | 
140 |     @staticmethod
141 |     def create(*args, **kwargs):
142 |         base_url = os.getenv('OPENAI_BASE_URL', 'https://api.openai.com/v1')
143 |         model_name = os.getenv('OPENAI_MODEL_NAME', 'whisper-1')
144 | 
145 |         return WhisperAPITranscriber(base_url, model_name)
146 | 
147 |     def transcribe_audio(self, audio: io.BytesIO) -> str:
148 |         try:
149 |             transcription = self.client.audio.transcriptions.create(
150 |                 model=self.model_name,
151 |                 file=audio,
152 |                 response_format="text",
153 |                 language="en",
154 |                 prompt="The following is normal speech or technical speech from an engineer.",
155 |             )
156 |             return transcription
157 |         except Exception as e:
158 |             print(f"Encountered Error: {e}")
159 |             return ""
160 | 
161 | 
162 | class WhisperLocalMLXTranscriber(AudioTranscriber):
163 |     def __init__(self, model_type="distil-medium.en", *args, **kwargs):
164 |         super().__init__(*args, **kwargs)
165 |         from lightning_whisper_mlx import LightningWhisperMLX
166 | 
167 |         self.model = LightningWhisperMLX(model_type)
168 | 
169 |     def transcribe_audio(self, audio: io.BytesIO) -> str:
170 |         try:
171 |             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
172 |                 tmpfile.write(audio.getvalue())
173 |                 transcription = self.model.transcribe(tmpfile.name)["text"]
174 |                 os.unlink(tmpfile.name)
175 |             return transcription
176 |         except Exception as e:
177 |             print(f"Encountered Error: {e}")
178 |             return ""
179 | 


--------------------------------------------------------------------------------