├── .python-version ├── .gitignore ├── dictation_remote.service ├── run_dictation_local.sh ├── legacy_auto_off ├── run_dictation_auto_off.sh ├── README.md ├── engine.py ├── dictation_local.service ├── _lagacy_dictation_auto_off_pynput.py └── dictation_auto_off.py ├── pyproject.toml ├── README.md └── dictation.py /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | venv 3 | .venv 4 | .env 5 | __pycache__ 6 | uv.lock 7 | uv_local.lock 8 | tmp.wav 9 | whisper_simple_dictation.egg-info 10 | -------------------------------------------------------------------------------- /dictation_remote.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Whisper Simple Dictation Service 3 | 4 | [Service] 5 | WorkingDirectory=%h/apps/whisper-simple-dictation 6 | ExecStart=%h/apps/whisper-simple-dictation/.venv/bin/python dictation.py remote en 7 | Restart=on-failure 8 | RestartSec=5 9 | 10 | [Install] 11 | WantedBy=default.target -------------------------------------------------------------------------------- /run_dictation_local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | script_path="$(realpath "$0")" 3 | script_dir="$(dirname "$script_path")" 4 | cd $script_dir 5 | 6 | source .venv/bin/activate 7 | export LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python3.13/site-packages/nvidia/cublas/lib:$VIRTUAL_ENV/lib/python3.13/site-packages/nvidia/cudnn/lib" 8 | .venv/bin/python3 dictation.py local "$@" -------------------------------------------------------------------------------- /legacy_auto_off/run_dictation_auto_off.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | script_path="$(realpath "$0")" 3 | script_dir="$(dirname "$script_path")" 4 | cd $script_dir 5 | source .venv/bin/activate 6 | export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` 7 | .venv/bin/python3 dictation_auto_off.py "$@" 8 | 9 | -------------------------------------------------------------------------------- /legacy_auto_off/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the script which aims to disable the local model when it is not active for some time. And then on demand load it back. 2 | 3 | --------- 4 | 5 | In case of local running `dictation_auto_off.py` uses evdev which only works on Linux. For Windows and Mac you can try `_lagacy_dictation_auto_off_pynput.py`, which uses pynput. (Modify `run_dictation_auto_off.sh`.) 6 | 7 | 8 | Pynput works on all systems, but does not work on Wayland. 9 | 10 | Evdev works with both Wayland and X11, supports special characters, but only works on Linux. 11 | 12 | Ydotool works with both Wayland and X11, but does not support special characters. Maybe it would be possible to make ydotool trigger pasting. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "whisper-simple-dictation" 3 | version = "0.1.0" 4 | description = "Whisper dictation with OpenAI API" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "numpy", 9 | # keyboard and sound support 10 | "pynput", 11 | "sounddevice", 12 | # clipboard support 13 | "pyperclip", 14 | # for remote: 15 | "openai", 16 | "soundfile", 17 | ] 18 | 19 | [project.optional-dependencies] 20 | local = [ 21 | "faster-whisper", 22 | # for faster-whisper, requiring cuda 12 23 | "nvidia-cublas-cu12", 24 | "nvidia-cudnn-cu12~=8.9", 25 | # "evdev", 26 | # "flask", 27 | ] 28 | 29 | [tool.setuptools] 30 | py-modules = ["dictation"] -------------------------------------------------------------------------------- /legacy_auto_off/engine.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from flask import Flask, request, jsonify 4 | from faster_whisper import WhisperModel 5 | 6 | app = Flask(__name__) 7 | 8 | model_name = sys.argv[1] 9 | print(f"Using model: {model_name}") 10 | 11 | model = WhisperModel(model_name, device="cuda", compute_type="float16") 12 | 13 | 14 | def get_text_local(audio, context=None): 15 | segments, info = model.transcribe(audio, beam_size=5, language="en", initial_prompt=context) 16 | segments = list(segments) 17 | text = " ".join([segment.text.strip() for segment in segments]) 18 | return text 19 | 20 | 21 | @app.route("/transcribe", methods=["POST"]) 22 | def transcribe(): 23 | data = request.get_json() 24 | if "audio" not in data: 25 | return jsonify({"error": "No audio data provided"}), 400 26 | 27 | context = data.get("context", None) 28 | 29 | try: 30 | audio_array = np.array(data["audio"]) 31 | text = get_text_local(audio_array, context) 32 | return jsonify({"text": text}), 200 33 | except Exception as e: 34 | return jsonify({"error": str(e)}), 500 35 | 36 | if __name__ == "__main__": 37 | app.run(debug=True, host="0.0.0.0", port=5900, use_reloader=False) 38 | -------------------------------------------------------------------------------- /legacy_auto_off/dictation_local.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Whisper Simple Dictation Service 3 | 4 | [Service] 5 | Type=simple 6 | Environment="WAYLAND_DISPLAY=wayland-0" 7 | Environment="DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus" 8 | ; the sudo tee ... command serves to indicate that the engine is running using some LED 9 | ; feel free to tweak it or remove it 10 | ; if you keep it, add this to the sudoers file: 11 | ; ALL ALL=NOPASSWD: /usr/bin/tee /sys/class/leds/platform\:\:mute/brightness 12 | ExecStart=/bin/bash /home/USERNAME/apps/whisper-simple-dictation/run_dictation_auto_off.sh en --on-callback 'echo 1 | sudo tee /sys/class/leds/platform::mute/brightness' --off-callback 'echo 0 | sudo tee /sys/class/leds/platform::mute/brightness' --auto-off-time 120 --model medium 13 | Restart=no 14 | 15 | [Install] 16 | WantedBy=default.target 17 | 18 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 19 | ; alternative ExecStart, in case you don't want to add your user to the input group 20 | ; but then, remember to add this rule to your sudoers file: 21 | ; ALL ALL=(ALL) NOPASSWD: SETENV: /bin/bash /home/filip/apps/whisper-simple-dictation/run_dictation_auto_off.sh * 22 | ; you will need to tweak --recording-device name - sd.query_devices() will list all available devices 23 | ; ExecStart=/usr/bin/sudo -E /bin/bash /home/USERNAME/apps/whisper-simple-dictation/run_dictation_auto_off.sh en --on-callback 'echo 1 | sudo tee /sys/class/leds/platform::mute/brightness' --off-callback 'echo 0 | sudo tee /sys/class/leds/platform::mute/brightness' --auto-off-time 120 --model medium --recording-device "sof-hda-dsp: - (hw:1,6)" 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Whisper simple dictation 2 | 3 | - press a key to start recording 4 | - release it to stop recording 5 | - Whisper transcribes it 6 | - the text is typed with simulated keypresses 7 | 8 | You can either run Whisper locally or through OpenAI's API. 9 | 10 | For local execution you need a CUDA device with at least 4GB VRAM. Uses whisper version `large-v3`, run with FasterWhisper. 11 | 12 | With remote execution, OpenAI's API has about 1 second delay (as of Jan 2024), while local is near instant. 13 | 14 | 15 | ## Installation 16 | 17 | ``` 18 | git clone https://github.com/filyp/whisper-simple-dictation.git 19 | cd whisper-simple-dictation 20 | python3 -m venv .venv --copies 21 | .venv/bin/pip install -e . 22 | ``` 23 | 24 | If using Wayland, you also need to install ydotool and enable ydotoold. (The script tries to use ydotool, and if it's not installed, it falls back to pynput.) 25 | 26 | ### Remote 27 | If you want to run remotely, run: 28 | ``` 29 | echo sk-... > ~/.config/openai.token 30 | ``` 31 | Where `sk-...` is your OpenAI API token. 32 | 33 | ### Local 34 | Then, if you want to run locally, run: 35 | ``` 36 | .venv/bin/pip install -e ".[local]" 37 | sudo usermod -aG input __YOUR_USER_NAME__ 38 | ``` 39 | 40 | Then log out and back in. 41 | 42 | (If you're using Wayland and don't want to add your user to the input group for security reasons, see instructions in `dictation_local.service`. On X11 it doesn't matter - devices are exposed anyway.) 43 | 44 | 45 | ## Running 46 | 47 | To run remotely: 48 | ``` 49 | .venv/bin/python3 dictation.py remote en 50 | ``` 51 | 52 | To run locally: 53 | ``` 54 | bash run_dictation_local.sh en 55 | ``` 56 | 57 | Ctrl-c to stop. 58 | 59 | By default the record key is *right* ctrl. You can change it in `dictation.py`, but it must be a modifier key (shift, alt, ...). 60 | 61 | To set up a service that will run whisper-simple-dictation, take a look at `dictation_remote.service`. 62 | 63 | (Note that the way we send text is by copying it to the clipboard and then sending Ctrl+Shift+V. That's because typing the text normally is complicated to do right, with all the special characters. If using ydotool, it types the text, because pasting is not implemented.) 64 | 65 | ## Options 66 | 67 | - **Language.** First argument (in the example above `en`), sets the language. You can also not pass any language to detect it automatically, but that can have worse latency and accuracy. 68 | - **Choosing model** Default is `large-v3`. You can also pass e.g. `--model medium` or `--model small`. 69 | 70 | ## Other approaches 71 | 72 | At first I wanted real-time dictation, similar to [nerd-dictation](https://github.com/ideasman42/nerd-dictation). There's [whisper_streaming](https://github.com/ufal/whisper_streaming) which implements something similar, a continuous transcription using whisper. But it has a 3.3 second time lag, and because it needs to run whisper on many overlapping time windows, it's more compute heavy. Also those transcribed time windows are sometimes merged incorrectly. It may be enough for creating captions, but not really for dictation. 73 | 74 | With some clever engineering and a lot of compute maybe we could get that time lag to less than a second. But I found that reading what you said with a few hundred millisecond delay is very confusing, similar to hearing your voice delayed. So for now, I think the best and most reliable way is the one used here. This may change with future neural nets, with architecture other than whisper, aimed at real-time transcription. 75 | 76 | There's also [whisper-writer](https://github.com/savbell/whisper-writer), which is more mature, but doesn't (as of Jan 2024) have push-to-talk, which I find more pleasant to use. 77 | -------------------------------------------------------------------------------- /legacy_auto_off/_lagacy_dictation_auto_off_pynput.py: -------------------------------------------------------------------------------- 1 | # %% 2 | # import os 3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib" 4 | import argparse 5 | import subprocess 6 | import threading 7 | import time 8 | 9 | import numpy as np 10 | import pynput 11 | import pyperclip 12 | import requests 13 | import sounddevice as sd 14 | 15 | # from dictation import get_context, type_using_clipboard 16 | 17 | # ! you can change this rec_key value 18 | rec_key = pynput.keyboard.Key.ctrl_r 19 | 20 | whisper_samplerate = 16000 # sampling rate that whisper uses 21 | recording_samplerate = 48000 # multiple of whisper_samplerate, widely supported 22 | 23 | server_url = 'http://0.0.0.0:5900' 24 | 25 | controller = pynput.keyboard.Controller() 26 | 27 | # %% parse args 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("language", nargs="?", default=None) 30 | parser.add_argument("--no-grab-context", action="store_true") 31 | parser.add_argument("--no-type-using-clipboard", action="store_true") 32 | parser.add_argument("--context-limit-chars", type=int, default=300) 33 | # add a command to be run on after model load 34 | parser.add_argument("--on-callback", type=str, default=None) 35 | # add a command to be run on after model unload 36 | parser.add_argument("--off-callback", type=str, default=None) 37 | # turn off automatically after some time 38 | parser.add_argument("--auto-off-time", type=int, default=None) 39 | # add a command to be run on after model load 40 | parser.add_argument("--model", type=str, default="large-v3") 41 | args = parser.parse_args() 42 | 43 | 44 | # start a server process 45 | engine = subprocess.Popen(["python", "engine.py", args.model]) 46 | if args.on_callback is not None: 47 | subprocess.run(args.on_callback, shell=True) 48 | 49 | 50 | 51 | # %% 52 | 53 | def get_context(): 54 | # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow 55 | # fisrt clear the clipboard in case getting context fails 56 | pyperclip.copy("") 57 | # ctrl+shift+home 58 | controller.press(pynput.keyboard.Key.ctrl_l) 59 | controller.press(pynput.keyboard.Key.shift_l) 60 | controller.press(pynput.keyboard.Key.home) 61 | controller.release(pynput.keyboard.Key.home) 62 | controller.release(pynput.keyboard.Key.shift_l) 63 | controller.release(pynput.keyboard.Key.ctrl_l) 64 | # ctrl+c 65 | controller.press(pynput.keyboard.Key.ctrl_l) 66 | controller.press("c") 67 | controller.release("c") 68 | controller.release(pynput.keyboard.Key.ctrl_l) 69 | # right arrow 70 | controller.press(pynput.keyboard.Key.right) 71 | controller.release(pynput.keyboard.Key.right) 72 | # get clipboard 73 | clipboard = pyperclip.paste() 74 | if clipboard == "": 75 | print("Warning: context is empty") 76 | return clipboard 77 | 78 | 79 | def type_using_clipboard(text): 80 | # use pynput to type ctrl+shift+v 81 | pyperclip.copy(text) 82 | controller.press(pynput.keyboard.Key.ctrl_l) 83 | controller.press(pynput.keyboard.Key.shift_l) 84 | controller.press("v") 85 | controller.release("v") 86 | controller.release(pynput.keyboard.Key.shift_l) 87 | controller.release(pynput.keyboard.Key.ctrl_l) 88 | 89 | 90 | # %% 91 | rec_key_pressed = False 92 | time_last_used = time.time() 93 | 94 | 95 | def record_and_process(): 96 | global engine 97 | 98 | # ! start the engine if not running 99 | if engine.poll() is not None: 100 | # clean up the old process 101 | print("Starting engine") 102 | engine = subprocess.Popen(["python", "engine.py", args.model]) 103 | if args.on_callback is not None: 104 | subprocess.run(args.on_callback, shell=True) 105 | 106 | # ! record 107 | # while is pressed, record audio 108 | audio_chunks = [] 109 | 110 | def audio_callback(indata, frames, time, status): 111 | if status: 112 | print("WARNING:", status) 113 | audio_chunks.append(indata.copy()) 114 | 115 | stream = sd.InputStream( 116 | samplerate=recording_samplerate, 117 | channels=1, 118 | blocksize=256, 119 | callback=audio_callback, 120 | ) 121 | stream.start() 122 | while rec_key_pressed: 123 | time.sleep(0.005) 124 | stream.stop() 125 | stream.close() 126 | recorded_audio = np.concatenate(audio_chunks)[:, 0] 127 | 128 | # ! check if not too short 129 | duration = len(recorded_audio) / recording_samplerate 130 | if duration <= 0.1: 131 | print("Recording too short, skipping") 132 | return 133 | 134 | # ! downsample 135 | # scipy resampling was much too slow (hundreds of ms) 136 | # leave in only every 3rd sample, using numpy 137 | recorded_audio = recorded_audio[::3] 138 | 139 | # ! get context 140 | if not args.no_grab_context: 141 | context = get_context() 142 | # limit the length of context 143 | context = context[-args.context_limit_chars :] 144 | else: 145 | context = None 146 | # print(context) 147 | 148 | # # ! transcribe 149 | payload = { 150 | "audio": recorded_audio.tolist(), 151 | "context": context 152 | } 153 | 154 | # note that the server can be intializing, so have the post try until it works 155 | while True: 156 | try: 157 | response = requests.post(server_url + "/transcribe", json=payload) 158 | break 159 | except requests.exceptions.ConnectionError: 160 | time.sleep(0.1) 161 | print("Connection error, retrying...") 162 | 163 | response_data = response.json() 164 | 165 | if response.status_code == 200: 166 | text = response_data.get("text", "") 167 | print(text) 168 | else: 169 | print(f"Error transcribing audio: {response_data.get('error', 'Unknown error')}") 170 | 171 | # ! type that text 172 | text = text + " " 173 | if not args.no_type_using_clipboard: 174 | type_using_clipboard(text) 175 | else: 176 | controller.type(text) 177 | # subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text]) 178 | # note: ydotool on x11 correctly outputs polish chars and types in terminal 179 | 180 | 181 | def on_press(key): 182 | global rec_key_pressed 183 | # print("pressed", key) 184 | if key == rec_key: 185 | rec_key_pressed = True 186 | 187 | # start recording in a new thread 188 | t = threading.Thread(target=record_and_process) 189 | t.start() 190 | 191 | 192 | def on_release(key): 193 | global rec_key_pressed, time_last_used 194 | # print("released", key) 195 | if key == rec_key: 196 | rec_key_pressed = False 197 | time_last_used = time.time() 198 | 199 | 200 | # %% 201 | if args.language is not None: 202 | print(f"Using language: {args.language}") 203 | with pynput.keyboard.Listener(on_press=on_press, on_release=on_release) as listener: 204 | print(f"Press {rec_key} to start recording") 205 | try: 206 | # listener.join() 207 | while listener.is_alive(): 208 | if ( 209 | args.auto_off_time is not None 210 | and time.time() - time_last_used > args.auto_off_time 211 | and rec_key_pressed is False 212 | and engine.poll() is None 213 | ): 214 | print("Auto off") 215 | # shut down the server 216 | engine.terminate() 217 | 218 | if args.off_callback is not None: 219 | subprocess.run(args.off_callback, shell=True) 220 | 221 | time.sleep(1) 222 | except KeyboardInterrupt: 223 | if args.off_callback is not None: 224 | subprocess.run(args.off_callback, shell=True) 225 | print("\nExiting...") 226 | -------------------------------------------------------------------------------- /legacy_auto_off/dictation_auto_off.py: -------------------------------------------------------------------------------- 1 | # %% 2 | # import os 3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib" 4 | import argparse 5 | import subprocess 6 | import threading 7 | import time 8 | 9 | import numpy as np 10 | import pyperclip 11 | import requests 12 | import sounddevice as sd 13 | 14 | # from dictation import get_context, type_using_clipboard 15 | 16 | import evdev 17 | from evdev import UInput, ecodes as e 18 | from select import select 19 | 20 | # ! you can change this rec_key value 21 | rec_key = "KEY_RIGHTCTRL" 22 | 23 | whisper_samplerate = 16000 # sampling rate that whisper uses 24 | recording_samplerate = 48000 # multiple of whisper_samplerate, widely supported 25 | 26 | server_url = "http://0.0.0.0:5900" 27 | 28 | devices = [evdev.InputDevice(fn) for fn in evdev.list_devices()] 29 | keyboards = [d for d in devices if rec_key in str(d.capabilities(verbose=True))] 30 | 31 | writer = UInput() 32 | time.sleep(1) 33 | 34 | # %% parse args 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument("language", nargs="?", default=None) 37 | # add a command to be run on after model load 38 | parser.add_argument("--on-callback", type=str, default=None) 39 | # add a command to be run on after model unload 40 | parser.add_argument("--off-callback", type=str, default=None) 41 | # turn off automatically after some time 42 | parser.add_argument("--auto-off-time", type=int, default=None) 43 | # add a command to be run on after model load 44 | parser.add_argument("--model", type=str, default="large-v3") 45 | # name of the recording device to use as returned by sd.query_devices() 46 | parser.add_argument("--recording-device", type=str, default=None) 47 | args = parser.parse_args() 48 | 49 | if args.recording_device is None: 50 | device_index = None 51 | else: 52 | device_info = sd.query_devices() 53 | device_index = None 54 | for device in device_info: 55 | if args.recording_device in device['name'] and device['max_input_channels'] > 0: 56 | device_index = device["index"] 57 | break 58 | assert device_index is not None, "Couldn't find specified sound device" 59 | 60 | # mock engine process 61 | engine = subprocess.Popen(["echo", "mock engine"]) 62 | 63 | # %% 64 | 65 | # def get_context(): 66 | # # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow 67 | # # fisrt clear the clipboard in case getting context fails 68 | # pyperclip.copy("") 69 | # # ctrl+shift+home 70 | # controller.press(pynput.keyboard.Key.ctrl_l) 71 | # controller.press(pynput.keyboard.Key.shift_l) 72 | # controller.press(pynput.keyboard.Key.home) 73 | # controller.release(pynput.keyboard.Key.home) 74 | # controller.release(pynput.keyboard.Key.shift_l) 75 | # controller.release(pynput.keyboard.Key.ctrl_l) 76 | # # ctrl+c 77 | # controller.press(pynput.keyboard.Key.ctrl_l) 78 | # controller.press("c") 79 | # controller.release("c") 80 | # controller.release(pynput.keyboard.Key.ctrl_l) 81 | # # right arrow 82 | # controller.press(pynput.keyboard.Key.right) 83 | # controller.release(pynput.keyboard.Key.right) 84 | # # get clipboard 85 | # clipboard = pyperclip.paste() 86 | # if clipboard == "": 87 | # print("Warning: context is empty") 88 | # return clipboard 89 | 90 | 91 | def type_using_clipboard(text): 92 | pyperclip.copy(text) 93 | # use evdev to type ctrl+shift+v 94 | time.sleep(0.01) 95 | writer.write(e.EV_KEY, e.KEY_LEFTCTRL, 1) 96 | writer.write(e.EV_KEY, e.KEY_LEFTSHIFT, 1) 97 | writer.write(e.EV_KEY, e.KEY_V, 1) 98 | writer.write(e.EV_KEY, e.KEY_V, 0) 99 | writer.write(e.EV_KEY, e.KEY_LEFTSHIFT, 0) 100 | writer.write(e.EV_KEY, e.KEY_LEFTCTRL, 0) 101 | writer.syn() 102 | 103 | 104 | # %% 105 | rec_key_pressed = False 106 | time_last_used = time.time() 107 | 108 | 109 | def record_and_process(): 110 | global engine 111 | 112 | # ! start the engine if not running 113 | if engine.poll() is not None: 114 | # clean up the old process 115 | print("Starting engine") 116 | engine = subprocess.Popen(["python", "engine.py", args.model]) 117 | if args.on_callback is not None: 118 | subprocess.run(args.on_callback, shell=True) 119 | 120 | # ! record 121 | # while is pressed, record audio 122 | audio_chunks = [] 123 | 124 | def audio_callback(indata, frames, time, status): 125 | if status: 126 | print("WARNING:", status) 127 | audio_chunks.append(indata.copy()) 128 | 129 | stream = sd.InputStream( 130 | samplerate=recording_samplerate, 131 | channels=1, 132 | blocksize=256, 133 | callback=audio_callback, 134 | device=device_index, 135 | ) 136 | stream.start() 137 | while rec_key_pressed: 138 | time.sleep(0.005) 139 | stream.stop() 140 | stream.close() 141 | recorded_audio = np.concatenate(audio_chunks)[:, 0] 142 | 143 | # ! check if not too short 144 | duration = len(recorded_audio) / recording_samplerate 145 | if duration <= 0.1: 146 | print("Recording too short, skipping") 147 | return 148 | 149 | # ! downsample 150 | # scipy resampling was much too slow (hundreds of ms) 151 | # leave in only every 3rd sample, using numpy 152 | recorded_audio = recorded_audio[::3] 153 | 154 | # # ! get context 155 | # if not args.no_grab_context: 156 | # context = get_context() 157 | # # limit the length of context 158 | # context = context[-args.context_limit_chars :] 159 | context = None 160 | 161 | # ! transcribe 162 | payload = {"audio": recorded_audio.tolist(), "context": context} 163 | 164 | # note that the server can be intializing, so have the post try until it works 165 | while True: 166 | try: 167 | response = requests.post(server_url + "/transcribe", json=payload) 168 | break 169 | except requests.exceptions.ConnectionError: 170 | time.sleep(0.1) 171 | print("Connection error, retrying...") 172 | 173 | response_data = response.json() 174 | 175 | if response.status_code == 200: 176 | text = response_data.get("text", "") 177 | print(text) 178 | else: 179 | print( 180 | f"Error transcribing audio: {response_data.get('error', 'Unknown error')}" 181 | ) 182 | return 183 | 184 | # ! type that text 185 | text = text + " " 186 | type_using_clipboard(text) 187 | # print(text) 188 | 189 | 190 | # %% 191 | 192 | # read any keypress 193 | try: 194 | while True: 195 | r, _, _ = select(keyboards, [], [], 1) 196 | for event in (event for dev in r for event in dev.read()): 197 | # check if rec_key 198 | if event.code != evdev.ecodes.ecodes[rec_key]: 199 | continue 200 | if event.value == 1: 201 | rec_key_pressed = True 202 | # start recording in a new thread 203 | t = threading.Thread(target=record_and_process) 204 | t.start() 205 | 206 | elif event.value == 0: 207 | rec_key_pressed = False 208 | time_last_used = time.time() 209 | 210 | # check if we should shut down the engine 211 | if ( 212 | engine.poll() is None 213 | and args.auto_off_time is not None 214 | and time.time() - time_last_used > args.auto_off_time 215 | and rec_key_pressed is False 216 | ): 217 | print("Auto off") 218 | # shut down the server 219 | engine.terminate() 220 | 221 | if args.off_callback is not None: 222 | subprocess.run(args.off_callback, shell=True) 223 | 224 | except KeyboardInterrupt: 225 | print("\nExiting...") 226 | engine.terminate() 227 | writer.close() 228 | 229 | if args.off_callback is not None: 230 | subprocess.run(args.off_callback, shell=True) 231 | -------------------------------------------------------------------------------- /dictation.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import os 3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib" 4 | import argparse 5 | import shutil 6 | import subprocess 7 | import threading 8 | import time 9 | from pathlib import Path 10 | 11 | import numpy as np 12 | import pynput 13 | import pyperclip 14 | import sounddevice as sd 15 | 16 | # ! you can change this rec_key value 17 | rec_key = pynput.keyboard.Key.ctrl_r 18 | 19 | whisper_samplerate = 16000 # sampling rate that whisper uses 20 | recording_samplerate = 48000 # multiple of whisper_samplerate, widely supported 21 | 22 | controller = pynput.keyboard.Controller() 23 | 24 | # %% parse args 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("engine", choices=["local", "remote"]) 27 | parser.add_argument("language", nargs="?", default=None) 28 | parser.add_argument("--no-type-using-clipboard", action="store_true") 29 | # add a command to be run on after model load 30 | parser.add_argument("--on-callback", type=str, default=None) 31 | # turn off automatically after some time 32 | parser.add_argument("--auto-off-time", type=int, default=None) 33 | # add a command to be run on after model load 34 | parser.add_argument("--model", type=str, default="large-v3") 35 | # ydotool socket path for Wayland 36 | parser.add_argument("--ydotool-socket", type=str) 37 | args = parser.parse_args() 38 | 39 | command_words = ["engage", "kurde", "kurda", "command"] 40 | 41 | # check if ydotool is installed 42 | use_ydotool = shutil.which("ydotool") is not None 43 | # check if on wayland 44 | on_wayland = os.environ.get("WAYLAND_DISPLAY") is not None 45 | if on_wayland and not use_ydotool: 46 | raise ValueError("On Wayland, ydotool is required. Please install it using your package manager.") 47 | 48 | # %% local or remote 49 | if args.engine == "local": 50 | from faster_whisper import WhisperModel 51 | 52 | model = WhisperModel(args.model, device="cuda", compute_type="float16") 53 | # int8 is said to have worse accuracy and be slower 54 | elif args.engine == "remote": 55 | import soundfile 56 | from openai import OpenAI 57 | 58 | openai_token = Path("~/.config/openai.token").expanduser().read_text().strip() 59 | client = OpenAI(api_key=openai_token) 60 | else: 61 | raise ValueError("Specify whether to use local or remote engine") 62 | 63 | if args.on_callback is not None: 64 | subprocess.run(args.on_callback, shell=True) 65 | 66 | 67 | # %% 68 | def get_text_local(audio, context=None): 69 | segments, info = model.transcribe( 70 | audio, beam_size=5, language=args.language, initial_prompt=context 71 | ) 72 | segments = list(segments) 73 | text = " ".join([segment.text.strip() for segment in segments]) 74 | return text 75 | 76 | 77 | def get_text_remote(audio, context=None): 78 | tmp_audio_filename = "tmp.wav" 79 | soundfile.write(tmp_audio_filename, audio, whisper_samplerate, format="wav") 80 | # print(time.time()) 81 | api_response = client.audio.transcriptions.create( 82 | model="whisper-1", 83 | file=open(tmp_audio_filename, "rb"), 84 | language=args.language, 85 | prompt=context, 86 | ) 87 | # print(time.time()) 88 | return api_response.text 89 | 90 | 91 | env = os.environ.copy() 92 | if args.ydotool_socket is not None: 93 | env['YDOTOOL_SOCKET'] = args.ydotool_socket 94 | 95 | 96 | def type_using_clipboard(text): 97 | if use_ydotool: 98 | # Use ydotool for typing in Wayland 99 | subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text], env=env) 100 | else: 101 | # use pynput to type ctrl+shift+v 102 | pyperclip.copy(text) 103 | controller.press(pynput.keyboard.Key.ctrl_l) 104 | controller.press(pynput.keyboard.Key.shift_l) 105 | controller.press("v") 106 | controller.release("v") 107 | controller.release(pynput.keyboard.Key.shift_l) 108 | controller.release(pynput.keyboard.Key.ctrl_l) 109 | 110 | 111 | # %% 112 | rec_key_pressed = False 113 | time_last_used = time.time() 114 | 115 | 116 | def record_and_process(): 117 | # ! record 118 | # while is pressed, record audio 119 | audio_chunks = [] 120 | 121 | def audio_callback(indata, frames, time, status): 122 | if status: 123 | print("WARNING:", status) 124 | audio_chunks.append(indata.copy()) 125 | 126 | stream = sd.InputStream( 127 | samplerate=recording_samplerate, 128 | channels=1, 129 | blocksize=256, 130 | callback=audio_callback, 131 | ) 132 | stream.start() 133 | while rec_key_pressed: 134 | time.sleep(0.005) 135 | stream.stop() 136 | stream.close() 137 | recorded_audio = np.concatenate(audio_chunks)[:, 0] 138 | 139 | # ! check if not too short 140 | duration = len(recorded_audio) / recording_samplerate 141 | if duration <= 0.2: 142 | print("Recording too short, skipping") 143 | return 144 | 145 | # ! downsample 146 | # scipy resampling was much too slow (hundreds of ms) 147 | # leave in only every 3rd sample, using numpy 148 | recorded_audio = recorded_audio[::3] 149 | 150 | # # ! get context 151 | # if not args.no_grab_context: 152 | # context = get_context() 153 | # # limit the length of context 154 | # context = context[-args.context_limit_chars :] 155 | context = None 156 | 157 | # ! transcribe 158 | if args.engine == "local": 159 | text = get_text_local(recorded_audio, context) 160 | elif args.engine == "remote": 161 | text = get_text_remote(recorded_audio, context) 162 | print(text) 163 | 164 | # ! check if triggered unintentionally (hack) 165 | if text.strip(" .,!?").lower() in [ 166 | "", 167 | "you", 168 | "bye", 169 | "thank you", 170 | "thank you very much", 171 | ]: 172 | print("You triggered unintentionally, skipping") 173 | return 174 | 175 | # # ! check if it ends with a command word 176 | # words = text.split(" ") 177 | # use_command = False 178 | # if words and any(cmd_word in words[-1].lower() for cmd_word in command_words): 179 | # # last word was a command word 180 | # use_command = True 181 | # text = " ".join(words[:-1]) 182 | 183 | # ! check if it ends with a command word 184 | use_command = False 185 | if any(cmd_word in text[-12:].lower() for cmd_word in command_words): 186 | # last word was a command word 187 | use_command = True 188 | 189 | # ! type that text 190 | text = text + " " 191 | if not args.no_type_using_clipboard: 192 | type_using_clipboard(text) 193 | else: 194 | if use_ydotool: 195 | subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text], env=env) 196 | else: 197 | controller.type(text) 198 | 199 | # ! use command 200 | if use_command: 201 | if use_ydotool: 202 | subprocess.run(["ydotool", "key", "28:1", "28:0"], env=env) # 28 is Enter key 203 | else: 204 | controller.press(pynput.keyboard.Key.enter) 205 | controller.release(pynput.keyboard.Key.enter) 206 | 207 | 208 | def on_press(key): 209 | global rec_key_pressed 210 | # print("pressed", key) 211 | if key == rec_key: 212 | rec_key_pressed = True 213 | 214 | # start recording in a new thread 215 | t = threading.Thread(target=record_and_process) 216 | t.start() 217 | 218 | 219 | def on_release(key): 220 | global rec_key_pressed, time_last_used 221 | # print("released", key) 222 | if key == rec_key: 223 | rec_key_pressed = False 224 | time_last_used = time.time() 225 | 226 | 227 | # %% 228 | if args.language is not None: 229 | print(f"Using language: {args.language}") 230 | with pynput.keyboard.Listener(on_press=on_press, on_release=on_release) as listener: 231 | print(f"Press {rec_key} to start recording") 232 | try: 233 | # listener.join() 234 | while listener.is_alive(): 235 | if ( 236 | args.auto_off_time is not None 237 | and time.time() - time_last_used > args.auto_off_time 238 | ): 239 | print("Auto off") 240 | break 241 | time.sleep(1) 242 | except KeyboardInterrupt: 243 | print("\nExiting...") 244 | 245 | 246 | # %% play around with getting window titles 247 | # # requires pip install python-xlib and I think xorg stuff 248 | # # on wayland it fails for many windows (f.e. terminal, dolphin) 249 | # # on x11 it works 250 | # from Xlib import display 251 | 252 | 253 | # def get_window_class(): 254 | # d = display.Display() 255 | # window_id = d.get_input_focus().focus.id 256 | # window = d.create_resource_object("window", window_id) 257 | # return window.get_wm_class()[0] 258 | 259 | 260 | # def get_context(): 261 | # # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow 262 | # # fisrt clear the clipboard in case getting context fails 263 | # pyperclip.copy("") 264 | # # ctrl+shift+home 265 | # controller.press(pynput.keyboard.Key.ctrl_l) 266 | # controller.press(pynput.keyboard.Key.shift_l) 267 | # controller.press(pynput.keyboard.Key.home) 268 | # controller.release(pynput.keyboard.Key.home) 269 | # controller.release(pynput.keyboard.Key.shift_l) 270 | # controller.release(pynput.keyboard.Key.ctrl_l) 271 | # # ctrl+c 272 | # controller.press(pynput.keyboard.Key.ctrl_l) 273 | # controller.press("c") 274 | # controller.release("c") 275 | # controller.release(pynput.keyboard.Key.ctrl_l) 276 | # # right arrow 277 | # controller.press(pynput.keyboard.Key.right) 278 | # controller.release(pynput.keyboard.Key.right) 279 | # # get clipboard 280 | # clipboard = pyperclip.paste() 281 | # if clipboard == "": 282 | # print("Warning: context is empty") 283 | # return clipboard 284 | 285 | # - [ ] test if prompting works ok locally 286 | # - [ ] test if no lang actually increases latency/accuracy that much - it's useful to leave it blank 287 | 288 | 289 | # - grabbing context everywhere except some list of windows? - not very reliable, a lot of tinkering, platform specific, and not even that useful? 290 | # - now only terminal doesn't work 291 | # - in vscode, I just disabled C-S-Home; Now I can dictate, but context won't be grabbed. 292 | # - incremenal transcription? but no moving window, just larger and larger windows. but that makes sense only with local, and even then it may be so slow that the lag is confusing. it also complicates a lot of things 293 | # - on wayland, pynput doesn't detect ctrl_r (or any other keypresses) when in terminal (tested on manjaro plasma) --------------------------------------------------------------------------------