├── .python-version
├── .gitignore
├── dictation_remote.service
├── run_dictation_local.sh
├── legacy_auto_off
    ├── run_dictation_auto_off.sh
    ├── README.md
    ├── engine.py
    ├── dictation_local.service
    ├── _lagacy_dictation_auto_off_pynput.py
    └── dictation_auto_off.py
├── pyproject.toml
├── README.md
└── dictation.py


/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | venv
 3 | .venv
 4 | .env
 5 | __pycache__
 6 | uv.lock
 7 | uv_local.lock
 8 | tmp.wav
 9 | whisper_simple_dictation.egg-info
10 | 


--------------------------------------------------------------------------------
/dictation_remote.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Whisper Simple Dictation Service
 3 | 
 4 | [Service]
 5 | WorkingDirectory=%h/apps/whisper-simple-dictation
 6 | ExecStart=%h/apps/whisper-simple-dictation/.venv/bin/python dictation.py remote en
 7 | Restart=on-failure
 8 | RestartSec=5
 9 | 
10 | [Install]
11 | WantedBy=default.target


--------------------------------------------------------------------------------
/run_dictation_local.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | script_path="$(realpath "$0")"
3 | script_dir="$(dirname "$script_path")"
4 | cd $script_dir
5 | 
6 | source .venv/bin/activate
7 | export LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python3.13/site-packages/nvidia/cublas/lib:$VIRTUAL_ENV/lib/python3.13/site-packages/nvidia/cudnn/lib"
8 | .venv/bin/python3 dictation.py local "$@"


--------------------------------------------------------------------------------
/legacy_auto_off/run_dictation_auto_off.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | script_path="$(realpath "$0")"
3 | script_dir="$(dirname "$script_path")"
4 | cd $script_dir
5 | source .venv/bin/activate
6 | export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
7 | .venv/bin/python3 dictation_auto_off.py "$@"
8 | 
9 | 


--------------------------------------------------------------------------------
/legacy_auto_off/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains the script which aims to disable the local model when it is not active for some time. And then on demand load it back.
 2 | 
 3 | ---------
 4 | 
 5 | In case of local running `dictation_auto_off.py` uses evdev which only works on Linux. For Windows and Mac you can try `_lagacy_dictation_auto_off_pynput.py`, which uses pynput. (Modify `run_dictation_auto_off.sh`.)
 6 | 
 7 | 
 8 | Pynput works on all systems, but does not work on Wayland.
 9 | 
10 | Evdev works with both Wayland and X11, supports special characters, but only works on Linux.
11 | 
12 | Ydotool works with both Wayland and X11, but does not support special characters. Maybe it would be possible to make ydotool trigger pasting.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "whisper-simple-dictation"
 3 | version = "0.1.0"
 4 | description = "Whisper dictation with OpenAI API"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "numpy",
 9 |     # keyboard and sound support
10 |     "pynput",
11 |     "sounddevice",
12 |     # clipboard support
13 |     "pyperclip",
14 |     # for remote:
15 |     "openai",
16 |     "soundfile",
17 | ]
18 | 
19 | [project.optional-dependencies]
20 | local = [
21 |     "faster-whisper",
22 |     # for faster-whisper, requiring cuda 12
23 |     "nvidia-cublas-cu12",
24 |     "nvidia-cudnn-cu12~=8.9",
25 |     # "evdev",
26 |     # "flask",
27 | ]
28 | 
29 | [tool.setuptools]
30 | py-modules = ["dictation"]


--------------------------------------------------------------------------------
/legacy_auto_off/engine.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from flask import Flask, request, jsonify
 4 | from faster_whisper import WhisperModel
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | model_name = sys.argv[1]
 9 | print(f"Using model: {model_name}")
10 | 
11 | model = WhisperModel(model_name, device="cuda", compute_type="float16")
12 | 
13 | 
14 | def get_text_local(audio, context=None):
15 |     segments, info = model.transcribe(audio, beam_size=5, language="en", initial_prompt=context)
16 |     segments = list(segments)
17 |     text = " ".join([segment.text.strip() for segment in segments])
18 |     return text
19 | 
20 | 
21 | @app.route("/transcribe", methods=["POST"])
22 | def transcribe():
23 |     data = request.get_json()
24 |     if "audio" not in data:
25 |         return jsonify({"error": "No audio data provided"}), 400
26 |     
27 |     context = data.get("context", None)
28 | 
29 |     try:
30 |         audio_array = np.array(data["audio"])
31 |         text = get_text_local(audio_array, context)
32 |         return jsonify({"text": text}), 200
33 |     except Exception as e:
34 |         return jsonify({"error": str(e)}), 500
35 | 
36 | if __name__ == "__main__":
37 |     app.run(debug=True, host="0.0.0.0", port=5900, use_reloader=False)
38 | 


--------------------------------------------------------------------------------
/legacy_auto_off/dictation_local.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Whisper Simple Dictation Service
 3 | 
 4 | [Service]
 5 | Type=simple
 6 | Environment="WAYLAND_DISPLAY=wayland-0"
 7 | Environment="DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus"
 8 | ; the sudo tee ... command serves to indicate that the engine is running using some LED
 9 | ;     feel free to tweak it or remove it
10 | ;     if you keep it, add this to the sudoers file:
11 | ;     ALL ALL=NOPASSWD: /usr/bin/tee /sys/class/leds/platform\:\:mute/brightness
12 | ExecStart=/bin/bash /home/USERNAME/apps/whisper-simple-dictation/run_dictation_auto_off.sh en --on-callback 'echo 1 | sudo tee /sys/class/leds/platform::mute/brightness' --off-callback 'echo 0 | sudo tee /sys/class/leds/platform::mute/brightness' --auto-off-time 120 --model medium
13 | Restart=no
14 | 
15 | [Install]
16 | WantedBy=default.target
17 | 
18 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
19 | ; alternative ExecStart, in case you don't want to add your user to the input group
20 | ; but then, remember to add this rule to your sudoers file:
21 | ; ALL ALL=(ALL) NOPASSWD: SETENV: /bin/bash /home/filip/apps/whisper-simple-dictation/run_dictation_auto_off.sh *
22 | ; you will need to tweak --recording-device name - sd.query_devices() will list all available devices
23 | ; ExecStart=/usr/bin/sudo -E /bin/bash /home/USERNAME/apps/whisper-simple-dictation/run_dictation_auto_off.sh en --on-callback 'echo 1 | sudo tee /sys/class/leds/platform::mute/brightness' --off-callback 'echo 0 | sudo tee /sys/class/leds/platform::mute/brightness' --auto-off-time 120 --model medium --recording-device "sof-hda-dsp: - (hw:1,6)"
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Whisper simple dictation
 2 | 
 3 | - press a key to start recording
 4 | - release it to stop recording
 5 | - Whisper transcribes it
 6 | - the text is typed with simulated keypresses
 7 | 
 8 | You can either run Whisper locally or through OpenAI's API.
 9 | 
10 | For local execution you need a CUDA device with at least 4GB VRAM. Uses whisper version `large-v3`, run with FasterWhisper.
11 | 
12 | With remote execution, OpenAI's API has about 1 second delay (as of Jan 2024), while local is near instant.
13 | 
14 | 
15 | ## Installation
16 | 
17 | ```
18 | git clone https://github.com/filyp/whisper-simple-dictation.git
19 | cd whisper-simple-dictation
20 | python3 -m venv .venv --copies
21 | .venv/bin/pip install -e .
22 | ```
23 | 
24 | If using Wayland, you also need to install ydotool and enable ydotoold. (The script tries to use ydotool, and if it's not installed, it falls back to pynput.)
25 | 
26 | ### Remote
27 | If you want to run remotely, run:
28 | ```
29 | echo sk-... > ~/.config/openai.token
30 | ```
31 | Where `sk-...` is your OpenAI API token.
32 | 
33 | ### Local
34 | Then, if you want to run locally, run:
35 | ```
36 | .venv/bin/pip install -e ".[local]"
37 | sudo usermod -aG input __YOUR_USER_NAME__
38 | ```
39 | 
40 | Then log out and back in.
41 | 
42 | (If you're using Wayland and don't want to add your user to the input group for security reasons, see instructions in `dictation_local.service`. On X11 it doesn't matter - devices are exposed anyway.)
43 | 
44 | 
45 | ## Running
46 | 
47 | To run remotely:
48 | ```
49 | .venv/bin/python3 dictation.py remote en
50 | ```
51 | 
52 | To run locally:
53 | ```
54 | bash run_dictation_local.sh en
55 | ```
56 | 
57 | Ctrl-c to stop.
58 | 
59 | By default the record key is *right* ctrl. You can change it in `dictation.py`, but it must be a modifier key (shift, alt, ...).
60 | 
61 | To set up a service that will run whisper-simple-dictation, take a look at `dictation_remote.service`.
62 | 
63 | (Note that the way we send text is by copying it to the clipboard and then sending Ctrl+Shift+V. That's because typing the text normally is complicated to do right, with all the special characters. If using ydotool, it types the text, because pasting is not implemented.)
64 | 
65 | ## Options
66 | 
67 | - **Language.** First argument (in the example above `en`), sets the language. You can also not pass any language to detect it automatically, but that can have worse latency and accuracy.
68 | - **Choosing model** Default is `large-v3`. You can also pass e.g. `--model medium` or `--model small`.
69 | 
70 | ## Other approaches
71 | 
72 | At first I wanted real-time dictation, similar to [nerd-dictation](https://github.com/ideasman42/nerd-dictation). There's [whisper_streaming](https://github.com/ufal/whisper_streaming) which implements something similar, a continuous transcription using whisper. But it has a 3.3 second time lag, and because it needs to run whisper on many overlapping time windows, it's more compute heavy. Also those transcribed time windows are sometimes merged incorrectly. It may be enough for creating captions, but not really for dictation.
73 | 
74 | With some clever engineering and a lot of compute maybe we could get that time lag to less than a second. But I found that reading what you said with a few hundred millisecond delay is very confusing, similar to hearing your voice delayed. So for now, I think the best and most reliable way is the one used here. This may change with future neural nets, with architecture other than whisper, aimed at real-time transcription.
75 | 
76 | There's also [whisper-writer](https://github.com/savbell/whisper-writer), which is more mature, but doesn't (as of Jan 2024) have push-to-talk, which I find more pleasant to use.
77 | 


--------------------------------------------------------------------------------
/legacy_auto_off/_lagacy_dictation_auto_off_pynput.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | # import os
  3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib"
  4 | import argparse
  5 | import subprocess
  6 | import threading
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import pynput
 11 | import pyperclip
 12 | import requests
 13 | import sounddevice as sd
 14 | 
 15 | # from dictation import get_context, type_using_clipboard
 16 | 
 17 | # ! you can change this rec_key value
 18 | rec_key = pynput.keyboard.Key.ctrl_r
 19 | 
 20 | whisper_samplerate = 16000  # sampling rate that whisper uses
 21 | recording_samplerate = 48000  # multiple of whisper_samplerate, widely supported
 22 | 
 23 | server_url = 'http://0.0.0.0:5900'
 24 | 
 25 | controller = pynput.keyboard.Controller()
 26 | 
 27 | # %% parse args
 28 | parser = argparse.ArgumentParser()
 29 | parser.add_argument("language", nargs="?", default=None)
 30 | parser.add_argument("--no-grab-context", action="store_true")
 31 | parser.add_argument("--no-type-using-clipboard", action="store_true")
 32 | parser.add_argument("--context-limit-chars", type=int, default=300)
 33 | # add a command to be run on after model load
 34 | parser.add_argument("--on-callback", type=str, default=None)
 35 | # add a command to be run on after model unload
 36 | parser.add_argument("--off-callback", type=str, default=None)
 37 | # turn off automatically after some time
 38 | parser.add_argument("--auto-off-time", type=int, default=None)
 39 | # add a command to be run on after model load
 40 | parser.add_argument("--model", type=str, default="large-v3")
 41 | args = parser.parse_args()
 42 | 
 43 | 
 44 | # start a server process
 45 | engine = subprocess.Popen(["python", "engine.py", args.model])
 46 | if args.on_callback is not None:
 47 |     subprocess.run(args.on_callback, shell=True)
 48 | 
 49 | 
 50 | 
 51 | # %%
 52 | 
 53 | def get_context():
 54 |     # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow
 55 |     # fisrt clear the clipboard in case getting context fails
 56 |     pyperclip.copy("")
 57 |     # ctrl+shift+home
 58 |     controller.press(pynput.keyboard.Key.ctrl_l)
 59 |     controller.press(pynput.keyboard.Key.shift_l)
 60 |     controller.press(pynput.keyboard.Key.home)
 61 |     controller.release(pynput.keyboard.Key.home)
 62 |     controller.release(pynput.keyboard.Key.shift_l)
 63 |     controller.release(pynput.keyboard.Key.ctrl_l)
 64 |     # ctrl+c
 65 |     controller.press(pynput.keyboard.Key.ctrl_l)
 66 |     controller.press("c")
 67 |     controller.release("c")
 68 |     controller.release(pynput.keyboard.Key.ctrl_l)
 69 |     # right arrow
 70 |     controller.press(pynput.keyboard.Key.right)
 71 |     controller.release(pynput.keyboard.Key.right)
 72 |     # get clipboard
 73 |     clipboard = pyperclip.paste()
 74 |     if clipboard == "":
 75 |         print("Warning: context is empty")
 76 |     return clipboard
 77 | 
 78 | 
 79 | def type_using_clipboard(text):
 80 |     # use pynput to type ctrl+shift+v
 81 |     pyperclip.copy(text)
 82 |     controller.press(pynput.keyboard.Key.ctrl_l)
 83 |     controller.press(pynput.keyboard.Key.shift_l)
 84 |     controller.press("v")
 85 |     controller.release("v")
 86 |     controller.release(pynput.keyboard.Key.shift_l)
 87 |     controller.release(pynput.keyboard.Key.ctrl_l)
 88 | 
 89 | 
 90 | # %%
 91 | rec_key_pressed = False
 92 | time_last_used = time.time()
 93 | 
 94 | 
 95 | def record_and_process():
 96 |     global engine
 97 | 
 98 |     # ! start the engine if not running
 99 |     if engine.poll() is not None:
100 |         # clean up the old process
101 |         print("Starting engine")
102 |         engine = subprocess.Popen(["python", "engine.py", args.model])
103 |         if args.on_callback is not None:
104 |             subprocess.run(args.on_callback, shell=True)
105 | 
106 |     # ! record
107 |     # while is pressed, record audio
108 |     audio_chunks = []
109 | 
110 |     def audio_callback(indata, frames, time, status):
111 |         if status:
112 |             print("WARNING:", status)
113 |         audio_chunks.append(indata.copy())
114 | 
115 |     stream = sd.InputStream(
116 |         samplerate=recording_samplerate,
117 |         channels=1,
118 |         blocksize=256,
119 |         callback=audio_callback,
120 |     )
121 |     stream.start()
122 |     while rec_key_pressed:
123 |         time.sleep(0.005)
124 |     stream.stop()
125 |     stream.close()
126 |     recorded_audio = np.concatenate(audio_chunks)[:, 0]
127 | 
128 |     # ! check if not too short
129 |     duration = len(recorded_audio) / recording_samplerate
130 |     if duration <= 0.1:
131 |         print("Recording too short, skipping")
132 |         return
133 | 
134 |     # ! downsample
135 |     # scipy resampling was much too slow (hundreds of ms)
136 |     # leave in only every 3rd sample, using numpy
137 |     recorded_audio = recorded_audio[::3]
138 | 
139 |     # ! get context
140 |     if not args.no_grab_context:
141 |         context = get_context()
142 |         # limit the length of context
143 |         context = context[-args.context_limit_chars :]
144 |     else:
145 |         context = None
146 |     # print(context)
147 | 
148 |     # # ! transcribe
149 |     payload = {
150 |         "audio": recorded_audio.tolist(),
151 |         "context": context
152 |     }
153 |     
154 |     # note that the server can be intializing, so have the post try until it works
155 |     while True:
156 |         try:
157 |             response = requests.post(server_url + "/transcribe", json=payload)
158 |             break
159 |         except requests.exceptions.ConnectionError:
160 |             time.sleep(0.1)
161 |             print("Connection error, retrying...")
162 | 
163 |     response_data = response.json()
164 | 
165 |     if response.status_code == 200:
166 |         text = response_data.get("text", "")
167 |         print(text)
168 |     else:
169 |         print(f"Error transcribing audio: {response_data.get('error', 'Unknown error')}")
170 | 
171 |     # ! type that text
172 |     text = text + " "
173 |     if not args.no_type_using_clipboard:
174 |         type_using_clipboard(text)
175 |     else:
176 |         controller.type(text)
177 |         # subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text])
178 |         # note: ydotool on x11 correctly outputs polish chars and types in terminal
179 | 
180 | 
181 | def on_press(key):
182 |     global rec_key_pressed
183 |     # print("pressed", key)
184 |     if key == rec_key:
185 |         rec_key_pressed = True
186 | 
187 |         # start recording in a new thread
188 |         t = threading.Thread(target=record_and_process)
189 |         t.start()
190 | 
191 | 
192 | def on_release(key):
193 |     global rec_key_pressed, time_last_used
194 |     # print("released", key)
195 |     if key == rec_key:
196 |         rec_key_pressed = False
197 |         time_last_used = time.time()
198 | 
199 | 
200 | # %%
201 | if args.language is not None:
202 |     print(f"Using language: {args.language}")
203 | with pynput.keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
204 |     print(f"Press {rec_key} to start recording")
205 |     try:
206 |         # listener.join()
207 |         while listener.is_alive():
208 |             if (
209 |                 args.auto_off_time is not None
210 |                 and time.time() - time_last_used > args.auto_off_time
211 |                 and rec_key_pressed is False
212 |                 and engine.poll() is None
213 |             ):
214 |                 print("Auto off")
215 |                 # shut down the server
216 |                 engine.terminate()
217 | 
218 |                 if args.off_callback is not None:
219 |                     subprocess.run(args.off_callback, shell=True)
220 | 
221 |             time.sleep(1)
222 |     except KeyboardInterrupt:
223 |         if args.off_callback is not None:
224 |             subprocess.run(args.off_callback, shell=True)
225 |         print("\nExiting...")
226 | 


--------------------------------------------------------------------------------
/legacy_auto_off/dictation_auto_off.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | # import os
  3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib"
  4 | import argparse
  5 | import subprocess
  6 | import threading
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import pyperclip
 11 | import requests
 12 | import sounddevice as sd
 13 | 
 14 | # from dictation import get_context, type_using_clipboard
 15 | 
 16 | import evdev
 17 | from evdev import UInput, ecodes as e
 18 | from select import select
 19 | 
 20 | # ! you can change this rec_key value
 21 | rec_key = "KEY_RIGHTCTRL"
 22 | 
 23 | whisper_samplerate = 16000  # sampling rate that whisper uses
 24 | recording_samplerate = 48000  # multiple of whisper_samplerate, widely supported
 25 | 
 26 | server_url = "http://0.0.0.0:5900"
 27 | 
 28 | devices = [evdev.InputDevice(fn) for fn in evdev.list_devices()]
 29 | keyboards = [d for d in devices if rec_key in str(d.capabilities(verbose=True))]
 30 | 
 31 | writer = UInput()
 32 | time.sleep(1)
 33 | 
 34 | # %% parse args
 35 | parser = argparse.ArgumentParser()
 36 | parser.add_argument("language", nargs="?", default=None)
 37 | # add a command to be run on after model load
 38 | parser.add_argument("--on-callback", type=str, default=None)
 39 | # add a command to be run on after model unload
 40 | parser.add_argument("--off-callback", type=str, default=None)
 41 | # turn off automatically after some time
 42 | parser.add_argument("--auto-off-time", type=int, default=None)
 43 | # add a command to be run on after model load
 44 | parser.add_argument("--model", type=str, default="large-v3")
 45 | # name of the recording device to use as returned by sd.query_devices()
 46 | parser.add_argument("--recording-device", type=str, default=None)
 47 | args = parser.parse_args()
 48 | 
 49 | if args.recording_device is None:
 50 |     device_index = None
 51 | else:
 52 |     device_info = sd.query_devices()
 53 |     device_index = None
 54 |     for device in device_info:
 55 |         if args.recording_device in device['name'] and device['max_input_channels'] > 0:
 56 |             device_index = device["index"]
 57 |             break
 58 |     assert device_index is not None, "Couldn't find specified sound device"
 59 | 
 60 | # mock engine process
 61 | engine = subprocess.Popen(["echo", "mock engine"])
 62 | 
 63 | # %%
 64 | 
 65 | # def get_context():
 66 | #     # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow
 67 | #     # fisrt clear the clipboard in case getting context fails
 68 | #     pyperclip.copy("")
 69 | #     # ctrl+shift+home
 70 | #     controller.press(pynput.keyboard.Key.ctrl_l)
 71 | #     controller.press(pynput.keyboard.Key.shift_l)
 72 | #     controller.press(pynput.keyboard.Key.home)
 73 | #     controller.release(pynput.keyboard.Key.home)
 74 | #     controller.release(pynput.keyboard.Key.shift_l)
 75 | #     controller.release(pynput.keyboard.Key.ctrl_l)
 76 | #     # ctrl+c
 77 | #     controller.press(pynput.keyboard.Key.ctrl_l)
 78 | #     controller.press("c")
 79 | #     controller.release("c")
 80 | #     controller.release(pynput.keyboard.Key.ctrl_l)
 81 | #     # right arrow
 82 | #     controller.press(pynput.keyboard.Key.right)
 83 | #     controller.release(pynput.keyboard.Key.right)
 84 | #     # get clipboard
 85 | #     clipboard = pyperclip.paste()
 86 | #     if clipboard == "":
 87 | #         print("Warning: context is empty")
 88 | #     return clipboard
 89 | 
 90 | 
 91 | def type_using_clipboard(text):
 92 |     pyperclip.copy(text)
 93 |     # use evdev to type ctrl+shift+v
 94 |     time.sleep(0.01)
 95 |     writer.write(e.EV_KEY, e.KEY_LEFTCTRL, 1)
 96 |     writer.write(e.EV_KEY, e.KEY_LEFTSHIFT, 1)
 97 |     writer.write(e.EV_KEY, e.KEY_V, 1)
 98 |     writer.write(e.EV_KEY, e.KEY_V, 0)
 99 |     writer.write(e.EV_KEY, e.KEY_LEFTSHIFT, 0)
100 |     writer.write(e.EV_KEY, e.KEY_LEFTCTRL, 0)
101 |     writer.syn()
102 | 
103 | 
104 | # %%
105 | rec_key_pressed = False
106 | time_last_used = time.time()
107 | 
108 | 
109 | def record_and_process():
110 |     global engine
111 | 
112 |     # ! start the engine if not running
113 |     if engine.poll() is not None:
114 |         # clean up the old process
115 |         print("Starting engine")
116 |         engine = subprocess.Popen(["python", "engine.py", args.model])
117 |         if args.on_callback is not None:
118 |             subprocess.run(args.on_callback, shell=True)
119 | 
120 |     # ! record
121 |     # while is pressed, record audio
122 |     audio_chunks = []
123 | 
124 |     def audio_callback(indata, frames, time, status):
125 |         if status:
126 |             print("WARNING:", status)
127 |         audio_chunks.append(indata.copy())
128 | 
129 |     stream = sd.InputStream(
130 |         samplerate=recording_samplerate,
131 |         channels=1,
132 |         blocksize=256,
133 |         callback=audio_callback,
134 |         device=device_index,
135 |     )
136 |     stream.start()
137 |     while rec_key_pressed:
138 |         time.sleep(0.005)
139 |     stream.stop()
140 |     stream.close()
141 |     recorded_audio = np.concatenate(audio_chunks)[:, 0]
142 | 
143 |     # ! check if not too short
144 |     duration = len(recorded_audio) / recording_samplerate
145 |     if duration <= 0.1:
146 |         print("Recording too short, skipping")
147 |         return
148 | 
149 |     # ! downsample
150 |     # scipy resampling was much too slow (hundreds of ms)
151 |     # leave in only every 3rd sample, using numpy
152 |     recorded_audio = recorded_audio[::3]
153 | 
154 |     # # ! get context
155 |     # if not args.no_grab_context:
156 |     # context = get_context()
157 |     # # limit the length of context
158 |     # context = context[-args.context_limit_chars :]
159 |     context = None
160 | 
161 |     # ! transcribe
162 |     payload = {"audio": recorded_audio.tolist(), "context": context}
163 | 
164 |     # note that the server can be intializing, so have the post try until it works
165 |     while True:
166 |         try:
167 |             response = requests.post(server_url + "/transcribe", json=payload)
168 |             break
169 |         except requests.exceptions.ConnectionError:
170 |             time.sleep(0.1)
171 |             print("Connection error, retrying...")
172 | 
173 |     response_data = response.json()
174 | 
175 |     if response.status_code == 200:
176 |         text = response_data.get("text", "")
177 |         print(text)
178 |     else:
179 |         print(
180 |             f"Error transcribing audio: {response_data.get('error', 'Unknown error')}"
181 |         )
182 |         return
183 | 
184 |     # ! type that text
185 |     text = text + " "
186 |     type_using_clipboard(text)
187 |     # print(text)
188 | 
189 | 
190 | # %%
191 | 
192 | # read any keypress
193 | try:
194 |     while True:
195 |         r, _, _ = select(keyboards, [], [], 1)
196 |         for event in (event for dev in r for event in dev.read()):
197 |             # check if rec_key
198 |             if event.code != evdev.ecodes.ecodes[rec_key]:
199 |                 continue
200 |             if event.value == 1:
201 |                 rec_key_pressed = True
202 |                 # start recording in a new thread
203 |                 t = threading.Thread(target=record_and_process)
204 |                 t.start()
205 | 
206 |             elif event.value == 0:
207 |                 rec_key_pressed = False
208 |                 time_last_used = time.time()
209 | 
210 |         # check if we should shut down the engine
211 |         if (
212 |             engine.poll() is None
213 |             and args.auto_off_time is not None
214 |             and time.time() - time_last_used > args.auto_off_time
215 |             and rec_key_pressed is False
216 |         ):
217 |             print("Auto off")
218 |             # shut down the server
219 |             engine.terminate()
220 | 
221 |             if args.off_callback is not None:
222 |                 subprocess.run(args.off_callback, shell=True)
223 | 
224 | except KeyboardInterrupt:
225 |     print("\nExiting...")
226 |     engine.terminate()
227 |     writer.close()
228 | 
229 |     if args.off_callback is not None:
230 |         subprocess.run(args.off_callback, shell=True)
231 | 


--------------------------------------------------------------------------------
/dictation.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import os
  3 | # os.environ['LD_LIBRARY_PATH'] = "/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cublas/lib:/home/filip/projects/whisper-rt/venv_faster/lib/python3.11/site-packages/nvidia/cudnn/lib"
  4 | import argparse
  5 | import shutil
  6 | import subprocess
  7 | import threading
  8 | import time
  9 | from pathlib import Path
 10 | 
 11 | import numpy as np
 12 | import pynput
 13 | import pyperclip
 14 | import sounddevice as sd
 15 | 
 16 | # ! you can change this rec_key value
 17 | rec_key = pynput.keyboard.Key.ctrl_r
 18 | 
 19 | whisper_samplerate = 16000  # sampling rate that whisper uses
 20 | recording_samplerate = 48000  # multiple of whisper_samplerate, widely supported
 21 | 
 22 | controller = pynput.keyboard.Controller()
 23 | 
 24 | # %% parse args
 25 | parser = argparse.ArgumentParser()
 26 | parser.add_argument("engine", choices=["local", "remote"])
 27 | parser.add_argument("language", nargs="?", default=None)
 28 | parser.add_argument("--no-type-using-clipboard", action="store_true")
 29 | # add a command to be run on after model load
 30 | parser.add_argument("--on-callback", type=str, default=None)
 31 | # turn off automatically after some time
 32 | parser.add_argument("--auto-off-time", type=int, default=None)
 33 | # add a command to be run on after model load
 34 | parser.add_argument("--model", type=str, default="large-v3")
 35 | # ydotool socket path for Wayland
 36 | parser.add_argument("--ydotool-socket", type=str)
 37 | args = parser.parse_args()
 38 | 
 39 | command_words = ["engage", "kurde", "kurda", "command"]
 40 | 
 41 | # check if ydotool is installed
 42 | use_ydotool = shutil.which("ydotool") is not None
 43 | # check if on wayland
 44 | on_wayland = os.environ.get("WAYLAND_DISPLAY") is not None
 45 | if on_wayland and not use_ydotool:
 46 |     raise ValueError("On Wayland, ydotool is required. Please install it using your package manager.")
 47 | 
 48 | # %% local or remote
 49 | if args.engine == "local":
 50 |     from faster_whisper import WhisperModel
 51 | 
 52 |     model = WhisperModel(args.model, device="cuda", compute_type="float16")
 53 |     # int8 is said to have worse accuracy and be slower
 54 | elif args.engine == "remote":
 55 |     import soundfile
 56 |     from openai import OpenAI
 57 |     
 58 |     openai_token = Path("~/.config/openai.token").expanduser().read_text().strip()
 59 |     client = OpenAI(api_key=openai_token)
 60 | else:
 61 |     raise ValueError("Specify whether to use local or remote engine")
 62 | 
 63 | if args.on_callback is not None:
 64 |     subprocess.run(args.on_callback, shell=True)
 65 | 
 66 | 
 67 | # %%
 68 | def get_text_local(audio, context=None):
 69 |     segments, info = model.transcribe(
 70 |         audio, beam_size=5, language=args.language, initial_prompt=context
 71 |     )
 72 |     segments = list(segments)
 73 |     text = " ".join([segment.text.strip() for segment in segments])
 74 |     return text
 75 | 
 76 | 
 77 | def get_text_remote(audio, context=None):
 78 |     tmp_audio_filename = "tmp.wav"
 79 |     soundfile.write(tmp_audio_filename, audio, whisper_samplerate, format="wav")
 80 |     # print(time.time())
 81 |     api_response = client.audio.transcriptions.create(
 82 |         model="whisper-1",
 83 |         file=open(tmp_audio_filename, "rb"),
 84 |         language=args.language,
 85 |         prompt=context,
 86 |     )
 87 |     # print(time.time())
 88 |     return api_response.text
 89 | 
 90 | 
 91 | env = os.environ.copy()
 92 | if args.ydotool_socket is not None:
 93 |     env['YDOTOOL_SOCKET'] = args.ydotool_socket
 94 | 
 95 | 
 96 | def type_using_clipboard(text):
 97 |     if use_ydotool:
 98 |         # Use ydotool for typing in Wayland
 99 |         subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text], env=env)
100 |     else:
101 |         # use pynput to type ctrl+shift+v
102 |         pyperclip.copy(text)
103 |         controller.press(pynput.keyboard.Key.ctrl_l)
104 |         controller.press(pynput.keyboard.Key.shift_l)
105 |         controller.press("v")
106 |         controller.release("v")
107 |         controller.release(pynput.keyboard.Key.shift_l)
108 |         controller.release(pynput.keyboard.Key.ctrl_l)
109 | 
110 | 
111 | # %%
112 | rec_key_pressed = False
113 | time_last_used = time.time()
114 | 
115 | 
116 | def record_and_process():
117 |     # ! record
118 |     # while is pressed, record audio
119 |     audio_chunks = []
120 | 
121 |     def audio_callback(indata, frames, time, status):
122 |         if status:
123 |             print("WARNING:", status)
124 |         audio_chunks.append(indata.copy())
125 | 
126 |     stream = sd.InputStream(
127 |         samplerate=recording_samplerate,
128 |         channels=1,
129 |         blocksize=256,
130 |         callback=audio_callback,
131 |     )
132 |     stream.start()
133 |     while rec_key_pressed:
134 |         time.sleep(0.005)
135 |     stream.stop()
136 |     stream.close()
137 |     recorded_audio = np.concatenate(audio_chunks)[:, 0]
138 | 
139 |     # ! check if not too short
140 |     duration = len(recorded_audio) / recording_samplerate
141 |     if duration <= 0.2:
142 |         print("Recording too short, skipping")
143 |         return
144 | 
145 |     # ! downsample
146 |     # scipy resampling was much too slow (hundreds of ms)
147 |     # leave in only every 3rd sample, using numpy
148 |     recorded_audio = recorded_audio[::3]
149 | 
150 |     # # ! get context
151 |     # if not args.no_grab_context:
152 |     #     context = get_context()
153 |     #     # limit the length of context
154 |     #     context = context[-args.context_limit_chars :]
155 |     context = None
156 | 
157 |     # ! transcribe
158 |     if args.engine == "local":
159 |         text = get_text_local(recorded_audio, context)
160 |     elif args.engine == "remote":
161 |         text = get_text_remote(recorded_audio, context)
162 |     print(text)
163 | 
164 |     # ! check if triggered unintentionally (hack)
165 |     if text.strip(" .,!?").lower() in [
166 |         "",
167 |         "you",
168 |         "bye",
169 |         "thank you",
170 |         "thank you very much",
171 |     ]:
172 |         print("You triggered unintentionally, skipping")
173 |         return
174 | 
175 |     # # ! check if it ends with a command word
176 |     # words = text.split(" ")
177 |     # use_command = False
178 |     # if words and any(cmd_word in words[-1].lower() for cmd_word in command_words):
179 |     #     # last word was a command word
180 |     #     use_command = True
181 |     #     text = " ".join(words[:-1])
182 | 
183 |     # ! check if it ends with a command word
184 |     use_command = False
185 |     if any(cmd_word in text[-12:].lower() for cmd_word in command_words):
186 |         # last word was a command word
187 |         use_command = True
188 | 
189 |     # ! type that text
190 |     text = text + " "
191 |     if not args.no_type_using_clipboard:
192 |         type_using_clipboard(text)
193 |     else:
194 |         if use_ydotool:
195 |             subprocess.run(["ydotool", "type", "--key-delay=0", "--key-hold=0", text], env=env)
196 |         else:
197 |             controller.type(text)
198 | 
199 |     # ! use command
200 |     if use_command:
201 |         if use_ydotool:
202 |             subprocess.run(["ydotool", "key", "28:1", "28:0"], env=env)  # 28 is Enter key
203 |         else:
204 |             controller.press(pynput.keyboard.Key.enter)
205 |             controller.release(pynput.keyboard.Key.enter)
206 | 
207 | 
208 | def on_press(key):
209 |     global rec_key_pressed
210 |     # print("pressed", key)
211 |     if key == rec_key:
212 |         rec_key_pressed = True
213 | 
214 |         # start recording in a new thread
215 |         t = threading.Thread(target=record_and_process)
216 |         t.start()
217 | 
218 | 
219 | def on_release(key):
220 |     global rec_key_pressed, time_last_used
221 |     # print("released", key)
222 |     if key == rec_key:
223 |         rec_key_pressed = False
224 |         time_last_used = time.time()
225 | 
226 | 
227 | # %%
228 | if args.language is not None:
229 |     print(f"Using language: {args.language}")
230 | with pynput.keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
231 |     print(f"Press {rec_key} to start recording")
232 |     try:
233 |         # listener.join()
234 |         while listener.is_alive():
235 |             if (
236 |                 args.auto_off_time is not None
237 |                 and time.time() - time_last_used > args.auto_off_time
238 |             ):
239 |                 print("Auto off")
240 |                 break
241 |             time.sleep(1)
242 |     except KeyboardInterrupt:
243 |         print("\nExiting...")
244 | 
245 | 
246 | # %% play around with getting window titles
247 | # # requires pip install python-xlib and I think xorg stuff
248 | # # on wayland it fails for many windows (f.e. terminal, dolphin)
249 | # # on x11 it works
250 | # from Xlib import display
251 | 
252 | 
253 | # def get_window_class():
254 | #     d = display.Display()
255 | #     window_id = d.get_input_focus().focus.id
256 | #     window = d.create_resource_object("window", window_id)
257 | #     return window.get_wm_class()[0]
258 | 
259 | 
260 | # def get_context():
261 | #     # use pynput to type ctrl+shift+home, and then ctrl+c, and then right arrow
262 | #     # fisrt clear the clipboard in case getting context fails
263 | #     pyperclip.copy("")
264 | #     # ctrl+shift+home
265 | #     controller.press(pynput.keyboard.Key.ctrl_l)
266 | #     controller.press(pynput.keyboard.Key.shift_l)
267 | #     controller.press(pynput.keyboard.Key.home)
268 | #     controller.release(pynput.keyboard.Key.home)
269 | #     controller.release(pynput.keyboard.Key.shift_l)
270 | #     controller.release(pynput.keyboard.Key.ctrl_l)
271 | #     # ctrl+c
272 | #     controller.press(pynput.keyboard.Key.ctrl_l)
273 | #     controller.press("c")
274 | #     controller.release("c")
275 | #     controller.release(pynput.keyboard.Key.ctrl_l)
276 | #     # right arrow
277 | #     controller.press(pynput.keyboard.Key.right)
278 | #     controller.release(pynput.keyboard.Key.right)
279 | #     # get clipboard
280 | #     clipboard = pyperclip.paste()
281 | #     if clipboard == "":
282 | #         print("Warning: context is empty")
283 | #     return clipboard
284 | 
285 | # - [ ] test if prompting works ok locally
286 | # - [ ] test if no lang actually increases latency/accuracy that much - it's useful to leave it blank
287 | 
288 | 
289 | # - grabbing context everywhere except some list of windows? - not very reliable, a lot of tinkering, platform specific, and not even that useful?
290 | #     - now only terminal doesn't work
291 | #     - in vscode, I just disabled C-S-Home; Now I can dictate, but context won't be grabbed. 
292 | # - incremenal transcription? but no moving window, just larger and larger windows. but that makes sense only with local, and even then it may be so slow that the lag is confusing. it also complicates a lot of things
293 | # - on wayland, pynput doesn't detect ctrl_r (or any other keypresses) when in terminal (tested on manjaro plasma)


--------------------------------------------------------------------------------