├── README.md ├── demo.gif ├── requirements.txt └── transcribe_demo.py /README.md: -------------------------------------------------------------------------------- 1 | # Real Time Whisper Transcription 2 | 3 | ![Demo gif](demo.gif) 4 | 5 | This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings. 6 | 7 | To install dependencies simply run 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | in an environment of your choosing. 12 | 13 | Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers: 14 | 15 | ``` 16 | # on Ubuntu or Debian 17 | sudo apt update && sudo apt install ffmpeg 18 | 19 | # on Arch Linux 20 | sudo pacman -S ffmpeg 21 | 22 | # on MacOS using Homebrew (https://brew.sh/) 23 | brew install ffmpeg 24 | 25 | # on Windows using Chocolatey (https://chocolatey.org/) 26 | choco install ffmpeg 27 | 28 | # on Windows using Scoop (https://scoop.sh/) 29 | scoop install ffmpeg 30 | ``` 31 | 32 | For more information on Whisper please see https://github.com/openai/whisper 33 | 34 | The code in this repository is public domain. -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davabase/whisper_real_time/bfc75c0dc0a9357ffb3d5795baa7fc2d2cada509/demo.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | pyaudio 3 | SpeechRecognition 4 | --extra-index-url https://download.pytorch.org/whl/cu116 5 | torch 6 | numpy 7 | git+https://github.com/openai/whisper.git 8 | -------------------------------------------------------------------------------- /transcribe_demo.py: -------------------------------------------------------------------------------- 1 | #! python3.7 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | import speech_recognition as sr 7 | import whisper 8 | import torch 9 | 10 | from datetime import datetime, timedelta 11 | from queue import Queue 12 | from time import sleep 13 | from sys import platform 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--model", default="medium", help="Model to use", 19 | choices=["tiny", "base", "small", "medium", "large"]) 20 | parser.add_argument("--non_english", action='store_true', 21 | help="Don't use the english model.") 22 | parser.add_argument("--energy_threshold", default=1000, 23 | help="Energy level for mic to detect.", type=int) 24 | parser.add_argument("--record_timeout", default=2, 25 | help="How real time the recording is in seconds.", type=float) 26 | parser.add_argument("--phrase_timeout", default=3, 27 | help="How much empty space between recordings before we " 28 | "consider it a new line in the transcription.", type=float) 29 | if 'linux' in platform: 30 | parser.add_argument("--default_microphone", default='pulse', 31 | help="Default microphone name for SpeechRecognition. " 32 | "Run this with 'list' to view available Microphones.", type=str) 33 | args = parser.parse_args() 34 | 35 | # The last time a recording was retrieved from the queue. 36 | phrase_time = None 37 | # Thread safe Queue for passing data from the threaded recording callback. 38 | data_queue = Queue() 39 | # Bytes object which holds audio data for the current phrase 40 | phrase_bytes = bytes() 41 | # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends. 42 | recorder = sr.Recognizer() 43 | recorder.energy_threshold = args.energy_threshold 44 | # Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording. 45 | recorder.dynamic_energy_threshold = False 46 | 47 | # Important for linux users. 48 | # Prevents permanent application hang and crash by using the wrong Microphone 49 | if 'linux' in platform: 50 | mic_name = args.default_microphone 51 | if not mic_name or mic_name == 'list': 52 | print("Available microphone devices are: ") 53 | for index, name in enumerate(sr.Microphone.list_microphone_names()): 54 | print(f"Microphone with name \"{name}\" found") 55 | return 56 | else: 57 | for index, name in enumerate(sr.Microphone.list_microphone_names()): 58 | if mic_name in name: 59 | source = sr.Microphone(sample_rate=16000, device_index=index) 60 | break 61 | else: 62 | source = sr.Microphone(sample_rate=16000) 63 | 64 | # Load / Download model 65 | model = args.model 66 | if args.model != "large" and not args.non_english: 67 | model = model + ".en" 68 | audio_model = whisper.load_model(model) 69 | 70 | record_timeout = args.record_timeout 71 | phrase_timeout = args.phrase_timeout 72 | 73 | transcription = [''] 74 | 75 | with source: 76 | recorder.adjust_for_ambient_noise(source) 77 | 78 | def record_callback(_, audio:sr.AudioData) -> None: 79 | """ 80 | Threaded callback function to receive audio data when recordings finish. 81 | audio: An AudioData containing the recorded bytes. 82 | """ 83 | # Grab the raw bytes and push it into the thread safe queue. 84 | data = audio.get_raw_data() 85 | data_queue.put(data) 86 | 87 | # Create a background thread that will pass us raw audio bytes. 88 | # We could do this manually but SpeechRecognizer provides a nice helper. 89 | recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout) 90 | 91 | # Cue the user that we're ready to go. 92 | print("Model loaded.\n") 93 | 94 | while True: 95 | try: 96 | now = datetime.utcnow() 97 | # Pull raw recorded audio from the queue. 98 | if not data_queue.empty(): 99 | phrase_complete = False 100 | # If enough time has passed between recordings, consider the phrase complete. 101 | # Clear the current working audio buffer to start over with the new data. 102 | if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): 103 | phrase_bytes = bytes() 104 | phrase_complete = True 105 | # This is the last time we received new audio data from the queue. 106 | phrase_time = now 107 | 108 | # Combine audio data from queue 109 | audio_data = b''.join(data_queue.queue) 110 | data_queue.queue.clear() 111 | 112 | # Add the new audio data to the accumulated data for this phrase 113 | phrase_bytes += audio_data 114 | 115 | # Convert in-ram buffer to something the model can use directly without needing a temp file. 116 | # Convert data from 16 bit wide integers to floating point with a width of 32 bits. 117 | # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max. 118 | audio_np = np.frombuffer(phrase_bytes, dtype=np.int16).astype(np.float32) / 32768.0 119 | 120 | # Read the transcription. 121 | result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available()) 122 | text = result['text'].strip() 123 | 124 | # If we detected a pause between recordings, add a new item to our transcription. 125 | # Otherwise edit the existing one. 126 | if phrase_complete: 127 | transcription.append(text) 128 | else: 129 | transcription[-1] = text 130 | 131 | # Clear the console to reprint the updated transcription. 132 | os.system('cls' if os.name=='nt' else 'clear') 133 | for line in transcription: 134 | print(line) 135 | # Flush stdout. 136 | print('', end='', flush=True) 137 | else: 138 | # Infinite loops are bad for processors, must sleep. 139 | sleep(0.25) 140 | except KeyboardInterrupt: 141 | break 142 | 143 | print("\n\nTranscription:") 144 | for line in transcription: 145 | print(line) 146 | 147 | 148 | if __name__ == "__main__": 149 | main() 150 | --------------------------------------------------------------------------------