├── README.md
├── demo.gif
├── requirements.txt
└── transcribe_demo.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Real Time Whisper Transcription
 2 | 
 3 | ![Demo gif](demo.gif)
 4 | 
 5 | This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings.
 6 | 
 7 | To install dependencies simply run
 8 | ```
 9 | pip install -r requirements.txt
10 | ```
11 | in an environment of your choosing.
12 | 
13 | Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
14 | 
15 | ```
16 | # on Ubuntu or Debian
17 | sudo apt update && sudo apt install ffmpeg
18 | 
19 | # on Arch Linux
20 | sudo pacman -S ffmpeg
21 | 
22 | # on MacOS using Homebrew (https://brew.sh/)
23 | brew install ffmpeg
24 | 
25 | # on Windows using Chocolatey (https://chocolatey.org/)
26 | choco install ffmpeg
27 | 
28 | # on Windows using Scoop (https://scoop.sh/)
29 | scoop install ffmpeg
30 | ```
31 | 
32 | For more information on Whisper please see https://github.com/openai/whisper
33 | 
34 | The code in this repository is public domain.


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davabase/whisper_real_time/bfc75c0dc0a9357ffb3d5795baa7fc2d2cada509/demo.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | pyaudio
3 | SpeechRecognition
4 | --extra-index-url https://download.pytorch.org/whl/cu116
5 | torch
6 | numpy
7 | git+https://github.com/openai/whisper.git
8 | 


--------------------------------------------------------------------------------
/transcribe_demo.py:
--------------------------------------------------------------------------------
  1 | #! python3.7
  2 | 
  3 | import argparse
  4 | import os
  5 | import numpy as np
  6 | import speech_recognition as sr
  7 | import whisper
  8 | import torch
  9 | 
 10 | from datetime import datetime, timedelta
 11 | from queue import Queue
 12 | from time import sleep
 13 | from sys import platform
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument("--model", default="medium", help="Model to use",
 19 |                         choices=["tiny", "base", "small", "medium", "large"])
 20 |     parser.add_argument("--non_english", action='store_true',
 21 |                         help="Don't use the english model.")
 22 |     parser.add_argument("--energy_threshold", default=1000,
 23 |                         help="Energy level for mic to detect.", type=int)
 24 |     parser.add_argument("--record_timeout", default=2,
 25 |                         help="How real time the recording is in seconds.", type=float)
 26 |     parser.add_argument("--phrase_timeout", default=3,
 27 |                         help="How much empty space between recordings before we "
 28 |                              "consider it a new line in the transcription.", type=float)
 29 |     if 'linux' in platform:
 30 |         parser.add_argument("--default_microphone", default='pulse',
 31 |                             help="Default microphone name for SpeechRecognition. "
 32 |                                  "Run this with 'list' to view available Microphones.", type=str)
 33 |     args = parser.parse_args()
 34 | 
 35 |     # The last time a recording was retrieved from the queue.
 36 |     phrase_time = None
 37 |     # Thread safe Queue for passing data from the threaded recording callback.
 38 |     data_queue = Queue()
 39 |     # Bytes object which holds audio data for the current phrase
 40 |     phrase_bytes = bytes()
 41 |     # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
 42 |     recorder = sr.Recognizer()
 43 |     recorder.energy_threshold = args.energy_threshold
 44 |     # Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
 45 |     recorder.dynamic_energy_threshold = False
 46 | 
 47 |     # Important for linux users.
 48 |     # Prevents permanent application hang and crash by using the wrong Microphone
 49 |     if 'linux' in platform:
 50 |         mic_name = args.default_microphone
 51 |         if not mic_name or mic_name == 'list':
 52 |             print("Available microphone devices are: ")
 53 |             for index, name in enumerate(sr.Microphone.list_microphone_names()):
 54 |                 print(f"Microphone with name \"{name}\" found")
 55 |             return
 56 |         else:
 57 |             for index, name in enumerate(sr.Microphone.list_microphone_names()):
 58 |                 if mic_name in name:
 59 |                     source = sr.Microphone(sample_rate=16000, device_index=index)
 60 |                     break
 61 |     else:
 62 |         source = sr.Microphone(sample_rate=16000)
 63 | 
 64 |     # Load / Download model
 65 |     model = args.model
 66 |     if args.model != "large" and not args.non_english:
 67 |         model = model + ".en"
 68 |     audio_model = whisper.load_model(model)
 69 | 
 70 |     record_timeout = args.record_timeout
 71 |     phrase_timeout = args.phrase_timeout
 72 | 
 73 |     transcription = ['']
 74 | 
 75 |     with source:
 76 |         recorder.adjust_for_ambient_noise(source)
 77 | 
 78 |     def record_callback(_, audio:sr.AudioData) -> None:
 79 |         """
 80 |         Threaded callback function to receive audio data when recordings finish.
 81 |         audio: An AudioData containing the recorded bytes.
 82 |         """
 83 |         # Grab the raw bytes and push it into the thread safe queue.
 84 |         data = audio.get_raw_data()
 85 |         data_queue.put(data)
 86 | 
 87 |     # Create a background thread that will pass us raw audio bytes.
 88 |     # We could do this manually but SpeechRecognizer provides a nice helper.
 89 |     recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
 90 | 
 91 |     # Cue the user that we're ready to go.
 92 |     print("Model loaded.\n")
 93 | 
 94 |     while True:
 95 |         try:
 96 |             now = datetime.utcnow()
 97 |             # Pull raw recorded audio from the queue.
 98 |             if not data_queue.empty():
 99 |                 phrase_complete = False
100 |                 # If enough time has passed between recordings, consider the phrase complete.
101 |                 # Clear the current working audio buffer to start over with the new data.
102 |                 if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
103 |                     phrase_bytes = bytes()
104 |                     phrase_complete = True
105 |                 # This is the last time we received new audio data from the queue.
106 |                 phrase_time = now
107 |                 
108 |                 # Combine audio data from queue
109 |                 audio_data = b''.join(data_queue.queue)
110 |                 data_queue.queue.clear()
111 | 
112 |                 # Add the new audio data to the accumulated data for this phrase
113 |                 phrase_bytes += audio_data
114 | 
115 |                 # Convert in-ram buffer to something the model can use directly without needing a temp file.
116 |                 # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
117 |                 # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
118 |                 audio_np = np.frombuffer(phrase_bytes, dtype=np.int16).astype(np.float32) / 32768.0
119 | 
120 |                 # Read the transcription.
121 |                 result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
122 |                 text = result['text'].strip()
123 | 
124 |                 # If we detected a pause between recordings, add a new item to our transcription.
125 |                 # Otherwise edit the existing one.
126 |                 if phrase_complete:
127 |                     transcription.append(text)
128 |                 else:
129 |                     transcription[-1] = text
130 | 
131 |                 # Clear the console to reprint the updated transcription.
132 |                 os.system('cls' if os.name=='nt' else 'clear')
133 |                 for line in transcription:
134 |                     print(line)
135 |                 # Flush stdout.
136 |                 print('', end='', flush=True)
137 |             else:
138 |                 # Infinite loops are bad for processors, must sleep.
139 |                 sleep(0.25)
140 |         except KeyboardInterrupt:
141 |             break
142 | 
143 |     print("\n\nTranscription:")
144 |     for line in transcription:
145 |         print(line)
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------