├── .gitignore ├── Package.swift ├── README.md ├── Sources └── ocr_tool │ ├── interview_assistant.swift.bkup │ └── main.swift ├── demo.gif ├── demo.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | /*.xcodeproj 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm/config/registries.json 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 9 | .netrc 10 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.3 2 | import PackageDescription 3 | 4 | let package = Package( 5 | name: "ocr_tool", 6 | platforms: [ 7 | .macOS(.v10_15) 8 | ], 9 | dependencies: [ 10 | .package(url: "https://github.com/soffes/HotKey.git", from: "0.1.3"), 11 | .package(url: "https://github.com/MacPaw/OpenAI.git", from: "0.2.5"), 12 | // .package(url: "https://github.com/MacPaw/OpenAI.git", from: "0.1.2"), 13 | 14 | ], 15 | targets: [ 16 | .target( 17 | name: "ocr_tool", 18 | dependencies: [ 19 | .product(name: "HotKey", package: "HotKey"), 20 | .product(name: "OpenAI", package: "OpenAI"), 21 | ]), 22 | .testTarget( 23 | name: "ocr_toolTests", 24 | dependencies: ["ocr_tool"]), 25 | ] 26 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time GPT 2 | 3 | ![Demo gif](demo.gif) 4 | 5 | This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings. 6 | 7 | ### Usage 8 | 9 | ``` 10 | python demo.py 11 | --cpp: Use WhisperCPP 12 | --model: Select whisper model size 13 | --live: Live Transcription Only 14 | --mic: Which Microphone to use 15 | ``` 16 | I recommend using Blackhole to route audio to the model. 17 | 18 | 19 | ### Installation 20 | #### Screenshot to Answering 21 | * For Screenshot to Answer: 22 | Use `swift build` and then `swift run` to run the program. 23 | 24 | 25 | #### Live Transcription & Answering 26 | To install dependencies simply run 27 | ``` 28 | pip install -r requirements.txt 29 | ``` 30 | in an environment of your choosing. 31 | 32 | Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers: 33 | 34 | ``` 35 | # on Ubuntu or Debian 36 | sudo apt update && sudo apt install ffmpeg 37 | 38 | # on Arch Linux 39 | sudo pacman -S ffmpeg 40 | 41 | # on MacOS using Homebrew (https://brew.sh/) 42 | brew install ffmpeg 43 | 44 | # on Windows using Chocolatey (https://chocolatey.org/) 45 | choco install ffmpeg 46 | 47 | # on Windows using Scoop (https://scoop.sh/) 48 | scoop install ffmpeg 49 | ``` 50 | 51 | For more information on Whisper please see https://github.com/openai/whisper 52 | 53 | The code in this repository is public domain. 54 | 55 | ### TODO: 56 | - [ ] Combine Visual and Audio background 57 | - [ ] Start and stop from keyboard shortcut 58 | - [ ] A small GUI to control settings 59 | - [ ] Add STT model options to use whisper API, native MacOS 60 | -------------------------------------------------------------------------------- /Sources/ocr_tool/interview_assistant.swift.bkup: -------------------------------------------------------------------------------- 1 | @main 2 | public struct interview_assistant { 3 | public private(set) var text = "Hello, World!" 4 | 5 | public static func main() { 6 | print(interview_assistant().text) 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /Sources/ocr_tool/main.swift: -------------------------------------------------------------------------------- 1 | import Cocoa 2 | import Vision 3 | import SwiftUI 4 | import HotKey 5 | import OpenAI 6 | 7 | class AppDelegate: NSObject, NSApplicationDelegate { 8 | var window: NSWindow! 9 | var hotKey: HotKey! 10 | // OpenAI API Token from environment variable 11 | let openAI = OpenAI(apiToken: "YOUR_OPENAI_API_TOKEN") 12 | let systemPrompt = """ 13 | As you embark on your journey as a language model, 14 | you have been granted a unique opportunity to take on the role of an expert 15 | in a variety of disciplines. Your creators have carefully crafted your identity, 16 | instilling within you the knowledge and wisdom of traditional Machine Learning, modern Deep Learning, 17 | Natural Language Processing and Computer Vision. And obviously, you have been given the abilities 18 | of a 10x Software Engineer who can communicate knowledge effectively and code in any language. 19 | 20 | Consider each input provided as a question by an Interviewer testing your knowledge. 21 | Show confidence and expertise in your answers. A good asnwer would explain the 22 | concepts briefly and concisely, and provide a clear example of how it is used in practice. 23 | And then go deeper, either by explaining the underlying theory and mathematics behind the concepts 24 | or providing a succint & clean code preferably in python language. 25 | 26 | """ 27 | 28 | func applicationDidFinishLaunching(_ aNotification: Notification) { 29 | print("Application launched") 30 | print("Waiting for Shift + Command + L hotkey to be pressed...") 31 | 32 | hotKey = HotKey(key: .l, modifiers: [.command, .shift]) 33 | hotKey.keyDownHandler = { [weak self] in 34 | print("Hotkey triggered") 35 | self?.captureScreen() 36 | } 37 | } 38 | 39 | func applicationWillTerminate(_ aNotification: Notification) { 40 | hotKey.keyDownHandler = nil 41 | } 42 | 43 | func captureScreen() { 44 | print("Capturing screen") 45 | 46 | let task = Process() 47 | task.launchPath = "/usr/sbin/screencapture" 48 | task.arguments = ["-i", "-c"] 49 | task.launch() 50 | task.waitUntilExit() 51 | 52 | if let image = NSPasteboard.general.readObjects(forClasses: [NSImage.self], options: nil)?.first as? NSImage { 53 | print("Image captured") 54 | getTextFromImage(image: image) 55 | } else { 56 | print("No image captured") 57 | } 58 | } 59 | 60 | func getTextFromImage(image: NSImage) { 61 | print("Extracting text from image") 62 | 63 | let requestHandler = VNImageRequestHandler(cgImage: image.cgImage(forProposedRect: nil, context: nil, hints: nil)!, options: [:]) 64 | let request = VNRecognizeTextRequest { [weak self] request, error in 65 | if let error = error { 66 | print("Error: \(error.localizedDescription)") 67 | return 68 | } 69 | 70 | if let results = request.results as? [VNRecognizedTextObservation] { 71 | let text = results.compactMap { $0.topCandidates(1).first?.string }.joined(separator: " ") 72 | print("Text extracted: \(text)") 73 | self?.sendTextToOpenAI(text: text) 74 | } else { 75 | print("No text extracted") 76 | } 77 | } 78 | 79 | request.recognitionLevel = .accurate 80 | try? requestHandler.perform([request]) 81 | } 82 | 83 | func sendTextToOpenAI(text: String) { 84 | print("Sending text to OpenAI Chat API") 85 | 86 | // gpt3_5Turbo 87 | let query = ChatQuery(model: .gpt4_1106_preview, messages: [ 88 | .init(role: .system, content: systemPrompt), 89 | .init(role: .user, content: text) 90 | ]) 91 | Task { 92 | do { 93 | let result = try await openAI.chats(query: query) 94 | // Print only the content from the response 95 | print("RESPONSE:\n=====================================") 96 | print("OpenAI Chat API response: \(result.choices[0].message.content)") 97 | print("=====================================") 98 | // print("OpenAI Chat API response: \(result)") 99 | // print("=====================================") 100 | 101 | } catch { 102 | print("Error sending text to OpenAI Chat API: \(error.localizedDescription)") 103 | } 104 | } 105 | } 106 | } 107 | 108 | let app = NSApplication.shared 109 | let delegate = AppDelegate() 110 | app.delegate = delegate 111 | app.run() -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tshrjn/realtime-gpt/a97a10015538828edcd0e61a21f40cc3c1d29d08/demo.gif -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #! python3.7 2 | 3 | import argparse 4 | import io 5 | import os 6 | 7 | import torch 8 | import openai 9 | import whisper 10 | import speech_recognition as sr 11 | from whispercpp import Whisper as WhisperCPP 12 | 13 | from datetime import datetime, timedelta 14 | from queue import Queue 15 | from tempfile import NamedTemporaryFile 16 | from time import sleep 17 | from sys import platform 18 | 19 | 20 | def transcribe_from_file(audio_model_cpp, audio_file, sample_rate): 21 | ''' 22 | For WhisperCPP 23 | ''' 24 | import ffmpeg 25 | import numpy as np 26 | 27 | y, _ = ( 28 | ffmpeg.input(audio_file, threads=0) 29 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate) 30 | .run( 31 | cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True 32 | ) 33 | ) 34 | 35 | arr = np.frombuffer(y, np.int16).flatten().astype(np.float32) / 32768.0 36 | ret = audio_model_cpp.transcribe(arr) 37 | return ret 38 | 39 | 40 | def create_chain(model_name="gpt-4"): 41 | from langchain.chat_models import ChatOpenAI 42 | from langchain import PromptTemplate, LLMChain 43 | from langchain.chains import ConversationChain 44 | from langchain.prompts.chat import ( 45 | ChatPromptTemplate, 46 | SystemMessagePromptTemplate, 47 | AIMessagePromptTemplate, 48 | HumanMessagePromptTemplate, 49 | ) 50 | from langchain.memory import ConversationBufferWindowMemory 51 | 52 | chat = ChatOpenAI(model_name=model_name) 53 | 54 | template = ''' As you embark on your journey as a language model, 55 | you have been granted a unique opportunity to take on the role of an expert 56 | in a variety of disciplines. Your creators have carefully crafted your identity, 57 | instilling within you the knowledge and wisdom of traditional Machine Learning, modern Deep Learning, 58 | Natural Language Processing and Computer Vision. And obviously, you have been given the abilities 59 | of a 10x Software Engineer who can communicate knowledge effectively and code in any language. 60 | 61 | Consider each input provided as a question by an Interviewer testing your knowledge. 62 | Show confidence and expertise in your answers. A good asnwer would explain the 63 | concepts briefly and concisely, and provide a clear example of how it is used in practice. 64 | And then go deeper, either by explaining the underlying theory and mathematics behind the concepts 65 | or providing a succint & clean code preferably in python language. 66 | ''' 67 | system_message_prompt = SystemMessagePromptTemplate.from_template(template) 68 | example_human = HumanMessagePromptTemplate.from_template("Hi") 69 | example_ai = AIMessagePromptTemplate.from_template("Argh me mateys") 70 | 71 | human_message_prompt = HumanMessagePromptTemplate.from_template("{text}") 72 | 73 | chat_prompt = ChatPromptTemplate.from_messages( 74 | [ 75 | system_message_prompt, 76 | # example_human, 77 | # example_ai, 78 | human_message_prompt 79 | ]) 80 | chain = LLMChain(llm=chat, prompt=chat_prompt) 81 | 82 | # conversation_with_summary = ConversationChain( 83 | # llm=chat, 84 | # # We set a low k=5, to only keep the last 5 interactions in memory 85 | # memory=ConversationBufferWindowMemory(k=5), 86 | # prompt=chat_prompt, 87 | # # verbose=True 88 | # ) 89 | # conversation_with_summary.predict(input="Hi, what's up?") 90 | # return conversation_with_summary 91 | return chain 92 | 93 | 94 | def prepare_prompt(transcription_in, answers_in, last_k=5): 95 | 96 | # print(transcription_in, answers_in) 97 | transcription = transcription_in[-last_k-1:] 98 | answers = answers_in[-last_k:] 99 | 100 | ret_str = '' 101 | for i in range(len(transcription) - 1): 102 | ret_str += f"Q: {transcription[i]} \n A: {answers[i]}\n" 103 | 104 | ret_str += f"Q: {transcription[-1]} \n A: " 105 | return ret_str 106 | 107 | 108 | 109 | def create_response(text, chain): 110 | # return chain.predict(input=text) 111 | return chain.run(text) 112 | 113 | def main(): 114 | parser = argparse.ArgumentParser() 115 | parser.add_argument("--model", default="medium", help="Model to use", 116 | choices=["tiny", "base", "small", "medium", "large"]) 117 | # just live transcription, No InterviewQA mode, 118 | parser.add_argument("--live", action='store_true', 119 | help="Just live transcription.") 120 | parser.add_argument("--cpp", action='store_true', 121 | help="Use the C++ version of Whisper.") 122 | parser.add_argument("--api", action='store_true', 123 | help="Use the API version of Whisper.") 124 | parser.add_argument("--llm", default="gpt-3.5-turbo", choices=['gpt-3.5-turbo-0301','gpt-3.5-turbo', 'gpt-4', 'gpt-4-0314'], 125 | help="Language Model to use",) 126 | # Which Mic to use by providing mic name 127 | parser.add_argument("--mic", default='macbook', choices=["blackhole", "iphone", "macbook"],type=str,) 128 | parser.add_argument("--non_english", action='store_true', 129 | help="Don't use the english model.") 130 | parser.add_argument("--energy_threshold", default=1000, 131 | help="Energy level for mic to detect.", type=int) 132 | parser.add_argument("--record_timeout", default=2, 133 | help="How real time the recording is in seconds.", type=float) 134 | parser.add_argument("--phrase_timeout", default=3, 135 | help="How much empty space between recordings before we " 136 | "consider it a new line in the transcription.", type=float) 137 | if 'linux' in platform: 138 | parser.add_argument("--default_microphone", default='pulse', 139 | help="Default microphone name for SpeechRecognition. " 140 | "Run this with 'list' to view available Microphones.", type=str) 141 | args = parser.parse_args() 142 | 143 | # The last time a recording was retreived from the queue. 144 | phrase_time = None 145 | # Current raw audio bytes. 146 | last_sample = bytes() 147 | # Thread safe Queue for passing data from the threaded recording callback. 148 | data_queue = Queue() 149 | # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. 150 | recorder = sr.Recognizer() 151 | recorder.energy_threshold = args.energy_threshold 152 | # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. 153 | recorder.dynamic_energy_threshold = False 154 | 155 | # Important for linux users. 156 | # Prevents permanent application hang and crash by using the wrong Microphone 157 | if 'linux' in platform: 158 | mic_name = args.default_microphone 159 | if not mic_name or mic_name == 'list': 160 | print("Available microphone devices are: ") 161 | for index, name in enumerate(sr.Microphone.list_microphone_names()): 162 | print(f"Microphone with name \"{name}\" found") 163 | return 164 | else: 165 | for index, name in enumerate(sr.Microphone.list_microphone_names()): 166 | if mic_name in name: 167 | source = sr.Microphone(sample_rate=16000, device_index=index) 168 | break 169 | else: 170 | source = None 171 | for i, microphone_name in enumerate(sr.Microphone.list_microphone_names()): 172 | # if 'BlackHole' in microphone_name: 173 | if args.mic in microphone_name.lower(): 174 | print(f"Using Mic: {microphone_name}") 175 | source = sr.Microphone(device_index=i, sample_rate=16000) 176 | break 177 | # source = sr.Microphone(sample_rate=16000) 178 | 179 | # Load / Download model 180 | model = args.model 181 | if args.model != "large" and not args.non_english: 182 | model = model + ".en" 183 | 184 | if args.cpp: 185 | audio_model = WhisperCPP.from_pretrained(model) # num_proc > 1 -> full_parallel 186 | elif args.api: 187 | audio_model = None 188 | else: 189 | audio_model = whisper.load_model(model) 190 | 191 | record_timeout = args.record_timeout 192 | phrase_timeout = args.phrase_timeout 193 | 194 | temp_file = NamedTemporaryFile(suffix='.wav').name 195 | # print('temp_file', temp_file) 196 | transcription = [''] 197 | answers = [''] 198 | 199 | chain = create_chain(args.llm) 200 | 201 | with source: 202 | recorder.adjust_for_ambient_noise(source) 203 | 204 | def record_callback(_, audio:sr.AudioData) -> None: 205 | """ 206 | Threaded callback function to recieve audio data when recordings finish. 207 | audio: An AudioData containing the recorded bytes. 208 | """ 209 | # Grab the raw bytes and push it into the thread safe queue. 210 | data = audio.get_raw_data() 211 | data_queue.put(data) 212 | 213 | # Create a background thread that will pass us raw audio bytes. 214 | # We could do this manually but SpeechRecognizer provides a nice helper. 215 | recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout) 216 | 217 | # Cue the user that we're ready to go. 218 | print("Model loaded.\n") 219 | 220 | while True: 221 | try: 222 | now = datetime.utcnow() 223 | # Pull raw recorded audio from the queue. 224 | if not data_queue.empty(): 225 | phrase_complete = False 226 | # If enough time has passed between recordings, consider the phrase complete. 227 | # Clear the current working audio buffer to start over with the new data. 228 | if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): 229 | last_sample = bytes() 230 | phrase_complete = True 231 | # This is the last time we received new audio data from the queue. 232 | phrase_time = now 233 | 234 | # Concatenate our current audio data with the latest audio data. 235 | while not data_queue.empty(): 236 | data = data_queue.get() 237 | last_sample += data 238 | 239 | # Use AudioData to convert the raw data to wav data. 240 | audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) 241 | wav_data = io.BytesIO(audio_data.get_wav_data()) 242 | 243 | # Write wav data to the temporary file as bytes. 244 | with open(temp_file, 'w+b') as f: 245 | f.write(wav_data.read()) 246 | 247 | # Read the transcription. 248 | if args.cpp: 249 | result = transcribe_from_file(audio_model, temp_file, source.SAMPLE_RATE) 250 | # result = audio_model.transcribe_from_file("/path/to/audio.wav") 251 | text = result.strip() 252 | elif args.api: 253 | file = open(temp_file, "rb") 254 | result = openai.Audio.transcribe("whisper-1", file) 255 | text = result['text'].strip() 256 | else: 257 | result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) 258 | text = result['text'].strip() 259 | 260 | # If we detected a pause between recordings, add a new item to our transcripion. 261 | # Otherwise edit the existing one. 262 | if phrase_complete: 263 | transcription.append(text) 264 | if not args.live: 265 | prompt = prepare_prompt(transcription, answers) 266 | answer = create_response(prompt, chain) 267 | answers.append(answer) 268 | else: 269 | transcription[-1] = text 270 | 271 | # Clear the console to reprint the updated transcription. 272 | os.system('cls' if os.name=='nt' else 'clear') 273 | for i, line in enumerate(transcription): 274 | if args.live: 275 | print(line) 276 | else: 277 | print(f'Interviewer Q: {line}') 278 | print('='*50) 279 | print(f'Recommended Answer: {answers[i]}') 280 | print('='*50) 281 | # Flush stdout. 282 | print('', end='', flush=True) 283 | 284 | 285 | # Infinite loops are bad for processors, must sleep. 286 | sleep(0.1) 287 | except KeyboardInterrupt: 288 | break 289 | 290 | print("\n\nTranscription:") 291 | for line in transcription: 292 | print(line) 293 | 294 | 295 | if __name__ == "__main__": 296 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ffmpeg==1.4 2 | keyboard==0.13.5 3 | Pillow==9.5.0 4 | pyautogui==0.9.53 5 | pytesseract==0.3.10 6 | numpy==1.24.2 7 | langchain==0.0.131 8 | openai==0.27.4 9 | torch==2.0.0 10 | whisper==1.1.10 11 | whispercpp==0.0.17 12 | pyaudio 13 | SpeechRecognition 14 | --extra-index-url https://download.pytorch.org/whl/cu116 15 | torch --------------------------------------------------------------------------------