├── .gitignore
├── Package.swift
├── README.md
├── Sources
    └── ocr_tool
    │   ├── interview_assistant.swift.bkup
    │   └── main.swift
├── demo.gif
├── demo.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | /.build
 3 | /Packages
 4 | /*.xcodeproj
 5 | xcuserdata/
 6 | DerivedData/
 7 | .swiftpm/config/registries.json
 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
 9 | .netrc
10 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version:5.3
 2 | import PackageDescription
 3 | 
 4 | let package = Package(
 5 |     name: "ocr_tool",
 6 |     platforms: [
 7 |         .macOS(.v10_15)
 8 |     ],
 9 |     dependencies: [
10 |         .package(url: "https://github.com/soffes/HotKey.git", from: "0.1.3"),
11 |         .package(url: "https://github.com/MacPaw/OpenAI.git", from: "0.2.5"),
12 |         // .package(url: "https://github.com/MacPaw/OpenAI.git", from: "0.1.2"),
13 |         
14 |     ],
15 |     targets: [
16 |         .target(
17 |             name: "ocr_tool",
18 |             dependencies: [
19 |                 .product(name: "HotKey", package: "HotKey"),
20 |                 .product(name: "OpenAI", package: "OpenAI"),
21 |             ]),
22 |         .testTarget(
23 |             name: "ocr_toolTests",
24 |             dependencies: ["ocr_tool"]),
25 |     ]
26 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Real-Time GPT 
 2 | 
 3 | ![Demo gif](demo.gif)
 4 | 
 5 | This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings.
 6 | 
 7 | ### Usage
 8 | 
 9 | ```
10 | python demo.py
11 | --cpp: Use WhisperCPP
12 | --model: Select whisper model size
13 | --live: Live Transcription Only
14 | --mic: Which Microphone to use
15 | ```
16 | I recommend using Blackhole to route audio to the model.
17 | 
18 | 
19 | ### Installation
20 | #### Screenshot to Answering
21 | * For Screenshot to Answer:
22 | Use `swift build` and then `swift run` to run the program.
23 | 
24 | 
25 | #### Live Transcription & Answering
26 | To install dependencies simply run
27 | ```
28 | pip install -r requirements.txt
29 | ```
30 | in an environment of your choosing.
31 | 
32 | Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
33 | 
34 | ```
35 | # on Ubuntu or Debian
36 | sudo apt update && sudo apt install ffmpeg
37 | 
38 | # on Arch Linux
39 | sudo pacman -S ffmpeg
40 | 
41 | # on MacOS using Homebrew (https://brew.sh/)
42 | brew install ffmpeg
43 | 
44 | # on Windows using Chocolatey (https://chocolatey.org/)
45 | choco install ffmpeg
46 | 
47 | # on Windows using Scoop (https://scoop.sh/)
48 | scoop install ffmpeg
49 | ```
50 | 
51 | For more information on Whisper please see https://github.com/openai/whisper
52 | 
53 | The code in this repository is public domain.
54 | 
55 | ### TODO:
56 | - [ ] Combine Visual and Audio background
57 | - [ ] Start and stop from keyboard shortcut
58 | - [ ] A small GUI to control settings
59 | - [ ] Add STT model options to use whisper API, native MacOS
60 | 


--------------------------------------------------------------------------------
/Sources/ocr_tool/interview_assistant.swift.bkup:
--------------------------------------------------------------------------------
1 | @main
2 | public struct interview_assistant {
3 |     public private(set) var text = "Hello, World!"
4 | 
5 |     public static func main() {
6 |         print(interview_assistant().text)
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/Sources/ocr_tool/main.swift:
--------------------------------------------------------------------------------
  1 | import Cocoa
  2 | import Vision
  3 | import SwiftUI
  4 | import HotKey
  5 | import OpenAI
  6 | 
  7 | class AppDelegate: NSObject, NSApplicationDelegate {
  8 |     var window: NSWindow!
  9 |     var hotKey: HotKey!
 10 |     // OpenAI API Token from environment variable
 11 |     let openAI = OpenAI(apiToken: "YOUR_OPENAI_API_TOKEN")
 12 |     let systemPrompt = """
 13 |     As you embark on your journey as a language model, 
 14 |     you have been granted a unique opportunity to take on the role of an expert
 15 |     in a variety of disciplines. Your creators have carefully crafted your identity, 
 16 |     instilling within you the knowledge and wisdom of traditional Machine Learning, modern Deep Learning,
 17 |     Natural Language Processing and Computer Vision. And obviously, you have been given the abilities 
 18 |     of a 10x Software Engineer who can communicate knowledge effectively and code in any language.
 19 | 
 20 |     Consider each input provided as a question by an Interviewer testing your knowledge.
 21 |     Show confidence and expertise in your answers. A good asnwer would explain the 
 22 |     concepts briefly and concisely, and provide a clear example of how it is used in practice.
 23 |     And then go deeper, either by explaining the underlying theory and mathematics behind the concepts 
 24 |     or providing a succint & clean code preferably in python language.
 25 | 
 26 |     """
 27 | 
 28 |     func applicationDidFinishLaunching(_ aNotification: Notification) {
 29 |         print("Application launched")
 30 |         print("Waiting for Shift + Command + L hotkey to be pressed...")
 31 | 
 32 |         hotKey = HotKey(key: .l, modifiers: [.command, .shift])
 33 |         hotKey.keyDownHandler = { [weak self] in
 34 |             print("Hotkey triggered")
 35 |             self?.captureScreen()
 36 |         }
 37 |     }
 38 | 
 39 |     func applicationWillTerminate(_ aNotification: Notification) {
 40 |         hotKey.keyDownHandler = nil
 41 |     }
 42 | 
 43 |     func captureScreen() {
 44 |         print("Capturing screen")
 45 |         
 46 |         let task = Process()
 47 |         task.launchPath = "/usr/sbin/screencapture"
 48 |         task.arguments = ["-i", "-c"]
 49 |         task.launch()
 50 |         task.waitUntilExit()
 51 | 
 52 |         if let image = NSPasteboard.general.readObjects(forClasses: [NSImage.self], options: nil)?.first as? NSImage {
 53 |             print("Image captured")
 54 |             getTextFromImage(image: image)
 55 |         } else {
 56 |             print("No image captured")
 57 |         }
 58 |     }
 59 | 
 60 |     func getTextFromImage(image: NSImage) {
 61 |         print("Extracting text from image")
 62 | 
 63 |         let requestHandler = VNImageRequestHandler(cgImage: image.cgImage(forProposedRect: nil, context: nil, hints: nil)!, options: [:])
 64 |         let request = VNRecognizeTextRequest { [weak self] request, error in
 65 |             if let error = error {
 66 |                 print("Error: \(error.localizedDescription)")
 67 |                 return
 68 |             }
 69 | 
 70 |             if let results = request.results as? [VNRecognizedTextObservation] {
 71 |                 let text = results.compactMap { $0.topCandidates(1).first?.string }.joined(separator: " ")
 72 |                 print("Text extracted: \(text)")
 73 |                 self?.sendTextToOpenAI(text: text)
 74 |             } else {
 75 |                 print("No text extracted")
 76 |             }
 77 |         }
 78 | 
 79 |         request.recognitionLevel = .accurate
 80 |         try? requestHandler.perform([request])
 81 |     }
 82 | 
 83 |     func sendTextToOpenAI(text: String) {
 84 |         print("Sending text to OpenAI Chat API")
 85 | 
 86 |         // gpt3_5Turbo
 87 |         let query = ChatQuery(model: .gpt4_1106_preview, messages: [
 88 |             .init(role: .system, content: systemPrompt),
 89 |             .init(role: .user, content: text)
 90 |             ])
 91 |         Task {
 92 |             do {
 93 |                 let result = try await openAI.chats(query: query)
 94 |                 // Print only the content from the response
 95 |                 print("RESPONSE:\n=====================================")
 96 |                 print("OpenAI Chat API response: \(result.choices[0].message.content)")
 97 |                 print("=====================================")
 98 |                 // print("OpenAI Chat API response: \(result)")
 99 |                 // print("=====================================")
100 | 
101 |             } catch {
102 |                 print("Error sending text to OpenAI Chat API: \(error.localizedDescription)")
103 |             }
104 |         }
105 |     }
106 | }
107 | 
108 | let app = NSApplication.shared
109 | let delegate = AppDelegate()
110 | app.delegate = delegate
111 | app.run()


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tshrjn/realtime-gpt/a97a10015538828edcd0e61a21f40cc3c1d29d08/demo.gif


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | #! python3.7
  2 | 
  3 | import argparse
  4 | import io
  5 | import os
  6 | 
  7 | import torch
  8 | import openai
  9 | import whisper
 10 | import speech_recognition as sr
 11 | from whispercpp import Whisper as WhisperCPP
 12 | 
 13 | from datetime import datetime, timedelta
 14 | from queue import Queue
 15 | from tempfile import NamedTemporaryFile
 16 | from time import sleep
 17 | from sys import platform
 18 | 
 19 | 
 20 | def transcribe_from_file(audio_model_cpp, audio_file, sample_rate):
 21 |     '''
 22 |     For WhisperCPP
 23 |     '''
 24 |     import ffmpeg
 25 |     import numpy as np
 26 | 
 27 |     y, _ = (
 28 |              ffmpeg.input(audio_file, threads=0)
 29 |              .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
 30 |              .run(
 31 |                  cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
 32 |              )
 33 |          )
 34 | 
 35 |     arr = np.frombuffer(y, np.int16).flatten().astype(np.float32) / 32768.0
 36 |     ret = audio_model_cpp.transcribe(arr)
 37 |     return ret
 38 | 
 39 | 
 40 | def create_chain(model_name="gpt-4"):
 41 |     from langchain.chat_models import ChatOpenAI
 42 |     from langchain import PromptTemplate, LLMChain
 43 |     from langchain.chains import ConversationChain
 44 |     from langchain.prompts.chat import (
 45 |         ChatPromptTemplate,
 46 |         SystemMessagePromptTemplate,
 47 |         AIMessagePromptTemplate,
 48 |         HumanMessagePromptTemplate,
 49 |     )
 50 |     from langchain.memory import ConversationBufferWindowMemory
 51 | 
 52 |     chat = ChatOpenAI(model_name=model_name)
 53 | 
 54 |     template = ''' As you embark on your journey as a language model, 
 55 |     you have been granted a unique opportunity to take on the role of an expert
 56 |     in a variety of disciplines. Your creators have carefully crafted your identity, 
 57 |     instilling within you the knowledge and wisdom of traditional Machine Learning, modern Deep Learning,
 58 |     Natural Language Processing and Computer Vision. And obviously, you have been given the abilities 
 59 |     of a 10x Software Engineer who can communicate knowledge effectively and code in any language.
 60 | 
 61 |     Consider each input provided as a question by an Interviewer testing your knowledge.
 62 |     Show confidence and expertise in your answers. A good asnwer would explain the 
 63 |     concepts briefly and concisely, and provide a clear example of how it is used in practice.
 64 |     And then go deeper, either by explaining the underlying theory and mathematics behind the concepts 
 65 |     or providing a succint & clean code preferably in python language.
 66 |     '''
 67 |     system_message_prompt = SystemMessagePromptTemplate.from_template(template)
 68 |     example_human = HumanMessagePromptTemplate.from_template("Hi")
 69 |     example_ai = AIMessagePromptTemplate.from_template("Argh me mateys")
 70 | 
 71 |     human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
 72 | 
 73 |     chat_prompt = ChatPromptTemplate.from_messages(
 74 |         [
 75 |             system_message_prompt,
 76 |             # example_human,
 77 |             # example_ai,
 78 |             human_message_prompt
 79 |         ])
 80 |     chain = LLMChain(llm=chat, prompt=chat_prompt)
 81 | 
 82 |     # conversation_with_summary = ConversationChain(
 83 |     #     llm=chat,
 84 |     #     # We set a low k=5, to only keep the last 5 interactions in memory
 85 |     #     memory=ConversationBufferWindowMemory(k=5),
 86 |     #     prompt=chat_prompt,
 87 |     #     # verbose=True
 88 |     # )
 89 |     # conversation_with_summary.predict(input="Hi, what's up?")
 90 |     # return conversation_with_summary
 91 |     return chain
 92 | 
 93 | 
 94 | def prepare_prompt(transcription_in, answers_in, last_k=5):
 95 | 
 96 |     # print(transcription_in, answers_in)
 97 |     transcription = transcription_in[-last_k-1:]
 98 |     answers = answers_in[-last_k:]
 99 | 
100 |     ret_str = ''
101 |     for i in range(len(transcription) - 1):
102 |         ret_str += f"Q: {transcription[i]} \n A: {answers[i]}\n"
103 | 
104 |     ret_str += f"Q: {transcription[-1]} \n A: "
105 |     return ret_str
106 | 
107 | 
108 | 
109 | def create_response(text, chain):
110 |     # return chain.predict(input=text)
111 |     return chain.run(text)
112 | 
113 | def main():
114 |     parser = argparse.ArgumentParser()
115 |     parser.add_argument("--model", default="medium", help="Model to use",
116 |                         choices=["tiny", "base", "small", "medium", "large"])
117 |     # just live transcription, No InterviewQA mode, 
118 |     parser.add_argument("--live", action='store_true',
119 |                         help="Just live transcription.")
120 |     parser.add_argument("--cpp", action='store_true',
121 |                         help="Use the C++ version of Whisper.")
122 |     parser.add_argument("--api", action='store_true',
123 |                         help="Use the API version of Whisper.")
124 |     parser.add_argument("--llm", default="gpt-3.5-turbo", choices=['gpt-3.5-turbo-0301','gpt-3.5-turbo', 'gpt-4', 'gpt-4-0314'],
125 |                         help="Language Model to use",)
126 |     # Which Mic to use by providing mic name
127 |     parser.add_argument("--mic", default='macbook', choices=["blackhole", "iphone", "macbook"],type=str,)
128 |     parser.add_argument("--non_english", action='store_true',
129 |                         help="Don't use the english model.")
130 |     parser.add_argument("--energy_threshold", default=1000,
131 |                         help="Energy level for mic to detect.", type=int)
132 |     parser.add_argument("--record_timeout", default=2,
133 |                         help="How real time the recording is in seconds.", type=float)
134 |     parser.add_argument("--phrase_timeout", default=3,
135 |                         help="How much empty space between recordings before we "
136 |                              "consider it a new line in the transcription.", type=float)
137 |     if 'linux' in platform:
138 |         parser.add_argument("--default_microphone", default='pulse',
139 |                             help="Default microphone name for SpeechRecognition. "
140 |                                  "Run this with 'list' to view available Microphones.", type=str)
141 |     args = parser.parse_args()
142 | 
143 |     # The last time a recording was retreived from the queue.
144 |     phrase_time = None
145 |     # Current raw audio bytes.
146 |     last_sample = bytes()
147 |     # Thread safe Queue for passing data from the threaded recording callback.
148 |     data_queue = Queue()
149 |     # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
150 |     recorder = sr.Recognizer()
151 |     recorder.energy_threshold = args.energy_threshold
152 |     # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
153 |     recorder.dynamic_energy_threshold = False
154 | 
155 |     # Important for linux users.
156 |     # Prevents permanent application hang and crash by using the wrong Microphone
157 |     if 'linux' in platform:
158 |         mic_name = args.default_microphone
159 |         if not mic_name or mic_name == 'list':
160 |             print("Available microphone devices are: ")
161 |             for index, name in enumerate(sr.Microphone.list_microphone_names()):
162 |                 print(f"Microphone with name \"{name}\" found")
163 |             return
164 |         else:
165 |             for index, name in enumerate(sr.Microphone.list_microphone_names()):
166 |                 if mic_name in name:
167 |                     source = sr.Microphone(sample_rate=16000, device_index=index)
168 |                     break
169 |     else:
170 |         source = None
171 |         for i, microphone_name in enumerate(sr.Microphone.list_microphone_names()):
172 |             # if 'BlackHole' in microphone_name:
173 |             if args.mic in microphone_name.lower():
174 |                 print(f"Using Mic: {microphone_name}")
175 |                 source = sr.Microphone(device_index=i, sample_rate=16000)
176 |                 break
177 |         # source = sr.Microphone(sample_rate=16000)
178 | 
179 |     # Load / Download model
180 |     model = args.model
181 |     if args.model != "large" and not args.non_english:
182 |         model = model + ".en"
183 | 
184 |     if args.cpp:
185 |         audio_model = WhisperCPP.from_pretrained(model) # num_proc > 1 -> full_parallel
186 |     elif args.api:
187 |         audio_model = None
188 |     else:
189 |         audio_model = whisper.load_model(model)
190 | 
191 |     record_timeout = args.record_timeout
192 |     phrase_timeout = args.phrase_timeout
193 | 
194 |     temp_file = NamedTemporaryFile(suffix='.wav').name
195 |     # print('temp_file', temp_file)
196 |     transcription = ['']
197 |     answers = ['']
198 | 
199 |     chain = create_chain(args.llm)
200 | 
201 |     with source:
202 |         recorder.adjust_for_ambient_noise(source)
203 | 
204 |     def record_callback(_, audio:sr.AudioData) -> None:
205 |         """
206 |         Threaded callback function to recieve audio data when recordings finish.
207 |         audio: An AudioData containing the recorded bytes.
208 |         """
209 |         # Grab the raw bytes and push it into the thread safe queue.
210 |         data = audio.get_raw_data()
211 |         data_queue.put(data)
212 | 
213 |     # Create a background thread that will pass us raw audio bytes.
214 |     # We could do this manually but SpeechRecognizer provides a nice helper.
215 |     recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
216 | 
217 |     # Cue the user that we're ready to go.
218 |     print("Model loaded.\n")
219 | 
220 |     while True:
221 |         try:
222 |             now = datetime.utcnow()
223 |             # Pull raw recorded audio from the queue.
224 |             if not data_queue.empty():
225 |                 phrase_complete = False
226 |                 # If enough time has passed between recordings, consider the phrase complete.
227 |                 # Clear the current working audio buffer to start over with the new data.
228 |                 if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
229 |                     last_sample = bytes()
230 |                     phrase_complete = True
231 |                 # This is the last time we received new audio data from the queue.
232 |                 phrase_time = now
233 | 
234 |                 # Concatenate our current audio data with the latest audio data.
235 |                 while not data_queue.empty():
236 |                     data = data_queue.get()
237 |                     last_sample += data
238 | 
239 |                 # Use AudioData to convert the raw data to wav data.
240 |                 audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
241 |                 wav_data = io.BytesIO(audio_data.get_wav_data())
242 | 
243 |                 # Write wav data to the temporary file as bytes.
244 |                 with open(temp_file, 'w+b') as f:
245 |                     f.write(wav_data.read())
246 | 
247 |                 # Read the transcription.
248 |                 if args.cpp:
249 |                     result = transcribe_from_file(audio_model, temp_file, source.SAMPLE_RATE)
250 |                     # result = audio_model.transcribe_from_file("/path/to/audio.wav")
251 |                     text = result.strip()
252 |                 elif args.api:
253 |                     file = open(temp_file, "rb")
254 |                     result = openai.Audio.transcribe("whisper-1", file)
255 |                     text = result['text'].strip()
256 |                 else:
257 |                     result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
258 |                     text = result['text'].strip()
259 | 
260 |                 # If we detected a pause between recordings, add a new item to our transcripion.
261 |                 # Otherwise edit the existing one.
262 |                 if phrase_complete:
263 |                     transcription.append(text)
264 |                     if not args.live:
265 |                         prompt = prepare_prompt(transcription, answers)
266 |                         answer = create_response(prompt, chain)
267 |                         answers.append(answer)
268 |                 else:
269 |                     transcription[-1] = text
270 | 
271 |                 # Clear the console to reprint the updated transcription.
272 |                 os.system('cls' if os.name=='nt' else 'clear')
273 |                 for i, line in enumerate(transcription):
274 |                     if args.live:
275 |                         print(line)
276 |                     else:
277 |                         print(f'Interviewer Q: {line}')
278 |                         print('='*50)
279 |                         print(f'Recommended Answer: {answers[i]}')
280 |                         print('='*50)
281 |                 # Flush stdout.
282 |                 print('', end='', flush=True)
283 | 
284 | 
285 |                 # Infinite loops are bad for processors, must sleep.
286 |                 sleep(0.1)
287 |         except KeyboardInterrupt:
288 |             break
289 | 
290 |     print("\n\nTranscription:")
291 |     for line in transcription:
292 |         print(line)
293 | 
294 | 
295 | if __name__ == "__main__":
296 |     main()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ffmpeg==1.4
 2 | keyboard==0.13.5
 3 | Pillow==9.5.0
 4 | pyautogui==0.9.53
 5 | pytesseract==0.3.10
 6 | numpy==1.24.2
 7 | langchain==0.0.131
 8 | openai==0.27.4
 9 | torch==2.0.0
10 | whisper==1.1.10
11 | whispercpp==0.0.17
12 | pyaudio
13 | SpeechRecognition
14 | --extra-index-url https://download.pytorch.org/whl/cu116
15 | torch


--------------------------------------------------------------------------------