├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── manual_cli.py
    └── streaming_cli.py
├── openai_realtime_client
    ├── __init__.py
    ├── client
    │   ├── __init__.py
    │   └── realtime_client.py
    └── handlers
    │   ├── __init__.py
    │   ├── audio_handler.py
    │   └── input_handler.py
├── poetry.lock
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__
3 | dist


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 LlamaIndex
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenAI Realtime API Client for Python
 2 | 
 3 | This is an experimental OpenAI Realtime API client for Python and LlamaIndex. It integrates with LlamaIndex's tools, allowing you to quickly build custom voice assistants.
 4 | 
 5 | Include two examples that run directly in the terminal -- using both manual and Server VAD mode (i.e. allowing you to interrupt the chatbot).
 6 | 
 7 | ## Installation
 8 | 
 9 | Install system deps:
10 | 
11 | ```bash
12 | brew install ffmpeg portaudio
13 | ```
14 | 
15 | Install python deps:
16 | 
17 | ```bash
18 | pip install openai-realtime-client
19 | 
20 | # Optional: clone the repo and run the examples locally
21 | git clone https://github.com/run-llama/openai_realtime_client.git
22 | cd openai_realtime_client
23 | ```
24 | 
25 | Set your openai key:
26 | 
27 | ```bash
28 | export OPENAI_API_KEY="sk-..."
29 | ```
30 | 
31 | ## Usage
32 | 
33 | Assuming you installed and cloned the repo (or copy-pasted the examples), you can immediately run the examples.
34 | 
35 | Run the interactive CLI with manual VAD (try asking for your phone number to see function calling in action):
36 | 
37 | ```bash
38 | python ./examples/manual_cli.py
39 | ```
40 | 
41 | Or to use streaming mode (which allows you to interrupt the chatbot):
42 | 
43 | ```bash
44 | python ./examples/streaming_cli.py
45 | ```
46 | 
47 | **NOTE:** Streaming mode can be a little janky, best to use headphones in a quiet environment.
48 | 
49 | Take a look at the examples, add your own tools, and build something amazing!
50 | 


--------------------------------------------------------------------------------
/examples/manual_cli.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | 
 4 | from pynput import keyboard
 5 | from openai_realtime_client import RealtimeClient, InputHandler, AudioHandler
 6 | from llama_index.core.tools import FunctionTool
 7 | 
 8 | # Add your own tools here!
 9 | # NOTE: FunctionTool parses the docstring to get description, the tool name is the function name
10 | def get_phone_number(name: str) -> str:
11 |     """Get my phone number."""
12 |     if name == "Jerry":
13 |         return "1234567890"
14 |     elif name == "Logan":
15 |         return "0987654321"
16 |     else:
17 |         return "Unknown"
18 | 
19 | tools = [FunctionTool.from_defaults(fn=get_phone_number)]
20 | 
21 | async def main():
22 |     # Initialize handlers
23 |     audio_handler = AudioHandler()
24 |     input_handler = InputHandler()
25 |     input_handler.loop = asyncio.get_running_loop()
26 |     
27 |     # Initialize the realtime client
28 |     client = RealtimeClient(
29 |         api_key=os.environ.get("OPENAI_API_KEY"),
30 |         on_text_delta=lambda text: print(f"\nAssistant: {text}", end="", flush=True),
31 |         on_audio_delta=lambda audio: audio_handler.play_audio(audio),
32 |         on_input_transcript=lambda transcript: print(f"\nYou said: {transcript}\nAssistant: ", end="", flush=True),
33 |         on_output_transcript=lambda transcript: print(f"{transcript}", end="", flush=True),
34 |         tools=tools,
35 |     )
36 |     
37 |     # Start keyboard listener in a separate thread
38 |     listener = keyboard.Listener(on_press=input_handler.on_press)
39 |     listener.start()
40 |     
41 |     try:
42 |         # Connect to the API
43 |         await client.connect()
44 |         
45 |         # Start message handling in the background
46 |         message_handler = asyncio.create_task(client.handle_messages())
47 |         
48 |         print("Connected to OpenAI Realtime API!")
49 |         print("Commands:")
50 |         print("- Type your message and press Enter to send text")
51 |         print("- Press 'r' to start recording audio")
52 |         print("- Press 'space' to stop recording")
53 |         print("- Press 'q' to quit")
54 |         print("")        
55 |  
56 |         while True:
57 |             # Wait for commands from the input handler
58 |             command, data = await input_handler.command_queue.get()
59 |             
60 |             if command == 'q':
61 |                 break
62 |             elif command == 'r':
63 |                 # Start recording
64 |                 audio_handler.start_recording()
65 |             elif command == 'space':
66 |                 print("[About to stop recording]")
67 |                 if audio_handler.recording:
68 |                     # Stop recording and get audio data
69 |                     audio_data = audio_handler.stop_recording()
70 |                     print("[Recording stopped]")
71 |                     if audio_data:
72 |                         await client.send_audio(audio_data)
73 |                         print("[Audio sent]")
74 |             elif command == 'enter' and data:
75 |                 # Send text message
76 |                 await client.send_text(data)
77 | 
78 |             await asyncio.sleep(0.01) 
79 |     except Exception as e:
80 |         print(f"Error: {e}")
81 |     finally:
82 |         # Clean up
83 |         listener.stop()
84 |         audio_handler.cleanup()
85 |         await client.close()
86 | 
87 | if __name__ == "__main__":
88 |     # Install required packages:
89 |     # pip install pyaudio pynput pydub websockets
90 | 
91 |     print("Starting Realtime API CLI...")
92 |     asyncio.run(main())
93 | 


--------------------------------------------------------------------------------
/examples/streaming_cli.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | 
 4 | from pynput import keyboard
 5 | from openai_realtime_client import RealtimeClient, AudioHandler, InputHandler, TurnDetectionMode
 6 | from llama_index.core.tools import FunctionTool
 7 | 
 8 | # Add your own tools here!
 9 | # NOTE: FunctionTool parses the docstring to get description, the tool name is the function name
10 | def get_phone_number(name: str) -> str:
11 |     """Get my phone number."""
12 |     if name == "Jerry":
13 |         return "1234567890"
14 |     elif name == "Logan":
15 |         return "0987654321"
16 |     else:
17 |         return "Unknown"
18 | 
19 | tools = [FunctionTool.from_defaults(fn=get_phone_number)]
20 | 
21 | async def main():
22 |     audio_handler = AudioHandler()
23 |     input_handler = InputHandler()
24 |     input_handler.loop = asyncio.get_running_loop()
25 |     
26 |     client = RealtimeClient(
27 |         api_key=os.environ.get("OPENAI_API_KEY"),
28 |         on_text_delta=lambda text: print(f"\nAssistant: {text}", end="", flush=True),
29 |         on_audio_delta=lambda audio: audio_handler.play_audio(audio),
30 |         on_interrupt=lambda: audio_handler.stop_playback_immediately(),
31 |         turn_detection_mode=TurnDetectionMode.SERVER_VAD,
32 |         tools=tools,
33 |     )
34 | 
35 |     # Start keyboard listener in a separate thread
36 |     listener = keyboard.Listener(on_press=input_handler.on_press)
37 |     listener.start()
38 |     
39 |     try:
40 |         await client.connect()
41 |         message_handler = asyncio.create_task(client.handle_messages())
42 |         
43 |         print("Connected to OpenAI Realtime API!")
44 |         print("Audio streaming will start automatically.")
45 |         print("Press 'q' to quit")
46 |         print("")
47 |         
48 |         # Start continuous audio streaming
49 |         streaming_task = asyncio.create_task(audio_handler.start_streaming(client))
50 |         
51 |         # Simple input loop for quit command
52 |         while True:
53 |             command, _ = await input_handler.command_queue.get()
54 |             
55 |             if command == 'q':
56 |                 break
57 |             
58 |     except Exception as e:
59 |         print(f"Error: {e}")
60 |     finally:
61 |         audio_handler.stop_streaming()
62 |         audio_handler.cleanup()
63 |         await client.close()
64 | 
65 | if __name__ == "__main__":
66 |     print("Starting Realtime API CLI with Server VAD...")
67 |     asyncio.run(main())
68 | 


--------------------------------------------------------------------------------
/openai_realtime_client/__init__.py:
--------------------------------------------------------------------------------
1 | from .client.realtime_client import RealtimeClient, TurnDetectionMode
2 | from .handlers.audio_handler import AudioHandler
3 | from .handlers.input_handler import InputHandler
4 | 
5 | __all__ = ["RealtimeClient", "TurnDetectionMode", "AudioHandler", "InputHandler"]


--------------------------------------------------------------------------------
/openai_realtime_client/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .realtime_client import RealtimeClient
2 | 
3 | __all__ = ["RealtimeClient"]


--------------------------------------------------------------------------------
/openai_realtime_client/client/realtime_client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import websockets
  3 | import json
  4 | import base64
  5 | import io
  6 | 
  7 | from typing import Optional, Callable, List, Dict, Any
  8 | from enum import Enum
  9 | from pydub import AudioSegment
 10 | 
 11 | from llama_index.core.tools import BaseTool, AsyncBaseTool, ToolSelection, adapt_to_async_tool, call_tool_with_selection
 12 | 
 13 | 
 14 | class TurnDetectionMode(Enum):
 15 |     SERVER_VAD = "server_vad"
 16 |     MANUAL = "manual"
 17 | 
 18 | class RealtimeClient:
 19 |     """
 20 |     A client for interacting with the OpenAI Realtime API.
 21 | 
 22 |     This class provides methods to connect to the Realtime API, send text and audio data,
 23 |     handle responses, and manage the WebSocket connection.
 24 | 
 25 |     Attributes:
 26 |         api_key (str): 
 27 |             The API key for authentication.
 28 |         model (str): 
 29 |             The model to use for text and audio processing.
 30 |         voice (str): 
 31 |             The voice to use for audio output.
 32 |         instructions (str): 
 33 |             The instructions for the chatbot.
 34 |         temperature (float):
 35 |             The chatbot's temperature.
 36 |         turn_detection_mode (TurnDetectionMode): 
 37 |             The mode for turn detection.
 38 |         tools (List[BaseTool]): 
 39 |             The tools to use for function calling.
 40 |         on_text_delta (Callable[[str], None]): 
 41 |             Callback for text delta events. 
 42 |             Takes in a string and returns nothing.
 43 |         on_audio_delta (Callable[[bytes], None]): 
 44 |             Callback for audio delta events. 
 45 |             Takes in bytes and returns nothing.
 46 |         on_interrupt (Callable[[], None]): 
 47 |             Callback for user interrupt events, should be used to stop audio playback.
 48 |         on_input_transcript (Callable[[str], None]): 
 49 |             Callback for input transcript events. 
 50 |             Takes in a string and returns nothing.
 51 |         on_output_transcript (Callable[[str], None]): 
 52 |             Callback for output transcript events. 
 53 |             Takes in a string and returns nothing.
 54 |         extra_event_handlers (Dict[str, Callable[[Dict[str, Any]], None]]): 
 55 |             Additional event handlers. 
 56 |             Is a mapping of event names to functions that process the event payload.
 57 |     """
 58 |     def __init__(
 59 |         self, 
 60 |         api_key: str,
 61 |         model: str = "gpt-4o-realtime-preview-2024-10-01",
 62 |         voice: str = "alloy",
 63 |         instructions: str = "You are a helpful assistant",
 64 |         temperature: float = 0.8,
 65 |         turn_detection_mode: TurnDetectionMode = TurnDetectionMode.MANUAL,
 66 |         tools: Optional[List[BaseTool]] = None,
 67 |         on_text_delta: Optional[Callable[[str], None]] = None,
 68 |         on_audio_delta: Optional[Callable[[bytes], None]] = None,
 69 |         on_interrupt: Optional[Callable[[], None]] = None,
 70 |         on_input_transcript: Optional[Callable[[str], None]] = None,  
 71 |         on_output_transcript: Optional[Callable[[str], None]] = None,  
 72 |         extra_event_handlers: Optional[Dict[str, Callable[[Dict[str, Any]], None]]] = None
 73 |     ):
 74 |         self.api_key = api_key
 75 |         self.model = model
 76 |         self.voice = voice
 77 |         self.ws = None
 78 |         self.on_text_delta = on_text_delta
 79 |         self.on_audio_delta = on_audio_delta
 80 |         self.on_interrupt = on_interrupt
 81 |         self.on_input_transcript = on_input_transcript
 82 |         self.on_output_transcript = on_output_transcript
 83 |         self.instructions = instructions
 84 |         self.temperature = temperature
 85 |         self.base_url = "wss://api.openai.com/v1/realtime"
 86 |         self.extra_event_handlers = extra_event_handlers or {}
 87 |         self.turn_detection_mode = turn_detection_mode
 88 | 
 89 |         tools = tools or []
 90 |         for i, tool in enumerate(tools):
 91 |             tools[i] = adapt_to_async_tool(tool)
 92 |         self.tools: List[AsyncBaseTool] = tools
 93 | 
 94 |         # Track current response state
 95 |         self._current_response_id = None
 96 |         self._current_item_id = None
 97 |         self._is_responding = False
 98 |         # Track printing state for input and output transcripts
 99 |         self._print_input_transcript = False
100 |         self._output_transcript_buffer = ""
101 |         
102 |         
103 | 
104 |         
105 | 
106 |     async def connect(self) -> None:
107 |         """Establish WebSocket connection with the Realtime API."""
108 |         url = f"{self.base_url}?model={self.model}"
109 |         headers = {
110 |             "Authorization": f"Bearer {self.api_key}",
111 |             "OpenAI-Beta": "realtime=v1"
112 |         }
113 |         
114 |         self.ws = await websockets.connect(url, extra_headers=headers)
115 |         
116 |         # Set up default session configuration
117 |         tools = [t.metadata.to_openai_tool()['function'] for t in self.tools]
118 |         for t in tools:
119 |             t['type'] = 'function'  # TODO: OpenAI docs didn't say this was needed, but it was
120 | 
121 |         
122 |         if self.turn_detection_mode == TurnDetectionMode.MANUAL:
123 |             await self.update_session({
124 |                 "modalities": ["text", "audio"],
125 |                 "instructions": self.instructions,
126 |                 "voice": self.voice,
127 |                 "input_audio_format": "pcm16",
128 |                 "output_audio_format": "pcm16",
129 |                 "input_audio_transcription": {
130 |                     "model": "whisper-1"
131 |                 },
132 |                 "tools": tools,
133 |                 "tool_choice": "auto",
134 |                 "temperature": self.temperature,
135 |             })
136 |         elif self.turn_detection_mode == TurnDetectionMode.SERVER_VAD:
137 |             await self.update_session({
138 |                 "modalities": ["text", "audio"],
139 |                 "instructions": self.instructions,
140 |                 "voice": self.voice,
141 |                 "input_audio_format": "pcm16",
142 |                 "output_audio_format": "pcm16",
143 |                 "input_audio_transcription": {
144 |                     "model": "whisper-1"
145 |                 },
146 |                 "turn_detection": {
147 |                     "type": "server_vad",
148 |                     "threshold": 0.5,
149 |                     "prefix_padding_ms": 500,
150 |                     "silence_duration_ms": 200
151 |                 },
152 |                 "tools": tools,
153 |                 "tool_choice": "auto",
154 |                 "temperature": self.temperature,
155 |             })
156 |         else:
157 |             raise ValueError(f"Invalid turn detection mode: {self.turn_detection_mode}")
158 | 
159 |     async def update_session(self, config: Dict[str, Any]) -> None:
160 |         """Update session configuration."""
161 |         event = {
162 |             "type": "session.update",
163 |             "session": config
164 |         }
165 |         await self.ws.send(json.dumps(event))
166 | 
167 |     async def send_text(self, text: str) -> None:
168 |         """Send text message to the API."""
169 |         event = {
170 |             "type": "conversation.item.create",
171 |             "item": {
172 |                 "type": "message",
173 |                 "role": "user",
174 |                 "content": [{
175 |                     "type": "input_text",
176 |                     "text": text
177 |                 }]
178 |             }
179 |         }
180 |         await self.ws.send(json.dumps(event))
181 |         await self.create_response()
182 | 
183 |     async def send_audio(self, audio_bytes: bytes) -> None:
184 |         """Send audio data to the API."""
185 |         # Convert audio to required format (24kHz, mono, PCM16)
186 |         audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
187 |         audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
188 |         pcm_data = base64.b64encode(audio.raw_data).decode()
189 |         
190 |         # Append audio to buffer
191 |         append_event = {
192 |             "type": "input_audio_buffer.append",
193 |             "audio": pcm_data
194 |         }
195 |         await self.ws.send(json.dumps(append_event))
196 |         
197 |         # Commit the buffer
198 |         commit_event = {
199 |             "type": "input_audio_buffer.commit"
200 |         }
201 |         await self.ws.send(json.dumps(commit_event))
202 |         
203 |         # In manual mode, we need to explicitly request a response
204 |         if self.turn_detection_mode == TurnDetectionMode.MANUAL:
205 |             await self.create_response()
206 | 
207 |     async def stream_audio(self, audio_chunk: bytes) -> None:
208 |         """Stream raw audio data to the API."""
209 |         audio_b64 = base64.b64encode(audio_chunk).decode()
210 |         
211 |         append_event = {
212 |             "type": "input_audio_buffer.append",
213 |             "audio": audio_b64
214 |         }
215 |         await self.ws.send(json.dumps(append_event))
216 | 
217 |     async def create_response(self, functions: Optional[List[Dict[str, Any]]] = None) -> None:
218 |         """Request a response from the API. Needed when using manual mode."""
219 |         event = {
220 |             "type": "response.create",
221 |             "response": {
222 |                 "modalities": ["text", "audio"]
223 |             }
224 |         }
225 |         if functions:
226 |             event["response"]["tools"] = functions
227 |             
228 |         await self.ws.send(json.dumps(event))
229 | 
230 |     async def send_function_result(self, call_id: str, result: Any) -> None:
231 |         """Send function call result back to the API."""
232 |         event = {
233 |             "type": "conversation.item.create",
234 |             "item": {
235 |                 "type": "function_call_output",
236 |                 "call_id": call_id,
237 |                 "output": result
238 |             }
239 |         }
240 |         await self.ws.send(json.dumps(event))
241 | 
242 |         # functions need a manual response
243 |         await self.create_response()
244 | 
245 |     async def cancel_response(self) -> None:
246 |         """Cancel the current response."""
247 |         event = {
248 |             "type": "response.cancel"
249 |         }
250 |         await self.ws.send(json.dumps(event))
251 |     
252 |     async def truncate_response(self):
253 |         """Truncate the conversation item to match what was actually played."""
254 |         if self._current_item_id:
255 |             event = {
256 |                 "type": "conversation.item.truncate",
257 |                 "item_id": self._current_item_id
258 |             }
259 |             await self.ws.send(json.dumps(event))
260 | 
261 |     async def call_tool(self, call_id: str,tool_name: str, tool_arguments: Dict[str, Any]) -> None:
262 |         tool_selection = ToolSelection(
263 |             tool_id="tool_id",
264 |             tool_name=tool_name,
265 |             tool_kwargs=tool_arguments
266 |         )
267 | 
268 |         # avoid blocking the event loop with sync tools
269 |         # by using asyncio.to_thread
270 |         tool_result = await asyncio.to_thread(
271 |             call_tool_with_selection,
272 |             tool_selection, 
273 |             self.tools, 
274 |             verbose=True
275 |         )
276 |         await self.send_function_result(call_id, str(tool_result))
277 | 
278 |     async def handle_interruption(self):
279 |         """Handle user interruption of the current response."""
280 |         if not self._is_responding:
281 |             return
282 |             
283 |         print("\n[Handling interruption]")
284 |         
285 |         # 1. Cancel the current response
286 |         if self._current_response_id:
287 |             await self.cancel_response()
288 |         
289 |         # 2. Truncate the conversation item to what was actually played
290 |         if self._current_item_id:
291 |             await self.truncate_response()
292 |             
293 |         self._is_responding = False
294 |         self._current_response_id = None
295 |         self._current_item_id = None
296 | 
297 |     async def handle_messages(self) -> None:
298 |         try:
299 |             async for message in self.ws:
300 |                 event = json.loads(message)
301 |                 event_type = event.get("type")
302 |                 
303 |                 if event_type == "error":
304 |                     print(f"Error: {event['error']}")
305 |                     continue
306 |                 
307 |                 # Track response state
308 |                 elif event_type == "response.created":
309 |                     self._current_response_id = event.get("response", {}).get("id")
310 |                     self._is_responding = True
311 |                 
312 |                 elif event_type == "response.output_item.added":
313 |                     self._current_item_id = event.get("item", {}).get("id")
314 |                 
315 |                 elif event_type == "response.done":
316 |                     self._is_responding = False
317 |                     self._current_response_id = None
318 |                     self._current_item_id = None
319 |                 
320 |                 # Handle interruptions
321 |                 elif event_type == "input_audio_buffer.speech_started":
322 |                     print("\n[Speech detected")
323 |                     if self._is_responding:
324 |                         await self.handle_interruption()
325 | 
326 |                     if self.on_interrupt:
327 |                         self.on_interrupt()
328 | 
329 |                 
330 |                 elif event_type == "input_audio_buffer.speech_stopped":
331 |                     print("\n[Speech ended]")
332 |                 
333 |                 # Handle normal response events
334 |                 elif event_type == "response.text.delta":
335 |                     if self.on_text_delta:
336 |                         self.on_text_delta(event["delta"])
337 |                         
338 |                 elif event_type == "response.audio.delta":
339 |                     if self.on_audio_delta:
340 |                         audio_bytes = base64.b64decode(event["delta"])
341 |                         self.on_audio_delta(audio_bytes)
342 |                         
343 |                 elif event_type == "response.function_call_arguments.done":
344 |                     await self.call_tool(event["call_id"], event['name'], json.loads(event['arguments']))
345 | 
346 |                 # Handle input audio transcription
347 |                 elif event_type == "conversation.item.input_audio_transcription.completed":
348 |                     transcript = event.get("transcript", "")
349 |                     
350 |                     if self.on_input_transcript:
351 |                         await asyncio.to_thread(self.on_input_transcript,transcript)
352 |                         self._print_input_transcript = True
353 | 
354 |                 # Handle output audio transcription
355 |                 elif event_type == "response.audio_transcript.delta":
356 |                     if self.on_output_transcript:
357 |                         delta = event.get("delta", "")
358 |                         if not self._print_input_transcript:
359 |                             self._output_transcript_buffer += delta
360 |                         else:
361 |                             if self._output_transcript_buffer:
362 |                                 await asyncio.to_thread(self.on_output_transcript,self._output_transcript_buffer)
363 |                                 self._output_transcript_buffer = ""
364 |                             await asyncio.to_thread(self.on_output_transcript,delta)
365 | 
366 | 
367 |                 elif event_type == "response.audio_transcript.done":
368 |                     self._print_input_transcript = False
369 | 
370 |                 elif event_type in self.extra_event_handlers:
371 |                     self.extra_event_handlers[event_type](event)
372 | 
373 |         except websockets.exceptions.ConnectionClosed:
374 |             print("Connection closed")
375 |         except Exception as e:
376 |             print(f"Error in message handling: {str(e)}")
377 | 
378 |     async def close(self) -> None:
379 |         """Close the WebSocket connection."""
380 |         if self.ws:
381 |             await self.ws.close()
382 | 


--------------------------------------------------------------------------------
/openai_realtime_client/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | from .input_handler import InputHandler
2 | from .audio_handler import AudioHandler
3 | 
4 | __all__ = ["InputHandler", "AudioHandler"]


--------------------------------------------------------------------------------
/openai_realtime_client/handlers/audio_handler.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import pyaudio
  3 | import wave
  4 | import queue
  5 | import io
  6 | from typing import Optional
  7 | 
  8 | from pydub import AudioSegment
  9 | import threading
 10 | 
 11 | from ..client.realtime_client import RealtimeClient
 12 | 
 13 | 
 14 | class AudioHandler:
 15 |     """
 16 |     Handles audio input and output for the chatbot.
 17 | 
 18 |     Uses PyAudio for audio input and output, and runs a separate thread for recording and playing audio.
 19 | 
 20 |     When playing audio, it uses a buffer to store audio data and plays it continuously to ensure smooth playback.
 21 | 
 22 |     Attributes:
 23 |         format (int): The audio format (paInt16).
 24 |         channels (int): The number of audio channels (1).
 25 |         rate (int): The sample rate (24000).
 26 |         chunk (int): The size of the audio buffer (1024).
 27 |         audio (pyaudio.PyAudio): The PyAudio object.
 28 |         recording_stream (pyaudio.Stream): The stream for recording audio.
 29 |         recording_thread (threading.Thread): The thread for recording audio.
 30 |         recording (bool): Whether the audio is currently being recorded.
 31 |         streaming (bool): Whether the audio is currently being streamed.
 32 |         stream (pyaudio.Stream): The stream for streaming audio.
 33 |         playback_stream (pyaudio.Stream): The stream for playing audio.
 34 |         playback_buffer (queue.Queue): The buffer for playing audio.
 35 |         stop_playback (bool): Whether the audio playback should be stopped.
 36 |     """
 37 |     def __init__(self):
 38 |         # Audio parameters
 39 |         self.format = pyaudio.paInt16
 40 |         self.channels = 1
 41 |         self.rate = 24000
 42 |         self.chunk = 1024
 43 | 
 44 |         self.audio = pyaudio.PyAudio()
 45 | 
 46 |         # Recording params
 47 |         self.recording_stream: Optional[pyaudio.Stream] = None
 48 |         self.recording_thread = None
 49 |         self.recording = False
 50 | 
 51 |         # streaming params
 52 |         self.streaming = False
 53 |         self.stream = None
 54 | 
 55 |         # Playback params
 56 |         self.playback_stream = None
 57 |         self.playback_buffer = queue.Queue(maxsize=20)
 58 |         self.playback_event = threading.Event()
 59 |         self.playback_thread = None
 60 |         self.stop_playback = False
 61 | 
 62 |     def start_recording(self) -> bytes:
 63 |         """Start recording audio from microphone and return bytes"""
 64 |         if self.recording:
 65 |             return b''
 66 |         
 67 |         self.recording = True
 68 |         self.recording_stream = self.audio.open(
 69 |             format=self.format,
 70 |             channels=self.channels,
 71 |             rate=self.rate,
 72 |             input=True,
 73 |             frames_per_buffer=self.chunk
 74 |         )
 75 |         
 76 |         print("\nRecording... Press 'space' to stop.")
 77 |         
 78 |         self.frames = []
 79 |         self.recording_thread = threading.Thread(target=self._record)
 80 |         self.recording_thread.start()
 81 |         
 82 |         return b''  # Return empty bytes, we'll send audio later
 83 | 
 84 |     def _record(self):
 85 |         while self.recording:
 86 |             try:
 87 |                 data = self.recording_stream.read(self.chunk)
 88 |                 self.frames.append(data)
 89 |             except Exception as e:
 90 |                 print(f"Error recording: {e}")
 91 |                 break
 92 | 
 93 |     def stop_recording(self) -> bytes:
 94 |         """Stop recording and return the recorded audio as bytes"""
 95 |         if not self.recording:
 96 |             return b''
 97 |         
 98 |         self.recording = False
 99 |         if self.recording_thread:
100 |             self.recording_thread.join()
101 |         
102 |         # Clean up recording stream
103 |         if self.recording_stream:
104 |             self.recording_stream.stop_stream()
105 |             self.recording_stream.close()
106 |             self.recording_stream = None
107 |         
108 |         # Convert frames to WAV format in memory
109 |         wav_buffer = io.BytesIO()
110 |         with wave.open(wav_buffer, 'wb') as wf:
111 |             wf.setnchannels(self.channels)
112 |             wf.setsampwidth(self.audio.get_sample_size(self.format))
113 |             wf.setframerate(self.rate)
114 |             wf.writeframes(b''.join(self.frames))
115 |         
116 |         # Get the WAV data
117 |         wav_buffer.seek(0)
118 |         return wav_buffer.read()
119 | 
120 |     async def start_streaming(self, client: RealtimeClient):
121 |         """Start continuous audio streaming."""
122 |         if self.streaming:
123 |             return
124 |         
125 |         self.streaming = True
126 |         self.stream = self.audio.open(
127 |             format=self.format,
128 |             channels=self.channels,
129 |             rate=self.rate,
130 |             input=True,
131 |             frames_per_buffer=self.chunk
132 |         )
133 |         
134 |         print("\nStreaming audio... Press 'q' to stop.")
135 |         
136 |         while self.streaming:
137 |             try:
138 |                 # Read raw PCM data
139 |                 data = self.stream.read(self.chunk, exception_on_overflow=False)
140 |                 # Stream directly without trying to decode
141 |                 await client.stream_audio(data)
142 |             except Exception as e:
143 |                 print(f"Error streaming: {e}")
144 |                 break
145 |             await asyncio.sleep(0.01)
146 | 
147 |     def stop_streaming(self):
148 |         """Stop audio streaming."""
149 |         self.streaming = False
150 |         if self.stream:
151 |             self.stream.stop_stream()
152 |             self.stream.close()
153 |             self.stream = None
154 | 
155 |     def play_audio(self, audio_data: bytes):
156 |         """Add audio data to the buffer"""
157 |         try:
158 |             self.playback_buffer.put_nowait(audio_data)
159 |         except queue.Full:
160 |             # If the buffer is full, remove the oldest chunk and add the new one
161 |             self.playback_buffer.get_nowait()
162 |             self.playback_buffer.put_nowait(audio_data)
163 |         
164 |         if not self.playback_thread or not self.playback_thread.is_alive():
165 |             self.stop_playback = False
166 |             self.playback_event.clear()
167 |             self.playback_thread = threading.Thread(target=self._continuous_playback)
168 |             self.playback_thread.start()
169 | 
170 |     def _continuous_playback(self):
171 |         """Continuously play audio from the buffer"""
172 |         self.playback_stream = self.audio.open(
173 |             format=self.format,
174 |             channels=self.channels,
175 |             rate=self.rate,
176 |             output=True,
177 |             frames_per_buffer=self.chunk
178 |         )
179 | 
180 |         while not self.stop_playback:
181 |             try:
182 |                 audio_chunk = self.playback_buffer.get(timeout=0.1)
183 |                 self._play_audio_chunk(audio_chunk)
184 |             except queue.Empty:
185 |                 continue
186 |             
187 |             if self.playback_event.is_set():
188 |                 break
189 | 
190 |         if self.playback_stream:
191 |             self.playback_stream.stop_stream()
192 |             self.playback_stream.close()
193 |             self.playback_stream = None
194 | 
195 |     def _play_audio_chunk(self, audio_chunk):
196 |         try:
197 |             # Convert the audio chunk to the correct format
198 |             audio_segment = AudioSegment(
199 |                 audio_chunk,
200 |                 sample_width=2,
201 |                 frame_rate=24000,
202 |                 channels=1
203 |             )
204 |             
205 |             # Ensure the audio is in the correct format for playback
206 |             audio_data = audio_segment.raw_data
207 |             
208 |             # Play the audio chunk in smaller portions to allow for quicker interruption
209 |             chunk_size = 1024  # Adjust this value as needed
210 |             for i in range(0, len(audio_data), chunk_size):
211 |                 if self.playback_event.is_set():
212 |                     break
213 |                 chunk = audio_data[i:i+chunk_size]
214 |                 self.playback_stream.write(chunk)
215 |         except Exception as e:
216 |             print(f"Error playing audio chunk: {e}")
217 | 
218 |     def stop_playback_immediately(self):
219 |         """Stop audio playback immediately."""
220 |         self.stop_playback = True
221 |         self.playback_buffer.queue.clear()  # Clear any pending audio
222 |         self.currently_playing = False
223 |         self.playback_event.set()
224 | 
225 |     def cleanup(self):
226 |         """Clean up audio resources"""
227 |         self.stop_playback_immediately()
228 | 
229 |         self.stop_playback = True
230 |         if self.playback_thread:
231 |             self.playback_thread.join()
232 | 
233 |         self.recording = False
234 |         if self.recording_stream:
235 |             self.recording_stream.stop_stream()
236 |             self.recording_stream.close()
237 |         
238 |         if self.stream:
239 |             self.stream.stop_stream()
240 |             self.stream.close()
241 | 
242 |         self.audio.terminate()


--------------------------------------------------------------------------------
/openai_realtime_client/handlers/input_handler.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pynput import keyboard
 3 | 
 4 | 
 5 | class InputHandler:
 6 |     """
 7 |     Handles keyboard input for the chatbot.
 8 | 
 9 |     This class is responsible for capturing keyboard input and translating it into commands for the chatbot.
10 | 
11 |     Attributes:
12 |         text_input (str): The current text input from the user.
13 |         text_ready (asyncio.Event): An event that is set when the user has finished typing.
14 |         command_queue (asyncio.Queue): A queue that stores commands for the chatbot.
15 |         loop (asyncio.AbstractEventLoop): The event loop for the input handler.
16 |     """
17 |     def __init__(self):
18 |         self.text_input = ""
19 |         self.text_ready = asyncio.Event()
20 |         self.command_queue = asyncio.Queue()
21 |         self.loop = None
22 | 
23 |     def on_press(self, key):
24 |         try:
25 |             if key == keyboard.Key.space:
26 |                 self.loop.call_soon_threadsafe(
27 |                     self.command_queue.put_nowait, ('space', None)
28 |                 )
29 |             elif key == keyboard.Key.enter:
30 |                 self.loop.call_soon_threadsafe(
31 |                     self.command_queue.put_nowait, ('enter', self.text_input)
32 |                 )
33 |                 self.text_input = ""
34 |             elif key == keyboard.KeyCode.from_char('r'):
35 |                 self.loop.call_soon_threadsafe(
36 |                     self.command_queue.put_nowait, ('r', None)
37 |                 )
38 |             elif key == keyboard.KeyCode.from_char('q'):
39 |                 self.loop.call_soon_threadsafe(
40 |                     self.command_queue.put_nowait, ('q', None)
41 |                 )
42 |             elif hasattr(key, 'char'):
43 |                 if key == keyboard.Key.backspace:
44 |                     self.text_input = self.text_input[:-1]
45 |                 else:
46 |                     self.text_input += key.char
47 |         except AttributeError:
48 |             pass


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "openai-realtime-client"
 3 | version = "0.1.0"
 4 | description = "A python-based client for OpenAI's Realtime API"
 5 | authors = ["Logan Markewich <logan@runllama.ai>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.10"
11 | llama-index-core = "^0.11.17"
12 | pyaudio = "^0.2.14"
13 | pynput = "^1.7.7"
14 | pydub = "^0.25.1"
15 | websockets = "^13.1"
16 | wave = "^0.0.2"
17 | 
18 | [build-system]
19 | requires = ["poetry-core"]
20 | build-backend = "poetry.core.masonry.api"
21 | 


--------------------------------------------------------------------------------