├── .gitignore ├── LICENSE ├── README.md ├── examples ├── manual_cli.py └── streaming_cli.py ├── openai_realtime_client ├── __init__.py ├── client │ ├── __init__.py │ └── realtime_client.py └── handlers │ ├── __init__.py │ ├── audio_handler.py │ └── input_handler.py ├── poetry.lock └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | dist -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 LlamaIndex 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenAI Realtime API Client for Python 2 | 3 | This is an experimental OpenAI Realtime API client for Python and LlamaIndex. It integrates with LlamaIndex's tools, allowing you to quickly build custom voice assistants. 4 | 5 | Include two examples that run directly in the terminal -- using both manual and Server VAD mode (i.e. allowing you to interrupt the chatbot). 6 | 7 | ## Installation 8 | 9 | Install system deps: 10 | 11 | ```bash 12 | brew install ffmpeg portaudio 13 | ``` 14 | 15 | Install python deps: 16 | 17 | ```bash 18 | pip install openai-realtime-client 19 | 20 | # Optional: clone the repo and run the examples locally 21 | git clone https://github.com/run-llama/openai_realtime_client.git 22 | cd openai_realtime_client 23 | ``` 24 | 25 | Set your openai key: 26 | 27 | ```bash 28 | export OPENAI_API_KEY="sk-..." 29 | ``` 30 | 31 | ## Usage 32 | 33 | Assuming you installed and cloned the repo (or copy-pasted the examples), you can immediately run the examples. 34 | 35 | Run the interactive CLI with manual VAD (try asking for your phone number to see function calling in action): 36 | 37 | ```bash 38 | python ./examples/manual_cli.py 39 | ``` 40 | 41 | Or to use streaming mode (which allows you to interrupt the chatbot): 42 | 43 | ```bash 44 | python ./examples/streaming_cli.py 45 | ``` 46 | 47 | **NOTE:** Streaming mode can be a little janky, best to use headphones in a quiet environment. 48 | 49 | Take a look at the examples, add your own tools, and build something amazing! 50 | -------------------------------------------------------------------------------- /examples/manual_cli.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from pynput import keyboard 5 | from openai_realtime_client import RealtimeClient, InputHandler, AudioHandler 6 | from llama_index.core.tools import FunctionTool 7 | 8 | # Add your own tools here! 9 | # NOTE: FunctionTool parses the docstring to get description, the tool name is the function name 10 | def get_phone_number(name: str) -> str: 11 | """Get my phone number.""" 12 | if name == "Jerry": 13 | return "1234567890" 14 | elif name == "Logan": 15 | return "0987654321" 16 | else: 17 | return "Unknown" 18 | 19 | tools = [FunctionTool.from_defaults(fn=get_phone_number)] 20 | 21 | async def main(): 22 | # Initialize handlers 23 | audio_handler = AudioHandler() 24 | input_handler = InputHandler() 25 | input_handler.loop = asyncio.get_running_loop() 26 | 27 | # Initialize the realtime client 28 | client = RealtimeClient( 29 | api_key=os.environ.get("OPENAI_API_KEY"), 30 | on_text_delta=lambda text: print(f"\nAssistant: {text}", end="", flush=True), 31 | on_audio_delta=lambda audio: audio_handler.play_audio(audio), 32 | on_input_transcript=lambda transcript: print(f"\nYou said: {transcript}\nAssistant: ", end="", flush=True), 33 | on_output_transcript=lambda transcript: print(f"{transcript}", end="", flush=True), 34 | tools=tools, 35 | ) 36 | 37 | # Start keyboard listener in a separate thread 38 | listener = keyboard.Listener(on_press=input_handler.on_press) 39 | listener.start() 40 | 41 | try: 42 | # Connect to the API 43 | await client.connect() 44 | 45 | # Start message handling in the background 46 | message_handler = asyncio.create_task(client.handle_messages()) 47 | 48 | print("Connected to OpenAI Realtime API!") 49 | print("Commands:") 50 | print("- Type your message and press Enter to send text") 51 | print("- Press 'r' to start recording audio") 52 | print("- Press 'space' to stop recording") 53 | print("- Press 'q' to quit") 54 | print("") 55 | 56 | while True: 57 | # Wait for commands from the input handler 58 | command, data = await input_handler.command_queue.get() 59 | 60 | if command == 'q': 61 | break 62 | elif command == 'r': 63 | # Start recording 64 | audio_handler.start_recording() 65 | elif command == 'space': 66 | print("[About to stop recording]") 67 | if audio_handler.recording: 68 | # Stop recording and get audio data 69 | audio_data = audio_handler.stop_recording() 70 | print("[Recording stopped]") 71 | if audio_data: 72 | await client.send_audio(audio_data) 73 | print("[Audio sent]") 74 | elif command == 'enter' and data: 75 | # Send text message 76 | await client.send_text(data) 77 | 78 | await asyncio.sleep(0.01) 79 | except Exception as e: 80 | print(f"Error: {e}") 81 | finally: 82 | # Clean up 83 | listener.stop() 84 | audio_handler.cleanup() 85 | await client.close() 86 | 87 | if __name__ == "__main__": 88 | # Install required packages: 89 | # pip install pyaudio pynput pydub websockets 90 | 91 | print("Starting Realtime API CLI...") 92 | asyncio.run(main()) 93 | -------------------------------------------------------------------------------- /examples/streaming_cli.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from pynput import keyboard 5 | from openai_realtime_client import RealtimeClient, AudioHandler, InputHandler, TurnDetectionMode 6 | from llama_index.core.tools import FunctionTool 7 | 8 | # Add your own tools here! 9 | # NOTE: FunctionTool parses the docstring to get description, the tool name is the function name 10 | def get_phone_number(name: str) -> str: 11 | """Get my phone number.""" 12 | if name == "Jerry": 13 | return "1234567890" 14 | elif name == "Logan": 15 | return "0987654321" 16 | else: 17 | return "Unknown" 18 | 19 | tools = [FunctionTool.from_defaults(fn=get_phone_number)] 20 | 21 | async def main(): 22 | audio_handler = AudioHandler() 23 | input_handler = InputHandler() 24 | input_handler.loop = asyncio.get_running_loop() 25 | 26 | client = RealtimeClient( 27 | api_key=os.environ.get("OPENAI_API_KEY"), 28 | on_text_delta=lambda text: print(f"\nAssistant: {text}", end="", flush=True), 29 | on_audio_delta=lambda audio: audio_handler.play_audio(audio), 30 | on_interrupt=lambda: audio_handler.stop_playback_immediately(), 31 | turn_detection_mode=TurnDetectionMode.SERVER_VAD, 32 | tools=tools, 33 | ) 34 | 35 | # Start keyboard listener in a separate thread 36 | listener = keyboard.Listener(on_press=input_handler.on_press) 37 | listener.start() 38 | 39 | try: 40 | await client.connect() 41 | message_handler = asyncio.create_task(client.handle_messages()) 42 | 43 | print("Connected to OpenAI Realtime API!") 44 | print("Audio streaming will start automatically.") 45 | print("Press 'q' to quit") 46 | print("") 47 | 48 | # Start continuous audio streaming 49 | streaming_task = asyncio.create_task(audio_handler.start_streaming(client)) 50 | 51 | # Simple input loop for quit command 52 | while True: 53 | command, _ = await input_handler.command_queue.get() 54 | 55 | if command == 'q': 56 | break 57 | 58 | except Exception as e: 59 | print(f"Error: {e}") 60 | finally: 61 | audio_handler.stop_streaming() 62 | audio_handler.cleanup() 63 | await client.close() 64 | 65 | if __name__ == "__main__": 66 | print("Starting Realtime API CLI with Server VAD...") 67 | asyncio.run(main()) 68 | -------------------------------------------------------------------------------- /openai_realtime_client/__init__.py: -------------------------------------------------------------------------------- 1 | from .client.realtime_client import RealtimeClient, TurnDetectionMode 2 | from .handlers.audio_handler import AudioHandler 3 | from .handlers.input_handler import InputHandler 4 | 5 | __all__ = ["RealtimeClient", "TurnDetectionMode", "AudioHandler", "InputHandler"] -------------------------------------------------------------------------------- /openai_realtime_client/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .realtime_client import RealtimeClient 2 | 3 | __all__ = ["RealtimeClient"] -------------------------------------------------------------------------------- /openai_realtime_client/client/realtime_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import json 4 | import base64 5 | import io 6 | 7 | from typing import Optional, Callable, List, Dict, Any 8 | from enum import Enum 9 | from pydub import AudioSegment 10 | 11 | from llama_index.core.tools import BaseTool, AsyncBaseTool, ToolSelection, adapt_to_async_tool, call_tool_with_selection 12 | 13 | 14 | class TurnDetectionMode(Enum): 15 | SERVER_VAD = "server_vad" 16 | MANUAL = "manual" 17 | 18 | class RealtimeClient: 19 | """ 20 | A client for interacting with the OpenAI Realtime API. 21 | 22 | This class provides methods to connect to the Realtime API, send text and audio data, 23 | handle responses, and manage the WebSocket connection. 24 | 25 | Attributes: 26 | api_key (str): 27 | The API key for authentication. 28 | model (str): 29 | The model to use for text and audio processing. 30 | voice (str): 31 | The voice to use for audio output. 32 | instructions (str): 33 | The instructions for the chatbot. 34 | temperature (float): 35 | The chatbot's temperature. 36 | turn_detection_mode (TurnDetectionMode): 37 | The mode for turn detection. 38 | tools (List[BaseTool]): 39 | The tools to use for function calling. 40 | on_text_delta (Callable[[str], None]): 41 | Callback for text delta events. 42 | Takes in a string and returns nothing. 43 | on_audio_delta (Callable[[bytes], None]): 44 | Callback for audio delta events. 45 | Takes in bytes and returns nothing. 46 | on_interrupt (Callable[[], None]): 47 | Callback for user interrupt events, should be used to stop audio playback. 48 | on_input_transcript (Callable[[str], None]): 49 | Callback for input transcript events. 50 | Takes in a string and returns nothing. 51 | on_output_transcript (Callable[[str], None]): 52 | Callback for output transcript events. 53 | Takes in a string and returns nothing. 54 | extra_event_handlers (Dict[str, Callable[[Dict[str, Any]], None]]): 55 | Additional event handlers. 56 | Is a mapping of event names to functions that process the event payload. 57 | """ 58 | def __init__( 59 | self, 60 | api_key: str, 61 | model: str = "gpt-4o-realtime-preview-2024-10-01", 62 | voice: str = "alloy", 63 | instructions: str = "You are a helpful assistant", 64 | temperature: float = 0.8, 65 | turn_detection_mode: TurnDetectionMode = TurnDetectionMode.MANUAL, 66 | tools: Optional[List[BaseTool]] = None, 67 | on_text_delta: Optional[Callable[[str], None]] = None, 68 | on_audio_delta: Optional[Callable[[bytes], None]] = None, 69 | on_interrupt: Optional[Callable[[], None]] = None, 70 | on_input_transcript: Optional[Callable[[str], None]] = None, 71 | on_output_transcript: Optional[Callable[[str], None]] = None, 72 | extra_event_handlers: Optional[Dict[str, Callable[[Dict[str, Any]], None]]] = None 73 | ): 74 | self.api_key = api_key 75 | self.model = model 76 | self.voice = voice 77 | self.ws = None 78 | self.on_text_delta = on_text_delta 79 | self.on_audio_delta = on_audio_delta 80 | self.on_interrupt = on_interrupt 81 | self.on_input_transcript = on_input_transcript 82 | self.on_output_transcript = on_output_transcript 83 | self.instructions = instructions 84 | self.temperature = temperature 85 | self.base_url = "wss://api.openai.com/v1/realtime" 86 | self.extra_event_handlers = extra_event_handlers or {} 87 | self.turn_detection_mode = turn_detection_mode 88 | 89 | tools = tools or [] 90 | for i, tool in enumerate(tools): 91 | tools[i] = adapt_to_async_tool(tool) 92 | self.tools: List[AsyncBaseTool] = tools 93 | 94 | # Track current response state 95 | self._current_response_id = None 96 | self._current_item_id = None 97 | self._is_responding = False 98 | # Track printing state for input and output transcripts 99 | self._print_input_transcript = False 100 | self._output_transcript_buffer = "" 101 | 102 | 103 | 104 | 105 | 106 | async def connect(self) -> None: 107 | """Establish WebSocket connection with the Realtime API.""" 108 | url = f"{self.base_url}?model={self.model}" 109 | headers = { 110 | "Authorization": f"Bearer {self.api_key}", 111 | "OpenAI-Beta": "realtime=v1" 112 | } 113 | 114 | self.ws = await websockets.connect(url, extra_headers=headers) 115 | 116 | # Set up default session configuration 117 | tools = [t.metadata.to_openai_tool()['function'] for t in self.tools] 118 | for t in tools: 119 | t['type'] = 'function' # TODO: OpenAI docs didn't say this was needed, but it was 120 | 121 | 122 | if self.turn_detection_mode == TurnDetectionMode.MANUAL: 123 | await self.update_session({ 124 | "modalities": ["text", "audio"], 125 | "instructions": self.instructions, 126 | "voice": self.voice, 127 | "input_audio_format": "pcm16", 128 | "output_audio_format": "pcm16", 129 | "input_audio_transcription": { 130 | "model": "whisper-1" 131 | }, 132 | "tools": tools, 133 | "tool_choice": "auto", 134 | "temperature": self.temperature, 135 | }) 136 | elif self.turn_detection_mode == TurnDetectionMode.SERVER_VAD: 137 | await self.update_session({ 138 | "modalities": ["text", "audio"], 139 | "instructions": self.instructions, 140 | "voice": self.voice, 141 | "input_audio_format": "pcm16", 142 | "output_audio_format": "pcm16", 143 | "input_audio_transcription": { 144 | "model": "whisper-1" 145 | }, 146 | "turn_detection": { 147 | "type": "server_vad", 148 | "threshold": 0.5, 149 | "prefix_padding_ms": 500, 150 | "silence_duration_ms": 200 151 | }, 152 | "tools": tools, 153 | "tool_choice": "auto", 154 | "temperature": self.temperature, 155 | }) 156 | else: 157 | raise ValueError(f"Invalid turn detection mode: {self.turn_detection_mode}") 158 | 159 | async def update_session(self, config: Dict[str, Any]) -> None: 160 | """Update session configuration.""" 161 | event = { 162 | "type": "session.update", 163 | "session": config 164 | } 165 | await self.ws.send(json.dumps(event)) 166 | 167 | async def send_text(self, text: str) -> None: 168 | """Send text message to the API.""" 169 | event = { 170 | "type": "conversation.item.create", 171 | "item": { 172 | "type": "message", 173 | "role": "user", 174 | "content": [{ 175 | "type": "input_text", 176 | "text": text 177 | }] 178 | } 179 | } 180 | await self.ws.send(json.dumps(event)) 181 | await self.create_response() 182 | 183 | async def send_audio(self, audio_bytes: bytes) -> None: 184 | """Send audio data to the API.""" 185 | # Convert audio to required format (24kHz, mono, PCM16) 186 | audio = AudioSegment.from_file(io.BytesIO(audio_bytes)) 187 | audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2) 188 | pcm_data = base64.b64encode(audio.raw_data).decode() 189 | 190 | # Append audio to buffer 191 | append_event = { 192 | "type": "input_audio_buffer.append", 193 | "audio": pcm_data 194 | } 195 | await self.ws.send(json.dumps(append_event)) 196 | 197 | # Commit the buffer 198 | commit_event = { 199 | "type": "input_audio_buffer.commit" 200 | } 201 | await self.ws.send(json.dumps(commit_event)) 202 | 203 | # In manual mode, we need to explicitly request a response 204 | if self.turn_detection_mode == TurnDetectionMode.MANUAL: 205 | await self.create_response() 206 | 207 | async def stream_audio(self, audio_chunk: bytes) -> None: 208 | """Stream raw audio data to the API.""" 209 | audio_b64 = base64.b64encode(audio_chunk).decode() 210 | 211 | append_event = { 212 | "type": "input_audio_buffer.append", 213 | "audio": audio_b64 214 | } 215 | await self.ws.send(json.dumps(append_event)) 216 | 217 | async def create_response(self, functions: Optional[List[Dict[str, Any]]] = None) -> None: 218 | """Request a response from the API. Needed when using manual mode.""" 219 | event = { 220 | "type": "response.create", 221 | "response": { 222 | "modalities": ["text", "audio"] 223 | } 224 | } 225 | if functions: 226 | event["response"]["tools"] = functions 227 | 228 | await self.ws.send(json.dumps(event)) 229 | 230 | async def send_function_result(self, call_id: str, result: Any) -> None: 231 | """Send function call result back to the API.""" 232 | event = { 233 | "type": "conversation.item.create", 234 | "item": { 235 | "type": "function_call_output", 236 | "call_id": call_id, 237 | "output": result 238 | } 239 | } 240 | await self.ws.send(json.dumps(event)) 241 | 242 | # functions need a manual response 243 | await self.create_response() 244 | 245 | async def cancel_response(self) -> None: 246 | """Cancel the current response.""" 247 | event = { 248 | "type": "response.cancel" 249 | } 250 | await self.ws.send(json.dumps(event)) 251 | 252 | async def truncate_response(self): 253 | """Truncate the conversation item to match what was actually played.""" 254 | if self._current_item_id: 255 | event = { 256 | "type": "conversation.item.truncate", 257 | "item_id": self._current_item_id 258 | } 259 | await self.ws.send(json.dumps(event)) 260 | 261 | async def call_tool(self, call_id: str,tool_name: str, tool_arguments: Dict[str, Any]) -> None: 262 | tool_selection = ToolSelection( 263 | tool_id="tool_id", 264 | tool_name=tool_name, 265 | tool_kwargs=tool_arguments 266 | ) 267 | 268 | # avoid blocking the event loop with sync tools 269 | # by using asyncio.to_thread 270 | tool_result = await asyncio.to_thread( 271 | call_tool_with_selection, 272 | tool_selection, 273 | self.tools, 274 | verbose=True 275 | ) 276 | await self.send_function_result(call_id, str(tool_result)) 277 | 278 | async def handle_interruption(self): 279 | """Handle user interruption of the current response.""" 280 | if not self._is_responding: 281 | return 282 | 283 | print("\n[Handling interruption]") 284 | 285 | # 1. Cancel the current response 286 | if self._current_response_id: 287 | await self.cancel_response() 288 | 289 | # 2. Truncate the conversation item to what was actually played 290 | if self._current_item_id: 291 | await self.truncate_response() 292 | 293 | self._is_responding = False 294 | self._current_response_id = None 295 | self._current_item_id = None 296 | 297 | async def handle_messages(self) -> None: 298 | try: 299 | async for message in self.ws: 300 | event = json.loads(message) 301 | event_type = event.get("type") 302 | 303 | if event_type == "error": 304 | print(f"Error: {event['error']}") 305 | continue 306 | 307 | # Track response state 308 | elif event_type == "response.created": 309 | self._current_response_id = event.get("response", {}).get("id") 310 | self._is_responding = True 311 | 312 | elif event_type == "response.output_item.added": 313 | self._current_item_id = event.get("item", {}).get("id") 314 | 315 | elif event_type == "response.done": 316 | self._is_responding = False 317 | self._current_response_id = None 318 | self._current_item_id = None 319 | 320 | # Handle interruptions 321 | elif event_type == "input_audio_buffer.speech_started": 322 | print("\n[Speech detected") 323 | if self._is_responding: 324 | await self.handle_interruption() 325 | 326 | if self.on_interrupt: 327 | self.on_interrupt() 328 | 329 | 330 | elif event_type == "input_audio_buffer.speech_stopped": 331 | print("\n[Speech ended]") 332 | 333 | # Handle normal response events 334 | elif event_type == "response.text.delta": 335 | if self.on_text_delta: 336 | self.on_text_delta(event["delta"]) 337 | 338 | elif event_type == "response.audio.delta": 339 | if self.on_audio_delta: 340 | audio_bytes = base64.b64decode(event["delta"]) 341 | self.on_audio_delta(audio_bytes) 342 | 343 | elif event_type == "response.function_call_arguments.done": 344 | await self.call_tool(event["call_id"], event['name'], json.loads(event['arguments'])) 345 | 346 | # Handle input audio transcription 347 | elif event_type == "conversation.item.input_audio_transcription.completed": 348 | transcript = event.get("transcript", "") 349 | 350 | if self.on_input_transcript: 351 | await asyncio.to_thread(self.on_input_transcript,transcript) 352 | self._print_input_transcript = True 353 | 354 | # Handle output audio transcription 355 | elif event_type == "response.audio_transcript.delta": 356 | if self.on_output_transcript: 357 | delta = event.get("delta", "") 358 | if not self._print_input_transcript: 359 | self._output_transcript_buffer += delta 360 | else: 361 | if self._output_transcript_buffer: 362 | await asyncio.to_thread(self.on_output_transcript,self._output_transcript_buffer) 363 | self._output_transcript_buffer = "" 364 | await asyncio.to_thread(self.on_output_transcript,delta) 365 | 366 | 367 | elif event_type == "response.audio_transcript.done": 368 | self._print_input_transcript = False 369 | 370 | elif event_type in self.extra_event_handlers: 371 | self.extra_event_handlers[event_type](event) 372 | 373 | except websockets.exceptions.ConnectionClosed: 374 | print("Connection closed") 375 | except Exception as e: 376 | print(f"Error in message handling: {str(e)}") 377 | 378 | async def close(self) -> None: 379 | """Close the WebSocket connection.""" 380 | if self.ws: 381 | await self.ws.close() 382 | -------------------------------------------------------------------------------- /openai_realtime_client/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | from .input_handler import InputHandler 2 | from .audio_handler import AudioHandler 3 | 4 | __all__ = ["InputHandler", "AudioHandler"] -------------------------------------------------------------------------------- /openai_realtime_client/handlers/audio_handler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pyaudio 3 | import wave 4 | import queue 5 | import io 6 | from typing import Optional 7 | 8 | from pydub import AudioSegment 9 | import threading 10 | 11 | from ..client.realtime_client import RealtimeClient 12 | 13 | 14 | class AudioHandler: 15 | """ 16 | Handles audio input and output for the chatbot. 17 | 18 | Uses PyAudio for audio input and output, and runs a separate thread for recording and playing audio. 19 | 20 | When playing audio, it uses a buffer to store audio data and plays it continuously to ensure smooth playback. 21 | 22 | Attributes: 23 | format (int): The audio format (paInt16). 24 | channels (int): The number of audio channels (1). 25 | rate (int): The sample rate (24000). 26 | chunk (int): The size of the audio buffer (1024). 27 | audio (pyaudio.PyAudio): The PyAudio object. 28 | recording_stream (pyaudio.Stream): The stream for recording audio. 29 | recording_thread (threading.Thread): The thread for recording audio. 30 | recording (bool): Whether the audio is currently being recorded. 31 | streaming (bool): Whether the audio is currently being streamed. 32 | stream (pyaudio.Stream): The stream for streaming audio. 33 | playback_stream (pyaudio.Stream): The stream for playing audio. 34 | playback_buffer (queue.Queue): The buffer for playing audio. 35 | stop_playback (bool): Whether the audio playback should be stopped. 36 | """ 37 | def __init__(self): 38 | # Audio parameters 39 | self.format = pyaudio.paInt16 40 | self.channels = 1 41 | self.rate = 24000 42 | self.chunk = 1024 43 | 44 | self.audio = pyaudio.PyAudio() 45 | 46 | # Recording params 47 | self.recording_stream: Optional[pyaudio.Stream] = None 48 | self.recording_thread = None 49 | self.recording = False 50 | 51 | # streaming params 52 | self.streaming = False 53 | self.stream = None 54 | 55 | # Playback params 56 | self.playback_stream = None 57 | self.playback_buffer = queue.Queue(maxsize=20) 58 | self.playback_event = threading.Event() 59 | self.playback_thread = None 60 | self.stop_playback = False 61 | 62 | def start_recording(self) -> bytes: 63 | """Start recording audio from microphone and return bytes""" 64 | if self.recording: 65 | return b'' 66 | 67 | self.recording = True 68 | self.recording_stream = self.audio.open( 69 | format=self.format, 70 | channels=self.channels, 71 | rate=self.rate, 72 | input=True, 73 | frames_per_buffer=self.chunk 74 | ) 75 | 76 | print("\nRecording... Press 'space' to stop.") 77 | 78 | self.frames = [] 79 | self.recording_thread = threading.Thread(target=self._record) 80 | self.recording_thread.start() 81 | 82 | return b'' # Return empty bytes, we'll send audio later 83 | 84 | def _record(self): 85 | while self.recording: 86 | try: 87 | data = self.recording_stream.read(self.chunk) 88 | self.frames.append(data) 89 | except Exception as e: 90 | print(f"Error recording: {e}") 91 | break 92 | 93 | def stop_recording(self) -> bytes: 94 | """Stop recording and return the recorded audio as bytes""" 95 | if not self.recording: 96 | return b'' 97 | 98 | self.recording = False 99 | if self.recording_thread: 100 | self.recording_thread.join() 101 | 102 | # Clean up recording stream 103 | if self.recording_stream: 104 | self.recording_stream.stop_stream() 105 | self.recording_stream.close() 106 | self.recording_stream = None 107 | 108 | # Convert frames to WAV format in memory 109 | wav_buffer = io.BytesIO() 110 | with wave.open(wav_buffer, 'wb') as wf: 111 | wf.setnchannels(self.channels) 112 | wf.setsampwidth(self.audio.get_sample_size(self.format)) 113 | wf.setframerate(self.rate) 114 | wf.writeframes(b''.join(self.frames)) 115 | 116 | # Get the WAV data 117 | wav_buffer.seek(0) 118 | return wav_buffer.read() 119 | 120 | async def start_streaming(self, client: RealtimeClient): 121 | """Start continuous audio streaming.""" 122 | if self.streaming: 123 | return 124 | 125 | self.streaming = True 126 | self.stream = self.audio.open( 127 | format=self.format, 128 | channels=self.channels, 129 | rate=self.rate, 130 | input=True, 131 | frames_per_buffer=self.chunk 132 | ) 133 | 134 | print("\nStreaming audio... Press 'q' to stop.") 135 | 136 | while self.streaming: 137 | try: 138 | # Read raw PCM data 139 | data = self.stream.read(self.chunk, exception_on_overflow=False) 140 | # Stream directly without trying to decode 141 | await client.stream_audio(data) 142 | except Exception as e: 143 | print(f"Error streaming: {e}") 144 | break 145 | await asyncio.sleep(0.01) 146 | 147 | def stop_streaming(self): 148 | """Stop audio streaming.""" 149 | self.streaming = False 150 | if self.stream: 151 | self.stream.stop_stream() 152 | self.stream.close() 153 | self.stream = None 154 | 155 | def play_audio(self, audio_data: bytes): 156 | """Add audio data to the buffer""" 157 | try: 158 | self.playback_buffer.put_nowait(audio_data) 159 | except queue.Full: 160 | # If the buffer is full, remove the oldest chunk and add the new one 161 | self.playback_buffer.get_nowait() 162 | self.playback_buffer.put_nowait(audio_data) 163 | 164 | if not self.playback_thread or not self.playback_thread.is_alive(): 165 | self.stop_playback = False 166 | self.playback_event.clear() 167 | self.playback_thread = threading.Thread(target=self._continuous_playback) 168 | self.playback_thread.start() 169 | 170 | def _continuous_playback(self): 171 | """Continuously play audio from the buffer""" 172 | self.playback_stream = self.audio.open( 173 | format=self.format, 174 | channels=self.channels, 175 | rate=self.rate, 176 | output=True, 177 | frames_per_buffer=self.chunk 178 | ) 179 | 180 | while not self.stop_playback: 181 | try: 182 | audio_chunk = self.playback_buffer.get(timeout=0.1) 183 | self._play_audio_chunk(audio_chunk) 184 | except queue.Empty: 185 | continue 186 | 187 | if self.playback_event.is_set(): 188 | break 189 | 190 | if self.playback_stream: 191 | self.playback_stream.stop_stream() 192 | self.playback_stream.close() 193 | self.playback_stream = None 194 | 195 | def _play_audio_chunk(self, audio_chunk): 196 | try: 197 | # Convert the audio chunk to the correct format 198 | audio_segment = AudioSegment( 199 | audio_chunk, 200 | sample_width=2, 201 | frame_rate=24000, 202 | channels=1 203 | ) 204 | 205 | # Ensure the audio is in the correct format for playback 206 | audio_data = audio_segment.raw_data 207 | 208 | # Play the audio chunk in smaller portions to allow for quicker interruption 209 | chunk_size = 1024 # Adjust this value as needed 210 | for i in range(0, len(audio_data), chunk_size): 211 | if self.playback_event.is_set(): 212 | break 213 | chunk = audio_data[i:i+chunk_size] 214 | self.playback_stream.write(chunk) 215 | except Exception as e: 216 | print(f"Error playing audio chunk: {e}") 217 | 218 | def stop_playback_immediately(self): 219 | """Stop audio playback immediately.""" 220 | self.stop_playback = True 221 | self.playback_buffer.queue.clear() # Clear any pending audio 222 | self.currently_playing = False 223 | self.playback_event.set() 224 | 225 | def cleanup(self): 226 | """Clean up audio resources""" 227 | self.stop_playback_immediately() 228 | 229 | self.stop_playback = True 230 | if self.playback_thread: 231 | self.playback_thread.join() 232 | 233 | self.recording = False 234 | if self.recording_stream: 235 | self.recording_stream.stop_stream() 236 | self.recording_stream.close() 237 | 238 | if self.stream: 239 | self.stream.stop_stream() 240 | self.stream.close() 241 | 242 | self.audio.terminate() -------------------------------------------------------------------------------- /openai_realtime_client/handlers/input_handler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from pynput import keyboard 3 | 4 | 5 | class InputHandler: 6 | """ 7 | Handles keyboard input for the chatbot. 8 | 9 | This class is responsible for capturing keyboard input and translating it into commands for the chatbot. 10 | 11 | Attributes: 12 | text_input (str): The current text input from the user. 13 | text_ready (asyncio.Event): An event that is set when the user has finished typing. 14 | command_queue (asyncio.Queue): A queue that stores commands for the chatbot. 15 | loop (asyncio.AbstractEventLoop): The event loop for the input handler. 16 | """ 17 | def __init__(self): 18 | self.text_input = "" 19 | self.text_ready = asyncio.Event() 20 | self.command_queue = asyncio.Queue() 21 | self.loop = None 22 | 23 | def on_press(self, key): 24 | try: 25 | if key == keyboard.Key.space: 26 | self.loop.call_soon_threadsafe( 27 | self.command_queue.put_nowait, ('space', None) 28 | ) 29 | elif key == keyboard.Key.enter: 30 | self.loop.call_soon_threadsafe( 31 | self.command_queue.put_nowait, ('enter', self.text_input) 32 | ) 33 | self.text_input = "" 34 | elif key == keyboard.KeyCode.from_char('r'): 35 | self.loop.call_soon_threadsafe( 36 | self.command_queue.put_nowait, ('r', None) 37 | ) 38 | elif key == keyboard.KeyCode.from_char('q'): 39 | self.loop.call_soon_threadsafe( 40 | self.command_queue.put_nowait, ('q', None) 41 | ) 42 | elif hasattr(key, 'char'): 43 | if key == keyboard.Key.backspace: 44 | self.text_input = self.text_input[:-1] 45 | else: 46 | self.text_input += key.char 47 | except AttributeError: 48 | pass -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "openai-realtime-client" 3 | version = "0.1.0" 4 | description = "A python-based client for OpenAI's Realtime API" 5 | authors = ["Logan Markewich "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | llama-index-core = "^0.11.17" 12 | pyaudio = "^0.2.14" 13 | pynput = "^1.7.7" 14 | pydub = "^0.25.1" 15 | websockets = "^13.1" 16 | wave = "^0.0.2" 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | --------------------------------------------------------------------------------