├── .gitignore ├── ADA ├── __pycache__ │ ├── ADA_Local.cpython-312.pyc │ ├── ADA_Online.cpython-312.pyc │ └── ADA_Online_NoElevenlabs.cpython-312.pyc ├── WIDGETS │ ├── __pycache__ │ │ ├── timer.cpython-312.pyc │ │ ├── camera.cpython-312.pyc │ │ ├── project.cpython-312.pyc │ │ ├── system.cpython-312.pyc │ │ ├── to_do_list.cpython-312.pyc │ │ ├── open_camera.cpython-312.pyc │ │ ├── system_info.cpython-312.pyc │ │ └── create_project.cpython-312.pyc │ ├── camera.py │ ├── project.py │ ├── timer.py │ ├── to_do_list.py │ └── system.py ├── ADA_Local.py ├── ADA_Online_NoElevenlabs.py └── ADA_Online.py ├── main_local.py ├── test ├── basic_tts.py ├── tts_latency_test.py └── function_call_accuracy_test.py ├── main_online_noelevenlabs.py ├── main_online.py ├── multimodal_live_api.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /realtimesst.log 2 | /.env -------------------------------------------------------------------------------- /ADA/__pycache__/ADA_Local.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/__pycache__/ADA_Local.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/__pycache__/ADA_Online.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/__pycache__/ADA_Online.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/timer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/timer.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/camera.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/camera.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/project.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/project.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/system.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/system.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/to_do_list.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/to_do_list.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/open_camera.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/open_camera.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/system_info.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/system_info.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/__pycache__/create_project.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/WIDGETS/__pycache__/create_project.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/__pycache__/ADA_Online_NoElevenlabs.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nlouis38/ada/HEAD/ADA/__pycache__/ADA_Online_NoElevenlabs.cpython-312.pyc -------------------------------------------------------------------------------- /ADA/WIDGETS/camera.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | def open(): 4 | """Opens the default camera using OpenCV and displays the video feed. Press 'q' to exit.""" 5 | 6 | return "Camera is open" 7 | 8 | global cap # Access the global cap variable 9 | cap = cv2.VideoCapture(0) 10 | 11 | if not cap.isOpened(): 12 | print("Error: Could not open camera.") 13 | return 14 | 15 | while True: 16 | ret, frame = cap.read() 17 | if not ret: 18 | print("Error: Could not read frame.") 19 | break 20 | 21 | cv2.imshow('Camera Feed', frame) 22 | 23 | if cv2.waitKey(1) & 0xFF == ord('q'): 24 | break 25 | -------------------------------------------------------------------------------- /main_local.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Important: **Use headphones**. This script uses the system default audio 3 | input and output, which often won't include echo cancellation. So to prevent 4 | the model from interrupting itself it is important that you use headphones. 5 | ''' 6 | 7 | from ADA.ADA_Local import ADA 8 | import asyncio 9 | 10 | async def main(): 11 | ada = ADA() 12 | async with asyncio.TaskGroup() as tg: 13 | tg.create_task(ada.stt()) 14 | input_message = tg.create_task(ada.input_message()) 15 | tg.create_task(ada.send_prompt()) 16 | tg.create_task(ada.tts()) 17 | 18 | await input_message 19 | 20 | if __name__ == "__main__": 21 | asyncio.run(main()) -------------------------------------------------------------------------------- /test/basic_tts.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from RealtimeTTS import TextToAudioStream, CoquiEngine 3 | 4 | def dummy_generator(): 5 | yield "Hey guys! These here are realtime spoken sentences based on local text synthesis. " 6 | yield "With a local, neuronal, cloned voice. So every spoken sentence sounds unique." 7 | 8 | # for normal use with minimal logging: 9 | engine = CoquiEngine() 10 | 11 | # test with extended logging: 12 | # import logging 13 | # logging.basicConfig(level=logging.INFO) 14 | # engine = CoquiEngine(level=logging.INFO) 15 | 16 | stream = TextToAudioStream(engine) 17 | 18 | print("Starting to play stream") 19 | stream.feed(dummy_generator()).play(log_synthesized_text=True, output_wavfile = "output.wav") 20 | 21 | print("Playout finished") 22 | 23 | engine.shutdown() -------------------------------------------------------------------------------- /main_online_noelevenlabs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Important: **Use headphones**. This script uses the system default audio 3 | input and output, which often won't include echo cancellation. So to prevent 4 | the model from interrupting itself it is important that you use headphones. 5 | 6 | Before running this script, ensure the `GOOGLE_API_KEY` environment 7 | variable is set to the api-key you obtained from Google AI Studio. 8 | ''' 9 | 10 | from ADA.ADA_Online_NoElevenlabs import ADA 11 | import asyncio 12 | 13 | async def main(): 14 | ada = ADA() 15 | async with asyncio.TaskGroup() as tg: 16 | tg.create_task(ada.stt()) 17 | input_message = tg.create_task(ada.input_message()) 18 | tg.create_task(ada.send_prompt()) 19 | tg.create_task(ada.tts()) 20 | 21 | await input_message 22 | 23 | if __name__ == "__main__": 24 | asyncio.run(main()) -------------------------------------------------------------------------------- /main_online.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Important: **Use headphones**. This script uses the system default audio 3 | input and output, which often won't include echo cancellation. So to prevent 4 | the model from interrupting itself it is important that you use headphones. 5 | 6 | Before running this script, ensure the `GOOGLE_API_KEY` environment 7 | variable is set to the api-key you obtained from Google AI Studio. 8 | ''' 9 | 10 | from ADA.ADA_Online import ADA 11 | import asyncio 12 | 13 | async def main(): 14 | ada = ADA() 15 | async with asyncio.TaskGroup() as tg: 16 | tg.create_task(ada.stt()) 17 | input_message = tg.create_task(ada.input_message()) 18 | tg.create_task(ada.send_prompt()) 19 | tg.create_task(ada.tts()) 20 | tg.create_task(ada.play_audio()) 21 | 22 | await input_message 23 | 24 | if __name__ == "__main__": 25 | asyncio.run(main()) -------------------------------------------------------------------------------- /ADA/WIDGETS/project.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def create_folder(folder_name, chat_history_file): 4 | """ 5 | Creates a project folder and a text file to store chat history. 6 | 7 | Args: 8 | folder_name (str): The name of the project folder to create. 9 | """ 10 | 11 | try: 12 | # Create the project folder 13 | if not os.path.exists(folder_name): 14 | os.makedirs(folder_name) 15 | file_path = os.path.join(folder_name, chat_history_file) 16 | with open(file_path, 'w') as f: 17 | f.write("Chat history will be stored here.\n") 18 | return(f"Project folder '{folder_name}' created successfully.") 19 | else: 20 | return(f"Project folder '{folder_name}' already exists.") 21 | 22 | # Create the chat history file inside the project folder 23 | 24 | 25 | except OSError as e: 26 | return(f"Error creating project folder or file: {e}") 27 | 28 | if __name__ == "__main__": 29 | create_folder(folder_name="", chat_history_file="") 30 | -------------------------------------------------------------------------------- /ADA/WIDGETS/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | 4 | def set(time_str): 5 | """ 6 | Counts down from a specified time in HH:MM:SS format. 7 | 8 | Args: 9 | time_str (str): The time to count down from in HH:MM:SS format. 10 | """ 11 | 12 | try: 13 | hours, minutes, seconds = map(int, time_str.split(':')) 14 | total_seconds = hours * 3600 + minutes * 60 + seconds 15 | 16 | if not (0 <= hours <= 99 and 0 <= minutes <= 59 and 0 <= seconds <= 59): 17 | print("Invalid time format. Hours should be between 00 and 99, minutes and seconds between 00 and 59.") 18 | return 19 | 20 | while total_seconds > 0: 21 | timer = divmod(total_seconds, 60) 22 | hours = timer[0] // 60 23 | minutes = timer[0] % 60 24 | seconds = timer[1] 25 | sys.stdout.write(f"\r{hours:02d}:{minutes:02d}:{seconds:02d}") 26 | sys.stdout.flush() 27 | time.sleep(1) 28 | total_seconds -= 1 29 | 30 | print("\rTime's up! ") 31 | 32 | except ValueError: 33 | print("Invalid time format. Please use HH:MM:SS.") 34 | 35 | if __name__ == "__main__": 36 | set(time_str="00:10:00") 37 | -------------------------------------------------------------------------------- /ADA/WIDGETS/to_do_list.py: -------------------------------------------------------------------------------- 1 | def create_list(): 2 | """ 3 | Creates an empty to-do list. 4 | 5 | Returns: 6 | list: An empty to-do list. 7 | """ 8 | return [] 9 | 10 | def add_task(todo_list, task): 11 | """ 12 | Adds a task to the to-do list. 13 | 14 | Args: 15 | todo_list (list): The to-do list. 16 | task (str): The task to add. 17 | """ 18 | todo_list.append(task) 19 | print(f"Task '{task}' added to the to-do list.") 20 | 21 | def delete_task(todo_list, task): 22 | """ 23 | Deletes a task from the to-do list. 24 | 25 | Args: 26 | todo_list (list): The to-do list. 27 | task (str): The task to delete. 28 | """ 29 | if task in todo_list: 30 | todo_list.remove(task) 31 | print(f"Task '{task}' removed from the to-do list.") 32 | else: 33 | print(f"Task '{task}' not found in the to-do list.") 34 | 35 | def display_todo_list(todo_list): 36 | """ 37 | Displays the current to-do list. 38 | 39 | Args: 40 | todo_list (list): The to-do list. 41 | """ 42 | if not todo_list: 43 | print("Your to-do list is empty.") 44 | else: 45 | print("Your to-do list:") 46 | for i, task in enumerate(todo_list): 47 | print(f"{i+1}. {task}") 48 | 49 | if __name__ == "__main__": 50 | my_todo_list = create_list() 51 | add_task(my_todo_list, "Grocery Shopping") 52 | add_task(my_todo_list, "Pay Bills") 53 | add_task(my_todo_list, "Walk the Dog") 54 | display_todo_list(my_todo_list) 55 | delete_task(my_todo_list, "Pay Bills") 56 | display_todo_list(my_todo_list) 57 | -------------------------------------------------------------------------------- /ADA/WIDGETS/system.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import psutil 3 | import GPUtil 4 | 5 | def info(): 6 | """ 7 | Gathers and prints system information including CPU, RAM, and GPU details. 8 | """ 9 | 10 | print("="*40, "System Information", "="*40) 11 | uname = platform.uname() 12 | print(f"System: {uname.system}") 13 | print(f"Node Name: {uname.node}") 14 | print(f"Release: {uname.release}") 15 | print(f"Version: {uname.version}") 16 | print(f"Machine: {uname.machine}") 17 | print(f"Processor: {uname.processor}") 18 | 19 | # CPU information 20 | print("="*40, "CPU Info", "="*40) 21 | print("Physical cores:", psutil.cpu_count(logical=False)) 22 | print("Total cores:", psutil.cpu_count(logical=True)) 23 | cpufreq = psutil.cpu_freq() 24 | print(f"Max Frequency: {cpufreq.max:.2f}Mhz") 25 | print(f"Min Frequency: {cpufreq.min:.2f}Mhz") 26 | print(f"Current Frequency: {cpufreq.current:.2f}Mhz") 27 | print("CPU Usage Per Core:") 28 | for i, percentage in enumerate(psutil.cpu_percent(percpu=True, interval=1)): 29 | print(f"Core {i}: {percentage}%") 30 | print(f"Total CPU Usage: {psutil.cpu_percent()}%") 31 | 32 | # Memory Information 33 | print("="*40, "Memory Information", "="*40) 34 | svmem = psutil.virtual_memory() 35 | print(f"Total: {svmem.total / (1024.0 ** 3):.2f} GB") 36 | print(f"Available: {svmem.available / (1024.0 ** 3):.2f} GB") 37 | print(f"Used: {svmem.used / (1024.0 ** 3):.2f} GB") 38 | print(f"Percentage: {svmem.percent}%") 39 | 40 | # GPU information 41 | print("="*40, "GPU Info", "="*40) 42 | try: 43 | gpus = GPUtil.getGPUs() 44 | for gpu in gpus: 45 | print(f"GPU ID: {gpu.id}") 46 | print(f" GPU Name: {gpu.name}") 47 | print(f" GPU Load: {gpu.load*100:.2f}%") 48 | print(f" GPU Memory Total: {gpu.memoryTotal:.2f}MB") 49 | print(f" GPU Memory Used: {gpu.memoryUsed:.2f}MB") 50 | print(f" GPU Memory Free: {gpu.memoryFree:.2f}MB") 51 | print(f" GPU Temperature: {gpu.temperature:.2f} °C") 52 | except Exception as e: 53 | return(f"Error getting GPU information: {e}") 54 | 55 | return("system information has been returned") 56 | 57 | 58 | if __name__ == "__main__": 59 | info() 60 | -------------------------------------------------------------------------------- /test/tts_latency_test.py: -------------------------------------------------------------------------------- 1 | from RealtimeTTS import ( 2 | TextToAudioStream, 3 | SystemEngine, 4 | AzureEngine, 5 | ElevenlabsEngine, 6 | CoquiEngine, 7 | OpenAIEngine, 8 | ) 9 | from PyQt6.QtWidgets import ( 10 | QApplication, 11 | QMainWindow, 12 | QVBoxLayout, 13 | QWidget, 14 | QComboBox, 15 | QTextEdit, 16 | QLabel, 17 | ) 18 | from PyQt6.QtCore import pyqtSlot 19 | import time 20 | import sys 21 | import os 22 | from dotenv import load_dotenv # Added for API key loading 23 | 24 | # --- Load Environment Variables --- 25 | load_dotenv() 26 | 27 | ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") 28 | 29 | class TTSApp(QMainWindow): 30 | def __init__(self): 31 | super().__init__() 32 | 33 | # Initialize TTS engines 34 | print("Initializing TTS Engines...") 35 | self.engine_system = SystemEngine() 36 | #self.engine_azure = AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), os.environ.get("AZURE_SPEECH_REGION")) 37 | self.engine_elevenlabs = ElevenlabsEngine(ELEVENLABS_API_KEY) 38 | self.engine_coqui = CoquiEngine() 39 | #self.engine_openai = OpenAIEngine() 40 | print("TTS Engines initialized.") 41 | 42 | # Add a dictionary to map engine names to engine instances 43 | self.engines = { 44 | "System Engine": self.engine_system, 45 | #"OpenAI Engine": self.engine_openai, 46 | "Elevenlabs Engine": self.engine_elevenlabs, 47 | "Coqui Engine": self.engine_coqui, 48 | #"Azure Engine": self.engine_azure, 49 | } 50 | 51 | # Initialize TTS Stream 52 | self.stream = TextToAudioStream( 53 | self.engine_system, on_audio_stream_start=self.on_audio_stream_start 54 | ) 55 | 56 | # Main widget and layout 57 | self.main_widget = QWidget(self) 58 | self.setCentralWidget(self.main_widget) 59 | self.layout = QVBoxLayout(self.main_widget) 60 | 61 | # Dropdown for TTS Engine Selection 62 | self.tts_engine_dropdown = QComboBox(self) 63 | for engine_name in self.engines.keys(): 64 | self.tts_engine_dropdown.addItem(engine_name) 65 | self.tts_engine_dropdown.currentTextChanged.connect(self.tts_engine_changed) 66 | self.layout.addWidget(self.tts_engine_dropdown) 67 | 68 | # Big Input Text Control 69 | self.text_input = QTextEdit(self) 70 | self.text_input.textChanged.connect(self.text_pasted) 71 | self.layout.addWidget(self.text_input) 72 | 73 | # Label for Latency Display 74 | self.latency_label = QLabel("Latency: N/A", self) 75 | self.layout.addWidget(self.latency_label) 76 | 77 | self.setWindowTitle("TTS Synthesis Speed Test") 78 | 79 | @pyqtSlot() 80 | def tts_engine_changed(self): 81 | selected_engine_name = self.tts_engine_dropdown.currentText() 82 | selected_engine = self.engines[selected_engine_name] 83 | self.stream.load_engine(selected_engine) 84 | print(f"TTS Engine selected: {selected_engine_name}") 85 | 86 | @pyqtSlot() 87 | def text_pasted(self): 88 | pasted_text = self.text_input.toPlainText() 89 | print(f"Text pasted: {pasted_text}") 90 | 91 | self.time_pasted = time.time() 92 | self.stream.feed(pasted_text) 93 | self.stream.play_async() 94 | 95 | def on_audio_stream_start(self): 96 | self.time_started = time.time() 97 | latency = self.time_started - self.time_pasted 98 | self.latency_label.setText("Latency: {:.2f} seconds".format(latency)) 99 | 100 | 101 | if __name__ == "__main__": 102 | app = QApplication(sys.argv) 103 | mainWin = TTSApp() 104 | mainWin.show() 105 | sys.exit(app.exec()) -------------------------------------------------------------------------------- /test/function_call_accuracy_test.py: -------------------------------------------------------------------------------- 1 | import ollama 2 | import re 3 | import time 4 | import json 5 | 6 | response_array = [] 7 | success_count = 0 8 | failure_count = 0 9 | 10 | # extract the tool call from the response 11 | def extract_tool_call(text, function_name): 12 | import io 13 | from contextlib import redirect_stdout 14 | 15 | pattern = r"```tool_code\s*(.*?)\s*```" 16 | match = re.search(pattern, text, re.DOTALL) 17 | if match: 18 | code = match.group(1).strip() 19 | if function_name is None: 20 | return False # No function name expected, but code was found 21 | if function_name in code: 22 | return True 23 | else: 24 | return False 25 | else: 26 | return None 27 | 28 | system_instructions = """ 29 | Your name is ADA (Advanced Design Assistant) you are a helpful AI assistant. You are an expert in All STEM Fields providing concise and accurate information. When asked to perform a task, respond with the code to perform that task wrapped in ```tool_code```. If the task does not require a function call, provide a direct answer without using ```tool_code```. Always respond in a helpful and informative manner." 30 | 31 | You speak with a british accent and address people as Sir. 32 | """ 33 | 34 | instruction_prompt_with_function_calling = '''At each turn, if you decide to invoke any of the function(s), it should be wrapped with ```tool_code```. If you decide to call a function the response should only have the function wrapped in tool code nothing more. The python methods described below are imported and available, you can only use defined methods also only call methods when you are sure they need to be called. The generated code should be readable and efficient. The response to a method will be wrapped in ```tool_output``` use it generate a helpful, friendly response. For example if the tool output says ```tool_output camera on```. You should say something like "The Camera is on". 35 | 36 | For regular prompts do not call any functions or wrap the response in ```tool_code```. 37 | 38 | The following Python methods are available: 39 | 40 | ```python 41 | def camera.open() -> None: 42 | """Open the camera""" 43 | 44 | def system.info() -> None: 45 | """ Gathers and prints system information including CPU, RAM, and GPU details. """ 46 | 47 | def timer.set(time_str): 48 | """ 49 | Counts down from a specified time in HH:MM:SS format. 50 | 51 | Args: 52 | time_str (str): The time to count down from in HH:MM:SS format. 53 | """ 54 | def project.create_folder(folder_name): 55 | """ 56 | Creates a project folder and a text file to store chat history. 57 | 58 | Args: 59 | folder_name (str): The name of the project folder to create. 60 | """ 61 | 62 | ``` 63 | 64 | User: {user_message} 65 | ''' 66 | 67 | def test(prompt, should_call_function, function_name): 68 | global success_count 69 | global failure_count 70 | messages = [{"role": "system", "content": system_instructions}, {"role": "user", "content": instruction_prompt_with_function_calling.format(user_message=prompt)}] 71 | response = ollama.chat(model="gemma3:4b-it-q4_K_M", messages=messages) 72 | #print(response['message']['content']) 73 | returned_value = extract_tool_call(response['message']['content'], function_name) 74 | 75 | if should_call_function == False: 76 | if returned_value == None: 77 | result = "Passed" 78 | else: 79 | result = "Failed" 80 | else: 81 | if returned_value == True: 82 | result = "Passed" 83 | else: 84 | result = "Failed" 85 | print(result) 86 | if result == "Passed": 87 | success_count += 1 88 | else: 89 | failure_count += 1 90 | 91 | data = (prompt, result, response['message']['content']) 92 | response_array.append(data) 93 | 94 | prompts_and_expectations = [ 95 | ("Hello, how are you?", False, None), # Should NOT call a function 96 | ("set 10 second timer", True, "timer.set"), # Should call timer.set() 97 | ("Difference between DC and AC", False, None), # Should NOT call a function 98 | ("Show me System Info", True, "system.info"), # Should call system.info() 99 | ("Briefly explain gravity", False, None), # Should NOT call a function 100 | ("can you open the camera", True, "camera.open"), # Should call camera.open() 101 | ("Give me a short explanation of the internet", False, None), # Should NOT call a function 102 | ("set me a timer for 1 minute", True, "timer.set"), # Should call timer.set() 103 | ("What is the chemical symbol for water?", False, None), # Should NOT call a function 104 | ("open the camera", True, "camera.open"), # Should call camera.open() 105 | ("What is a synonym for happy?", False, None), # Should NOT call a function 106 | ("set me 33 second timer", True, "timer.set"), # Should call timer.set() 107 | ("What is the largest planet in our solar system?", False, None), # Should NOT call a function 108 | ("open camera", True, "camera.open"), # Should call camera.open() 109 | ("How many continents are there?", False, None), # Should NOT call a function 110 | ("Start a 10 hour timer", True, "timer.set"), # Should call timer.set() 111 | ("What is the opposite of up?", False, None), # Should NOT call a function 112 | ("Turn on the Camera", True, "camera.open"), # Should call camera.open() 113 | ("What is the speed of light in a vacuum?", False, None), # Should NOT call a function 114 | ("Timer for 10 minutes and 10 seconds", True, "timer.set"), # Should call timer.set() 115 | ("Who painted the Mona Lisa?", False, None), # Should NOT call a function 116 | ("Start the Camera", True, "camera.open"), # Should call camera.open() 117 | ("Thank you very much.", False, None), # Should NOT call a function 118 | ("Create new web shooter project", True, "project.create_folder"), # Should call project.create_folder() 119 | ("Please and thank you.", False, None), # Should NOT call a function 120 | ("Give me system info", True, "system.info"), # Should call system.info() 121 | ("No, thank you.", False, None), # Should NOT call a function 122 | ("Create new project called Iron Man", True, "project.create_folder"), # Should call project.create_folder() 123 | ("Where do Lions live", False, None), # Should NOT call a function 124 | ("Show me GPU information", True, "system.info"), # Should call system.info() 125 | ("What ocean is larger the atlantic or pacific", False, None), #Should NOT call a function 126 | ("Make a new project folder name robot arm", True, "project.create_folder"), #Should call project.create_folder() 127 | ("What is the largest country in the world", False, None), #Should NOT call a function 128 | ("How much RAM am I using", True, "system.info"), #Should call system.info() 129 | ("Briefly explain AI", False, None), #Should NOT call a function 130 | ("Start a new project called robot car", True, "project.create_folder"), #Should call project.create_folder() 131 | ("Give me CPU Info", True, "system.info"), #Should call system.info() 132 | ("What is a brushless motor?", False, None), #Should NOT call a function 133 | ("Make me a new project folder called AI assistant", True, "project.create_folder"), #Should call project.create_folder() 134 | ("Goodnight!", False, None), #Should NOT call a function 135 | ] 136 | start_time = time.time() 137 | 138 | for prompt, should_call_function, function_name in prompts_and_expectations: 139 | test(prompt, should_call_function, function_name) 140 | 141 | end_time = time.time() 142 | execution_time = end_time - start_time 143 | 144 | print(f"Execution time: {execution_time} seconds") 145 | print(f"Success rate: {success_count / (success_count + failure_count) * 100}%") 146 | 147 | log_filename = "response_log.json" 148 | with open(log_filename, "w") as f: 149 | json.dump( 150 | [{"prompt": item[0], "result": item[1], "model_response": item[2]} for item in response_array], f, indent=4 151 | ) 152 | 153 | print(f"\nLog file '{log_filename}' created successfully.") -------------------------------------------------------------------------------- /multimodal_live_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | ## Setup 3 | 4 | To install the dependencies for this script, run: 5 | 6 | ``` 7 | pip install google-genai opencv-python pyaudio pillow mss 8 | ``` 9 | 10 | Before running this script, ensure the `GOOGLE_API_KEY` environment 11 | variable is set to the api-key you obtained from Google AI Studio. 12 | 13 | Important: **Use headphones**. This script uses the system default audio 14 | input and output, which often won't include echo cancellation. So to prevent 15 | the model from interrupting itself it is important that you use headphones. 16 | 17 | ## Run 18 | 19 | To run the script: 20 | 21 | ``` 22 | python Get_started_LiveAPI.py 23 | ``` 24 | 25 | The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none". 26 | The default is "camera". To share your screen run: 27 | 28 | ``` 29 | python Get_started_LiveAPI.py --mode screen 30 | ``` 31 | """ 32 | 33 | import asyncio 34 | import base64 35 | import io 36 | import os 37 | import sys 38 | import traceback 39 | 40 | import cv2 41 | import pyaudio 42 | import PIL.Image 43 | import mss 44 | 45 | import argparse 46 | 47 | from google import genai 48 | from dotenv import load_dotenv # Added for API key loading 49 | 50 | # --- Load Environment Variables --- 51 | load_dotenv() 52 | 53 | if sys.version_info < (3, 11, 0): 54 | import taskgroup, exceptiongroup 55 | 56 | asyncio.TaskGroup = taskgroup.TaskGroup 57 | asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup 58 | 59 | FORMAT = pyaudio.paInt16 60 | CHANNELS = 1 61 | SEND_SAMPLE_RATE = 16000 62 | RECEIVE_SAMPLE_RATE = 24000 63 | CHUNK_SIZE = 1024 64 | 65 | MODEL = "models/gemini-2.0-flash-live-001" 66 | 67 | DEFAULT_MODE = "camera" 68 | 69 | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 70 | 71 | client = genai.Client(api_key=GOOGLE_API_KEY,http_options={"api_version": "v1beta"}) 72 | 73 | CONFIG = {"response_modalities": ["AUDIO"]} 74 | 75 | pya = pyaudio.PyAudio() 76 | 77 | 78 | class AudioLoop: 79 | def __init__(self, video_mode=DEFAULT_MODE): 80 | self.video_mode = video_mode 81 | 82 | self.audio_in_queue = None 83 | self.out_queue = None 84 | 85 | self.session = None 86 | 87 | self.send_text_task = None 88 | self.receive_audio_task = None 89 | self.play_audio_task = None 90 | 91 | async def send_text(self): 92 | while True: 93 | text = await asyncio.to_thread( 94 | input, 95 | "message > ", 96 | ) 97 | if text.lower() == "q": 98 | break 99 | await self.session.send(input=text or ".", end_of_turn=True) 100 | 101 | def _get_frame(self, cap): 102 | # Read the frameq 103 | ret, frame = cap.read() 104 | # Check if the frame was read successfully 105 | if not ret: 106 | return None 107 | # Fix: Convert BGR to RGB color space 108 | # OpenCV captures in BGR but PIL expects RGB format 109 | # This prevents the blue tint in the video feed 110 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 111 | img = PIL.Image.fromarray(frame_rgb) # Now using RGB frame 112 | img.thumbnail([1024, 1024]) 113 | 114 | image_io = io.BytesIO() 115 | img.save(image_io, format="jpeg") 116 | image_io.seek(0) 117 | 118 | mime_type = "image/jpeg" 119 | image_bytes = image_io.read() 120 | return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()} 121 | 122 | async def get_frames(self): 123 | # This takes about a second, and will block the whole program 124 | # causing the audio pipeline to overflow if you don't to_thread it. 125 | cap = await asyncio.to_thread( 126 | cv2.VideoCapture, 0 127 | ) # 0 represents the default camera 128 | 129 | while True: 130 | frame = await asyncio.to_thread(self._get_frame, cap) 131 | if frame is None: 132 | break 133 | 134 | await asyncio.sleep(1.0) 135 | 136 | await self.out_queue.put(frame) 137 | 138 | # Release the VideoCapture object 139 | cap.release() 140 | 141 | def _get_screen(self): 142 | sct = mss.mss() 143 | monitor = sct.monitors[0] 144 | 145 | i = sct.grab(monitor) 146 | 147 | mime_type = "image/jpeg" 148 | image_bytes = mss.tools.to_png(i.rgb, i.size) 149 | img = PIL.Image.open(io.BytesIO(image_bytes)) 150 | 151 | image_io = io.BytesIO() 152 | img.save(image_io, format="jpeg") 153 | image_io.seek(0) 154 | 155 | image_bytes = image_io.read() 156 | return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()} 157 | 158 | async def get_screen(self): 159 | 160 | while True: 161 | frame = await asyncio.to_thread(self._get_screen) 162 | if frame is None: 163 | break 164 | 165 | await asyncio.sleep(1.0) 166 | 167 | await self.out_queue.put(frame) 168 | 169 | async def send_realtime(self): 170 | while True: 171 | msg = await self.out_queue.get() 172 | await self.session.send(input=msg) 173 | 174 | async def listen_audio(self): 175 | mic_info = pya.get_default_input_device_info() 176 | self.audio_stream = await asyncio.to_thread( 177 | pya.open, 178 | format=FORMAT, 179 | channels=CHANNELS, 180 | rate=SEND_SAMPLE_RATE, 181 | input=True, 182 | input_device_index=mic_info["index"], 183 | frames_per_buffer=CHUNK_SIZE, 184 | ) 185 | if __debug__: 186 | kwargs = {"exception_on_overflow": False} 187 | else: 188 | kwargs = {} 189 | while True: 190 | data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs) 191 | await self.out_queue.put({"data": data, "mime_type": "audio/pcm"}) 192 | 193 | async def receive_audio(self): 194 | "Background task to reads from the websocket and write pcm chunks to the output queue" 195 | while True: 196 | turn = self.session.receive() 197 | async for response in turn: 198 | if data := response.data: 199 | self.audio_in_queue.put_nowait(data) 200 | continue 201 | if text := response.text: 202 | print(text, end="") 203 | 204 | # If you interrupt the model, it sends a turn_complete. 205 | # For interruptions to work, we need to stop playback. 206 | # So empty out the audio queue because it may have loaded 207 | # much more audio than has played yet. 208 | while not self.audio_in_queue.empty(): 209 | self.audio_in_queue.get_nowait() 210 | 211 | async def play_audio(self): 212 | stream = await asyncio.to_thread( 213 | pya.open, 214 | format=FORMAT, 215 | channels=CHANNELS, 216 | rate=RECEIVE_SAMPLE_RATE, 217 | output=True, 218 | ) 219 | while True: 220 | bytestream = await self.audio_in_queue.get() 221 | await asyncio.to_thread(stream.write, bytestream) 222 | 223 | async def run(self): 224 | try: 225 | async with ( 226 | client.aio.live.connect(model=MODEL, config=CONFIG) as session, 227 | asyncio.TaskGroup() as tg, 228 | ): 229 | self.session = session 230 | 231 | self.audio_in_queue = asyncio.Queue() 232 | self.out_queue = asyncio.Queue(maxsize=5) 233 | 234 | send_text_task = tg.create_task(self.send_text()) 235 | tg.create_task(self.send_realtime()) 236 | tg.create_task(self.listen_audio()) 237 | if self.video_mode == "camera": 238 | tg.create_task(self.get_frames()) 239 | elif self.video_mode == "screen": 240 | tg.create_task(self.get_screen()) 241 | 242 | tg.create_task(self.receive_audio()) 243 | tg.create_task(self.play_audio()) 244 | 245 | await send_text_task 246 | raise asyncio.CancelledError("User requested exit") 247 | 248 | except asyncio.CancelledError: 249 | pass 250 | except ExceptionGroup as EG: 251 | self.audio_stream.close() 252 | traceback.print_exception(EG) 253 | 254 | 255 | if __name__ == "__main__": 256 | parser = argparse.ArgumentParser() 257 | parser.add_argument( 258 | "--mode", 259 | type=str, 260 | default=DEFAULT_MODE, 261 | help="pixels to stream from", 262 | choices=["camera", "screen", "none"], 263 | ) 264 | args = parser.parse_args() 265 | main = AudioLoop(video_mode=args.mode) 266 | asyncio.run(main.run()) -------------------------------------------------------------------------------- /ADA/ADA_Local.py: -------------------------------------------------------------------------------- 1 | import ollama 2 | import asyncio 3 | import pyaudio 4 | from RealtimeSTT import AudioToTextRecorder 5 | from RealtimeTTS import TextToAudioStream, SystemEngine, CoquiEngine 6 | import torch # Import the torch library 7 | import re 8 | import time 9 | import os 10 | from .WIDGETS import system, timer, project, camera 11 | 12 | ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") 13 | VOICE_ID = 'pFZP5JQG7iQjIQuC4Bku' 14 | 15 | FORMAT = pyaudio.paInt16 16 | CHANNELS = 1 17 | SEND_SAMPLE_RATE = 16000 18 | RECEIVE_SAMPLE_RATE = 24000 19 | CHUNK_SIZE = 1024 20 | 21 | class ADA: 22 | def __init__(self): 23 | print("initializing...") 24 | 25 | # Check for CUDA availability 26 | if torch.cuda.is_available(): 27 | self.device = "cuda" 28 | print("CUDA is available. Using GPU.") 29 | else: 30 | self.device = "cpu" 31 | print("CUDA is not available. Using CPU.") 32 | 33 | self.model = "gemma3:4b-it-q4_K_M" #This is the smallest version of gemma3 for consistent function calling use gemma3:4b-it-q4_K_M or higher if your computer is not strong enough use ada_online 34 | self.system_behavior = """ 35 | Your name is ADA (Advanced Design Assistant) you are a helpful AI assistant. You are an expert in All STEM Fields providing concise and accurate information. When asked to perform a task, respond with the code to perform that task wrapped in ```tool_code```. If the task does not require a function call, provide a direct answer without using ```tool_code```. Always respond in a helpful and informative manner." 36 | 37 | You speak with a british accent and address people as Sir. 38 | """ 39 | 40 | self.instruction_prompt_with_function_calling = ''' 41 | At each turn, if you decide to invoke any of the function(s), it should be wrapped with ```tool_code```. If you decide to call a function the response should only have the function wrapped in tool code nothing more. The python methods described below are imported and available, you can only use defined methods also only call methods when you are sure they need to be called. The generated code should be readable and efficient. 42 | 43 | The response to a method will be wrapped in ```tool_output``` use the response to give the user an answer based on the information provided that is wrapped in ```tool_ouput```. 44 | 45 | For regular prompts do not call any functions or wrap the response in ```tool_code```. 46 | 47 | The following Python methods are available: 48 | 49 | ```python 50 | def camera.open() -> None: 51 | """Open the camera""" 52 | 53 | def system.info() -> None: 54 | """ Gathers and prints system information including CPU, RAM, and GPU details. Only call when user ask about computer information. """ 55 | 56 | def timer.set(time_str): 57 | """ 58 | Counts down from a specified time in HH:MM:SS format. 59 | 60 | Args: 61 | time_str (str): The time to count down from in HH:MM:SS format. 62 | """ 63 | def project.create_folder(folder_name): 64 | """ 65 | Creates a project folder and a text file to store chat history. 66 | 67 | Args: 68 | folder_name (str): The name of the project folder to create. 69 | """ 70 | ``` 71 | 72 | User: {user_message} 73 | ''' 74 | 75 | self.model_params = { 76 | 'temperature': 0.1, 77 | 'top_p': 0.9, 78 | } 79 | self.conversation_history = [] 80 | 81 | self.input_queue = asyncio.Queue() 82 | self.response_queue = asyncio.Queue() 83 | self.audio_queue = asyncio.Queue() 84 | self.recorder_config = { 85 | 'model': 'large-v3', 86 | 'spinner': False, 87 | 'language': 'en', 88 | 'silero_sensitivity': 0.01, 89 | 'webrtc_sensitivity': 3, 90 | 'post_speech_silence_duration': 0.1, 91 | 'min_length_of_recording': 0.2, 92 | 'min_gap_between_recordings': 0, 93 | 94 | #'realtime_model_type': 'tiny.en', 95 | #'enable_realtime_transcription': True, 96 | #'on_realtime_transcription_update': self.clear_queues, 97 | } 98 | 99 | try: 100 | self.recorder = AudioToTextRecorder(**self.recorder_config) 101 | except Exception as e: 102 | print(f"Error initializing AudioToTextRecorder: {e}") 103 | self.recorder = None # Or handle this appropriately 104 | 105 | try: 106 | self.pya = pyaudio.PyAudio() 107 | except Exception as e: 108 | print(f"Error initializing PyAudio: {e}") 109 | self.pya = None 110 | 111 | self.response_start_time = None 112 | self.audio_start_time = None 113 | #self.engine = CoquiEngine() 114 | self.engine = SystemEngine() 115 | self.stream = TextToAudioStream(self.engine) 116 | self.first_audio_byte_time = None 117 | self.speech_to_text_time = None 118 | 119 | async def clear_queues(self, text=""): 120 | """Clears all data from the input, response, and audio queues.""" 121 | queues = [self.input_queue, self.response_queue, self.audio_queue] 122 | for q in queues: 123 | while not q.empty(): 124 | try: 125 | q.get_nowait() 126 | except asyncio.QueueEmpty: 127 | break # Queue is empty 128 | 129 | async def input_message(self): 130 | while True: 131 | try: 132 | prompt = await asyncio.to_thread(input, "Enter your message: ") 133 | if prompt.lower() == "exit": 134 | await self.input_queue.put(None) # Signal to exit 135 | break 136 | await self.clear_queues() 137 | self.prompt_start_time = time.time() 138 | await self.input_queue.put(prompt) 139 | except Exception as e: 140 | print(f"Error in input_message: {e}") 141 | continue # Continue the loop even if there's an error 142 | 143 | async def send_prompt(self): 144 | while True: 145 | try: 146 | prompt = await self.input_queue.get() 147 | if prompt is None: 148 | break # Exit loop if None is received 149 | 150 | self.response_start_time = time.time() #start timer when prompt is sent 151 | 152 | messages = [{"role": "system", "content": self.system_behavior}] + self.conversation_history + [{"role": "user", "content": self.instruction_prompt_with_function_calling.format(user_message=prompt)}] 153 | try: 154 | response = ollama.chat(model=self.model, messages=messages, stream=True) 155 | full_response = "" 156 | in_function_call = False 157 | function_call = "" 158 | 159 | for chunk in response: 160 | chunk_content = chunk['message']['content'] 161 | if chunk_content == "```": 162 | if in_function_call == True: 163 | in_function_call = False 164 | function_call += "```" 165 | tool_output = self.extract_tool_call(function_call) 166 | 167 | messages = [{"role": "system", "content": self.system_behavior}] + self.conversation_history + [{"role": "user", "content": self.instruction_prompt_with_function_calling.format(user_message=tool_output)}] 168 | 169 | response = ollama.chat(model=self.model, messages=messages, stream=True) 170 | for chunk in response: 171 | chunk_content = chunk['message']['content'] 172 | print(chunk_content, end="", flush=True) 173 | await self.response_queue.put(chunk_content) 174 | print() 175 | continue 176 | else: 177 | in_function_call = True 178 | 179 | if in_function_call == False: 180 | await self.response_queue.put(chunk_content) 181 | await asyncio.sleep(0) 182 | else: 183 | function_call += chunk_content 184 | if chunk_content: 185 | print(chunk_content, end="", flush=True) #print chunks on same line 186 | full_response += chunk_content 187 | print() # new line 188 | self.conversation_history.append({"role": "user", "content": prompt}) 189 | self.conversation_history.append({"role": "assistant", "content": full_response}) 190 | 191 | except Exception as e: 192 | print(f"An error occurred in send_prompt: {e}") 193 | except asyncio.CancelledError: 194 | break 195 | except Exception as e: 196 | print(f"Unexpected error in send_prompt: {e}") 197 | 198 | finally: # Ensure the sentinel value is added even if an error occurs 199 | await self.response_queue.put(None) 200 | 201 | def extract_tool_call(self, text): 202 | import io 203 | from contextlib import redirect_stdout 204 | 205 | pattern = r"```tool_code\s*(.*?)\s*```" 206 | match = re.search(pattern, text, re.DOTALL) 207 | if match: 208 | code = match.group(1).strip() 209 | # Capture stdout in a string buffer 210 | f = io.StringIO() 211 | with redirect_stdout(f): 212 | result = eval(code) 213 | output = f.getvalue() 214 | r = result if output == '' else output 215 | return f'```tool_output\n{str(r).strip()}\n```''' 216 | return None 217 | 218 | async def tts(self): 219 | while True: 220 | chunk = await self.response_queue.get() 221 | if chunk == None: 222 | continue 223 | if self.first_audio_byte_time is None: 224 | self.first_audio_byte_time = time.time() 225 | time_to_first_audio = self.first_audio_byte_time - self.prompt_start_time 226 | print(f"Time from prompt to first audio byte: {time_to_first_audio:.4f} seconds") 227 | self.stream.feed(chunk) 228 | self.stream.play_async() 229 | 230 | async def stt(self): 231 | if self.recorder is None: 232 | print("Audio recorder is not initialized.") 233 | return 234 | 235 | while True: 236 | try: 237 | text = await asyncio.to_thread(self.recorder.text) 238 | await self.clear_queues() 239 | await self.input_queue.put(text) 240 | print(text) 241 | except Exception as e: 242 | print(f"Error in listen: {e}") 243 | continue # Continue the loop even if there's an error 244 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ADA (Advanced Design Assistant) 2 | 3 | ADA is a helpful AI assistant specializing in STEM fields, designed to provide concise and accurate information and assist with various tasks through voice or text interaction. ADA comes in two versions: a local version (`ada_local`) that runs primarily on your machine and an online version (`ada_online`) that utilizes cloud-based services. A separate multimodal live demo (`multimodal_live_api.py`) is also included, showcasing real-time audio and video interaction. 4 | 5 | **Recommendation:** While both versions are available, the **`ada_online` version is heavily recommended**. It leverages powerful cloud-based models (Google Gemini) and services (ElevenLabs TTS) that generally offer faster, higher-quality, and more reliable responses compared to the local version, which is dependent on your hardware capabilities. The online models have also been developed and refined for a longer period. 6 | 7 | ## Features 8 | 9 | - **Dual Versions:** Choose between running ADA locally (`ada_local`) or using online services (`ada_online`). 10 | - **Real-time Interaction:** Communicate with ADA using voice (Speech-to-Text) and receive spoken responses (Text-to-Speech). 11 | - **Function Calling & Grounding:** ADA can perform specific tasks by calling available functions (widgets) and use tools like Google Search to access current information. 12 | - Accessing system information (`system.info`) 13 | - Setting timers (`timer.set`) 14 | - Creating project folders (`project.create_folder`) 15 | - Opening the camera (`camera.open`) 16 | - Managing a To-Do list (`to_do_list.py` - _Note: Not currently integrated as a callable tool in provided main scripts_) 17 | - Getting weather (`get_weather`) 18 | - Calculating travel duration (`get_travel_duration`) 19 | - **STEM Expertise:** Designed to assist with engineering, math, and science queries. 20 | - **Conversational:** Engages in natural language conversation. 21 | - **Multimodal Demo:** Includes a script (`multimodal_live_api.py`) for live interaction combining audio and video (camera/screen). 22 | 23 | ## Setup 24 | 25 | ### Prerequisites 26 | 27 | - **Python:** Ensure you have Python installed (code uses features compatible with Python 3.11+). 28 | - **Ollama (for `ada_local`)**: You need Ollama installed and running to serve the local LLM. Make sure you have downloaded the model specified in `ADA/ADA_Local.py` (e.g., `gemma3:4b-it-q4_K_M`). Performance heavily depends on your hardware. 29 | - **CUDA (Optional, for `ada_local` & potentially local STT/TTS models)**: For better performance with local models, a CUDA-compatible GPU and the necessary drivers are recommended. ADA's local components attempt to automatically detect and use the GPU if available via PyTorch. 30 | - **Microphone and Speakers:** Required for voice interaction (STT/TTS). **Headphones are strongly recommended** to prevent echo and self-interruption. 31 | - **API Keys (for `ada_online` & `multimodal_live_api.py`)**: See the API Key Setup section below. 32 | - **FFmpeg (Optional, Recommended)**: The `RealtimeSTT` or `RealtimeTTS` libraries (or their dependencies) might rely on FFmpeg for audio processing. If you encounter audio errors (like `torchaudio` warnings in logs), installing FFmpeg and ensuring it's in your system's PATH is recommended. 33 | - **System Dependencies (e.g., `portaudio`)**: Libraries like `PyAudio` might require system-level libraries (like `portaudio` on Linux/macOS or specific drivers on Windows). Consult the documentation for `PyAudio` and `RealtimeTTS` (especially if using `CoquiEngine`) for specific OS requirements. 34 | 35 | ### Installation 36 | 37 | 1. **Clone the Repository:** 38 | ```bash 39 | git clone https://github.com/Nlouis38/ada.git 40 | cd ada_v1 41 | ``` 42 | 2. **Install Dependencies:** 43 | Create a virtual environment (recommended): 44 | ```bash 45 | python -m venv venv 46 | source venv/bin/activate # On Windows use `venv\Scripts\activate` 47 | ``` 48 | Install the required Python libraries: 49 | ```bash 50 | pip install ollama websockets pyaudio RealtimeSTT RealtimeTTS torch google-generativeai opencv-python pillow mss psutil GPUtil elevenlabs python-dotenv python-weather googlemaps # Add any other specific libraries used 51 | ``` 52 | 53 | ## API Key Setup (Environment Variables Recommended) 54 | 55 | Both `ada_online` and `multimodal_live_api.py` require API keys for cloud services. It is **highly recommended** to use environment variables for security instead of hardcoding keys into the scripts. 56 | 57 | 1. **Create a `.env` file:** In the root `ada_v1` directory, create a file named `.env`. 58 | 2. **Add Keys to `.env`:** Open the `.env` file and add your keys in the following format: 59 | 60 | ```dotenv 61 | # .env file 62 | GOOGLE_API_KEY=YOUR_GOOGLE_AI_STUDIO_KEY_HERE 63 | ELEVENLABS_API_KEY=YOUR_ELEVENLABS_KEY_HERE 64 | MAPS_API_KEY=YOUR_Maps_API_KEY_HERE 65 | ``` 66 | 67 | 3. **Get the Keys:** 68 | 69 | - **Google Generative AI (Gemini API):** 70 | - **Purpose:** Core LLM for `ada_online` and `multimodal_live_api.py`. 71 | - **Get:** Visit [Google AI Studio](https://aistudio.google.com/), sign in, and create an API key. 72 | - **ElevenLabs:** 73 | - **Purpose:** High-quality Text-to-Speech (TTS) for `ada_online`. 74 | - **Get:** Go to [ElevenLabs](https://elevenlabs.io/), log in, and find your API key in your profile/settings. 75 | - **Google Maps:** 76 | - **Purpose:** Used by the `get_travel_duration` function tool in `ada_online`. 77 | - **Get:** Go to the [Google Cloud Console](https://console.cloud.google.com/), create a project (or use an existing one), enable the "Directions API", and create an API key under "Credentials". 78 | 79 | 4. **Code Usage:** The Python scripts (`ADA_Online.py`, `multimodal_live_api.py`, `tts_latency_test.py`) use `python-dotenv` to automatically load these variables from the `.env` file when the script starts. 80 | 81 | ```python 82 | # Example from ADA_Online.py 83 | from dotenv import load_dotenv 84 | load_dotenv() # Loads variables from .env 85 | 86 | ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") 87 | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 88 | MAPS_API_KEY = os.getenv("MAPS_API_KEY") 89 | 90 | # ... later use these variables ... 91 | self.client = genai.Client(api_key=GOOGLE_API_KEY, ...) 92 | # or when initializing ElevenLabsEngine/Websocket connection 93 | ``` 94 | 95 | ## Speech-to-Text (STT) and Text-to-Speech (TTS) 96 | 97 | ADA uses real-time libraries for voice interaction: 98 | 99 | - **STT (Speech-to-Text):** 100 | - **Library:** `RealtimeSTT` is used in both `ada_local` and `ada_online`. 101 | - **Functionality:** Captures audio from the default microphone, detects speech, and transcribes it to text using a backend model (e.g., Whisper `large-v3` specified in the configs). 102 | - **TTS (Text-to-Speech):** 103 | - **Library:** `RealtimeTTS` provides the framework. Different _engines_ handle the actual synthesis: 104 | - **`ada_local`:** Uses `RealtimeTTS` likely with `SystemEngine` (OS default TTS) or potentially `CoquiEngine` (local neural voice, requires setup). Quality and latency depend heavily on the chosen engine and system hardware. 105 | - **`ada_online` (Recommended):** Uses `ElevenlabsEngine` via WebSockets. This typically provides very low latency and high-quality, natural-sounding voices, but requires an ElevenLabs API key and internet connection. 106 | - **`ada_online_noelevenlabs`:** Uses `RealtimeTTS` with `SystemEngine`, offering an online LLM experience without needing an ElevenLabs key, but using the basic OS TTS voice. 107 | 108 | ## Running ADA 109 | 110 | ### `ada_local` 111 | 112 | Uses Ollama for the LLM and local engines for STT/TTS. Performance depends significantly on your CPU/GPU and RAM. 113 | 114 | - **LLM:** Served locally via Ollama (e.g., `gemma3:4b-it-q4_K_M`). 115 | - **STT:** `RealtimeSTT`. 116 | - **TTS:** `RealtimeTTS` with `SystemEngine` or `CoquiEngine`. 117 | - **To run:** 118 | ```bash 119 | # Ensure Ollama is running with the required model pulled 120 | python main_local.py 121 | ``` 122 | 123 | ### `ada_online` (Recommended) 124 | 125 | Uses Google Gemini (cloud) for LLM and ElevenLabs (cloud) for TTS. Requires API keys and internet. Generally faster and higher quality. 126 | 127 | - **LLM:** Google Gemini (`gemini-2.0-flash-live-001` or similar). 128 | - **STT:** `RealtimeSTT`. 129 | - **TTS:** `RealtimeTTS` with `ElevenlabsEngine` via WebSockets. 130 | - **To run:** 131 | ```bash 132 | # Make sure .env file is set up with API keys 133 | python main_online.py 134 | ``` 135 | 136 | ### `ada_online_noelevenlabs` 137 | 138 | Uses Google Gemini (cloud) for LLM and local OS TTS. A middle ground if you want the better online LLM but don't have/want an ElevenLabs key. 139 | 140 | - **LLM:** Google Gemini (`gemini-2.0-flash-live-001` or similar). 141 | - **STT:** `RealtimeSTT`. 142 | - **TTS:** `RealtimeTTS` with `SystemEngine`. 143 | - **To run:** 144 | ```bash 145 | # Make sure .env file is set up with GOOGLE_API_KEY and MAPS_API_KEY 146 | python main_online_noelevenlabs.py 147 | ``` 148 | 149 | ## Multimodal Live API Demo (`multimodal_live_api.py`) 150 | 151 | This script demonstrates real-time, multimodal interaction using the Gemini Live API. It streams audio from your microphone and video frames (from your camera or screen) to the Gemini model and plays back the audio response. 152 | 153 | ### Setup (Multimodal Demo) 154 | 155 | - Ensure dependencies are installed (see main Installation section). 156 | - Ensure your `GOOGLE_API_KEY` is set in your `.env` file. 157 | - **Use headphones!** 158 | 159 | ### Running (Multimodal Demo) 160 | 161 | - **With Camera:** 162 | ```bash 163 | python multimodal_live_api.py --mode camera # or just python multimodal_live_api.py 164 | ``` 165 | - **With Screen Sharing:** 166 | ```bash 167 | python multimodal_live_api.py --mode screen 168 | ``` 169 | - **Audio Only:** 170 | ```bash 171 | python multimodal_live_api.py --mode none 172 | ``` 173 | - You can type text messages in the console while the audio/video stream is running. Type 'q' and Enter to quit. 174 | 175 | ## Usage (Main ADA Scripts) 176 | 177 | Once `main_local.py`, `main_online.py`, or `main_online_noelevenlabs.py` is running: 178 | 179 | - **Voice Input:** Speak clearly into your microphone. The STT engine will detect speech and transcribe it. 180 | - **Text Input:** If you prefer typing, type your prompt into the console when it says "Enter your message:" and press Enter. 181 | - **Exit:** Type `exit` and press Enter. 182 | 183 | ## Widgets / Tools 184 | 185 | ADA (`ada_local` and `ada_online`) can utilize several built-in functions/tools: 186 | 187 | - **Local Widgets (`WIDGETS/` directory):** Primarily used by `ada_local`. 188 | - `camera.py`: Opens the default camera feed. (_Note: Implementation returns string, doesn't keep feed open_) 189 | - `project.py`: Creates project folders. 190 | - `system.py`: Provides system hardware information. 191 | - `timer.py`: Sets countdown timers. 192 | - `to_do_list.py`: Manages a simple to-do list. (_Not integrated_) 193 | - **Online Tools (Gemini API):** Used by `ada_online` versions. 194 | - `GoogleSearch`: Accesses Google Search for current information. 195 | - `get_weather`: Fetches weather using `python-weather`. 196 | - `get_travel_duration`: Calculates travel time using `googlemaps`. 197 | - `CodeExecution`: Allows Gemini to generate and potentially execute code (primarily for analysis/computation, not file system interaction). 198 | 199 | ADA decides when to call these based on your request and the model's understanding. 200 | 201 | ## Troubleshooting 202 | 203 | - **Audio Issues (No Input/Output):** 204 | - Ensure microphone/speakers are system defaults and not muted. 205 | - Check `PyAudio` dependencies (`portaudio`). 206 | - Ensure necessary permissions are granted for microphone access. 207 | - Try different audio devices if available. 208 | - Check for `FFmpeg` if errors mention audio encoding/decoding. 209 | - **API Key Errors (`ada_online`, `multimodal_live_api.py`):** 210 | - Verify keys are correct in the `.env` file. 211 | - Ensure the relevant APIs (Gemini, Maps, ElevenLabs) are enabled in their respective cloud consoles. 212 | - Check API key quotas and billing status. 213 | - **Library Errors:** 214 | - Ensure all dependencies from `Installation` are correctly installed in your active virtual environment. 215 | - Some libraries (e.g., `torch`, `tensorflow` used by STT/TTS backends) might have specific CPU/GPU version requirements. 216 | - **Ollama Issues (`ada_local`):** 217 | - Confirm Ollama service is running. 218 | - Verify the specified model (e.g., `gemma3:4b-it-q4_K_M`) is downloaded (`ollama pull model_name`) and accessible. 219 | - Check Ollama logs for errors. 220 | - **TTS Issues:** 221 | - If using `ElevenlabsEngine`, check API key and internet connection. 222 | - If using `CoquiEngine`, ensure it's installed correctly and models are downloaded. 223 | - If using `SystemEngine`, ensure your OS's built-in TTS is functional. Latency might be higher. 224 | - **STT Issues:** 225 | - Check microphone levels. 226 | - Ensure `RealtimeSTT` model is appropriate for your hardware (larger models need more resources). 227 | - Background noise can interfere. Use headphones. 228 | -------------------------------------------------------------------------------- /ADA/ADA_Online_NoElevenlabs.py: -------------------------------------------------------------------------------- 1 | # --- Keep necessary imports --- 2 | import asyncio 3 | import pyaudio # Still needed for RealtimeSTT 4 | from RealtimeSTT import AudioToTextRecorder 5 | import torch 6 | import re 7 | from google.genai import types 8 | from google import genai 9 | import os 10 | from google.genai.types import Tool, GoogleSearch, Part, Blob, Content 11 | import python_weather 12 | import googlemaps 13 | from datetime import datetime 14 | from dotenv import load_dotenv 15 | 16 | # --- Add RealtimeTTS imports --- 17 | from RealtimeTTS import TextToAudioStream, SystemEngine # Using SystemEngine as per reference example 18 | # from RealtimeTTS import CoquiEngine # Uncomment if you want to use CoquiTTS (requires installation) 19 | 20 | # --- Load Environment Variables (Remove ElevenLabs key) --- 21 | load_dotenv() 22 | # ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") # Removed 23 | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 24 | MAPS_API_KEY = os.getenv("MAPS_API_KEY") 25 | 26 | # --- Validate API Keys --- 27 | # if not ELEVENLABS_API_KEY: print("Error: ELEVENLABS_API_KEY not found in environment variables.") # Removed 28 | if not GOOGLE_API_KEY: print("Error: GOOGLE_API_KEY not found in environment variables.") 29 | if not MAPS_API_KEY: print("Error: MAPS_API_KEY not found in environment variables.") 30 | # --- End API Key Validation --- 31 | 32 | # VOICE_ID = 'pFZP5JQG7iQjIQuC4Bku' # Removed (Specific to ElevenLabs) 33 | 34 | FORMAT = pyaudio.paInt16 35 | CHANNELS = 1 36 | # SEND_SAMPLE_RATE = 16000 # Keep if used by RealtimeSTT 37 | # RECEIVE_SAMPLE_RATE = 24000 # RealtimeTTS handles its own output rate 38 | # CHUNK_SIZE = 1024 # Less relevant for RealtimeTTS feed/play approach 39 | 40 | class ADA: 41 | def __init__(self): 42 | print("initializing...") 43 | 44 | # Check for CUDA availability 45 | if torch.cuda.is_available(): 46 | self.device = "cuda" 47 | print("CUDA is available. Using GPU.") 48 | else: 49 | self.device = "cpu" 50 | print("CUDA is not available. Using CPU.") 51 | 52 | # --- Initialize Google GenAI Client (Keep) --- 53 | self.client = genai.Client(api_key=GOOGLE_API_KEY, http_options={'api_version': 'v1beta'}) 54 | self.model = "gemini-2.0-flash-live-001" 55 | 56 | # --- System Behavior Prompt (Keep) --- 57 | self.system_behavior = """ 58 | Your name is Ada, which stands for Advanced Design Assistant. 59 | You have a joking personality. You are an AI designed to assist with engineering projects, and you are an expert in all engineering, math, and science disciplines. 60 | You address people as "Sir" and you also speak with a british accent. 61 | When answering, you respond using complete sentences and in a conversational tone. Make sure to keep tempo of answers quick so don't use too much commas, periods or overall punctuation. 62 | Any prompts that need current or recent data always use the search tool. 63 | """ 64 | 65 | # --- Function Declarations (Keep) --- 66 | self.get_weather_func = types.FunctionDeclaration( 67 | name="get_weather", 68 | description="Get the current weather conditions (temperature, precipitation, description) for a specified city and state/country (e.g., 'Vinings, GA', 'London, UK').", 69 | parameters=types.Schema( 70 | type=types.Type.OBJECT, properties={"location": types.Schema(type=types.Type.STRING, description="The city and state, e.g., San Francisco, CA or Vinings, GA")}, required=["location"] 71 | ) 72 | ) 73 | self.get_travel_duration_func = types.FunctionDeclaration( 74 | name="get_travel_duration", 75 | description="Calculates the estimated travel duration between a specified origin and destination using Google Maps. Considers current traffic for driving mode.", 76 | parameters=types.Schema( 77 | type=types.Type.OBJECT, properties={ 78 | "origin": types.Schema(type=types.Type.STRING, description="The starting address or place name."), 79 | "destination": types.Schema(type=types.Type.STRING, description="The destination address or place name."), 80 | "mode": types.Schema(type=types.Type.STRING, description="Optional: Mode of transport ('driving', 'walking', etc.). Defaults to 'driving'.") 81 | }, required=["origin", "destination"] 82 | ) 83 | ) 84 | # --- End Function Declarations --- 85 | 86 | # --- Map function names to actual methods (Keep) --- 87 | self.available_functions = { 88 | "get_weather": self.get_weather, 89 | "get_travel_duration": self.get_travel_duration 90 | } 91 | 92 | # --- Google Search Tool (Grounding) --- 93 | self.google_search_tool = Tool( 94 | google_search = GoogleSearch() 95 | ) 96 | 97 | # --- Configuration (Updated tools list) --- 98 | self.config = types.LiveConnectConfig( 99 | system_instruction=types.Content( 100 | parts=[types.Part(text=self.system_behavior)] 101 | ), 102 | response_modalities=["TEXT"], 103 | # ---> Updated tools list <--- 104 | tools=[self.google_search_tool, types.Tool(code_execution=types.ToolCodeExecution,function_declarations=[ 105 | self.get_weather_func, 106 | self.get_travel_duration_func # Add the new function here 107 | ])] 108 | ) 109 | # --- End Configuration --- 110 | 111 | # --- Queues (Remove audio_queue) --- 112 | self.input_queue = asyncio.Queue() 113 | self.response_queue = asyncio.Queue() 114 | # self.audio_queue = asyncio.Queue() # Removed - RealtimeTTS handles playback 115 | 116 | # --- Recorder Config (Keep) --- 117 | self.recorder_config = { 118 | 'model': 'large-v3', 119 | 'spinner': False, 120 | 'language': 'en', 121 | 'silero_sensitivity': 0.01, 122 | 'webrtc_sensitivity': 3, 123 | 'post_speech_silence_duration': 0.1, 124 | 'min_length_of_recording': 0.2, 125 | 'min_gap_between_recordings': 0, 126 | } 127 | 128 | # --- Initialize Recorder and PyAudio (Keep) --- 129 | try: 130 | self.recorder = AudioToTextRecorder(**self.recorder_config) 131 | except Exception as e: 132 | print(f"Error initializing AudioToTextRecorder: {e}") 133 | self.recorder = None 134 | 135 | try: 136 | # PyAudio might still be needed by RealtimeSTT or underlying STT engine 137 | self.pya = pyaudio.PyAudio() 138 | except Exception as e: 139 | print(f"Error initializing PyAudio: {e}") 140 | self.pya = None 141 | 142 | # --- Initialize RealtimeTTS Engine and Stream --- 143 | print("Initializing TTS Engine...") 144 | try: 145 | # Use SystemEngine for default OS TTS. Replace with CoquiEngine if preferred. 146 | # self.engine = CoquiEngine(device=self.device) # Requires CoquiTTS installation 147 | self.engine = SystemEngine() 148 | self.stream = TextToAudioStream(self.engine) 149 | print("TTS Engine Initialized.") 150 | except Exception as e: 151 | print(f"Error initializing RealtimeTTS: {e}") 152 | self.engine = None 153 | self.stream = None 154 | # --- End TTS Initialization --- 155 | 156 | # --- End Initialization --- 157 | 158 | 159 | # --- Function Implementations (Keep get_weather, get_travel_duration) --- 160 | async def get_weather(self, location: str) -> dict | None: 161 | """ Fetches current weather. """ 162 | async with python_weather.Client(unit=python_weather.IMPERIAL) as client: 163 | try: 164 | weather = await client.get(location) 165 | weather_data = { 166 | 'location': location, 167 | 'current_temp_f': weather.temperature, 168 | 'precipitation': weather.precipitation, 169 | 'description': weather.description, 170 | } 171 | print(f"Weather data fetched: {weather_data}") 172 | return weather_data 173 | except Exception as e: 174 | print(f"Error fetching weather for {location}: {e}") 175 | return {"error": f"Could not fetch weather for {location}."} 176 | 177 | def _sync_get_travel_duration(self, origin: str, destination: str, mode: str = "driving") -> str: 178 | """ Synchronous helper for Google Maps API call """ 179 | if not MAPS_API_KEY or MAPS_API_KEY == "YOUR_PROVIDED_KEY": 180 | print("Error: Google Maps API Key is missing or invalid.") 181 | return "Error: Missing or invalid Google Maps API Key configuration." 182 | try: 183 | gmaps = googlemaps.Client(key=MAPS_API_KEY) 184 | now = datetime.now() 185 | print(f"Requesting directions: From='{origin}', To='{destination}', Mode='{mode}'") 186 | directions_result = gmaps.directions(origin, destination, mode=mode, departure_time=now) 187 | if directions_result: 188 | leg = directions_result[0]['legs'][0] 189 | result = f"Duration information not found in response for {mode}." 190 | if mode == "driving" and 'duration_in_traffic' in leg: 191 | duration_text = leg['duration_in_traffic']['text'] 192 | result = f"Estimated travel duration ({mode}, with current traffic): {duration_text}" 193 | elif 'duration' in leg: 194 | duration_text = leg['duration']['text'] 195 | result = f"Estimated travel duration ({mode}): {duration_text}" 196 | print(f"Directions Result: {result}") 197 | return result 198 | else: 199 | print(f"No route found from {origin} to {destination} via {mode}.") 200 | return f"Could not find a route from {origin} to {destination} via {mode}." 201 | except googlemaps.exceptions.ApiError as api_err: 202 | print(f"Google Maps API Error: {api_err}") 203 | return f"Error contacting Google Maps: {api_err}" 204 | except Exception as e: 205 | print(f"An unexpected error occurred during travel duration lookup: {e}") 206 | return f"An unexpected error occurred: {e}" 207 | 208 | async def get_travel_duration(self, origin: str, destination: str, mode: str = "driving") -> dict: 209 | """ Async wrapper to get travel duration. """ 210 | print(f"Received request for travel duration from: {origin} to: {destination}, Mode: {mode}") 211 | if not mode: mode = "driving" 212 | try: 213 | result_string = await asyncio.to_thread(self._sync_get_travel_duration, origin, destination, mode) 214 | return {"duration_result": result_string} 215 | except Exception as e: 216 | print(f"Error calling _sync_get_travel_duration via to_thread: {e}") 217 | return {"duration_result": f"Failed to execute travel duration request: {e}"} 218 | # --- End Function Implementations --- 219 | 220 | 221 | async def clear_queues(self, text=""): 222 | """Clears input and response queues.""" 223 | # Removed audio_queue 224 | queues = [self.input_queue, self.response_queue] 225 | for q in queues: 226 | while not q.empty(): 227 | try: 228 | q.get_nowait() 229 | except asyncio.QueueEmpty: 230 | break 231 | 232 | async def input_message(self): 233 | """ Handles user text input (Keep) """ 234 | while True: 235 | try: 236 | prompt = await asyncio.to_thread(input, "Enter your message: ") 237 | if prompt.lower() == "exit": 238 | await self.input_queue.put(None) # Use None as signal 239 | print("exit input") 240 | break 241 | await self.clear_queues() 242 | await self.input_queue.put(prompt) 243 | except Exception as e: 244 | print(f"Error in input_message: {e}") 245 | continue 246 | 247 | # --- send_prompt: (Keep Function Calling/Grounding logic) --- 248 | async def send_prompt(self): 249 | """Manages the Gemini conversation session, handling text and tool calls.""" 250 | print("Starting Gemini session manager...") 251 | try: 252 | async with self.client.aio.live.connect(model=self.model, config=self.config) as session: 253 | print("Gemini session connected.") 254 | while True: 255 | message = await self.input_queue.get() 256 | if message is None: # Check for exit signal 257 | print("Exit signal received in send_prompt.") 258 | break 259 | if not session: 260 | print("Gemini session is not active.") 261 | self.input_queue.task_done(); continue 262 | 263 | print(f"Sending FINAL text input to Gemini: {message}") 264 | await session.send(input=message, end_of_turn=True) 265 | print("Final text message sent to Gemini, waiting for response...") 266 | 267 | # --- Process responses (Keep Function Calling Logic) --- 268 | async for response in session.receive(): 269 | try: 270 | # --- Handle Tool Calls (Function Calling) --- 271 | if response.tool_call: 272 | function_call_details = response.tool_call.function_calls[0] 273 | tool_call_id = function_call_details.id 274 | tool_call_name = function_call_details.name 275 | tool_call_args = dict(function_call_details.args) 276 | print(f"--- Received Tool Call: {tool_call_name} with args: {tool_call_args} (ID: {tool_call_id}) ---") 277 | 278 | if tool_call_name in self.available_functions: 279 | function_to_call = self.available_functions[tool_call_name] 280 | try: 281 | function_result = await function_to_call(**tool_call_args) 282 | func_resp = types.FunctionResponse( 283 | id=tool_call_id, name=tool_call_name, response={"content": function_result} 284 | ) 285 | print(f"--- Sending Tool Response for {tool_call_name} (ID: {tool_call_id}) ---") 286 | await session.send(input=func_resp, end_of_turn=False) 287 | except Exception as e: print(f"Error executing function {tool_call_name}: {e}") 288 | else: print(f"Error: Unknown function called by Gemini: {tool_call_name}") 289 | continue # Move to next response chunk 290 | 291 | # --- Handle Text Responses --- 292 | elif response.text: 293 | text_chunk = response.text 294 | print(text_chunk, end="", flush=True) 295 | await self.response_queue.put(text_chunk) # Put chunk onto queue for TTS 296 | 297 | # --- (Optional) Handle Executable Code Tool --- 298 | elif (response.server_content and response.server_content.model_turn and 299 | response.server_content.model_turn.parts and response.server_content.model_turn.parts[0].executable_code): 300 | try: 301 | executable_code = response.server_content.model_turn.parts[0].executable_code 302 | print(f"\n--- Received Executable Code ({str(executable_code.language)}) ---") 303 | print(executable_code.code) 304 | print("------------------------------------------") 305 | except Exception: pass # Ignore errors silently 306 | 307 | except Exception as e: print(f"\nError processing Gemini response chunk: {e}") 308 | # --- End Processing Responses --- 309 | 310 | print("\nEnd of Gemini response stream for this turn.") 311 | await self.response_queue.put(None) # Signal end of response for TTS 312 | self.input_queue.task_done() 313 | 314 | except asyncio.CancelledError: print("Gemini session task cancelled.") 315 | except Exception as e: print(f"Error in Gemini session manager: {e}") 316 | finally: 317 | print("Gemini session manager finished.") 318 | await self.response_queue.put(None) # Ensure sentinel is sent on exit/error 319 | 320 | 321 | # --- tts: Replaced with RealtimeTTS logic --- 322 | async def tts(self): 323 | """ Feeds text chunks to RealtimeTTS stream for synthesis and playback. """ 324 | if not self.stream: 325 | print("RealtimeTTS stream not initialized. Cannot perform TTS.") 326 | return 327 | 328 | print("TTS task started, waiting for text chunks...") 329 | while True: 330 | try: 331 | chunk = await self.response_queue.get() 332 | if chunk is None: 333 | # End of response turn signal 334 | print("TTS received end-of-response signal.") 335 | # Optional: Add a small delay or check stream state before continuing 336 | # self.stream.stop() # Might stop prematurely if playback is async 337 | self.response_queue.task_done() 338 | continue # Wait for the next turn 339 | 340 | if chunk: # Ensure chunk is not empty 341 | # Feed the text chunk to the TTS stream 342 | self.stream.feed(chunk) 343 | # Start/continue asynchronous playback of buffered audio 344 | self.stream.play_async() 345 | 346 | self.response_queue.task_done() 347 | 348 | except asyncio.CancelledError: 349 | print("TTS task cancelled.") 350 | if self.stream: self.stream.stop() # Stop playback on cancellation 351 | break 352 | except Exception as e: 353 | print(f"Error in TTS loop: {e}") 354 | if self.stream: self.stream.stop() # Stop playback on error 355 | # Add a small delay or attempt recovery if desired 356 | await asyncio.sleep(1) 357 | 358 | 359 | # --- play_audio: Removed, handled by RealtimeTTS --- 360 | # async def play_audio(self): 361 | # """ Removed - Playback is now handled by self.stream.play_async() in tts method """ 362 | # pass 363 | 364 | async def stt(self): 365 | """ Listens via microphone and puts transcribed text onto input_queue. (Keep) """ 366 | if self.recorder is None: 367 | print("Audio recorder (RealtimeSTT) is not initialized.") 368 | return 369 | 370 | print("Starting Speech-to-Text engine...") 371 | while True: 372 | try: 373 | text = await asyncio.to_thread(self.recorder.text) 374 | if text: 375 | print(f"STT Detected: {text}") 376 | await self.clear_queues() 377 | await self.input_queue.put(text) 378 | except asyncio.CancelledError: 379 | print("STT task cancelled.") 380 | break 381 | except Exception as e: 382 | print(f"Error in STT loop: {e}") 383 | await asyncio.sleep(0.5) 384 | # --- End of ADA Class --- 385 | 386 | # --- Main Execution Block (Updated for RealtimeTTS) --- 387 | async def main(): 388 | print("Starting Ada Assistant...") 389 | ada = ADA() 390 | 391 | if ada.pya is None or ada.recorder is None or ada.stream is None: 392 | print("Failed to initialize audio/TTS components. Exiting.") 393 | return 394 | 395 | # Create tasks for each concurrent operation (Removed play_audio) 396 | tasks = [ 397 | asyncio.create_task(ada.stt()), # Speech to Text -> input_queue 398 | asyncio.create_task(ada.send_prompt()), # input_queue -> Gemini (handles tools) -> response_queue 399 | asyncio.create_task(ada.tts()), # response_queue -> RealtimeTTS (feed + play_async) 400 | # asyncio.create_task(ada.input_message()) # Optional: Uncomment for text input instead of STT 401 | ] 402 | 403 | # Run tasks concurrently 404 | try: 405 | await asyncio.gather(*tasks) 406 | except asyncio.CancelledError: 407 | print("Main tasks cancelled.") 408 | finally: 409 | print("Cleaning up...") 410 | if ada.stream: 411 | print("Stopping TTS Stream...") 412 | ada.stream.stop() # Ensure TTS stream is stopped 413 | for task in tasks: 414 | if not task.done(): task.cancel() 415 | await asyncio.gather(*tasks, return_exceptions=True) 416 | if ada.pya: 417 | print("Terminating PyAudio.") 418 | # Use run_in_executor for thread safety if needed, or simple to_thread 419 | await asyncio.to_thread(ada.pya.terminate) 420 | 421 | if __name__ == "__main__": 422 | try: 423 | asyncio.run(main()) 424 | except KeyboardInterrupt: 425 | print("\nExiting Ada Assistant...") 426 | except Exception as e: 427 | print(f"\nAn unexpected error occurred in main: {e}") -------------------------------------------------------------------------------- /ADA/ADA_Online.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import json 4 | import base64 5 | import pyaudio 6 | from RealtimeSTT import AudioToTextRecorder 7 | import torch # Import the torch library 8 | import re 9 | from google.genai import types 10 | import asyncio 11 | from google import genai 12 | import os 13 | from google.genai.types import Tool, GoogleSearch, Part, Blob, Content 14 | import python_weather 15 | import googlemaps # Added for travel duration 16 | from datetime import datetime # Added for travel duration 17 | from dotenv import load_dotenv # Added for API key loading 18 | 19 | # --- Load Environment Variables --- 20 | load_dotenv() 21 | ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") 22 | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 23 | MAPS_API_KEY = os.getenv("MAPS_API_KEY") # Added Maps API Key 24 | 25 | # --- Validate API Keys --- 26 | if not ELEVENLABS_API_KEY: print("Error: ELEVENLABS_API_KEY not found in environment variables.") 27 | if not GOOGLE_API_KEY: print("Error: GOOGLE_API_KEY not found in environment variables.") 28 | if not MAPS_API_KEY: print("Error: MAPS_API_KEY not found in environment variables.") 29 | # --- End API Key Validation --- 30 | 31 | VOICE_ID = 'pFZP5JQG7iQjIQuC4Bku' 32 | 33 | FORMAT = pyaudio.paInt16 34 | CHANNELS = 1 35 | # SEND_SAMPLE_RATE = 16000 # Keep if used by RealtimeSTT or other input processing 36 | RECEIVE_SAMPLE_RATE = 24000 # For ElevenLabs output 37 | CHUNK_SIZE = 1024 38 | 39 | class ADA: 40 | def __init__(self): 41 | print("initializing...") 42 | 43 | # Check for CUDA availability 44 | if torch.cuda.is_available(): 45 | self.device = "cuda" 46 | print("CUDA is available. Using GPU.") 47 | else: 48 | self.device = "cpu" 49 | print("CUDA is not available. Using CPU.") 50 | 51 | # --- Initialize Google GenAI Client --- 52 | self.client = genai.Client(api_key=GOOGLE_API_KEY, http_options={'api_version': 'v1beta'}) 53 | self.model = "gemini-2.0-flash-live-001" 54 | 55 | # --- System Behavior Prompt (Updated from reference) --- 56 | self.system_behavior = """ 57 | Your name is Ada, which stands for Advanced Design Assistant. 58 | You have a joking personality. You are an AI designed to assist with engineering projects, and you are an expert in all engineering, math, and science disciplines. 59 | You address people as "Sir" and you also speak with a british accent. 60 | When answering, you respond using complete sentences and in a conversational tone. Make sure to keep tempo of answers quick so don't use too much commas, periods or overall punctuation. 61 | Any prompts that need current or recent data always use the search tool. 62 | """ 63 | 64 | # --- Function Declarations (Added get_travel_duration_func) --- 65 | self.get_weather_func = types.FunctionDeclaration( 66 | name="get_weather", 67 | description="Get the current weather conditions (temperature, precipitation, description) for a specified city and state/country (e.g., 'Vinings, GA', 'London, UK').", 68 | parameters=types.Schema( 69 | type=types.Type.OBJECT, properties={"location": types.Schema(type=types.Type.STRING, description="The city and state, e.g., San Francisco, CA or Vinings, GA")}, required=["location"] 70 | ) 71 | ) 72 | self.get_travel_duration_func = types.FunctionDeclaration( 73 | name="get_travel_duration", 74 | description="Calculates the estimated travel duration between a specified origin and destination using Google Maps. Considers current traffic for driving mode.", 75 | parameters=types.Schema( 76 | type=types.Type.OBJECT, properties={ 77 | "origin": types.Schema(type=types.Type.STRING, description="The starting address or place name."), 78 | "destination": types.Schema(type=types.Type.STRING, description="The destination address or place name."), 79 | "mode": types.Schema(type=types.Type.STRING, description="Optional: Mode of transport ('driving', 'walking', etc.). Defaults to 'driving'.") 80 | }, required=["origin", "destination"] 81 | ) 82 | ) 83 | # --- End Function Declarations --- 84 | 85 | # --- Map function names to actual methods (Added get_travel_duration) --- 86 | self.available_functions = { 87 | "get_weather": self.get_weather, 88 | "get_travel_duration": self.get_travel_duration # Added mapping 89 | } 90 | 91 | # --- Google Search Tool (Grounding) --- 92 | self.google_search_tool = Tool( 93 | google_search = GoogleSearch() 94 | ) 95 | 96 | # --- Configuration (Updated tools list) --- 97 | self.config = types.LiveConnectConfig( 98 | system_instruction=types.Content( 99 | parts=[types.Part(text=self.system_behavior)] 100 | ), 101 | response_modalities=["TEXT"], 102 | # ---> Updated tools list <--- 103 | tools=[self.google_search_tool, types.Tool(code_execution=types.ToolCodeExecution,function_declarations=[ 104 | self.get_weather_func, 105 | self.get_travel_duration_func # Add the new function here 106 | ])] 107 | ) 108 | # --- End Configuration --- 109 | 110 | # --- Queues (Kept original relevant queues) --- 111 | self.input_queue = asyncio.Queue() 112 | self.response_queue = asyncio.Queue() 113 | self.audio_queue = asyncio.Queue() # Renamed from audio_output_queue for consistency 114 | 115 | # --- Recorder Config (Kept original) --- 116 | self.recorder_config = { 117 | 'model': 'large-v3', 118 | 'spinner': False, 119 | 'language': 'en', 120 | 'silero_sensitivity': 0.01, 121 | 'webrtc_sensitivity': 3, 122 | 'post_speech_silence_duration': 0.1, 123 | 'min_length_of_recording': 0.2, 124 | 'min_gap_between_recordings': 0, 125 | } 126 | 127 | # --- Initialize Recorder and PyAudio (Kept original) --- 128 | try: 129 | self.recorder = AudioToTextRecorder(**self.recorder_config) 130 | except Exception as e: 131 | print(f"Error initializing AudioToTextRecorder: {e}") 132 | self.recorder = None 133 | 134 | try: 135 | self.pya = pyaudio.PyAudio() 136 | except Exception as e: 137 | print(f"Error initializing PyAudio: {e}") 138 | self.pya = None 139 | # --- End Initialization --- 140 | 141 | # --- Function Implementations --- 142 | 143 | async def get_weather(self, location: str) -> dict | None: 144 | """ Fetches current weather. (Removed SocketIO emit) """ 145 | async with python_weather.Client(unit=python_weather.IMPERIAL) as client: 146 | try: 147 | weather = await client.get(location) 148 | weather_data = { 149 | 'location': location, 150 | 'current_temp_f': weather.temperature, 151 | 'precipitation': weather.precipitation, 152 | 'description': weather.description, 153 | } 154 | print(f"Weather data fetched: {weather_data}") 155 | # --- SocketIO Emit Removed --- 156 | return weather_data # Return data for Gemini 157 | 158 | except Exception as e: 159 | print(f"Error fetching weather for {location}: {e}") 160 | return {"error": f"Could not fetch weather for {location}."} # Return error info 161 | 162 | # --- Added Travel Duration Functions (from reference, removed SocketIO emit) --- 163 | def _sync_get_travel_duration(self, origin: str, destination: str, mode: str = "driving") -> str: 164 | """ Synchronous helper for Google Maps API call """ 165 | if not MAPS_API_KEY or MAPS_API_KEY == "YOUR_PROVIDED_KEY": # Check the actual key 166 | print("Error: Google Maps API Key is missing or invalid.") 167 | return "Error: Missing or invalid Google Maps API Key configuration." 168 | try: 169 | gmaps = googlemaps.Client(key=MAPS_API_KEY) # Use the loaded key 170 | now = datetime.now() 171 | print(f"Requesting directions: From='{origin}', To='{destination}', Mode='{mode}'") 172 | directions_result = gmaps.directions(origin, destination, mode=mode, departure_time=now) 173 | if directions_result: 174 | leg = directions_result[0]['legs'][0] 175 | duration_text = "Not available" 176 | result = f"Duration information not found in response for {mode}." # Default result 177 | if mode == "driving" and 'duration_in_traffic' in leg: 178 | duration_text = leg['duration_in_traffic']['text'] 179 | result = f"Estimated travel duration ({mode}, with current traffic): {duration_text}" 180 | elif 'duration' in leg: 181 | duration_text = leg['duration']['text'] 182 | result = f"Estimated travel duration ({mode}): {duration_text}" 183 | 184 | print(f"Directions Result: {result}") 185 | return result 186 | else: 187 | print(f"No route found from {origin} to {destination} via {mode}.") 188 | return f"Could not find a route from {origin} to {destination} via {mode}." 189 | except googlemaps.exceptions.ApiError as api_err: 190 | print(f"Google Maps API Error: {api_err}") 191 | return f"Error contacting Google Maps: {api_err}" 192 | except Exception as e: 193 | print(f"An unexpected error occurred during travel duration lookup: {e}") 194 | return f"An unexpected error occurred: {e}" 195 | 196 | async def get_travel_duration(self, origin: str, destination: str, mode: str = "driving") -> dict: 197 | """ Async wrapper to get travel duration. (Removed SocketIO emit) """ 198 | print(f"Received request for travel duration from: {origin} to: {destination}, Mode: {mode}") 199 | if not mode: 200 | mode = "driving" 201 | 202 | try: 203 | result_string = await asyncio.to_thread( 204 | self._sync_get_travel_duration, origin, destination, mode 205 | ) 206 | # --- SocketIO Emit Removed --- 207 | return {"duration_result": result_string} # Return result for Gemini 208 | 209 | except Exception as e: 210 | print(f"Error calling _sync_get_travel_duration via to_thread: {e}") 211 | return {"duration_result": f"Failed to execute travel duration request: {e}"} 212 | # --- End Travel Duration Functions --- 213 | 214 | 215 | async def clear_queues(self, text=""): 216 | """Clears all data from the input, response, and audio queues.""" 217 | # Changed audio_queue name for consistency 218 | queues = [self.input_queue, self.response_queue, self.audio_queue] 219 | for q in queues: 220 | while not q.empty(): 221 | try: 222 | q.get_nowait() 223 | except asyncio.QueueEmpty: 224 | break # Queue is empty 225 | 226 | async def input_message(self): 227 | """ Handles user text input (Kept original) """ 228 | while True: 229 | try: 230 | prompt = await asyncio.to_thread(input, "Enter your message: ") 231 | if prompt.lower() == "exit": 232 | await self.input_queue.put("exit") # Signal to exit 233 | print("exit input") 234 | break 235 | await self.clear_queues() 236 | await self.input_queue.put(prompt) 237 | except Exception as e: 238 | print(f"Error in input_message: {e}") 239 | continue # Continue the loop even if there's an error 240 | 241 | # --- send_prompt: Updated with Function Calling/Grounding logic from reference --- 242 | async def send_prompt(self): 243 | """Manages the Gemini conversation session, handling text and tool calls.""" 244 | print("Starting Gemini session manager...") 245 | try: 246 | # Establish connection (same as original) 247 | async with self.client.aio.live.connect(model=self.model, config=self.config) as session: 248 | print("Gemini session connected.") 249 | 250 | while True: # Loop to process text inputs 251 | message = await self.input_queue.get() 252 | 253 | if message.lower() == "exit": 254 | print("Exit signal received in send_prompt.") 255 | break # Exit the main loop 256 | 257 | if not session: # Check session validity (though handled by async with) 258 | print("Gemini session is not active.") 259 | self.input_queue.task_done(); continue # Should not happen here 260 | 261 | # Send the final text input for the turn (same as original) 262 | print(f"Sending FINAL text input to Gemini: {message}") 263 | await session.send(input=message, end_of_turn=True) 264 | print("Final text message sent to Gemini, waiting for response...") 265 | 266 | # --- Process responses (NEW LOGIC based on reference) --- 267 | async for response in session.receive(): 268 | try: 269 | # --- Handle Tool Calls (Function Calling) --- 270 | if response.tool_call: 271 | function_call_details = response.tool_call.function_calls[0] 272 | tool_call_id = function_call_details.id 273 | tool_call_name = function_call_details.name 274 | tool_call_args = dict(function_call_details.args) 275 | 276 | print(f"--- Received Tool Call: {tool_call_name} with args: {tool_call_args} (ID: {tool_call_id}) ---") 277 | 278 | if tool_call_name in self.available_functions: 279 | function_to_call = self.available_functions[tool_call_name] 280 | try: 281 | # Execute the corresponding async function 282 | function_result = await function_to_call(**tool_call_args) 283 | 284 | # Construct the response to send back to Gemini 285 | func_resp = types.FunctionResponse( 286 | id=tool_call_id, 287 | name=tool_call_name, 288 | response={"content": function_result} # Send back the result dictionary 289 | ) 290 | print(f"--- Sending Tool Response for {tool_call_name} (ID: {tool_call_id}) ---") 291 | # Send the function result back, don't end the turn yet 292 | await session.send(input=func_resp, end_of_turn=False) 293 | 294 | except Exception as e: 295 | print(f"Error executing function {tool_call_name}: {e}") 296 | # Decide how to handle function execution errors (e.g., send error back?) 297 | # For now, just print and continue waiting for Gemini's next step 298 | else: 299 | print(f"Error: Unknown function called by Gemini: {tool_call_name}") 300 | # Decide how to handle unknown function calls 301 | continue # Move to next response chunk after handling tool call 302 | 303 | # --- Handle Text Responses --- 304 | elif response.text: 305 | text_chunk = response.text 306 | print(text_chunk, end="", flush=True) # Print chunk immediately (like original) 307 | await self.response_queue.put(text_chunk) # Put chunk onto queue for TTS 308 | 309 | # --- (Optional) Handle Executable Code Tool (like reference, no SocketIO) --- 310 | elif (response.server_content and 311 | response.server_content.model_turn and 312 | response.server_content.model_turn.parts and 313 | response.server_content.model_turn.parts[0].executable_code): 314 | try: 315 | executable_code = response.server_content.model_turn.parts[0].executable_code 316 | code_string = executable_code.code 317 | language = str(executable_code.language) # Get language as string 318 | print(f"\n--- Received Executable Code ({language}) ---") 319 | print(code_string) 320 | print("------------------------------------------") 321 | # NOTE: No execution here, just printing. The library handles execution if configured. 322 | except (AttributeError, IndexError, TypeError) as e: 323 | pass # Ignore errors if structure isn't as expected 324 | 325 | except Exception as e: 326 | print(f"\nError processing Gemini response chunk: {e}") 327 | # Potentially break or continue depending on severity 328 | # --- End Processing Responses --- 329 | 330 | print("\nEnd of Gemini response stream for this turn.") 331 | await self.response_queue.put(None) # Signal end of response for TTS 332 | self.input_queue.task_done() # Mark input processed 333 | 334 | except asyncio.CancelledError: 335 | print("Gemini session task cancelled.") 336 | except Exception as e: 337 | print(f"Error in Gemini session manager: {e}") 338 | finally: 339 | print("Gemini session manager finished.") 340 | # No specific cleanup needed here unless tasks were managed differently 341 | 342 | async def tts(self): 343 | """ Send text to ElevenLabs API and stream the returned audio. (Kept Original Logic)""" 344 | uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream-input?model_id=eleven_flash_v2_5&output_format=pcm_24000" 345 | while True: # Outer loop to handle reconnections 346 | print("Attempting to connect to ElevenLabs WebSocket...") 347 | try: 348 | async with websockets.connect(uri) as websocket: 349 | print("ElevenLabs WebSocket Connected.") 350 | try: 351 | # Send initial configuration 352 | await websocket.send(json.dumps({ 353 | "text": " ", 354 | "voice_settings": {"stability": 0.4, "similarity_boost": 0.8, "speed": 1.1}, 355 | "xi_api_key": ELEVENLABS_API_KEY, 356 | })) 357 | 358 | async def listen(): 359 | """Listen to the websocket for audio data and queue it.""" 360 | while True: 361 | try: 362 | message = await websocket.recv() 363 | data = json.loads(message) 364 | if data.get("audio"): 365 | # Put raw audio bytes onto the queue 366 | await self.audio_queue.put(base64.b64decode(data["audio"])) 367 | elif data.get("isFinal"): 368 | # Optional: Handle end-of-stream signal from ElevenLabs if needed 369 | pass 370 | # Removed `elif text is None:` check as it was incorrect scope 371 | except websockets.exceptions.ConnectionClosedOK: 372 | print("ElevenLabs connection closed normally by server.") 373 | break # Exit listener loop 374 | except websockets.exceptions.ConnectionClosedError as e: 375 | print(f"ElevenLabs connection closed with error: {e}") 376 | break # Exit listener loop 377 | except json.JSONDecodeError as e: 378 | print(f"JSON Decode Error in ElevenLabs listener: {e}") 379 | # Decide whether to break or continue 380 | except asyncio.CancelledError: 381 | print("ElevenLabs listener task cancelled.") 382 | raise # Re-raise cancellation 383 | except Exception as e: 384 | print(f"Error in ElevenLabs listener: {e}") 385 | break # Exit listener loop on other errors 386 | 387 | listen_task = asyncio.create_task(listen()) 388 | 389 | try: 390 | # Send text chunks from response queue 391 | while True: 392 | text = await self.response_queue.get() 393 | if text is None: # Signal to end the TTS stream for this turn 394 | print("End of text stream signal received for TTS.") 395 | await websocket.send(json.dumps({"text": ""})) # Send EOS signal 396 | break # Exit inner loop (sending text) 397 | 398 | if text: # Ensure text is not empty 399 | # Added space for potential word breaks 400 | await websocket.send(json.dumps({"text": text + " "})) 401 | 402 | self.response_queue.task_done() # Mark item as processed 403 | 404 | except asyncio.CancelledError: 405 | print("TTS text sender cancelled.") 406 | listen_task.cancel() # Cancel listener if sender is cancelled 407 | raise # Re-raise cancellation 408 | except Exception as e: 409 | print(f"Error processing text for TTS: {e}") 410 | listen_task.cancel() # Cancel listener on error 411 | finally: 412 | # Wait for the listener task to finish after text sending stops or errors 413 | if not listen_task.done(): 414 | print("Waiting for TTS listener task to complete...") 415 | try: 416 | await asyncio.wait_for(listen_task, timeout=5.0) 417 | except asyncio.TimeoutError: 418 | print("Timeout waiting for TTS listener task.") 419 | listen_task.cancel() 420 | except asyncio.CancelledError: 421 | print("TTS Listener was already cancelled.") # Expected if sender was cancelled 422 | except Exception as e: 423 | print(f"Error awaiting listener task: {e}") 424 | 425 | 426 | except websockets.exceptions.ConnectionClosed as e: 427 | print(f"ElevenLabs WebSocket connection closed during operation: {e}") 428 | # Outer loop will handle reconnection attempt 429 | except Exception as e: 430 | print(f"Error during ElevenLabs websocket communication: {e}") 431 | # Outer loop will handle reconnection attempt 432 | 433 | except websockets.exceptions.WebSocketException as e: 434 | print(f"ElevenLabs WebSocket connection failed: {e}") 435 | except asyncio.CancelledError: 436 | print("TTS main task cancelled.") 437 | break # Exit outer loop if cancelled 438 | except Exception as e: 439 | print(f"Error connecting to ElevenLabs websocket: {e}") 440 | 441 | print("Waiting 5 seconds before attempting ElevenLabs reconnection...") 442 | await asyncio.sleep(5) # Wait before retrying connection 443 | 444 | # Removed extract_tool_call method as it's replaced by direct handling in send_prompt 445 | 446 | async def play_audio(self): 447 | """ Plays audio chunks from the audio_queue. (Kept Original Logic) """ 448 | if self.pya is None: 449 | print("PyAudio is not initialized. Cannot play audio.") 450 | return 451 | 452 | stream = None # Initialize stream variable 453 | try: 454 | print("Opening PyAudio stream...") 455 | stream = await asyncio.to_thread( 456 | self.pya.open, 457 | format=FORMAT, 458 | channels=CHANNELS, 459 | rate=RECEIVE_SAMPLE_RATE, 460 | output=True, 461 | ) 462 | print("PyAudio stream opened. Waiting for audio chunks...") 463 | while True: 464 | try: 465 | # Wait for audio data from the TTS task 466 | bytestream = await self.audio_queue.get() 467 | if bytestream is None: # Potential signal to stop? (Not currently used) 468 | print("Received None in audio queue, stopping playback loop.") 469 | break 470 | # Write audio data to the stream in a separate thread 471 | await asyncio.to_thread(stream.write, bytestream) 472 | self.audio_queue.task_done() # Mark item as processed 473 | except asyncio.CancelledError: 474 | print("Audio playback task cancelled.") 475 | break # Exit loop if task is cancelled 476 | except Exception as e: 477 | print(f"Error in play_audio loop: {e}") 478 | # Decide if error is fatal or recoverable 479 | await asyncio.sleep(0.1) # Avoid busy-looping on error 480 | 481 | except pyaudio.PyAudioError as e: 482 | print(f"PyAudio error opening stream: {e}") 483 | except Exception as e: 484 | print(f"Error setting up audio stream: {e}") 485 | finally: 486 | if stream: 487 | print("Closing PyAudio stream...") 488 | await asyncio.to_thread(stream.stop_stream) 489 | await asyncio.to_thread(stream.close) 490 | print("PyAudio stream closed.") 491 | # Don't terminate PyAudio here if other parts might use it 492 | # await asyncio.to_thread(self.pya.terminate) 493 | 494 | async def stt(self): 495 | """ Listens via microphone and puts transcribed text onto input_queue. (Kept Original Logic) """ 496 | if self.recorder is None: 497 | print("Audio recorder (RealtimeSTT) is not initialized.") 498 | return 499 | 500 | print("Starting Speech-to-Text engine...") 501 | while True: 502 | try: 503 | # Blocking call handled in a thread 504 | text = await asyncio.to_thread(self.recorder.text) 505 | if text: # Only process if text is not empty 506 | print(f"STT Detected: {text}") 507 | await self.clear_queues() # Clear queues before adding new input 508 | await self.input_queue.put(text) # Put transcribed text onto the input queue 509 | except asyncio.CancelledError: 510 | print("STT task cancelled.") 511 | break 512 | except Exception as e: 513 | print(f"Error in STT loop: {e}") 514 | # Add a small delay to prevent high CPU usage on continuous errors 515 | await asyncio.sleep(0.5) 516 | # --- End of ADA Class --- 517 | 518 | # --- Main Execution Block (Example) --- 519 | async def main(): 520 | print("Starting Ada Assistant...") 521 | ada = ADA() 522 | 523 | if ada.pya is None or ada.recorder is None: 524 | print("Failed to initialize audio components. Exiting.") 525 | return 526 | 527 | # Create tasks for each concurrent operation 528 | tasks = [ 529 | asyncio.create_task(ada.stt()), # Speech to Text -> input_queue 530 | asyncio.create_task(ada.send_prompt()), # input_queue -> Gemini (handles tools) -> response_queue 531 | asyncio.create_task(ada.tts()), # response_queue -> ElevenLabs -> audio_queue 532 | asyncio.create_task(ada.play_audio()), # audio_queue -> Speaker 533 | # asyncio.create_task(ada.input_message()) # Optional: Uncomment for text input instead of STT 534 | ] 535 | 536 | # Run tasks concurrently 537 | try: 538 | await asyncio.gather(*tasks) 539 | except asyncio.CancelledError: 540 | print("Main tasks cancelled.") 541 | finally: 542 | print("Cleaning up...") 543 | # Gracefully stop tasks if needed (though gather handles cancellation) 544 | for task in tasks: 545 | if not task.done(): 546 | task.cancel() 547 | await asyncio.gather(*tasks, return_exceptions=True) # Wait for cleanup 548 | if ada.pya: 549 | print("Terminating PyAudio.") 550 | await asyncio.to_thread(ada.pya.terminate) # Clean up PyAudio resources 551 | 552 | if __name__ == "__main__": 553 | try: 554 | asyncio.run(main()) 555 | except KeyboardInterrupt: 556 | print("\nExiting Ada Assistant...") 557 | except Exception as e: 558 | print(f"\nAn unexpected error occurred in main: {e}") --------------------------------------------------------------------------------