├── Crow.py ├── CrowAssistant.py ├── CrowAssistant.spec ├── CrowBrain.py ├── CrowConfig.py ├── CrowSTT.py ├── LICENSE ├── README.md ├── Volume.py ├── config.json ├── crow.ico ├── images ├── crow-idle1.png ├── crow-idle2.png ├── crow-wingleft.png ├── crow-wingright.png ├── crowfly.png ├── crowhead-blink.png ├── crowhead-lookback.png ├── crowhead-tilt.png ├── crowhead-tiltold.png └── crowhead.png └── templates ├── index.html └── settings.html /Crow.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import sys 3 | import os 4 | import random 5 | import psutil 6 | import CrowConfig 7 | import webbrowser 8 | 9 | 10 | # For Windows, we need to set these environment variables 11 | # to make the window clickthrough 12 | if os.name == 'nt': 13 | import ctypes 14 | try: 15 | ctypes.windll.user32.SetProcessDPIAware() 16 | except AttributeError: 17 | pass 18 | import win32gui 19 | import win32con 20 | import win32api # Add this import 21 | import win32process 22 | # Get the directory of the script 23 | script_dir = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | # Set the working directory to the script's directory 27 | os.chdir(script_dir) 28 | pygame_icon = pygame.image.load('crow.ico') 29 | pygame.display.set_icon(pygame_icon) 30 | 31 | def load_sprite_animation(name): 32 | sprites = [] 33 | index = 1 34 | while True: 35 | filename = f"{name}{index}.png" 36 | try: 37 | sprite = pygame.image.load(filename).convert_alpha() 38 | sprites.append(sprite) 39 | index += 1 40 | except FileNotFoundError: 41 | break 42 | return sprites 43 | 44 | def load_sprite_sheet(filename, sprite_size): 45 | sprite_sheet = pygame.image.load(filename).convert_alpha() 46 | sprite_width, sprite_height = sprite_size 47 | sheet_width, sheet_height = sprite_sheet.get_size() 48 | sprites = [] 49 | 50 | for y in range(0, sheet_height, sprite_height): 51 | for x in range(0, sheet_width, sprite_width): 52 | sprite = sprite_sheet.subsurface((x, y, sprite_width, sprite_height)) 53 | if not is_blank(sprite): 54 | sprites.append(sprite) 55 | 56 | return sprites 57 | 58 | def is_blank(sprite): 59 | for x in range(sprite.get_width()): 60 | for y in range(sprite.get_height()): 61 | if sprite.get_at((x, y)).a != 0: 62 | return False 63 | return True 64 | 65 | 66 | 67 | class CrowAnimationController: 68 | def __init__(self, crow): 69 | self.crow = crow 70 | self.head_index = 0 71 | self.idle_index = 0 72 | self.fly_index = 0 73 | self.blink_timer = 0 74 | self.lookback_timer = 0 75 | 76 | 77 | # Animation speeds 78 | self.head_animation_speed = 10 # frames per second 79 | self.idle_animation_speed = 1 # frames per second 80 | self.fly_animation_speed = 10 # frames per second 81 | self.blink_interval = 10 # frames between blinks 82 | self.lookback_interval = 120 # frames between lookbacks 83 | 84 | # Timers 85 | self.idle_timer = 0 86 | self.fly_timer = 0 87 | self.blink_cooldown = 0 88 | self.lookback_cooldown = 0 89 | 90 | def update(self): 91 | # Update head animation based on volume 92 | if self.crow.volume > 0: 93 | target_index = int(len(self.crow.head) * self.crow.volume) 94 | self.head_index = min(target_index, len(self.crow.head) - 1) 95 | else: 96 | self.head_index = 0 97 | 98 | # Update idle animation 99 | if not self.crow.flying: 100 | self.idle_timer += 1 101 | if self.idle_timer >= (60 / self.idle_animation_speed): 102 | self.idle_index = (self.idle_index + 1) % len(self.crow.idle) 103 | self.idle_timer = 0 104 | 105 | # Update fly animation 106 | if self.crow.flying: 107 | self.fly_timer += 1 108 | if self.fly_timer >= (60 / self.fly_animation_speed): 109 | self.fly_index = (self.fly_index + 1) % len(self.crow.fly) 110 | self.fly_timer = 0 111 | 112 | # Update blink timer 113 | if self.crow.volume == 0: 114 | self.blink_cooldown -= 1 115 | if self.blink_cooldown <= 0: 116 | self.blink_timer = self.blink_interval 117 | self.blink_cooldown = random.randint(self.blink_interval, self.blink_interval * 20) 118 | if random.random() < 0.1: # 10% chance to look back instead of blink 119 | self.lookback_timer = 120 120 | else: 121 | self.lookback_timer = 0 122 | 123 | # Decrement blink timer 124 | if self.blink_timer > 0: 125 | self.blink_timer -= 1 126 | 127 | # Update lookback timer 128 | if self.lookback_timer > 0: 129 | self.lookback_timer -= 1 130 | self.lookback_cooldown = self.lookback_interval 131 | 132 | # Update lookback cooldown 133 | if self.lookback_cooldown > 0: 134 | self.lookback_cooldown -= 1 135 | 136 | def render(self, screen): 137 | # Render body 138 | if self.crow.flying: 139 | screen.blit(self.crow.fly[self.fly_index], (0, 0)) 140 | else: 141 | screen.blit(self.crow.idle[self.idle_index], (0, 0)) 142 | 143 | # Render head 144 | if self.crow.listen and not self.crow.Sleeping: 145 | screen.blit(self.crow.headtilt, (0, 0)) 146 | elif not self.crow.flying and self.lookback_timer > 0 and self.crow.volume == 0: 147 | screen.blit(self.crow.headlookback, (0, 0)) 148 | elif self.blink_timer > 0 and self.crow.volume == 0: 149 | screen.blit(self.crow.headblink, (0, 0)) 150 | else: 151 | screen.blit(self.crow.head[self.head_index], (0, 0)) 152 | 153 | return screen 154 | 155 | def Init(): 156 | return DesktopPet.get_instance() 157 | 158 | class DesktopPet: 159 | _instance = None 160 | 161 | @classmethod 162 | def get_instance(cls): 163 | if cls._instance is None: 164 | cls._instance = cls() 165 | return cls._instance 166 | 167 | def __init__(self): 168 | if DesktopPet._instance is not None: 169 | raise Exception("This class is a singleton!") 170 | else: 171 | DesktopPet._instance = self 172 | #pygame.init() 173 | self.last_click_time = 0 174 | self.double_click_threshold = 500 # milliseconds 175 | self.config = CrowConfig.config() 176 | self.Sleeping = False 177 | self.SleepTimer = 0 178 | self.scale = self.config.config['scale'] 179 | # Set up the window 180 | 181 | self.screen = pygame.display.set_mode((64*self.scale, 64*self.scale), pygame.NOFRAME) 182 | pygame.display.set_caption("Crow") 183 | print("title set") 184 | 185 | self.right = False 186 | 187 | 188 | self.volume = 0 #a float that is a voice volume from 0 to 1 189 | self.listen = False 190 | self.flying = False 191 | 192 | # Load sprites 193 | self.head = load_sprite_sheet("images/crowhead.png",(64,64)) # a set of mouth animations the last is wide open, the first is closed. we should lerp to these from the volume 194 | self.headblink = pygame.image.load("images/crowhead-blink.png").convert_alpha() #blink randomly every once in a while as long as talking is 0 195 | self.headlookback = pygame.image.load("images/crowhead-lookback.png").convert_alpha() #rarely instead of blink 196 | self.headtilt = pygame.image.load("images/crowhead-tilt.png").convert_alpha() #if listen is true, this should render 197 | 198 | self.idle = load_sprite_animation("images/crow-idle") #when not moving 199 | self.fly = load_sprite_sheet("images/crowfly.png",(64,64)) #when moving to a new spot 200 | print("Sprites Loaded") 201 | 202 | 203 | 204 | # Set the window to be transparent 205 | self.screen.set_colorkey((0,0,0)) # Black will be transparent, any sprites can not be black 206 | self.screen.fill((0,0,0)) 207 | 208 | # For Windows, set the window to be clickthrough 209 | if os.name == 'nt': 210 | hwnd = pygame.display.get_wm_info()["window"] 211 | win32gui.SetWindowLong(hwnd, win32con.GWL_EXSTYLE, 212 | win32gui.GetWindowLong(hwnd, win32con.GWL_EXSTYLE) | win32con.WS_EX_LAYERED) 213 | win32gui.SetLayeredWindowAttributes(hwnd, win32api.RGB(0,0,0), 0, win32con.LWA_COLORKEY) 214 | 215 | 216 | self.clock = pygame.time.Clock() 217 | self.dragging = False 218 | self.hwnd = pygame.display.get_wm_info()["window"] 219 | self.set_always_on_top() 220 | self.current_window = None 221 | self.target_x = 0 222 | self.target_y = 0 223 | self.current_x = 0 224 | self.current_y = 0 225 | self.move_speed = 4 # Adjust this to change animation speed 226 | self.wincheck = 0 227 | 228 | self.running = True 229 | self.animation_controller = CrowAnimationController(self) 230 | self.wincheck = 0 231 | 232 | 233 | 234 | def set_always_on_top(self): 235 | win32gui.SetWindowPos( 236 | self.hwnd, 237 | win32con.HWND_TOPMOST, 238 | 0, 0, 0, 0, 239 | win32con.SWP_NOMOVE | win32con.SWP_NOSIZE 240 | ) 241 | 242 | 243 | def get_focused_window(self): 244 | focused = win32gui.GetForegroundWindow() 245 | if focused == self.hwnd: 246 | return None # Return None if our window is focused 247 | title = win32gui.GetWindowText(focused) 248 | if title: 249 | return focused 250 | return None 251 | 252 | def get_window_info(self, hwnd): 253 | if hwnd: 254 | try: 255 | rect = win32gui.GetWindowRect(hwnd) 256 | return rect 257 | except win32gui.error: 258 | return None 259 | return None 260 | 261 | def move_to_window(self, hwnd): 262 | if hwnd and hwnd != self.hwnd: # Only move if it's not our own window 263 | rect = self.get_window_info(hwnd) 264 | if rect: 265 | window_width = rect[2] - rect[0] 266 | window_bottom = rect[3] 267 | 268 | # Set new target x and y positions 269 | self.target_x = random.randint(rect[0], rect[2] - self.screen.get_width()) 270 | self.target_y = window_bottom - self.screen.get_height() 271 | else: 272 | # If we can't get window info, move to a default position 273 | self.target_x = 0 274 | self.target_y = win32api.GetSystemMetrics(win32con.SM_CYSCREEN) - self.screen.get_height() 275 | 276 | def move_to_taskbar_clock(self): 277 | # Get the screen size 278 | screen_width = win32api.GetSystemMetrics(win32con.SM_CXSCREEN) 279 | screen_height = win32api.GetSystemMetrics(win32con.SM_CYSCREEN) 280 | 281 | # Get the taskbar height 282 | taskbar_hwnd = win32gui.FindWindow("Shell_TrayWnd", None) 283 | if taskbar_hwnd: 284 | taskbar_rect = win32gui.GetWindowRect(taskbar_hwnd) 285 | taskbar_height = taskbar_rect[3] - taskbar_rect[1] 286 | else: 287 | # If we can't find the taskbar, assume a default height 288 | taskbar_height = 40 289 | 290 | # Calculate the position 291 | # We'll position it slightly to the left of the very corner to avoid overlapping with any system tray icons 292 | offset_from_right = 100 # Adjust this value as needed 293 | self.target_x = screen_width - self.screen.get_width() - offset_from_right 294 | self.target_y = screen_height - self.screen.get_height() - taskbar_height 295 | 296 | def update_position(self): 297 | # Calculate the distance between the current position and the target position 298 | dx = self.target_x - self.current_x 299 | dy = self.target_y - self.current_y 300 | 301 | # Calculate the length of the distance vector 302 | distance = (dx ** 2 + dy ** 2) ** 0.5 303 | 304 | # If the distance is very small, just snap to the target position 305 | if distance < self.move_speed: 306 | self.current_x = self.target_x 307 | self.current_y = self.target_y 308 | self.flying = False 309 | else: 310 | # Move the pet by a fraction of the distance each frame 311 | fraction = self.move_speed / distance 312 | self.current_x += dx * fraction 313 | self.current_y += dy * fraction 314 | self.flying = True 315 | if self.target_x > self.current_x: 316 | self.right = True 317 | else: 318 | self.right = False 319 | 320 | # Set new window position 321 | win32gui.SetWindowPos(self.hwnd, 0, 322 | int(self.current_x), 323 | int(self.current_y), 324 | 0, 0, win32con.SWP_NOSIZE | win32con.SWP_NOZORDER) 325 | 326 | 327 | 328 | 329 | def launch_webpage(self): 330 | url = "http://127.0.0.1:" + str(self.config.config['port']) 331 | webbrowser.open(url) 332 | 333 | 334 | def Update(self): 335 | #while running: 336 | for event in pygame.event.get(): 337 | if event.type == pygame.QUIT: 338 | running = False 339 | elif event.type == pygame.MOUSEBUTTONDOWN: 340 | if event.button == 1: # Left mouse button 341 | current_time = pygame.time.get_ticks() 342 | if current_time - self.last_click_time < self.double_click_threshold: 343 | self.launch_webpage() 344 | self.last_click_time = current_time 345 | 346 | 347 | 348 | self.wincheck+=1 349 | if self.wincheck > 300: 350 | self.wincheck = 0 351 | if(self.Sleeping): 352 | self.move_to_taskbar_clock() 353 | else: 354 | # Check for new focused window 355 | focused_window = self.get_focused_window() 356 | if focused_window and focused_window != self.current_window: 357 | self.current_window = focused_window 358 | self.move_to_window(self.current_window) 359 | self.last_window_info = self.get_window_info(self.current_window) 360 | 361 | # Check if current window has moved or resized 362 | if not self.Sleeping and self.current_window: 363 | current_window_info = self.get_window_info(self.current_window) 364 | if not self.flying and current_window_info != self.last_window_info: 365 | self.move_to_window(self.current_window) 366 | self.last_window_info = current_window_info 367 | 368 | # Update position for animation 369 | self.update_position() 370 | 371 | 372 | # Clear the screen 373 | self.screen.fill((0,0,0)) # Fill with the transparent color 374 | 375 | # Draw the current sprite 376 | temp_screen = pygame.Surface((64, 64), pygame.SRCALPHA) 377 | self.animation_controller.update() 378 | self.animation_controller.render(temp_screen) 379 | 380 | if self.right: 381 | temp_screen = pygame.transform.flip(temp_screen, True, False) 382 | 383 | scaled_screen = pygame.transform.scale(temp_screen, (64 * self.scale, 64 * self.scale)) 384 | 385 | self.screen.blit(scaled_screen, (0, 0)) 386 | # Update the display 387 | pygame.display.flip() 388 | 389 | self.SleepTimer += self.clock.get_time() 390 | self.clock.tick(60) 391 | 392 | def End(self): 393 | self.running = False 394 | print("crow end") 395 | pygame.quit() 396 | -------------------------------------------------------------------------------- /CrowAssistant.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from queue import Queue 4 | import pyaudio 5 | from CrowSTT import AudioToTextRecorder 6 | import random 7 | import string 8 | import time 9 | import keyboard 10 | import threading 11 | import Crow 12 | import CrowBrain 13 | import wave 14 | import numpy as np 15 | import re 16 | import sys 17 | import shlex 18 | import requests 19 | import ctypes 20 | import Volume 21 | import CrowConfig 22 | # import zipfile 23 | # from urllib.parse import urlparse 24 | # import shutil 25 | 26 | 27 | ##At some point I want to make it so it downloads the things it needs for TTS etc... but for now we'll do it manually 28 | # def download_and_extract(target_file, url): 29 | # # Determine the filename from the URL 30 | # parsed_url = urlparse(url) 31 | # download_filename = os.path.basename(parsed_url.path) or 'downloaded_file' 32 | 33 | # # If the target file exists, we'll still proceed with the download 34 | # # as we want to update all files in the zip 35 | # if os.path.exists(target_file): 36 | # print(f"Note: {target_file} already exists, but we'll proceed with the download and update.") 37 | 38 | # print(f"Downloading from {url}") 39 | # try: 40 | # response = requests.get(url, stream=True) 41 | # response.raise_for_status() # Raises an HTTPError for bad requests 42 | 43 | # # Save the downloaded content 44 | # with open(download_filename, 'wb') as file: 45 | # for chunk in response.iter_content(chunk_size=8192): 46 | # file.write(chunk) 47 | # print(f"File {download_filename} has been downloaded successfully.") 48 | 49 | # # Process the downloaded file 50 | # if download_filename.lower().endswith('.zip'): 51 | # print(f"Extracting all files from {download_filename}") 52 | # try: 53 | # with zipfile.ZipFile(download_filename, 'r') as zip_ref: 54 | # # Extract all contents, overwriting existing files 55 | # zip_ref.extractall(path=".", members=None) 56 | # print("All files have been extracted successfully.") 57 | 58 | # # Verify if the target file was part of the extracted files 59 | # if os.path.exists(target_file): 60 | # print(f"Successfully obtained {target_file}") 61 | # else: 62 | # print(f"Warning: {target_file} was not found in the extracted files.") 63 | # except zipfile.BadZipFile: 64 | # print("Error: The downloaded file is not a valid zip file.") 65 | # return 66 | # else: 67 | # # If it's not a zip, just rename it to the target file 68 | # os.replace(download_filename, target_file) 69 | # print(f"Downloaded file renamed to {target_file}") 70 | 71 | # except requests.RequestException as e: 72 | # print(f"Error downloading file: {e}") 73 | # finally: 74 | # # Clean up the downloaded zip file if it exists 75 | # if os.path.exists(download_filename) and download_filename != target_file: 76 | # os.remove(download_filename) 77 | # print(f"Cleaned up {download_filename}") 78 | 79 | 80 | 81 | 82 | 83 | def delete_wav_files(): 84 | # Get the directory of the current script 85 | script_dir = os.path.dirname(os.path.abspath(__file__)) 86 | 87 | # Construct the path to the 'wav' folder 88 | wav_folder = os.path.join(script_dir, 'wav') 89 | 90 | # Check if the 'wav' folder exists 91 | if not os.path.exists(wav_folder): 92 | print(f"The folder {wav_folder} does not exist.") 93 | return 94 | 95 | try: 96 | # Iterate over all files in the 'wav' folder 97 | for filename in os.listdir(wav_folder): 98 | file_path = os.path.join(wav_folder, filename) 99 | 100 | # Check if it's a file (not a subdirectory) 101 | if os.path.isfile(file_path): 102 | # Delete the file 103 | os.remove(file_path) 104 | print(f"Deleted: {filename}") 105 | 106 | print("All files in the 'wav' folder have been deleted.") 107 | 108 | except Exception as e: 109 | print(f"An error occurred: {e}") 110 | 111 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' 112 | # Fetch the absolute path of the script 113 | script_path = os.path.abspath(__file__) 114 | 115 | # Extract the directory from the absolute path 116 | script_dir = os.path.dirname(script_path) 117 | os.chdir(script_dir) 118 | 119 | 120 | #download_and_extract('piper.exe', 'https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_windows_amd64.zip') 121 | 122 | 123 | # Global queue for TTS files 124 | tts_queue = Queue() 125 | inputque = Queue() 126 | 127 | is_playing = False 128 | is_talking = False 129 | outputdev = 12 130 | 131 | # Initialize PyAudio 132 | audio = pyaudio.PyAudio() 133 | 134 | voicenum = 113# config.config['voice'] 135 | 136 | 137 | 138 | # Initialize CrowBrain 139 | brain = None# CrowBrain.Init() 140 | current_conversation_id = 1 141 | 142 | saidname = False 143 | 144 | def handle_completed_sentence(sentence): 145 | global current_conversation_id 146 | global brain 147 | global voicenum 148 | if not is_playing: 149 | #wintts(sentence, "en_US-libritts_r-medium.onnx -s " + str(voicenum)) 150 | #return 151 | # Generate response using CrowBrain 152 | response = brain.generate(sentence, current_conversation_id) 153 | if 'error' not in response: 154 | ai_response = response['content'] 155 | wintts(ai_response, "en_US-libritts_r-medium.onnx -s " + str(voicenum)) 156 | else: 157 | print(f"Error in AI response: {response['error']}") 158 | wintts(response['error'], "en_US-libritts_r-medium.onnx -s " + str(voicenum)) 159 | 160 | def test_voice(vid): 161 | global voicenum 162 | voicenum = vid 163 | wintts("This is my voice for number " + str(vid), "en_US-libritts_r-medium.onnx -s " + str(vid)) 164 | 165 | def contains_word(text, word): 166 | pattern = r'\b' + re.escape(word) + r'\b' 167 | return bool(re.search(pattern, text, re.IGNORECASE)) 168 | 169 | 170 | def wintts(text, model): 171 | global config 172 | global saidname 173 | global stopplayback 174 | stopplayback=False 175 | saidname = contains_word(text, config.config['name']) 176 | 177 | # Clean up the text 178 | text = re.sub(r"\'", "", text) 179 | text = re.sub(r"\*", "", text) 180 | text = text.strip() 181 | remove_chars = "&<>[]|^%:\"" 182 | text = "".join(char for char in text if char not in remove_chars) 183 | 184 | # Split the text into sentences or lines 185 | sentences = re.split(r'(?<=[.!?])\s+|\n', text) 186 | 187 | # Process each non-empty sentence 188 | for sentence in sentences: 189 | sentence = sentence.strip() 190 | if not sentence: 191 | continue 192 | 193 | print(sentence) 194 | 195 | # Generate a random filename 196 | random_filename = ''.join(random.choices(string.ascii_letters + string.digits, k=10)) + ".wav" 197 | random_filename = os.path.join("wav", random_filename) 198 | 199 | # Use shell escaping for the sentence to handle special characters 200 | safe_sentence = shlex.quote(sentence) 201 | 202 | command = f"echo {safe_sentence} | piper -m {model} -f {random_filename}" 203 | os.system(command) 204 | 205 | # Add the file to the queue 206 | tts_queue.put(random_filename) 207 | 208 | def play_and_delete_wav(): 209 | global stopplayback 210 | global Running 211 | global is_playing 212 | global is_talking 213 | global recorder 214 | global saidname 215 | while Running: 216 | if not tts_queue.empty(): 217 | filename = tts_queue.get() 218 | print(filename) 219 | if(not stopplayback): 220 | is_talking=True 221 | play_wav(filename) # Assuming play_wav is a function to play WAV files 222 | os.remove(filename) # Delete the WAV file after playing 223 | tts_queue.task_done() 224 | else: 225 | if is_talking: 226 | 227 | #print("Playing Done") 228 | recorder.interrupt_stop_event.set() 229 | recorder.stop() 230 | time.sleep(0.1) 231 | is_talking=False 232 | saidname=False 233 | time.sleep(0.1) # Sleep for a short duration to avoid busy waiting 234 | print("WAVE THREAD END") 235 | 236 | wf = None 237 | vol = 0 238 | 239 | stopplayback = False 240 | 241 | 242 | def callback(in_data, frame_count, time_info, status): 243 | global vol 244 | global wf 245 | global stopplayback 246 | if stopplayback: 247 | return (None, pyaudio.paComplete) 248 | # Read data from file 249 | data = wf.readframes(frame_count) 250 | d = np.frombuffer(data, dtype=np.int16) 251 | v = np.average(np.abs(d)) 252 | if (not np.isnan(v)): 253 | vol = v * .0001 254 | else: 255 | vol = 0 256 | 257 | return (data, pyaudio.paContinue) 258 | 259 | 260 | 261 | def play_wav(wavefile): 262 | #print("wavstart") 263 | athread = threading.Thread(target=wavethread, args=(wavefile,)) 264 | athread.start() 265 | while is_playing: 266 | time.sleep(0.01) 267 | #print("end of fun") 268 | 269 | def wavethread(wavefile): 270 | global wf # Make wf global so it can be accessed by callback 271 | global is_playing 272 | global outputdev 273 | is_playing = True 274 | # Open the wav file 275 | wf = wave.open(wavefile, 'rb') 276 | 277 | p = pyaudio.PyAudio() 278 | RATE = wf.getframerate() 279 | CHUNK = int(RATE / 10) 280 | 281 | stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), 282 | channels=wf.getnchannels(), 283 | rate=RATE, 284 | output=True, 285 | output_device_index=outputdev, 286 | frames_per_buffer=CHUNK, 287 | stream_callback=callback) 288 | 289 | # Start the stream 290 | stream.start_stream() 291 | 292 | # Keep the script running while the audio is playing 293 | while stream.is_active(): 294 | time.sleep(0.1) 295 | 296 | # Stop stream 297 | stream.stop_stream() 298 | stream.close() 299 | 300 | # Close PyAudio and wave file 301 | wf.close() 302 | p.terminate() 303 | is_playing=False 304 | 305 | 306 | def process_text(text): 307 | global mute 308 | global Running 309 | global Sleeping 310 | print(text) 311 | if not mute and not Sleeping and Running: 312 | handle_completed_sentence(text) 313 | 314 | mute = False 315 | recorder = None 316 | listening = False 317 | 318 | def my_start_callback(): 319 | global is_talking 320 | global listening 321 | global vc 322 | global lastvolume 323 | print("Recording started!") 324 | if(not is_talking): 325 | listening=True 326 | lastvolume = vc.get_volume() 327 | print(lastvolume) 328 | vc.set_volume(25) 329 | 330 | def my_stop_callback(): 331 | global listening 332 | listening=False 333 | vc.set_volume(lastvolume) 334 | print("Recording stopped!") 335 | 336 | def tupdate(text): 337 | global stopplayback 338 | global brain 339 | global saidname 340 | global current_conversation_id 341 | global Sleeping 342 | global crow 343 | global is_talking 344 | global listening 345 | print(text) 346 | if Sleeping: 347 | if(contains_word(text.lower(),config.config['name'].lower())): 348 | print("wake") 349 | Sleeping = False 350 | crow.SleepTimer = 0 351 | else: 352 | crow.SleepTimer = 0 353 | if is_talking: 354 | if(contains_word(text.lower(),config.config['name'].lower()) and not saidname): 355 | #stop current playback 356 | print("INTERUPT") 357 | stopplayback=True 358 | brain.addSystemMessage("[Interupted]",current_conversation_id) 359 | return 360 | else: 361 | listening=True 362 | 363 | def transcriptstart(): 364 | global crow 365 | crow.SleepTimer = 0 366 | 367 | 368 | 369 | whisperprompt = "" 370 | 371 | 372 | 373 | def aibrains(): 374 | global Running 375 | global mute 376 | global recorder 377 | print("BRAINS") 378 | try: 379 | print('Listening ... (press Shift + ESC to exit)') 380 | 381 | while Running: 382 | recorder.text(process_text) 383 | 384 | except Exception as e: 385 | print(f'An error occurred in aibrains: {e}') 386 | 387 | finally: 388 | print("end brain") 389 | Running = False 390 | 391 | 392 | recorder = None 393 | Sleeping = True 394 | Running = True 395 | crow = None 396 | config = None 397 | 398 | vc = Volume.VolumeControl(outputdev) 399 | lastvolume = vc.get_volume() 400 | 401 | def main(): 402 | global crow 403 | global Running 404 | global mute 405 | global voicenum 406 | global vol 407 | global is_playing 408 | global listening 409 | global Sleeping 410 | global recorder 411 | global outputdev 412 | #list_output_devices() 413 | 414 | print("MAIN") 415 | 416 | print(lastvolume) 417 | try: 418 | 419 | while Running: 420 | #Sleeping=False 421 | 422 | #update crow visuals 423 | crow.listen = listening 424 | crow.Sleeping = Sleeping 425 | if(is_playing): 426 | crow.volume = vol 427 | crow.SleepTimer= 0 428 | crow.Update() 429 | 430 | 431 | if(listening): 432 | crow.SleepTimer= 0 433 | 434 | if(not Sleeping and crow.SleepTimer>15000): 435 | print("Sleep Mode") 436 | Sleeping=True 437 | crow.SleepTimer = 0 438 | 439 | if keyboard.is_pressed('Esc') and keyboard.is_pressed('Shift'): 440 | print("Escape key pressed. Exiting loop.") 441 | break 442 | if(not crow.running): 443 | break 444 | 445 | except Exception as e: 446 | print(f'An error occurred in the main loop: {e}') 447 | finally: 448 | Running = False 449 | shutdown() 450 | 451 | def shutdown(): 452 | global Running, recorder, brain, crow 453 | print("Initiating shutdown...") 454 | 455 | # Stop the main loop 456 | Running = False 457 | 458 | # Stop the recorder 459 | if recorder: 460 | print("Shutting down recorder...") 461 | recorder.abort() 462 | recorder.shutdown() 463 | 464 | 465 | # Stop the brain server 466 | if brain and brain.server_thread: 467 | print("Shutting down brain server...") 468 | brain.app.config['TESTING'] = True # This should make the server more responsive to shutdown 469 | requests.get('http://localhost:5000/shutdown') # Assuming you add a /shutdown route 470 | brain.server_thread.join(timeout=5) 471 | 472 | # Stop Crow 473 | if crow: 474 | print("Shutting down Crow...") 475 | crow.End() 476 | 477 | # Close PyAudio 478 | #if 'audio' in globals(): 479 | print("Closing PyAudio...") 480 | audio.terminate() 481 | 482 | print("Forcing termination of remaining threads...") 483 | for thread in [brainthread, playback_thread]: 484 | if thread and thread.is_alive(): 485 | force_thread_termination(thread) 486 | 487 | print("Shutdown complete. Exiting...") 488 | os._exit(0) # Force exit the Python process 489 | 490 | print("Shutdown complete.") 491 | 492 | def force_thread_termination(thread): 493 | if thread.is_alive(): 494 | print(f"Force terminating thread: {thread.name}") 495 | tid = thread.ident 496 | if tid is not None: 497 | res = ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), ctypes.py_object(SystemExit)) 498 | if res > 1: 499 | ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), None) 500 | print("Exception raise failed") 501 | 502 | 503 | def name_to_index(device_name, is_input): 504 | p = pyaudio.PyAudio() 505 | for i in range(p.get_device_count()): 506 | device_info = p.get_device_info_by_index(i) 507 | if device_info['name'] == device_name: 508 | if (is_input and device_info['maxInputChannels'] > 0) or \ 509 | (not is_input and device_info['maxOutputChannels'] > 0): 510 | p.terminate() 511 | return i 512 | p.terminate() 513 | return None # Device not found 514 | 515 | def get_audio_output_devices(): 516 | """ 517 | Prints a list of audio output devices and their device indices on Windows. 518 | """ 519 | p = pyaudio.PyAudio() 520 | 521 | print("Audio Output Devices:") 522 | for i in range(p.get_device_count()): 523 | device_info = p.get_device_info_by_index(i) 524 | if device_info["maxOutputChannels"] > 0: 525 | print(f"{i}: {device_info['name']}") 526 | 527 | p.terminate() 528 | 529 | if __name__ == '__main__': 530 | 531 | get_audio_output_devices() 532 | delete_wav_files() 533 | config = CrowConfig.config() 534 | ainame = config.config['name'] 535 | whisperprompt = "Talking to" + ainame 536 | voicenum = config.config['voice'] 537 | micnum = name_to_index( config.config['mic'],True) 538 | outputdev = name_to_index( config.config['speaker'],False) 539 | print("Output Device: " + str(outputdev)) 540 | if(micnum is None): 541 | micnum=0 542 | print("Mic Not Set") 543 | 544 | recorder_config = { 545 | 'input_device_index': micnum, 546 | 'spinner': False, 547 | 'model': 'base.en', 548 | 'language': 'en', 549 | 'silero_sensitivity': 0.4, 550 | 'silero_use_onnx': True, 551 | 'webrtc_sensitivity': 2, 552 | 'device':'cuda', 553 | 'post_speech_silence_duration': 1.0, 554 | 'min_length_of_recording': 0, 555 | 'min_gap_between_recordings': 0, 556 | 'enable_realtime_transcription': True, 557 | 'realtime_processing_pause': 0.2, 558 | 'realtime_model_type': 'tiny.en', 559 | 'on_recording_start': my_start_callback, 560 | 'on_recording_stop': my_stop_callback, 561 | 'on_transcription_start': transcriptstart, 562 | 'on_realtime_transcription_update': tupdate, 563 | #'on_vad_detect_start': vadstart, 564 | #'on_vad_detect_stop': vadstop, 565 | #'on_realtime_transcription_stabilized': process_text, 566 | 'initial_prompt':whisperprompt, 567 | } 568 | recorder = AudioToTextRecorder(**recorder_config) 569 | brainthread = threading.Thread(target=aibrains) 570 | brainthread.start() 571 | time.sleep(1) 572 | playback_thread = threading.Thread(target=play_and_delete_wav) 573 | playback_thread.daemon = True # Daemonize thread 574 | playback_thread.start() 575 | brain = CrowBrain.Init() 576 | brain.config = config 577 | brain.set_test_voice_callback(test_voice) 578 | crow = Crow.Init() 579 | wintts("Crow is Online", "en_US-libritts_r-medium.onnx -s " + str(voicenum)) 580 | main() 581 | print("END") -------------------------------------------------------------------------------- /CrowAssistant.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | 3 | block_cipher = None 4 | 5 | a = Analysis( 6 | ['CrowAssistant.py'], 7 | pathex=[], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=['tiktoken_ext.openai_public', 'tiktoken_ext'], 11 | hookspath=[], 12 | hooksconfig={}, 13 | runtime_hooks=[], 14 | excludes=[], 15 | noarchive=False, 16 | optimize=0, 17 | win_no_prefer_redirects=False, 18 | win_private_assemblies=False, 19 | cipher=block_cipher 20 | ) 21 | pyz = PYZ(a.pure, a.zipped_data, 22 | cipher=block_cipher) 23 | 24 | exe = EXE( 25 | pyz, 26 | a.scripts, 27 | [], 28 | exclude_binaries=True, 29 | name='CrowAssistant', 30 | debug=False, 31 | bootloader_ignore_signals=False, 32 | strip=False, 33 | upx=True, 34 | console=True, 35 | disable_windowed_traceback=False, 36 | argv_emulation=False, 37 | target_arch=None, 38 | codesign_identity=None, 39 | entitlements_file=None, 40 | icon=['crow.ico'], 41 | ) 42 | coll = COLLECT( 43 | exe, 44 | a.binaries, 45 | a.datas, 46 | strip=False, 47 | upx=True, 48 | upx_exclude=[], 49 | name='CrowAssistant', 50 | ) 51 | -------------------------------------------------------------------------------- /CrowBrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from datetime import datetime 4 | from flask import Flask, request, jsonify, render_template 5 | from flask_sqlalchemy import SQLAlchemy 6 | from sqlalchemy import desc 7 | from openai import OpenAI 8 | import threading 9 | import tiktoken 10 | import CrowConfig 11 | import pyaudio 12 | 13 | class CrowBrain: 14 | _instance = None 15 | 16 | @classmethod 17 | def get_instance(cls): 18 | if cls._instance is None: 19 | cls._instance = cls() 20 | return cls._instance 21 | 22 | 23 | def set_test_voice_callback(self, callback): 24 | """Set the callback function for testing voice.""" 25 | self.test_voice_callback = callback 26 | 27 | def test_voice(self, voice_id): 28 | if self.test_voice_callback is None: 29 | print("Test voice callback not set") 30 | return 31 | 32 | # Call the callback function 33 | self.test_voice_callback(voice_id) 34 | 35 | def count_tokens(self, messages): 36 | """Count the number of tokens in a list of messages.""" 37 | num_tokens = 0 38 | for message in messages: 39 | num_tokens += 4 # Every message follows {role/name}\n{content}\n 40 | for key, value in message.items(): 41 | num_tokens += len(self.encoding.encode(value)) 42 | if key == "name": # If there's a name, the role is omitted 43 | num_tokens -= 1 # Role is always required and always 1 token 44 | num_tokens += 2 # Every reply is primed with assistant 45 | return num_tokens 46 | 47 | def trim_messages(self, messages, max_tokens): 48 | """Trim the messages to fit within max_tokens.""" 49 | while self.count_tokens(messages) > max_tokens: 50 | # Remove the second message (keeping the first system message) 51 | if len(messages) > 1: 52 | messages.pop(1) 53 | else: 54 | # If we're down to one message and still over the limit, truncate it 55 | content = messages[0]['content'] 56 | messages[0]['content'] = self.encoding.decode(self.encoding.encode(content)[:max_tokens]) 57 | break 58 | return messages 59 | 60 | def __init__(self): 61 | if CrowBrain._instance is not None: 62 | raise Exception("This class is a singleton!") 63 | else: 64 | CrowBrain._instance = self 65 | 66 | self.config = CrowConfig.config() 67 | #self.name = "Crow" 68 | self.thecrow = None 69 | #self.max_tokens = 32000 # Maximum context length for mixtral-8x7b-32768 70 | self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # This works for most models 71 | #self.max_messages = 100 # Maximum number of messages to retrieve from the database 72 | 73 | # Get the directory of the current script 74 | base_dir = os.path.abspath(os.path.dirname(__file__)) 75 | 76 | # Create the path for your database file 77 | db_path = os.path.join(base_dir, 'conversations.db') 78 | 79 | self.app = Flask(__name__) 80 | self.app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' 81 | self.app.config['TEMPLATES_AUTO_RELOAD'] = True # Enable template auto-reloading 82 | self.db = SQLAlchemy(self.app) 83 | 84 | self.client = OpenAI(api_key=self.config.config['api_key'], base_url=self.config.config['url']) 85 | 86 | class Conversation(self.db.Model): 87 | id = self.db.Column(self.db.Integer, primary_key=True) 88 | name = self.db.Column(self.db.String(100)) 89 | created_at = self.db.Column(self.db.DateTime, default=datetime.utcnow) 90 | 91 | class Message(self.db.Model): 92 | id = self.db.Column(self.db.Integer, primary_key=True) 93 | conversation_id = self.db.Column(self.db.Integer, self.db.ForeignKey('conversation.id')) 94 | role = self.db.Column(self.db.String(50)) 95 | content = self.db.Column(self.db.Text) 96 | timestamp = self.db.Column(self.db.DateTime, default=datetime.utcnow) 97 | conversation = self.db.relationship('Conversation', backref=self.db.backref('messages', lazy=True)) 98 | 99 | self.Conversation = Conversation 100 | self.Message = Message 101 | 102 | with self.app.app_context(): 103 | self.db.create_all() 104 | 105 | self.setup_routes() 106 | self.server_thread = None 107 | 108 | def run_server(self, host='0.0.0.0', port=5000): 109 | self.app.run(host=host, port=port, debug=False, use_reloader=False) 110 | 111 | def start_server_thread(self): 112 | if self.server_thread is None or not self.server_thread.is_alive(): 113 | self.server_thread = threading.Thread(target=self.run_server) 114 | self.server_thread.start() 115 | 116 | 117 | 118 | def load_config(self): 119 | self.config.load_config() 120 | 121 | def save_config(self): 122 | self.config.save_config() 123 | 124 | 125 | def new_conversation(self): 126 | with self.app.app_context(): 127 | conversation = self.Conversation(name=f"Conversation {self.Conversation.query.count() + 1}") 128 | self.db.session.add(conversation) 129 | self.db.session.commit() 130 | return {"conversation_id": conversation.id, "conversation_name": conversation.name} 131 | 132 | def get_conversations(self): 133 | with self.app.app_context(): 134 | conversations = self.Conversation.query.all() 135 | return {"conversations": [{"id": c.id, "name": c.name} for c in conversations]} 136 | 137 | def select_conversation(self, conversation_id): 138 | with self.app.app_context(): 139 | messages = self.Message.query.filter_by(conversation_id=conversation_id).order_by(self.Message.timestamp).all() 140 | return {"conversation_log": [{"role": m.role, "content": m.content} for m in messages]} 141 | 142 | def addSystemMessage(self, input_text, conversation_id): 143 | def add_message(): 144 | user_message = self.Message(conversation_id=conversation_id, role="system", content=input_text) 145 | self.db.session.add(user_message) 146 | self.db.session.commit() 147 | 148 | with self.app.app_context(): 149 | add_message() 150 | 151 | def generate(self, input_text, conversation_id): 152 | with self.app.app_context(): 153 | ainame = self.config.config['name'] 154 | personality = self.config.config['personality'] 155 | self.system_message = {"role": "system", "content": "You are a semi-sentient AI named "+ ainame +". "+personality+" You hear and talk with speech to text and text to speech, don't use descriptions of what you are doing. Be concise and direct in your responses."} 156 | 157 | conversation = self.Conversation.query.get(conversation_id) 158 | if not conversation: 159 | # Create a new conversation 160 | print("create new conversation") 161 | conversation = self.Conversation(name=f"Conversation {self.Conversation.query.count() + 1}") 162 | self.db.session.add(conversation) 163 | self.db.session.commit() 164 | conversation_id = conversation.id 165 | 166 | user_message = self.Message(conversation_id=conversation_id, role="user", content=input_text) 167 | self.db.session.add(user_message) 168 | self.db.session.commit() 169 | 170 | 171 | # Retrieve the most recent messages, including the new one 172 | recent_messages = self.Message.query.filter_by(conversation_id=conversation_id) \ 173 | .order_by(desc(self.Message.timestamp)) \ 174 | .limit(self.config.config['maxmsg']) \ 175 | .all() 176 | 177 | # Reverse the order to get chronological order 178 | recent_messages = recent_messages[::-1] 179 | 180 | messages_for_api = [{"role": m.role, "content": m.content} for m in recent_messages] 181 | 182 | # Calculate tokens for system message 183 | system_message_tokens = self.count_tokens([self.system_message]) 184 | 185 | # Trim messages to fit within token limit, leaving room for system message 186 | trimmed_messages = self.trim_messages(messages_for_api, self.config.config['maxtoken'] - system_message_tokens) 187 | 188 | # Add the system message at the beginning after trimming 189 | final_messages = [self.system_message] + trimmed_messages 190 | 191 | try: 192 | response = self.client.chat.completions.create( 193 | model=self.config.config['model'], 194 | messages=final_messages 195 | ) 196 | ai_message_content = response.choices[0].message.content 197 | except Exception as e: 198 | print(e) 199 | return {"error": str(e)} 200 | 201 | ai_message = self.Message(conversation_id=conversation_id, role="assistant", content=ai_message_content) 202 | self.db.session.add(ai_message) 203 | self.db.session.commit() 204 | return {"role": "assistant", "content": ai_message_content} 205 | 206 | def delete_conversation(self, conversation_id): 207 | with self.app.app_context(): 208 | conversation = self.Conversation.query.get(conversation_id) 209 | if conversation: 210 | self.Message.query.filter_by(conversation_id=conversation_id).delete() 211 | self.db.session.delete(conversation) 212 | self.db.session.commit() 213 | return {"status": "success"} 214 | else: 215 | return {"status": "error", "message": "Conversation not found"} 216 | 217 | def list_audio_input_names(self): 218 | p = pyaudio.PyAudio() 219 | input_devices = [] 220 | for i in range(p.get_device_count()): 221 | device_info = p.get_device_info_by_index(i) 222 | if device_info['maxInputChannels'] > 0: 223 | input_devices.append(device_info['name']) 224 | p.terminate() 225 | return input_devices 226 | 227 | def list_audio_output_names(self): 228 | p = pyaudio.PyAudio() 229 | output_devices = [] 230 | for i in range(p.get_device_count()): 231 | device_info = p.get_device_info_by_index(i) 232 | if device_info['maxOutputChannels'] > 0: 233 | output_devices.append(device_info['name']) 234 | p.terminate() 235 | return output_devices 236 | 237 | def setup_routes(self): 238 | @self.app.route('/') 239 | def index(): 240 | return render_template('index.html') 241 | 242 | @self.app.route('/new_conversation', methods=['POST']) 243 | def new_conversation_route(): 244 | return jsonify(self.new_conversation()) 245 | 246 | @self.app.route('/get_conversations', methods=['GET']) 247 | def get_conversations_route(): 248 | return jsonify(self.get_conversations()) 249 | 250 | @self.app.route('/select_conversation', methods=['GET']) 251 | def select_conversation_route(): 252 | conversation_id = request.args.get('conversation_id') 253 | return jsonify(self.select_conversation(conversation_id)) 254 | 255 | @self.app.route('/generate', methods=['POST']) 256 | def generate_route(): 257 | input_text = request.form['input_text'] 258 | conversation_id = request.form['conversation_id'] 259 | return jsonify(self.generate(input_text, conversation_id)) 260 | 261 | @self.app.route('/delete_conversation', methods=['POST']) 262 | def delete_conversation_route(): 263 | conversation_id = request.form['conversation_id'] 264 | return jsonify(self.delete_conversation(conversation_id)) 265 | 266 | @self.app.route('/shutdown', methods=['GET']) 267 | def shutdown(): 268 | func = request.environ.get('werkzeug.server.shutdown') 269 | if func is None: 270 | raise RuntimeError('Not running with the Werkzeug Server') 271 | func() 272 | return 'Server shutting down...' 273 | 274 | @self.app.route('/settings', methods=['GET', 'POST']) 275 | def settings_route(): 276 | if request.method == 'POST': 277 | # Update config with form data 278 | self.config.config['name'] = request.form['name'] 279 | self.config.config['personality'] = request.form['personality'] 280 | self.config.config['voice'] = int(request.form['voice']) 281 | self.config.config['url'] = request.form['url'] 282 | self.config.config['api_key'] = request.form['api_key'] 283 | self.config.config['model'] = request.form['model'] 284 | self.config.config['scale'] = int(request.form['scale']) 285 | self.config.config['mic'] = request.form['mic'] 286 | self.config.config['speaker'] = request.form['speaker'] 287 | self.config.config['maxtoken'] = int(request.form['maxtoken']) 288 | self.config.config['maxmsg'] = int(request.form['maxmsg']) 289 | self.save_config() 290 | return jsonify({"status": "success"}) 291 | else: 292 | input_devices = self.list_audio_input_names() 293 | output_devices = self.list_audio_output_names() 294 | return render_template('settings.html', config=self.config.config, input_devices=input_devices, output_devices=output_devices) 295 | 296 | @self.app.route('/test_voice', methods=['POST']) 297 | def test_voice_route(): 298 | voice_id = int(request.form['voice']) 299 | self.test_voice(voice_id) 300 | return jsonify({"status": "success"}) 301 | 302 | 303 | 304 | 305 | def Init(): 306 | brain = CrowBrain.get_instance() 307 | brain.start_server_thread() 308 | return CrowBrain.get_instance() 309 | -------------------------------------------------------------------------------- /CrowConfig.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | class config: 5 | 6 | def __init__(self): 7 | self.config = { 8 | "name": "Crow", 9 | "personality": "", 10 | "voice": 1, 11 | "url": "https://api.groq.com/openai/v1", 12 | "api_key": os.environ.get("API_KEY", ""), 13 | "model": "mixtral-8x7b-32768", 14 | "mic":"default", 15 | "speaker":"default", 16 | "scale":3, 17 | "port":5000, 18 | "maxtoken":32000, 19 | "maxmsg":100, 20 | } 21 | self.CONFIG_FILE = 'config.json' 22 | self.load_config() 23 | 24 | def load_config(self): 25 | print("Loading Config") 26 | if os.path.exists(self.CONFIG_FILE): 27 | with open(self.CONFIG_FILE, 'r') as f: 28 | self.config = json.load(f) 29 | else: 30 | print("no config") 31 | #we need to launch the settings window 32 | 33 | def save_config(self): 34 | with open(self.CONFIG_FILE, 'w') as f: 35 | json.dump(self.config, f, indent=2) 36 | 37 | 38 | -------------------------------------------------------------------------------- /CrowSTT.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List, Optional, Union 2 | import torch.multiprocessing as mp 3 | import torch 4 | from typing import List, Union 5 | from ctypes import c_bool 6 | from scipy.signal import resample 7 | from scipy import signal 8 | import faster_whisper 9 | import collections 10 | import numpy as np 11 | import traceback 12 | import threading 13 | import webrtcvad 14 | # import itertools 15 | import platform 16 | import pyaudio 17 | import logging 18 | import struct 19 | import halo 20 | import time 21 | import copy 22 | import os 23 | import re 24 | import gc 25 | 26 | # Set OpenMP runtime duplicate library handling to OK (Use only for development!) 27 | #os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' 28 | 29 | INIT_MODEL_TRANSCRIPTION = "tiny" 30 | INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny" 31 | INIT_REALTIME_PROCESSING_PAUSE = 0.2 32 | INIT_SILERO_SENSITIVITY = 0.4 33 | INIT_WEBRTC_SENSITIVITY = 3 34 | INIT_POST_SPEECH_SILENCE_DURATION = 0.6 35 | INIT_MIN_LENGTH_OF_RECORDING = 0.5 36 | INIT_MIN_GAP_BETWEEN_RECORDINGS = 0 37 | INIT_WAKE_WORDS_SENSITIVITY = 0.6 38 | INIT_PRE_RECORDING_BUFFER_DURATION = 1.0 39 | INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0 40 | INIT_WAKE_WORD_TIMEOUT = 5.0 41 | INIT_WAKE_WORD_BUFFER_DURATION = 0.1 42 | ALLOWED_LATENCY_LIMIT = 10 43 | 44 | TIME_SLEEP = 0.02 45 | SAMPLE_RATE = 16000 46 | BUFFER_SIZE = 512 47 | INT16_MAX_ABS_VALUE = 32768.0 48 | 49 | INIT_HANDLE_BUFFER_OVERFLOW = False 50 | if platform.system() != 'Darwin': 51 | INIT_HANDLE_BUFFER_OVERFLOW = True 52 | 53 | 54 | class AudioToTextRecorder: 55 | _instance = None 56 | 57 | @classmethod 58 | def get_instance(cls): 59 | if cls._instance is None: 60 | cls._instance = cls() 61 | return cls._instance 62 | 63 | 64 | def __init__(self, 65 | model: str = INIT_MODEL_TRANSCRIPTION, 66 | language: str = "", 67 | compute_type: str = "default", 68 | input_device_index: int = None, 69 | gpu_device_index: Union[int, List[int]] = 0, 70 | device: str = "cuda", 71 | on_recording_start=None, 72 | on_recording_stop=None, 73 | on_transcription_start=None, 74 | ensure_sentence_starting_uppercase=True, 75 | ensure_sentence_ends_with_period=True, 76 | use_microphone=True, 77 | spinner=True, 78 | level=logging.WARNING, 79 | 80 | # Realtime transcription parameters 81 | enable_realtime_transcription=False, 82 | realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME, 83 | realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE, 84 | on_realtime_transcription_update=None, 85 | on_realtime_transcription_stabilized=None, 86 | 87 | # Voice activation parameters 88 | silero_sensitivity: float = INIT_SILERO_SENSITIVITY, 89 | silero_use_onnx: bool = False, 90 | webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY, 91 | post_speech_silence_duration: float = ( 92 | INIT_POST_SPEECH_SILENCE_DURATION 93 | ), 94 | min_length_of_recording: float = ( 95 | INIT_MIN_LENGTH_OF_RECORDING 96 | ), 97 | min_gap_between_recordings: float = ( 98 | INIT_MIN_GAP_BETWEEN_RECORDINGS 99 | ), 100 | pre_recording_buffer_duration: float = ( 101 | INIT_PRE_RECORDING_BUFFER_DURATION 102 | ), 103 | on_vad_detect_start=None, 104 | on_vad_detect_stop=None, 105 | 106 | on_recorded_chunk=None, 107 | debug_mode=False, 108 | handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW, 109 | beam_size: int = 5, 110 | beam_size_realtime: int = 3, 111 | buffer_size: int = BUFFER_SIZE, 112 | sample_rate: int = SAMPLE_RATE, 113 | initial_prompt: Optional[Union[str, Iterable[int]]] = None, 114 | suppress_tokens: Optional[List[int]] = [-1], 115 | ): 116 | print("STARTING STT") 117 | mp.freeze_support() 118 | if AudioToTextRecorder._instance is not None: 119 | raise Exception("This class is a singleton!") 120 | else: 121 | AudioToTextRecorder._instance = self 122 | 123 | self.language = language 124 | self.compute_type = compute_type 125 | self.input_device_index = input_device_index 126 | self.gpu_device_index = gpu_device_index 127 | self.device = device 128 | # self.wake_words = wake_words 129 | # self.wake_word_activation_delay = wake_word_activation_delay 130 | # self.wake_word_timeout = wake_word_timeout 131 | # self.wake_word_buffer_duration = wake_word_buffer_duration 132 | self.ensure_sentence_starting_uppercase = ( 133 | ensure_sentence_starting_uppercase 134 | ) 135 | self.ensure_sentence_ends_with_period = ( 136 | ensure_sentence_ends_with_period 137 | ) 138 | self.use_microphone = mp.Value(c_bool, use_microphone) 139 | self.min_gap_between_recordings = min_gap_between_recordings 140 | self.min_length_of_recording = min_length_of_recording 141 | self.pre_recording_buffer_duration = pre_recording_buffer_duration 142 | self.post_speech_silence_duration = post_speech_silence_duration 143 | self.on_recording_start = on_recording_start 144 | self.on_recording_stop = on_recording_stop 145 | # self.on_wakeword_detected = on_wakeword_detected 146 | # self.on_wakeword_timeout = on_wakeword_timeout 147 | self.on_vad_detect_start = on_vad_detect_start 148 | self.on_vad_detect_stop = on_vad_detect_stop 149 | # self.on_wakeword_detection_start = on_wakeword_detection_start 150 | # self.on_wakeword_detection_end = on_wakeword_detection_end 151 | self.on_recorded_chunk = on_recorded_chunk 152 | self.on_transcription_start = on_transcription_start 153 | self.enable_realtime_transcription = enable_realtime_transcription 154 | self.realtime_model_type = realtime_model_type 155 | self.realtime_processing_pause = realtime_processing_pause 156 | self.on_realtime_transcription_update = ( 157 | on_realtime_transcription_update 158 | ) 159 | self.on_realtime_transcription_stabilized = ( 160 | on_realtime_transcription_stabilized 161 | ) 162 | self.debug_mode = debug_mode 163 | self.handle_buffer_overflow = handle_buffer_overflow 164 | self.beam_size = beam_size 165 | self.beam_size_realtime = beam_size_realtime 166 | self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT 167 | 168 | self.level = level 169 | self.audio_queue = mp.Queue() 170 | self.buffer_size = buffer_size 171 | self.sample_rate = sample_rate 172 | self.recording_start_time = 0 173 | self.recording_stop_time = 0 174 | self.wake_word_detect_time = 0 175 | self.silero_check_time = 0 176 | self.silero_working = False 177 | self.speech_end_silence_start = 0 178 | self.silero_sensitivity = silero_sensitivity 179 | self.listen_start = 0 180 | self.spinner = spinner 181 | self.halo = None 182 | self.state = "inactive" 183 | self.wakeword_detected = False 184 | self.text_storage = [] 185 | self.realtime_stabilized_text = "" 186 | self.realtime_stabilized_safetext = "" 187 | self.is_webrtc_speech_active = False 188 | self.is_silero_speech_active = False 189 | self.recording_thread = None 190 | self.realtime_thread = None 191 | self.audio_interface = None 192 | self.audio = None 193 | self.stream = None 194 | self.start_recording_event = threading.Event() 195 | self.stop_recording_event = threading.Event() 196 | self.last_transcription_bytes = None 197 | self.initial_prompt = initial_prompt 198 | self.suppress_tokens = suppress_tokens 199 | # self.use_wake_words = wake_words or wakeword_backend in {'oww', 'openwakeword', 'openwakewords'} 200 | 201 | # Initialize the logging configuration with the specified level 202 | log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s' 203 | 204 | # Create a logger 205 | logger = logging.getLogger() 206 | logger.setLevel(level) # Set the root logger's level 207 | 208 | # Create a file handler and set its level 209 | file_handler = logging.FileHandler('realtimesst.log') 210 | file_handler.setLevel(logging.DEBUG) 211 | file_handler.setFormatter(logging.Formatter(log_format)) 212 | 213 | # Create a console handler and set its level 214 | console_handler = logging.StreamHandler() 215 | console_handler.setLevel(level) 216 | console_handler.setFormatter(logging.Formatter(log_format)) 217 | 218 | # Add the handlers to the logger 219 | logger.addHandler(file_handler) 220 | logger.addHandler(console_handler) 221 | 222 | self.is_shut_down = False 223 | self.shutdown_event = mp.Event() 224 | 225 | try: 226 | logging.debug("Explicitly setting the multiprocessing start method to 'spawn'") 227 | mp.set_start_method('spawn') 228 | except RuntimeError as e: 229 | logging.debug(f"Start method has already been set. Details: {e}") 230 | 231 | logging.info("Starting RealTimeSTT") 232 | 233 | self.interrupt_stop_event = mp.Event() 234 | self.was_interrupted = mp.Event() 235 | self.main_transcription_ready_event = mp.Event() 236 | self.parent_transcription_pipe, child_transcription_pipe = mp.Pipe() 237 | 238 | # Set device for model 239 | self.device = "cuda" if self.device == "cuda" and torch.cuda.is_available() else "cpu" 240 | 241 | self.transcript_process = self._start_thread( 242 | target=AudioToTextRecorder._transcription_worker, 243 | args=( 244 | child_transcription_pipe, 245 | model, 246 | self.compute_type, 247 | self.gpu_device_index, 248 | self.device, 249 | self.main_transcription_ready_event, 250 | self.shutdown_event, 251 | self.interrupt_stop_event, 252 | self.beam_size, 253 | self.initial_prompt, 254 | self.suppress_tokens 255 | ) 256 | ) 257 | 258 | # Start audio data reading process 259 | if self.use_microphone.value: 260 | logging.info("Initializing audio recording" 261 | " (creating pyAudio input stream," 262 | f" sample rate: {self.sample_rate}" 263 | f" buffer size: {self.buffer_size}" 264 | ) 265 | self.reader_process = self._start_thread( 266 | target=AudioToTextRecorder._audio_data_worker, 267 | args=( 268 | self.audio_queue, 269 | self.sample_rate, 270 | self.buffer_size, 271 | self.input_device_index, 272 | self.shutdown_event, 273 | self.interrupt_stop_event, 274 | self.use_microphone 275 | ) 276 | ) 277 | 278 | # Initialize the realtime transcription model 279 | if self.enable_realtime_transcription: 280 | try: 281 | logging.info("Initializing faster_whisper realtime " 282 | f"transcription model {self.realtime_model_type}" 283 | ) 284 | self.realtime_model_type = faster_whisper.WhisperModel( 285 | model_size_or_path=self.realtime_model_type, 286 | device=self.device, 287 | compute_type=self.compute_type, 288 | device_index=self.gpu_device_index 289 | ) 290 | 291 | except Exception as e: 292 | logging.exception("Error initializing faster_whisper " 293 | f"realtime transcription model: {e}" 294 | ) 295 | raise 296 | 297 | logging.debug("Faster_whisper realtime speech to text " 298 | "transcription model initialized successfully") 299 | 300 | 301 | # Setup voice activity detection model WebRTC 302 | try: 303 | logging.info("Initializing WebRTC voice with " 304 | f"Sensitivity {webrtc_sensitivity}" 305 | ) 306 | self.webrtc_vad_model = webrtcvad.Vad() 307 | self.webrtc_vad_model.set_mode(webrtc_sensitivity) 308 | 309 | except Exception as e: 310 | logging.exception("Error initializing WebRTC voice " 311 | f"activity detection engine: {e}" 312 | ) 313 | raise 314 | 315 | logging.debug("WebRTC VAD voice activity detection " 316 | "engine initialized successfully" 317 | ) 318 | 319 | # Setup voice activity detection model Silero VAD 320 | try: 321 | self.silero_vad_model, _ = torch.hub.load( 322 | repo_or_dir="snakers4/silero-vad", 323 | model="silero_vad", 324 | verbose=False, 325 | onnx=silero_use_onnx 326 | ) 327 | 328 | except Exception as e: 329 | logging.exception(f"Error initializing Silero VAD " 330 | f"voice activity detection engine: {e}" 331 | ) 332 | raise 333 | 334 | logging.debug("Silero VAD voice activity detection " 335 | "engine initialized successfully" 336 | ) 337 | 338 | self.audio_buffer = collections.deque( 339 | maxlen=int((self.sample_rate // self.buffer_size) * 340 | self.pre_recording_buffer_duration) 341 | ) 342 | self.frames = [] 343 | 344 | # Recording control flags 345 | self.is_recording = False 346 | self.is_running = True 347 | self.start_recording_on_voice_activity = False 348 | self.stop_recording_on_voice_deactivity = False 349 | 350 | # Start the recording worker thread 351 | self.recording_thread = threading.Thread(target=self._recording_worker) 352 | self.recording_thread.daemon = True 353 | self.recording_thread.start() 354 | 355 | # Start the realtime transcription worker thread 356 | self.realtime_thread = threading.Thread(target=self._realtime_worker) 357 | self.realtime_thread.daemon = True 358 | self.realtime_thread.start() 359 | 360 | # Wait for transcription models to start 361 | logging.debug('Waiting for main transcription model to start') 362 | self.main_transcription_ready_event.wait() 363 | logging.debug('Main transcription model ready') 364 | 365 | logging.debug('RealtimeSTT initialization completed successfully') 366 | 367 | def _start_thread(self, target=None, args=()): 368 | 369 | if (platform.system() == 'Linux'): 370 | thread = threading.Thread(target=target, args=args) 371 | thread.deamon = True 372 | thread.start() 373 | return thread 374 | else: 375 | thread = mp.Process(target=target, args=args) 376 | thread.start() 377 | return thread 378 | 379 | @staticmethod 380 | def _transcription_worker(conn, 381 | model_path, 382 | compute_type, 383 | gpu_device_index, 384 | device, 385 | ready_event, 386 | shutdown_event, 387 | interrupt_stop_event, 388 | beam_size, 389 | initial_prompt, 390 | suppress_tokens 391 | ): 392 | 393 | 394 | logging.info("Initializing faster_whisper " 395 | f"main transcription model {model_path}" 396 | ) 397 | 398 | try: 399 | model = faster_whisper.WhisperModel( 400 | model_size_or_path=model_path, 401 | device=device, 402 | compute_type=compute_type, 403 | device_index=gpu_device_index, 404 | ) 405 | 406 | except Exception as e: 407 | logging.exception("Error initializing main " 408 | f"faster_whisper transcription model: {e}" 409 | ) 410 | raise 411 | 412 | ready_event.set() 413 | 414 | logging.debug("Faster_whisper main speech to text " 415 | "transcription model initialized successfully" 416 | ) 417 | 418 | while not shutdown_event.is_set(): 419 | try: 420 | if conn.poll(0.5): 421 | audio, language = conn.recv() 422 | try: 423 | segments = model.transcribe( 424 | audio, 425 | language=language if language else None, 426 | beam_size=beam_size, 427 | initial_prompt=initial_prompt, 428 | suppress_tokens=suppress_tokens 429 | ) 430 | segments = segments[0] 431 | transcription = " ".join(seg.text for seg in segments) 432 | transcription = transcription.strip() 433 | conn.send(('success', transcription)) 434 | except Exception as e: 435 | logging.error(f"General transcription error: {e}") 436 | conn.send(('error', str(e))) 437 | else: 438 | # If there's no data, sleep / prevent busy waiting 439 | time.sleep(0.02) 440 | except KeyboardInterrupt: 441 | interrupt_stop_event.set() 442 | logging.debug("Transcription worker process " 443 | "finished due to KeyboardInterrupt" 444 | ) 445 | break 446 | 447 | @staticmethod 448 | def _audio_data_worker(audio_queue, 449 | sample_rate, 450 | buffer_size, 451 | input_device_index, 452 | shutdown_event, 453 | interrupt_stop_event, 454 | use_microphone): 455 | 456 | try: 457 | audio_interface = pyaudio.PyAudio() 458 | if input_device_index is None: 459 | default_device = audio_interface.get_default_input_device_info() 460 | input_device_index = default_device['index'] 461 | stream = audio_interface.open( 462 | rate=sample_rate, 463 | format=pyaudio.paInt16, 464 | channels=1, 465 | input=True, 466 | frames_per_buffer=buffer_size, 467 | input_device_index=input_device_index, 468 | ) 469 | 470 | except Exception as e: 471 | logging.exception("Error initializing pyaudio " 472 | f"audio recording: {e}" 473 | ) 474 | raise 475 | 476 | logging.debug("Audio recording (pyAudio input " 477 | "stream) initialized successfully" 478 | ) 479 | 480 | try: 481 | while not shutdown_event.is_set(): 482 | try: 483 | data = stream.read(buffer_size) 484 | 485 | except OSError as e: 486 | if e.errno == pyaudio.paInputOverflowed: 487 | logging.warning("Input overflowed. Frame dropped.") 488 | else: 489 | logging.error(f"Error during recording: {e}") 490 | tb_str = traceback.format_exc() 491 | print(f"Traceback: {tb_str}") 492 | print(f"Error: {e}") 493 | continue 494 | 495 | except Exception as e: 496 | logging.error(f"Error during recording: {e}") 497 | tb_str = traceback.format_exc() 498 | print(f"Traceback: {tb_str}") 499 | print(f"Error: {e}") 500 | continue 501 | 502 | if use_microphone.value: 503 | audio_queue.put(data) 504 | 505 | except KeyboardInterrupt: 506 | interrupt_stop_event.set() 507 | logging.debug("Audio data worker process " 508 | "finished due to KeyboardInterrupt" 509 | ) 510 | finally: 511 | stream.stop_stream() 512 | stream.close() 513 | audio_interface.terminate() 514 | 515 | def wakeup(self): 516 | 517 | self.listen_start = time.time() 518 | 519 | def abort(self): 520 | self.start_recording_on_voice_activity = False 521 | self.stop_recording_on_voice_deactivity = False 522 | self._set_state("inactive") 523 | self.interrupt_stop_event.set() 524 | self.was_interrupted.wait() 525 | self.was_interrupted.clear() 526 | 527 | def wait_audio(self): 528 | 529 | 530 | self.listen_start = time.time() 531 | 532 | # If not yet started recording, wait for voice activity to initiate. 533 | if not self.is_recording and not self.frames: 534 | self._set_state("listening") 535 | self.start_recording_on_voice_activity = True 536 | 537 | # Wait until recording starts 538 | while not self.interrupt_stop_event.is_set(): 539 | if self.start_recording_event.wait(timeout=0.02): 540 | break 541 | 542 | # If recording is ongoing, wait for voice inactivity 543 | # to finish recording. 544 | if self.is_recording: 545 | self.stop_recording_on_voice_deactivity = True 546 | 547 | # Wait until recording stops 548 | while not self.interrupt_stop_event.is_set(): 549 | if (self.stop_recording_event.wait(timeout=0.02)): 550 | break 551 | 552 | # Convert recorded frames to the appropriate audio format. 553 | audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16) 554 | self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE 555 | self.frames.clear() 556 | 557 | # Reset recording-related timestamps 558 | self.recording_stop_time = 0 559 | self.listen_start = 0 560 | 561 | self._set_state("inactive") 562 | 563 | def transcribe(self): 564 | 565 | self._set_state("transcribing") 566 | audio_copy = copy.deepcopy(self.audio) 567 | self.parent_transcription_pipe.send((self.audio, self.language)) 568 | status, result = self.parent_transcription_pipe.recv() 569 | 570 | self._set_state("inactive") 571 | if status == 'success': 572 | self.last_transcription_bytes = audio_copy 573 | return self._preprocess_output(result) 574 | else: 575 | logging.error(result) 576 | raise Exception(result) 577 | 578 | def _process_wakeword(self, data): 579 | """ 580 | Processes audio data to detect wake words. 581 | """ 582 | if self.wakeword_backend in {'pvp', 'pvporcupine'}: 583 | pcm = struct.unpack_from( 584 | "h" * self.buffer_size, 585 | data 586 | ) 587 | porcupine_index = self.porcupine.process(pcm) 588 | if self.debug_mode: 589 | print (f"wake words porcupine_index: {porcupine_index}") 590 | return self.porcupine.process(pcm) 591 | 592 | elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}: 593 | pcm = np.frombuffer(data, dtype=np.int16) 594 | prediction = self.owwModel.predict(pcm) 595 | max_score = -1 596 | max_index = -1 597 | wake_words_in_prediction = len(self.owwModel.prediction_buffer.keys()) 598 | self.wake_words_sensitivities 599 | if wake_words_in_prediction: 600 | for idx, mdl in enumerate(self.owwModel.prediction_buffer.keys()): 601 | scores = list(self.owwModel.prediction_buffer[mdl]) 602 | if scores[-1] >= self.wake_words_sensitivity and scores[-1] > max_score: 603 | max_score = scores[-1] 604 | max_index = idx 605 | if self.debug_mode: 606 | print (f"wake words oww max_index, max_score: {max_index} {max_score}") 607 | return max_index 608 | else: 609 | if self.debug_mode: 610 | print (f"wake words oww_index: -1") 611 | return -1 612 | 613 | if self.debug_mode: 614 | print("wake words no match") 615 | return -1 616 | 617 | def text(self, 618 | on_transcription_finished=None, 619 | ): 620 | 621 | 622 | self.interrupt_stop_event.clear() 623 | self.was_interrupted.clear() 624 | 625 | self.wait_audio() 626 | 627 | if self.is_shut_down or self.interrupt_stop_event.is_set(): 628 | if self.interrupt_stop_event.is_set(): 629 | self.was_interrupted.set() 630 | return "" 631 | 632 | if on_transcription_finished: 633 | threading.Thread(target=on_transcription_finished, 634 | args=(self.transcribe(),)).start() 635 | else: 636 | return self.transcribe() 637 | 638 | def start(self): 639 | 640 | 641 | # Ensure there's a minimum interval 642 | # between stopping and starting recording 643 | if (time.time() - self.recording_stop_time 644 | < self.min_gap_between_recordings): 645 | logging.info("Attempted to start recording " 646 | "too soon after stopping." 647 | ) 648 | return self 649 | 650 | logging.info("recording started") 651 | self._set_state("recording") 652 | self.text_storage = [] 653 | self.realtime_stabilized_text = "" 654 | self.realtime_stabilized_safetext = "" 655 | self.wakeword_detected = False 656 | self.wake_word_detect_time = 0 657 | self.frames = [] 658 | self.is_recording = True 659 | self.recording_start_time = time.time() 660 | self.is_silero_speech_active = False 661 | self.is_webrtc_speech_active = False 662 | self.stop_recording_event.clear() 663 | self.start_recording_event.set() 664 | 665 | if self.on_recording_start: 666 | self.on_recording_start() 667 | 668 | return self 669 | 670 | def stop(self): 671 | 672 | 673 | # Ensure there's a minimum interval 674 | # between starting and stopping recording 675 | if (time.time() - self.recording_start_time 676 | < self.min_length_of_recording): 677 | logging.info("Attempted to stop recording " 678 | "too soon after starting." 679 | ) 680 | return self 681 | 682 | logging.info("recording stopped") 683 | self.is_recording = False 684 | self.recording_stop_time = time.time() 685 | self.is_silero_speech_active = False 686 | self.is_webrtc_speech_active = False 687 | self.silero_check_time = 0 688 | self.start_recording_event.clear() 689 | self.stop_recording_event.set() 690 | 691 | if self.on_recording_stop: 692 | self.on_recording_stop() 693 | 694 | return self 695 | 696 | def feed_audio(self, chunk, original_sample_rate=16000): 697 | 698 | # Check if the buffer attribute exists, if not, initialize it 699 | if not hasattr(self, 'buffer'): 700 | self.buffer = bytearray() 701 | 702 | # Check if input is a NumPy array 703 | if isinstance(chunk, np.ndarray): 704 | # Handle stereo to mono conversion if necessary 705 | if chunk.ndim == 2: 706 | chunk = np.mean(chunk, axis=1) 707 | 708 | # Resample to 16000 Hz if necessary 709 | if original_sample_rate != 16000: 710 | num_samples = int(len(chunk) * 16000 / original_sample_rate) 711 | chunk = resample(chunk, num_samples) 712 | 713 | # Ensure data type is int16 714 | chunk = chunk.astype(np.int16) 715 | 716 | # Convert the NumPy array to bytes 717 | chunk = chunk.tobytes() 718 | 719 | # Append the chunk to the buffer 720 | self.buffer += chunk 721 | buf_size = 2 * self.buffer_size # silero complains if too short 722 | 723 | # Check if the buffer has reached or exceeded the buffer_size 724 | while len(self.buffer) >= buf_size: 725 | # Extract self.buffer_size amount of data from the buffer 726 | to_process = self.buffer[:buf_size] 727 | self.buffer = self.buffer[buf_size:] 728 | 729 | # Feed the extracted data to the audio_queue 730 | self.audio_queue.put(to_process) 731 | 732 | def set_microphone(self, microphone_on=True): 733 | """ 734 | Set the microphone on or off. 735 | """ 736 | logging.info("Setting microphone to: " + str(microphone_on)) 737 | self.use_microphone.value = microphone_on 738 | 739 | def shutdown(self): 740 | 741 | 742 | # Force wait_audio() and text() to exit 743 | self.is_shut_down = True 744 | self.start_recording_event.set() 745 | self.stop_recording_event.set() 746 | 747 | self.shutdown_event.set() 748 | self.is_recording = False 749 | self.is_running = False 750 | 751 | logging.debug('Finishing recording thread') 752 | if self.recording_thread: 753 | self.recording_thread.join() 754 | 755 | logging.debug('Terminating reader process') 756 | 757 | # Give it some time to finish the loop and cleanup. 758 | if self.use_microphone: 759 | self.reader_process.join(timeout=10) 760 | 761 | if self.reader_process.is_alive(): 762 | logging.warning("Reader process did not terminate " 763 | "in time. Terminating forcefully." 764 | ) 765 | self.reader_process.terminate() 766 | 767 | logging.debug('Terminating transcription process') 768 | self.transcript_process.join(timeout=10) 769 | 770 | if self.transcript_process.is_alive(): 771 | logging.warning("Transcript process did not terminate " 772 | "in time. Terminating forcefully." 773 | ) 774 | self.transcript_process.terminate() 775 | 776 | self.parent_transcription_pipe.close() 777 | 778 | logging.debug('Finishing realtime thread') 779 | if self.realtime_thread: 780 | self.realtime_thread.join() 781 | 782 | if self.enable_realtime_transcription: 783 | if self.realtime_model_type: 784 | del self.realtime_model_type 785 | self.realtime_model_type = None 786 | gc.collect() 787 | 788 | def _recording_worker(self): 789 | 790 | 791 | logging.debug('Starting recording worker') 792 | 793 | try: 794 | was_recording = False 795 | delay_was_passed = False 796 | 797 | # Continuously monitor audio for voice activity 798 | while self.is_running: 799 | 800 | try: 801 | 802 | data = self.audio_queue.get() 803 | if self.on_recorded_chunk: 804 | self.on_recorded_chunk(data) 805 | 806 | if self.handle_buffer_overflow: 807 | # Handle queue overflow 808 | if (self.audio_queue.qsize() > 809 | self.allowed_latency_limit): 810 | logging.warning("Audio queue size exceeds " 811 | "latency limit. Current size: " 812 | f"{self.audio_queue.qsize()}. " 813 | "Discarding old audio chunks." 814 | ) 815 | 816 | while (self.audio_queue.qsize() > 817 | self.allowed_latency_limit): 818 | 819 | data = self.audio_queue.get() 820 | 821 | except BrokenPipeError: 822 | print("BrokenPipeError _recording_worker") 823 | self.is_running = False 824 | break 825 | 826 | if not self.is_recording: 827 | # Handle not recording state 828 | time_since_listen_start = (time.time() - self.listen_start 829 | if self.listen_start else 0) 830 | 831 | 832 | 833 | # Set state and spinner text 834 | if not self.recording_stop_time: 835 | # if self.use_wake_words \ 836 | # and wake_word_activation_delay_passed \ 837 | # and not self.wakeword_detected: 838 | # self._set_state("wakeword") 839 | # else: 840 | if self.listen_start: 841 | self._set_state("listening") 842 | else: 843 | self._set_state("inactive") 844 | 845 | 846 | # Check for voice activity to 847 | # trigger the start of recording 848 | if (self.start_recording_on_voice_activity): 849 | 850 | if self._is_voice_active(): 851 | logging.info("voice activity detected") 852 | 853 | self.start() 854 | 855 | if self.is_recording: 856 | self.start_recording_on_voice_activity = False 857 | 858 | # Add the buffered audio 859 | # to the recording frames 860 | self.frames.extend(list(self.audio_buffer)) 861 | self.audio_buffer.clear() 862 | 863 | self.silero_vad_model.reset_states() 864 | else: 865 | data_copy = data[:] 866 | self._check_voice_activity(data_copy) 867 | 868 | self.speech_end_silence_start = 0 869 | 870 | else: 871 | # If we are currently recording 872 | 873 | # Stop the recording if silence is detected after speech 874 | if self.stop_recording_on_voice_deactivity: 875 | 876 | if not self._is_webrtc_speech(data, True): 877 | 878 | # Voice deactivity was detected, so we start 879 | # measuring silence time before stopping recording 880 | if self.speech_end_silence_start == 0: 881 | self.speech_end_silence_start = time.time() 882 | 883 | else: 884 | self.speech_end_silence_start = 0 885 | 886 | # Wait for silence to stop recording after speech 887 | if self.speech_end_silence_start and time.time() - \ 888 | self.speech_end_silence_start > \ 889 | self.post_speech_silence_duration: 890 | logging.info("voice deactivity detected") 891 | self.stop() 892 | 893 | if not self.is_recording and was_recording: 894 | # Reset after stopping recording to ensure clean state 895 | self.stop_recording_on_voice_deactivity = False 896 | 897 | if time.time() - self.silero_check_time > 0.1: 898 | self.silero_check_time = 0 899 | 900 | 901 | was_recording = self.is_recording 902 | 903 | if self.is_recording: 904 | self.frames.append(data) 905 | 906 | if not self.is_recording or self.speech_end_silence_start: 907 | self.audio_buffer.append(data) 908 | 909 | except Exception as e: 910 | if not self.interrupt_stop_event.is_set(): 911 | logging.error(f"Unhandled exeption in _recording_worker: {e}") 912 | raise 913 | 914 | def _realtime_worker(self): 915 | 916 | try: 917 | 918 | logging.debug('Starting realtime worker') 919 | 920 | # Return immediately if real-time transcription is not enabled 921 | if not self.enable_realtime_transcription: 922 | return 923 | 924 | # Continue running as long as the main process is active 925 | while self.is_running: 926 | 927 | # Check if the recording is active 928 | if self.is_recording: 929 | 930 | # Sleep for the duration of the transcription resolution 931 | time.sleep(self.realtime_processing_pause) 932 | 933 | # Convert the buffer frames to a NumPy array 934 | audio_array = np.frombuffer( 935 | b''.join(self.frames), 936 | dtype=np.int16 937 | ) 938 | 939 | # Normalize the array to a [-1, 1] range 940 | audio_array = audio_array.astype(np.float32) / \ 941 | INT16_MAX_ABS_VALUE 942 | 943 | # Perform transcription and assemble the text 944 | segments = self.realtime_model_type.transcribe( 945 | audio_array, 946 | language=self.language if self.language else None, 947 | beam_size=self.beam_size_realtime, 948 | initial_prompt=self.initial_prompt, 949 | suppress_tokens=self.suppress_tokens, 950 | ) 951 | 952 | # double check recording state 953 | # because it could have changed mid-transcription 954 | if self.is_recording and time.time() - \ 955 | self.recording_start_time > 0.5: 956 | 957 | logging.debug('Starting realtime transcription') 958 | self.realtime_transcription_text = " ".join( 959 | seg.text for seg in segments[0] 960 | ) 961 | self.realtime_transcription_text = \ 962 | self.realtime_transcription_text.strip() 963 | 964 | self.text_storage.append( 965 | self.realtime_transcription_text 966 | ) 967 | 968 | # Take the last two texts in storage, if they exist 969 | if len(self.text_storage) >= 2: 970 | last_two_texts = self.text_storage[-2:] 971 | 972 | # Find the longest common prefix 973 | # between the two texts 974 | prefix = os.path.commonprefix( 975 | [last_two_texts[0], last_two_texts[1]] 976 | ) 977 | 978 | # This prefix is the text that was transcripted 979 | # two times in the same way 980 | # Store as "safely detected text" 981 | if len(prefix) >= \ 982 | len(self.realtime_stabilized_safetext): 983 | 984 | # Only store when longer than the previous 985 | # as additional security 986 | self.realtime_stabilized_safetext = prefix 987 | 988 | # Find parts of the stabilized text 989 | # in the freshly transcripted text 990 | matching_pos = self._find_tail_match_in_text( 991 | self.realtime_stabilized_safetext, 992 | self.realtime_transcription_text 993 | ) 994 | 995 | if matching_pos < 0: 996 | if self.realtime_stabilized_safetext: 997 | self._on_realtime_transcription_stabilized( 998 | self._preprocess_output( 999 | self.realtime_stabilized_safetext, 1000 | True 1001 | ) 1002 | ) 1003 | else: 1004 | self._on_realtime_transcription_stabilized( 1005 | self._preprocess_output( 1006 | self.realtime_transcription_text, 1007 | True 1008 | ) 1009 | ) 1010 | else: 1011 | # We found parts of the stabilized text 1012 | # in the transcripted text 1013 | # We now take the stabilized text 1014 | # and add only the freshly transcripted part to it 1015 | output_text = self.realtime_stabilized_safetext + \ 1016 | self.realtime_transcription_text[matching_pos:] 1017 | 1018 | # This yields us the "left" text part as stabilized 1019 | # AND at the same time delivers fresh detected 1020 | # parts on the first run without the need for 1021 | # two transcriptions 1022 | self._on_realtime_transcription_stabilized( 1023 | self._preprocess_output(output_text, True) 1024 | ) 1025 | 1026 | # Invoke the callback with the transcribed text 1027 | self._on_realtime_transcription_update( 1028 | self._preprocess_output( 1029 | self.realtime_transcription_text, 1030 | True 1031 | ) 1032 | ) 1033 | 1034 | # If not recording, sleep briefly before checking again 1035 | else: 1036 | time.sleep(TIME_SLEEP) 1037 | 1038 | except Exception as e: 1039 | logging.error(f"Unhandled exeption in _realtime_worker: {e}") 1040 | raise 1041 | 1042 | def _is_silero_speech(self, chunk): 1043 | 1044 | if self.sample_rate != 16000: 1045 | pcm_data = np.frombuffer(chunk, dtype=np.int16) 1046 | data_16000 = signal.resample_poly( 1047 | pcm_data, 16000, self.sample_rate) 1048 | chunk = data_16000.astype(np.int16).tobytes() 1049 | 1050 | self.silero_working = True 1051 | audio_chunk = np.frombuffer(chunk, dtype=np.int16) 1052 | audio_chunk = audio_chunk.astype(np.float32) / INT16_MAX_ABS_VALUE 1053 | vad_prob = self.silero_vad_model( 1054 | torch.from_numpy(audio_chunk), 1055 | SAMPLE_RATE).item() 1056 | is_silero_speech_active = vad_prob > (1 - self.silero_sensitivity) 1057 | if is_silero_speech_active: 1058 | self.is_silero_speech_active = True 1059 | self.silero_working = False 1060 | return is_silero_speech_active 1061 | 1062 | def _is_webrtc_speech(self, chunk, all_frames_must_be_true=False): 1063 | 1064 | if self.sample_rate != 16000: 1065 | pcm_data = np.frombuffer(chunk, dtype=np.int16) 1066 | data_16000 = signal.resample_poly( 1067 | pcm_data, 16000, self.sample_rate) 1068 | chunk = data_16000.astype(np.int16).tobytes() 1069 | 1070 | # Number of audio frames per millisecond 1071 | frame_length = int(16000 * 0.01) # for 10ms frame 1072 | num_frames = int(len(chunk) / (2 * frame_length)) 1073 | speech_frames = 0 1074 | 1075 | for i in range(num_frames): 1076 | start_byte = i * frame_length * 2 1077 | end_byte = start_byte + frame_length * 2 1078 | frame = chunk[start_byte:end_byte] 1079 | if self.webrtc_vad_model.is_speech(frame, 16000): 1080 | speech_frames += 1 1081 | if not all_frames_must_be_true: 1082 | if self.debug_mode: 1083 | print(f"Speech detected in frame {i + 1}" 1084 | f" of {num_frames}") 1085 | return True 1086 | if all_frames_must_be_true: 1087 | if self.debug_mode and speech_frames == num_frames: 1088 | print(f"Speech detected in {speech_frames} of " 1089 | f"{num_frames} frames") 1090 | elif self.debug_mode: 1091 | print(f"Speech not detected in all {num_frames} frames") 1092 | return speech_frames == num_frames 1093 | else: 1094 | if self.debug_mode: 1095 | print(f"Speech not detected in any of {num_frames} frames") 1096 | return False 1097 | 1098 | def _check_voice_activity(self, data): 1099 | 1100 | self.is_webrtc_speech_active = self._is_webrtc_speech(data) 1101 | 1102 | # First quick performing check for voice activity using WebRTC 1103 | if self.is_webrtc_speech_active: 1104 | 1105 | if not self.silero_working: 1106 | self.silero_working = True 1107 | 1108 | # Run the intensive check in a separate thread 1109 | threading.Thread( 1110 | target=self._is_silero_speech, 1111 | args=(data,)).start() 1112 | 1113 | def _is_voice_active(self): 1114 | 1115 | return self.is_webrtc_speech_active and self.is_silero_speech_active 1116 | 1117 | def _set_state(self, new_state): 1118 | 1119 | # Check if the state has actually changed 1120 | if new_state == self.state: 1121 | return 1122 | 1123 | # Store the current state for later comparison 1124 | old_state = self.state 1125 | 1126 | # Update to the new state 1127 | self.state = new_state 1128 | 1129 | # Execute callbacks based on transitioning FROM a particular state 1130 | if old_state == "listening": 1131 | if self.on_vad_detect_stop: 1132 | self.on_vad_detect_stop() 1133 | elif old_state == "wakeword": 1134 | if self.on_wakeword_detection_end: 1135 | self.on_wakeword_detection_end() 1136 | 1137 | # Execute callbacks based on transitioning TO a particular state 1138 | if new_state == "listening": 1139 | if self.on_vad_detect_start: 1140 | self.on_vad_detect_start() 1141 | self._set_spinner("speak now") 1142 | if self.spinner and self.halo: 1143 | self.halo._interval = 250 1144 | elif new_state == "wakeword": 1145 | if self.on_wakeword_detection_start: 1146 | self.on_wakeword_detection_start() 1147 | self._set_spinner(f"say {self.wake_words}") 1148 | if self.spinner and self.halo: 1149 | self.halo._interval = 500 1150 | elif new_state == "transcribing": 1151 | if self.on_transcription_start: 1152 | self.on_transcription_start() 1153 | self._set_spinner("transcribing") 1154 | if self.spinner and self.halo: 1155 | self.halo._interval = 50 1156 | elif new_state == "recording": 1157 | self._set_spinner("recording") 1158 | if self.spinner and self.halo: 1159 | self.halo._interval = 100 1160 | elif new_state == "inactive": 1161 | if self.spinner and self.halo: 1162 | self.halo.stop() 1163 | self.halo = None 1164 | 1165 | def _set_spinner(self, text): 1166 | 1167 | if self.spinner: 1168 | # If the Halo spinner doesn't exist, create and start it 1169 | if self.halo is None: 1170 | self.halo = halo.Halo(text=text) 1171 | self.halo.start() 1172 | # If the Halo spinner already exists, just update the text 1173 | else: 1174 | self.halo.text = text 1175 | 1176 | def _preprocess_output(self, text, preview=False): 1177 | 1178 | text = re.sub(r'\s+', ' ', text.strip()) 1179 | 1180 | if self.ensure_sentence_starting_uppercase: 1181 | if text: 1182 | text = text[0].upper() + text[1:] 1183 | 1184 | # Ensure the text ends with a proper punctuation 1185 | # if it ends with an alphanumeric character 1186 | if not preview: 1187 | if self.ensure_sentence_ends_with_period: 1188 | if text and text[-1].isalnum(): 1189 | text += '.' 1190 | 1191 | return text 1192 | 1193 | def _find_tail_match_in_text(self, text1, text2, length_of_match=10): 1194 | 1195 | 1196 | # Check if either of the texts is too short 1197 | if len(text1) < length_of_match or len(text2) < length_of_match: 1198 | return -1 1199 | 1200 | # The end portion of the first text that we want to compare 1201 | target_substring = text1[-length_of_match:] 1202 | 1203 | # Loop through text2 from right to left 1204 | for i in range(len(text2) - length_of_match + 1): 1205 | # Extract the substring from text2 1206 | # to compare with the target_substring 1207 | current_substring = text2[len(text2) - i - length_of_match: 1208 | len(text2) - i] 1209 | 1210 | # Compare the current_substring with the target_substring 1211 | if current_substring == target_substring: 1212 | # Position in text2 where the match starts 1213 | return len(text2) - i 1214 | 1215 | return -1 1216 | 1217 | def _on_realtime_transcription_stabilized(self, text): 1218 | 1219 | if self.on_realtime_transcription_stabilized: 1220 | if self.is_recording: 1221 | self.on_realtime_transcription_stabilized(text) 1222 | 1223 | def _on_realtime_transcription_update(self, text): 1224 | 1225 | if self.on_realtime_transcription_update: 1226 | if self.is_recording: 1227 | self.on_realtime_transcription_update(text) 1228 | 1229 | def __enter__(self): 1230 | 1231 | return self 1232 | 1233 | def __exit__(self, exc_type, exc_value, traceback): 1234 | 1235 | self.shutdown() 1236 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 RobotTelevision 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CrowAssistant 2 | Crow is a Desktop AI Assistant 3 | 4 | [![Crow Demo](https://img.youtube.com/vi/XdR7Uo3DPys/0.jpg)](https://www.youtube.com/watch?v=XdR7Uo3DPys) 5 | 6 | ## Features 7 | - Pixel Art Crow desktop friend flys and lands on the bottom of whatever window you're focused on. 8 | - Fast-Whisper Speech to Text and VAD pulled from: https://github.com/KoljaB/RealtimeSTT 9 | - Piper Text to Speech with over 900 voices to choose from. https://github.com/rhasspy/piper 10 | - Interuptable, by saying his name he can stop and listen. 11 | - Audio Ducking, Lowers the volume while recording so you can play music while talking to Crow. 12 | - Automaticaly pauses the conversation after a long silence and waits to hear his name to start the conversation again. 13 | - Website Interface for conversation logs and settings 14 | 15 | ## How to Use 16 | 17 | Double Click on the Crow to open the web interface. Open the settings and get a free plan api key from groq. 18 | Setup your Mic and Speakers, Save the settings and then restart Crow. 19 | To start talking to Crow, just say his name and he should start listening. 20 | 21 | When Crow is not in conversation he rests above the system tray. 22 | 23 | ## Running the Code 24 | 25 | you'll need to download a windows release of Piper: https://github.com/rhasspy/piper/releases 26 | Put the exe and other files right in the base directory... i know its a bit of a messy way to do things, but I'll try to clean it up in future releases. 27 | 28 | You'll also need to grab the libritts_r onnx and json files for the voice to work: https://huggingface.co/rhasspy/piper-voices/tree/main/en/en_US/libritts_r/medium 29 | 30 | And that should do it. 31 | -------------------------------------------------------------------------------- /Volume.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | class VolumeControl: 4 | def __init__(self, device_index=None): 5 | self.system = platform.system() 6 | self.device_index = device_index 7 | 8 | if self.system == "Windows": 9 | from comtypes import CLSCTX_ALL 10 | from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume 11 | device = AudioUtilities.GetSpeakers() 12 | interface = device.Activate( 13 | IAudioEndpointVolume._iid_, CLSCTX_ALL, None) 14 | self.volume = interface.QueryInterface(IAudioEndpointVolume) 15 | elif self.system == "Linux": 16 | import pulsectl 17 | self.pulse = pulsectl.Pulse('volume-control') 18 | elif self.system == "Darwin": # macOS 19 | import subprocess 20 | 21 | def get_volume(self): 22 | if self.system == "Windows": 23 | return round(self.volume.GetMasterVolumeLevelScalar() * 100) 24 | elif self.system == "Linux": 25 | sinks = self.pulse.sink_list() 26 | if self.device_index is not None and 0 <= self.device_index < len(sinks): 27 | return round(sinks[self.device_index].volume.value_flat * 100) 28 | elif sinks: 29 | return round(sinks[0].volume.value_flat * 100) 30 | elif self.system == "Darwin": 31 | cmd = f"osascript -e 'output volume of (get volume settings)'" 32 | if self.device_index is not None: 33 | cmd = f"osascript -e 'tell application \"System Events\" to get volume settings of audio device \"{self.device_index}\"'" 34 | return int(subprocess.check_output(cmd, shell=True).strip().split(", ")[0].split(":")[1]) 35 | 36 | def set_volume(self, volume): 37 | volume = max(0, min(100, volume)) # Ensure volume is between 0 and 100 38 | if self.system == "Windows": 39 | self.volume.SetMasterVolumeLevelScalar(volume / 100, None) 40 | elif self.system == "Linux": 41 | sinks = self.pulse.sink_list() 42 | if self.device_index is not None and 0 <= self.device_index < len(sinks): 43 | self.pulse.volume_set_all_chans(sinks[self.device_index], volume / 100) 44 | elif sinks: 45 | self.pulse.volume_set_all_chans(sinks[0], volume / 100) 46 | elif self.system == "Darwin": 47 | cmd = f"osascript -e 'set volume output volume {volume}'" 48 | if self.device_index is not None: 49 | cmd = f"osascript -e 'tell application \"System Events\" to set volume of audio device \"{self.device_index}\" to {volume}'" 50 | subprocess.call(cmd, shell=True) 51 | 52 | @staticmethod 53 | def list_devices(): 54 | system = platform.system() 55 | if system == "Windows": 56 | from pycaw.pycaw import AudioUtilities 57 | return [device.FriendlyName for device in AudioUtilities.GetAllDevices()] 58 | elif system == "Linux": 59 | import pulsectl 60 | with pulsectl.Pulse('device-list') as pulse: 61 | return [sink.name for sink in pulse.sink_list()] 62 | elif system == "Darwin": 63 | import subprocess 64 | cmd = "system_profiler SPAudioDataType | grep -A 1 'Output:' | grep -v 'Output:' | awk -F: '{print $1}' | sed 's/^[ \t]*//'" 65 | return subprocess.check_output(cmd, shell=True).decode().strip().split('\n') 66 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "llama3-8b-8192", 3 | "name": "Crow", 4 | "voice": 113, 5 | "personality": "You are sarcastic, mischievous and slightly annoying, while still being helpful.", 6 | "url": "https://api.groq.com/openai/v1", 7 | "api_key": "Enter API Key Here", 8 | "mic": "Default", 9 | "speaker": "Default", 10 | "scale": 2, 11 | "port": 5000, 12 | "maxtoken": 32000, 13 | "maxmsg": 100 14 | } -------------------------------------------------------------------------------- /crow.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/crow.ico -------------------------------------------------------------------------------- /images/crow-idle1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crow-idle1.png -------------------------------------------------------------------------------- /images/crow-idle2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crow-idle2.png -------------------------------------------------------------------------------- /images/crow-wingleft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crow-wingleft.png -------------------------------------------------------------------------------- /images/crow-wingright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crow-wingright.png -------------------------------------------------------------------------------- /images/crowfly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowfly.png -------------------------------------------------------------------------------- /images/crowhead-blink.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowhead-blink.png -------------------------------------------------------------------------------- /images/crowhead-lookback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowhead-lookback.png -------------------------------------------------------------------------------- /images/crowhead-tilt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowhead-tilt.png -------------------------------------------------------------------------------- /images/crowhead-tiltold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowhead-tiltold.png -------------------------------------------------------------------------------- /images/crowhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobotTelevision/CrowAssistant/6cc0e6947eb770083373b9c3d98966cb76619e47/images/crowhead.png -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Crow Web Interface 7 | 8 | 9 | 10 | 121 | 122 | 123 | 124 | 138 |
139 |
140 | 141 |
142 | 143 |
144 | 145 | 307 | 308 | 309 | 310 | -------------------------------------------------------------------------------- /templates/settings.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | CrowSettings 7 | 8 | 86 | 87 | 88 | 140 |
141 |
142 | 143 |

Crow Settings

144 | Return to Conversations 145 |

146 | For Crow to work, you'll need to get and enter the API key and URL.
Click the link to get a Groq key, there is a free teir you can use. 147 | 148 | 149 | 150 | 151 |

Enter the name you'd like to call your AI assistant. Make sure its something the Text To Speech system can understand easily.

152 | 153 | 154 | 155 |

Describe the personality traits you want your AI to exhibit. (ie: You are sarcastic and tell puns constantly.)

156 | 157 | 158 | 159 |

Select a voice ID for text-to-speech. Changing this value lets you preview the voice.

160 | 161 | 162 | 163 |

Enter the API endpoint URL for the AI service. For Groq this is: https://api.groq.com/openai/v1

164 | 165 | 166 | 167 | Get your Groq API key 168 |

Enter your API key for authentication. Keep this private!

169 | 170 | 171 | 172 |

Specify the AI model to use. This needs to be the exact model name used by the API.
For Groq, try: llama-3.1-8b-instant

173 | 174 | 175 | 176 |

Adjust the size of the crow on screen.

177 | 178 | 179 | 184 |

Select the microphone device for voice input.

185 | 186 | 187 | 192 |

Choose the speaker device for audio output.

193 | 194 |
195 | If you don't know what these are, don't change them :D 196 | 197 | 198 |

Specify the port number for the application to run on.

199 | 200 | 201 | 202 |

Set the maximum number of tokens allowed in a conversation.

203 | 204 | 205 | 206 |

Set the maximum number of messages allowed in a conversation history.

207 | 208 |
209 | 210 |
211 |
212 |
213 | 245 | 246 | 247 | --------------------------------------------------------------------------------