├── LICENSE ├── README.md ├── backend ├── .env ├── __init__.py ├── config.py ├── main.py ├── prompts │ ├── system_prompt.md │ ├── user_profile.json │ └── vision_settings.json ├── requirements.txt ├── routes │ ├── __init__.py │ └── websocket.py └── services │ ├── __init__.py │ ├── conversation_storage.py │ ├── llm.py │ ├── transcription.py │ ├── tts.py │ └── vision.py ├── docs ├── Demonstration_Video.png ├── Vocalis_Demo.png ├── Vocalis_Header.png ├── Vocalis_Visual_demo.gif └── index.html ├── frontend ├── favicon.ico ├── index.html ├── package.json ├── postcss.config.js ├── src │ ├── App.tsx │ ├── components │ │ ├── AssistantOrb.tsx │ │ ├── BackgroundStars.tsx │ │ ├── ChatInterface.tsx │ │ ├── PreferencesModal.tsx │ │ ├── SessionManager.tsx │ │ └── Sidebar.tsx │ ├── index.css │ ├── main.tsx │ ├── services │ │ ├── audio.ts │ │ └── websocket.ts │ ├── utils │ │ └── hooks.ts │ └── vite-env.d.ts ├── tailwind.config.js ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts ├── install-deps.bat ├── install-deps.sh ├── run.bat ├── run.sh ├── setup.bat └── setup.sh /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Vocalis - Speech-to-Speech AI Assistant](https://lex-au.github.io/Vocalis/Vocalis_Header.png) 2 | 3 | # Vocalis 4 | 5 | [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 6 | [![React](https://img.shields.io/badge/React-18-61DAFB.svg?logo=react&logoColor=white)](https://reactjs.org/) 7 | [![FastAPI](https://img.shields.io/badge/FastAPI-0.109.2-009688.svg?logo=fastapi&logoColor=white)](https://fastapi.tiangolo.com/) 8 | [![Whisper](https://img.shields.io/badge/Whisper-Faster--Whisper-yellow.svg)](https://github.com/guillaumekln/faster-whisper) 9 | [![Python](https://img.shields.io/badge/Python-3.10-3776AB.svg?logo=python&logoColor=white)](https://www.python.org/) 10 | 11 | A sophisticated AI assistant with speech-to-speech capabilities built on a modern React frontend with a FastAPI backend. Vocalis provides a responsive, low-latency conversational experience with advanced visual feedback. 12 | 13 | ## Video Demonstration of Setup and Usage 14 | 15 | [![Watch the video](https://lex-au.github.io/Vocalis/Demonstration_Video.png)](https://www.youtube.com/watch?v=2slWwsHTNIA) 16 | 17 | ## Changelog 18 | 19 | **v1.5.0** (Vision Update) - April 12, 2025 20 | - 🔍 New image analysis capability powered by [SmolVLM-256M-Instruct model](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) 21 | - 🖼️ Seamless image upload and processing interface 22 | - 🔄 Contextual conversation continuation based on image understanding 23 | - 🧩 Multi-modal conversation support (text, speech, and images) 24 | - 💾 Advanced session management for saving and retrieving conversations 25 | - 🎨 Improved UI with central call button and cleaner control layout 26 | - 🔌 Simplified sidebar without redundant controls 27 | 28 | **v1.0.0** (Initial Release) - March 31, 2025 29 | - ✨ Revolutionary barge-in technology for natural conversation flow 30 | - 🔊 Ultra low-latency audio streaming with adaptive buffering 31 | - 🤖 AI-initiated greetings and follow-ups for natural conversations 32 | - 🎨 Dynamic visual feedback system with state-aware animations 33 | - 🔄 Streaming TTS with chunk-based delivery for immediate responses 34 | - 🚀 Cross-platform support with optimised setup scripts 35 | - 💻 CUDA acceleration with fallback for CPU-only systems 36 | 37 | ## Features 38 | 39 | ### 🎯 Advanced Conversation Capabilities 40 | 41 | - **🗣️ Barge-In Interruption** - Interrupt the AI mid-speech for a truly natural conversation experience 42 | - **👋 AI-Initiated Greetings** - Assistant automatically welcomes users with a contextual greeting 43 | - **💬 Intelligent Follow-Ups** - System detects silence and continues conversation with natural follow-up questions 44 | - **🔄 Conversation Memory** - Maintains context throughout the conversation session 45 | - **🧠 Contextual Understanding** - Processes conversation history for coherent, relevant responses 46 | - **🖼️ Image Analysis** - Upload and discuss images with integrated visual understanding 47 | - **💾 Session Management** - Save, load, and manage conversation sessions with customisable titles 48 | 49 | ### ⚡ Ultra-Responsive Performance 50 | 51 | - **⏱️ Low-Latency Processing** - End-to-end latency under 500ms for immediate response perception 52 | - **🔊 Streaming Audio** - Begin playback before full response is generated 53 | - **📦 Adaptive Buffering** - Dynamically adjust audio buffer size based on network conditions 54 | - **🔌 Efficient WebSocket Protocol** - Bidirectional real-time audio streaming 55 | - **🔄 Parallel Processing** - Multi-stage pipeline for concurrent audio handling 56 | 57 | ### 🎨 Interactive Visual Experience 58 | 59 | - **🔮 Dynamic Assistant Orb** - Visual representation with state-aware animations: 60 | - Pulsing glow during listening 61 | - Particle animations during processing 62 | - Wave-like motion during speaking 63 | - **📝 Live Transcription** - Real-time display of recognised speech 64 | - **🚦 Status Indicators** - Clear visual cues for system state 65 | - **🌈 Smooth Transitions** - Fluid state changes with appealing animations 66 | - **🌙 Dark Theme** - Eye-friendly interface with cosmic aesthetic 67 | 68 | ### 🛠️ Technical Excellence 69 | 70 | - **🔍 High-Accuracy VAD** - Superior voice activity detection using custom-built VAD 71 | - **🗣️ Optimised Whisper Integration** - Faster-Whisper for rapid transcription 72 | - **🔊 Real-Time TTS** - Chunked audio delivery for immediate playback 73 | - **🖥️ Hardware Flexibility** - CUDA acceleration with CPU fallback options 74 | - **🔧 Easy Configuration** - Environment variables and user-friendly setup 75 | 76 | ## Quick Start 77 | 78 | ### Prerequisites 79 | 80 | #### Windows 81 | - Python 3.10+ installed and in your PATH 82 | - Node.js and npm installed 83 | 84 | #### macOS 85 | - Python 3.10+ installed 86 | - Install Homebrew (if not already installed): 87 | ```bash 88 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 89 | ``` 90 | - Install Node.js and npm: 91 | ```bash 92 | brew install node 93 | ``` 94 | - **Apple Silicon (M1/M2/M3/M4) Notes**: 95 | - The setup will automatically install a compatible PyTorch version 96 | - If you encounter any PyTorch-related errors, you may need to manually install it: 97 | ```bash 98 | pip install torch 99 | ``` 100 | Then continue with the regular setup. 101 | 102 | ### One-Click Setup (Recommended) 103 | 104 | #### Windows 105 | 1. Run `setup.bat` to initialise the project (one-time setup) 106 | - Includes option for CUDA or CPU-only PyTorch installation 107 | 2. Run `run.bat` to start both frontend and backend servers 108 | 3. If you need to update dependencies later, use `install-deps.bat` 109 | 110 | #### macOS/Linux 111 | 1. Make scripts executable: `chmod +x *.sh` 112 | 2. Run `./setup.sh` to initialise the project (one-time setup) 113 | - Includes option for CUDA or CPU-only PyTorch installation 114 | 3. Run `./run.sh` to start both frontend and backend servers 115 | 4. If you need to update dependencies later, use `./install-deps.sh` 116 | 117 | ### Manual Setup (Alternative) 118 | 119 | If you prefer to set up the project manually, follow these steps: 120 | 121 | #### Backend Setup 122 | 1. Create a Python virtual environment: 123 | ```bash 124 | cd backend 125 | python -m venv env 126 | # Windows: 127 | .\env\Scripts\activate 128 | # macOS/Linux: 129 | source env/bin/activate 130 | ``` 131 | 132 | 2. Install the Python dependencies: 133 | ```bash 134 | pip install -r requirements.txt 135 | ``` 136 | 137 | 3. If you need CUDA support, install PyTorch with CUDA: 138 | ```bash 139 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 140 | ``` 141 | 142 | 4. Start the backend server: 143 | ```bash 144 | python -m backend.main 145 | ``` 146 | 147 | #### Frontend Setup 148 | 1. Install Node.js dependencies: 149 | ```bash 150 | cd frontend 151 | npm install 152 | ``` 153 | 154 | 2. Start the development server: 155 | ```bash 156 | npm run dev 157 | ``` 158 | 159 | ### Personalising Vocalis 160 | 161 | After launching Vocalis, you can customise your experience through the sidebar: 162 | 163 | 1. Click the sidebar icon to open the navigation panel 164 | 2. Under the "Settings" tab, click "Preferences" to access personalisation options 165 | 166 | The preferences modal offers several ways to tailor Vocalis to your needs: 167 | 168 | #### User Profile 169 | - **Your Name**: Enter your name to personalise greetings and make conversations more natural 170 | - This helps Vocalis address you properly during interactions 171 | 172 | #### System Prompt 173 | - Modify the AI's behaviour by editing the system prompt 174 | - The default prompt is optimised for natural voice interaction, but you can customise it for specific use cases 175 | - Use the "Restore Default" button to revert to the original prompt if needed 176 | 177 | #### Vision Capabilities 178 | - Toggle vision capabilities on/off using the switch at the bottom of the preferences panel 179 | - When enabled, Vocalis can analyse images shared during conversations 180 | - This feature allows for rich multi-modal interactions where you can discuss visual content 181 | 182 | These settings are saved automatically and persist between sessions, ensuring a consistent experience tailored to your preferences. 183 | 184 | ## External Services 185 | 186 | Vocalis is designed to work with OpenAI-compatible API endpoints for both LLM and TTS services: 187 | 188 | - **LLM (Language Model)**: By default, the backend is configured to use [LM Studio](https://lmstudio.ai/) running locally. This provides a convenient way to run local language models compatible with OpenAI's API format. 189 | 190 | **Custom Vocalis Model**: For optimal performance, Vocalis includes a purpose-built fine-tuned model: [lex-au/Vocalis-Q4_K_M.gguf](https://huggingface.co/lex-au/Vocalis-Q4_K_M.gguf). This model is based on Meta's LLaMA 3 8B Instruct and specifically optimised for immersive conversational experiences with: 191 | - Enhanced spatial and temporal context tracking 192 | - Low-latency response generation 193 | - Rich, descriptive language capabilities 194 | - Efficient resource utilisation through Q4_K_M quantisation 195 | - Seamless integration with the Vocalis speech-to-speech pipeline 196 | 197 | - **Text-to-Speech (TTS)**: For voice generation, the system works out of the box with: 198 | - [Orpheus-FASTAPI](https://github.com/Lex-au/Orpheus-FastAPI): A high-quality TTS server with OpenAI-compatible endpoints providing rich, expressive voices. 199 | 200 | You can adjust the endpoint in `.env` to any opensource TTS project. For a lightning-fast alternative: 201 | - [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI): A lightning-fast TTS alternative, optimised for minimal latency when speed is the priority over maximum expressiveness. 202 | 203 | Both services can be configured in the `backend/.env` file. The system requires these external services to function properly, as Vocalis acts as an orchestration layer combining speech recognition, language model inference, and speech synthesis. 204 | 205 | ## Visual Demo 206 | 207 | ![Assistant Interface](https://lex-au.github.io/Vocalis/Vocalis_Demo.png) 208 | 209 | ## Session Management 210 | 211 | Vocalis includes a robust session management system that allows users to save, load, and organise their conversations: 212 | 213 | ### Key Features 214 | 215 | - **Save Conversations**: Save the current conversation state with a custom title 216 | - **Load Previous Sessions**: Return to any saved conversation exactly as you left it 217 | - **Edit Session Titles**: Rename sessions for better organisation 218 | - **Delete Unwanted Sessions**: Remove conversations you no longer need 219 | - **Session Metadata**: View additional information like message count 220 | - **Automatic Timestamps**: Sessions track both creation and last update times 221 | 222 | ### Technical Implementation 223 | 224 | The session system uses a two-part architecture: 225 | 226 | 1. **Backend Storage**: 227 | - Conversations are stored as JSON files in a dedicated directory 228 | - Each session maintains its complete message history 229 | - Asynchronous file I/O prevents performance impacts 230 | - UUID-based session identification ensures uniqueness 231 | 232 | 2. **Frontend Interface**: 233 | - Intuitive sidebar UI for session management 234 | - Real-time session status updates 235 | - Active session indicator 236 | - Session creation with optional custom titles 237 | 238 | ### Usage Flow 239 | 240 | 1. Start a new conversation with the assistant 241 | 2. Click "Save As New Conversation" to preserve the current state 242 | 3. Continue your conversation or load a different session 243 | 4. Return to any saved session at any time to continue where you left off 244 | 5. Edit session titles or delete unwanted sessions as needed 245 | 246 | This persistent storage system ensures you never lose valuable conversations and can maintain separate contexts for different topics or projects. 247 | 248 | ## Architecture Overview 249 | 250 | ```mermaid 251 | graph TB 252 | subgraph "Frontend (React)" 253 | AudioCapture[Audio Capture] 254 | AudioVisualizer[Audio Visualizer] 255 | WebSocket[WebSocket Client] 256 | AudioOutput[Audio Output] 257 | UIState[UI State Management] 258 | ImageUpload[Image Upload] 259 | SessionManager[Session Manager] 260 | end 261 | 262 | subgraph "Backend (FastAPI)" 263 | WSServer[WebSocket Server] 264 | VAD[Custom Voice Activity Detection] 265 | WhisperSTT[Faster Whisper] 266 | LLMClient[LLM Client] 267 | TTSClient[TTS Client] 268 | AudioProcessing[Audio Processing] 269 | VisionService[SmolVLM Vision Service] 270 | StorageService[Conversation Storage] 271 | EnvConfig[Environment Config] 272 | end 273 | 274 | subgraph "Local API Services" 275 | LLMEndpoint["LLM API (127.0.0.1:1234)"] 276 | TTSEndpoint["TTS API (localhost:5005)"] 277 | end 278 | 279 | subgraph "Storage" 280 | SessionFiles["Session JSON Files"] 281 | end 282 | 283 | AudioCapture -->|Audio Stream| WebSocket 284 | ImageUpload -->|Image Data| WebSocket 285 | SessionManager -->|Session Commands| WebSocket 286 | WebSocket <-->|WebSocket Protocol| WSServer 287 | WSServer --> VAD 288 | VAD -->|Audio with Speech| WhisperSTT 289 | WhisperSTT -->|Transcribed Text| LLMClient 290 | 291 | WebSocket -->|Image Data| WSServer 292 | WSServer -->|Process Image| VisionService 293 | VisionService -->|Image Description| LLMClient 294 | 295 | WebSocket -->|Session Operations| WSServer 296 | WSServer -->|Store/Load/List/Delete| StorageService 297 | StorageService <-->|Read/Write JSON| SessionFiles 298 | 299 | LLMClient -->|API Request| LLMEndpoint 300 | LLMEndpoint -->|Response Text| LLMClient 301 | LLMClient -->|Response Text| TTSClient 302 | TTSClient -->|API Request| TTSEndpoint 303 | TTSEndpoint -->|Audio Data| TTSClient 304 | TTSClient --> WSServer 305 | WSServer -->|Audio Response| WebSocket 306 | WebSocket --> AudioOutput 307 | EnvConfig -->|Configuration| WhisperSTT 308 | EnvConfig -->|Configuration| LLMClient 309 | EnvConfig -->|Configuration| TTSClient 310 | EnvConfig -->|Configuration| VisionService 311 | EnvConfig -->|Configuration| StorageService 312 | UIState <--> WebSocket 313 | ``` 314 | 315 | ## Detailed System Architecture 316 | 317 | The following diagram provides a comprehensive view of Vocalis's architecture, highlighting the advanced conversation features and interrupt handling systems that enable its natural conversational capabilities: 318 | 319 | ```mermaid 320 | graph TD 321 | %% Client Side 322 | subgraph "Frontend (React + TypeScript + Vite)" 323 | FE_Audio[Audio Capture/Playback] 324 | FE_WebSocket[WebSocket Client] 325 | FE_UI[UI Components] 326 | FE_State[State Management] 327 | FE_InterruptDetector[Interrupt Detector] 328 | FE_SilenceDetector[Silence Detector] 329 | FE_ImageUpload[Image Upload Handler] 330 | FE_SessionUI[Session Manager UI] 331 | 332 | subgraph "UI Components" 333 | UI_Orb[AssistantOrb] 334 | UI_Stars[BackgroundStars] 335 | UI_Chat[ChatInterface] 336 | UI_Prefs[PreferencesModal] 337 | UI_Sidebar[Sidebar] 338 | UI_Sessions[SessionManager] 339 | end 340 | 341 | subgraph "Services" 342 | FE_AudioService[Audio Service] 343 | FE_WebSocketService[WebSocket Service] 344 | end 345 | end 346 | 347 | %% Server Side 348 | subgraph "Backend (FastAPI + Python)" 349 | BE_Main[Main App] 350 | BE_Config[Configuration] 351 | BE_WebSocket[WebSocket Handler] 352 | BE_InterruptHandler[Interrupt Handler] 353 | BE_ConversationManager[Conversation Manager] 354 | 355 | subgraph "Services" 356 | BE_Transcription[Speech Transcription & VAD] 357 | BE_LLM[LLM Client] 358 | BE_TTS[TTS Client] 359 | BE_Vision[SmolVLM Vision Service] 360 | BE_Storage[Conversation Storage] 361 | end 362 | 363 | subgraph "Conversation Features" 364 | BE_GreetingSystem[AI Greeting System] 365 | BE_FollowUpSystem[Follow-Up Generator] 366 | BE_ContextMemory[Context Memory] 367 | BE_VisionContext[Image Context Manager] 368 | BE_SessionMgmt[Session Management] 369 | end 370 | end 371 | 372 | %% External Services & Storage 373 | subgraph "External Services" 374 | LLM_API[LM Studio OpenAI-compatible API] 375 | TTS_API[Orpheus-FASTAPI TTS] 376 | end 377 | 378 | subgraph "Persistent Storage" 379 | JSON_Files[Session JSON Files] 380 | end 381 | 382 | %% Data Flow - Main Path 383 | FE_Audio -->|Audio Stream| FE_AudioService 384 | FE_AudioService -->|Process Audio| FE_WebSocketService 385 | FE_WebSocketService -->|Binary Audio Data| FE_WebSocket 386 | FE_WebSocket <-->|WebSocket Protocol| BE_WebSocket 387 | 388 | BE_WebSocket -->|Audio Chunks| BE_Transcription 389 | BE_Transcription -->|Voice Activity Detection| BE_Transcription 390 | BE_Transcription -->|Transcribed Text| BE_ConversationManager 391 | BE_ConversationManager -->|Format Prompt| BE_LLM 392 | BE_LLM -->|API Request| LLM_API 393 | LLM_API -->|Response Text| BE_LLM 394 | BE_LLM -->|Response Text| BE_TTS 395 | BE_TTS -->|API Request| TTS_API 396 | TTS_API -->|Audio Data| BE_TTS 397 | BE_TTS -->|Processed Audio| BE_WebSocket 398 | 399 | BE_WebSocket -->|Audio Response| FE_WebSocket 400 | FE_WebSocket -->|Audio Data| FE_AudioService 401 | FE_AudioService -->|Playback| FE_Audio 402 | 403 | %% Session Management Flow 404 | FE_SessionUI -->|Save/Load/List/Delete| FE_WebSocketService 405 | FE_WebSocketService -->|Session Commands| FE_WebSocket 406 | FE_WebSocket -->|Session Operations| BE_WebSocket 407 | BE_WebSocket -->|Session Management| BE_SessionMgmt 408 | BE_SessionMgmt -->|Store/Retrieve| BE_Storage 409 | BE_Storage <-->|Persist Data| JSON_Files 410 | BE_Storage -->|Session Response| BE_WebSocket 411 | BE_WebSocket -->|Session Status| FE_WebSocket 412 | FE_WebSocket -->|Update UI| FE_SessionUI 413 | 414 | %% Vision Flow 415 | FE_ImageUpload -->|Image Data| FE_WebSocketService 416 | FE_WebSocketService -->|Image Base64| FE_WebSocket 417 | FE_WebSocket -->|Image Data| BE_WebSocket 418 | BE_WebSocket -->|Process Image| BE_Vision 419 | BE_Vision -->|Image Description| BE_VisionContext 420 | BE_VisionContext -->|Augmented Context| BE_ConversationManager 421 | 422 | %% Advanced Feature Paths 423 | 424 | %% 1. Interrupt System 425 | FE_Audio -->|Voice Activity| FE_InterruptDetector 426 | FE_InterruptDetector -->|Interrupt Signal| FE_WebSocket 427 | FE_WebSocket -->|Interrupt Command| BE_WebSocket 428 | BE_WebSocket -->|Cancel Processing| BE_InterruptHandler 429 | BE_InterruptHandler -.->|Stop Generation| BE_LLM 430 | BE_InterruptHandler -.->|Clear Buffer| BE_TTS 431 | BE_InterruptHandler -.->|Reset State| BE_ConversationManager 432 | 433 | %% 2. AI-Initiated Greetings 434 | BE_GreetingSystem -->|Initial Greeting| BE_ConversationManager 435 | BE_ConversationManager -->|Greeting Text| BE_LLM 436 | 437 | %% 3. Silence-based Follow-ups 438 | FE_SilenceDetector -->|Silence Detected| FE_WebSocket 439 | FE_WebSocket -->|Silence Notification| BE_WebSocket 440 | BE_WebSocket -->|Trigger Follow-up| BE_FollowUpSystem 441 | BE_FollowUpSystem -->|Generate Follow-up| BE_ConversationManager 442 | 443 | %% 4. Context Management 444 | BE_ConversationManager <-->|Store/Retrieve Context| BE_ContextMemory 445 | BE_SessionMgmt <-->|Save/Load Messages| BE_ContextMemory 446 | 447 | %% UI Interactions 448 | FE_State <-->|State Updates| FE_UI 449 | FE_WebSocketService -->|Connection Status| FE_State 450 | FE_AudioService -->|Audio Status| FE_State 451 | FE_InterruptDetector -->|Interrupt Status| FE_State 452 | FE_ImageUpload -->|Upload Status| FE_State 453 | 454 | %% Configuration 455 | BE_Config -->|Environment Settings| BE_Main 456 | BE_Config -->|API Settings| BE_LLM 457 | BE_Config -->|API Settings| BE_TTS 458 | BE_Config -->|Model Config| BE_Transcription 459 | BE_Config -->|Vision Settings| BE_Vision 460 | BE_Config -->|Storage Settings| BE_Storage 461 | BE_Config -->|Conversation Settings| BE_GreetingSystem 462 | BE_Config -->|Follow-up Settings| BE_FollowUpSystem 463 | 464 | %% UI Component Links 465 | FE_UI -->|Renders| UI_Orb 466 | UI_Orb -->|Visualises States| FE_State 467 | FE_UI -->|Renders| UI_Stars 468 | FE_UI -->|Renders| UI_Chat 469 | UI_Chat -->|Displays Transcript| FE_State 470 | FE_UI -->|Renders| UI_Prefs 471 | FE_UI -->|Renders| UI_Sidebar 472 | FE_UI -->|Renders| UI_Sessions 473 | UI_Sessions -->|Manages Sessions| FE_SessionUI 474 | 475 | %% Technology Labels 476 | classDef frontend fill:#61DAFB,color:#000,stroke:#61DAFB 477 | classDef backend fill:#009688,color:#fff,stroke:#009688 478 | classDef external fill:#FF9800,color:#000,stroke:#FF9800 479 | classDef feature fill:#E91E63,color:#fff,stroke:#E91E63 480 | classDef storage fill:#9C27B0,color:#fff,stroke:#9C27B0 481 | 482 | class FE_Audio,FE_WebSocket,FE_UI,FE_State,FE_AudioService,FE_WebSocketService,UI_Orb,UI_Stars,UI_Chat,UI_Prefs,UI_Sidebar,FE_ImageUpload,FE_SessionUI,UI_Sessions frontend 483 | class BE_Main,BE_Config,BE_WebSocket,BE_Transcription,BE_LLM,BE_TTS,BE_Vision,BE_Storage backend 484 | class LLM_API,TTS_API external 485 | class FE_InterruptDetector,FE_SilenceDetector,BE_InterruptHandler,BE_GreetingSystem,BE_FollowUpSystem,BE_ConversationManager,BE_ContextMemory,BE_VisionContext,BE_SessionMgmt feature 486 | class JSON_Files storage 487 | ``` 488 | 489 | ## Low-Latency TTS Streaming Architecture 490 | 491 | For achieving true low-latency in the speech system, we implement streaming TTS with chunked delivery and barge-in capability: 492 | 493 | ```mermaid 494 | sequenceDiagram 495 | participant Frontend 496 | participant AudioBuffer as Frontend Audio Buffer 497 | participant SilenceDetector as Frontend Silence Detector 498 | participant InterruptDetector as Frontend Interrupt Detector 499 | participant SessionMgr as Session Manager 500 | participant Backend as FastAPI Backend 501 | participant IntHandler as Backend Interrupt Handler 502 | participant Transcription as Speech Transcription & VAD 503 | participant VisionService as Vision Service (SmolVLM) 504 | participant StorageService as Conversation Storage 505 | participant LLM as LLM API (LM Studio) 506 | participant TTS as TTS API (Orpheus-FASTAPI) 507 | 508 | Note over Frontend,TTS: Normal Speech Flow 509 | 510 | Frontend->>Backend: Audio stream (chunks) 511 | Backend->>Transcription: Process audio 512 | Transcription->>Transcription: Voice activity detection 513 | Transcription->>Transcription: Speech-to-text 514 | Transcription->>Backend: Transcribed text 515 | Backend->>LLM: Text request with context 516 | activate LLM 517 | LLM-->>Backend: Text response (streaming) 518 | deactivate LLM 519 | Note over Backend: Begin TTS processing 520 | Backend->>TTS: Request TTS 521 | activate TTS 522 | 523 | %% Show parallel processing 524 | par Streaming audio playback 525 | TTS-->>Backend: Audio chunk 1 526 | Backend-->>Frontend: Audio chunk 1 527 | Frontend->>AudioBuffer: Queue chunk 528 | AudioBuffer->>Frontend: Begin playback 529 | 530 | TTS-->>Backend: Audio chunk 2 531 | Backend-->>Frontend: Audio chunk 2 532 | Frontend->>AudioBuffer: Queue chunk 533 | AudioBuffer->>Frontend: Continue playback 534 | 535 | TTS-->>Backend: Audio chunk n 536 | Backend-->>Frontend: Audio chunk n 537 | Frontend->>AudioBuffer: Queue chunk 538 | AudioBuffer->>Frontend: Continue playback 539 | end 540 | deactivate TTS 541 | 542 | Note over Frontend,TTS: Session Management Flow 543 | 544 | SessionMgr->>Backend: Save current session 545 | Backend->>StorageService: Store conversation 546 | StorageService-->>Backend: Session ID 547 | Backend-->>SessionMgr: Session saved confirmation 548 | 549 | SessionMgr->>Backend: Load specific session 550 | Backend->>StorageService: Retrieve session data 551 | StorageService-->>Backend: Conversation history 552 | Backend->>Backend: Restore conversation context 553 | Backend-->>SessionMgr: Session loaded confirmation 554 | 555 | Note over Frontend,TTS: Vision Processing Flow 556 | 557 | Frontend->>Backend: Upload image 558 | Backend->>VisionService: Process image 559 | activate VisionService 560 | VisionService-->>Backend: Image description 561 | deactivate VisionService 562 | Backend->>Backend: Add to conversation context 563 | Frontend->>Backend: Audio question about image 564 | Backend->>Transcription: Process audio 565 | Transcription->>Backend: Transcribed text 566 | Backend->>LLM: Text request with image context 567 | activate LLM 568 | LLM-->>Backend: Image-informed response 569 | deactivate LLM 570 | Backend->>TTS: Request TTS 571 | activate TTS 572 | TTS-->>Backend: Audio response 573 | Backend-->>Frontend: Stream audio response 574 | deactivate TTS 575 | 576 | Note over Frontend,TTS: Interrupt Flow (Barge-in) 577 | 578 | par Interrupt handling during speech 579 | Frontend->>InterruptDetector: User begins speaking 580 | InterruptDetector->>Frontend: Detect interrupt 581 | Frontend->>Backend: Send interrupt signal 582 | Backend->>IntHandler: Process interrupt 583 | 584 | IntHandler->>LLM: Cancel generation 585 | IntHandler->>TTS: Stop audio generation 586 | IntHandler->>Backend: Clear processing pipeline 587 | 588 | Backend->>Frontend: Stop audio signal 589 | Frontend->>AudioBuffer: Clear buffer 590 | AudioBuffer->>Frontend: Stop playback immediately 591 | end 592 | 593 | Note over Frontend,TTS: Silence Handling (AI Follow-ups) 594 | 595 | par AI-initiated follow-ups 596 | Frontend->>SilenceDetector: No user speech detected 597 | SilenceDetector->>Frontend: Silence timeout (3-5s) 598 | Frontend->>Backend: Silence notification 599 | Backend->>Backend: Generate follow-up 600 | Backend->>LLM: Request contextual follow-up 601 | activate LLM 602 | LLM-->>Backend: Follow-up response 603 | deactivate LLM 604 | Backend->>TTS: Convert to speech 605 | activate TTS 606 | TTS-->>Backend: Follow-up audio 607 | Backend-->>Frontend: Stream follow-up audio 608 | deactivate TTS 609 | Frontend->>AudioBuffer: Play follow-up 610 | end 611 | ``` 612 | 613 | ### Image Analysis Process 614 | 615 | Vocalis now includes visual understanding capabilities through the SmolVLM-256M-Instruct model: 616 | 617 | 1. **Image Upload**: 618 | - Users can click the vision button in the interface 619 | - A file picker allows selecting images up to 5MB 620 | - Images are encoded as base64 and sent to the backend 621 | 622 | 2. **Vision Processing**: 623 | - The SmolVLM model processes the image with transformers 624 | - The model generates a detailed description of the image contents 625 | - This description is added to the conversation context 626 | 627 | 3. **Contextual Continuation**: 628 | - After image processing, users can ask questions about the image 629 | - The system maintains awareness of the image context 630 | - Responses are generated with understanding of the visual content 631 | 632 | 4. **Multi-Modal Integration**: 633 | - The interface provides visual feedback during image processing 634 | - Transcripts and responses flow naturally between text and visual content 635 | - The conversation maintains coherence across modalities 636 | 637 | ### Streaming Architecture Features 638 | 639 | 1. **Parallel Processing**: 640 | - Simultaneous audio generation, transmission, and playback 641 | - Non-blocking pipeline for maximum responsiveness 642 | - Client-side buffer management with dynamic sizing 643 | 644 | 2. **Barge-in Capability**: 645 | - Real-time voice activity detection during AI speech 646 | - Multi-level interrupt system with priority handling 647 | - Immediate pipeline clearing for zero-latency response to interruptions 648 | 649 | 3. **Audio Buffer Management**: 650 | - Adaptive buffer sizes based on network conditions (20-50ms chunks) 651 | - Buffer health monitoring with automatic adjustments 652 | - Efficient audio format selection (Opus for compression, PCM for quality) 653 | 654 | 4. **Silence Response System**: 655 | - Time-based silence detection with configurable thresholds 656 | - Context-aware follow-up generation 657 | - Natural cadence for conversation flow maintenance 658 | 659 | ### Implementation Details: 660 | 661 | 1. **Backend TTS Integration**: 662 | - Configure TTS API with streaming support if available 663 | - Implement custom chunking if necessary 664 | 665 | 2. **Custom Streaming Implementation**: 666 | - Set up an async generator in FastAPI 667 | - Split audio into small chunks (10-50ms) 668 | - Send each chunk immediately through WebSocket 669 | 670 | 3. **WebSocket Protocol Enhancement**: 671 | - Add message types for different audio events: 672 | - `audio_chunk`: A piece of TTS audio to play immediately 673 | - `audio_start`: Signal to prepare audio context 674 | - `audio_end`: Signal that the complete utterance is finished 675 | 676 | 4. **Frontend Audio Handling**: 677 | - Use Web Audio API for low-latency playback 678 | - Implement buffer queue system for smooth playback 679 | 680 | ### Technical Considerations: 681 | 682 | 1. **Chunk Size Tuning**: 683 | - Find optimal balance between network overhead and latency 684 | 685 | 2. **Buffer Management**: 686 | - Avoid buffer underrun and excessive buffering 687 | 688 | 3. **Format Efficiency**: 689 | - Use efficient audio formats for streaming (Opus, WebM, or raw PCM) 690 | 691 | 4. **Abort Capability**: 692 | - Implement clean interruption for new user input 693 | 694 | ## Buffer Management Approach 695 | 696 | ### 1. Adaptive Buffer Sizing 697 | - Start with small buffers (20-30ms) 698 | - Monitor playback stability 699 | - Dynamically adjust buffer size based on network conditions 700 | 701 | ### 2. Parallel Processing Pipeline 702 | - Process audio in parallel streams where possible 703 | - Begin TTS playback as soon as first chunk is available 704 | - Continue processing subsequent chunks during playback 705 | 706 | ### 3. Interrupt Handling 707 | - Implement a "barge-in" capability where new user speech cancels ongoing TTS 708 | - Clear audio buffers immediately on interruption 709 | 710 | ## Latency Optimisation 711 | 712 | Vocalis achieves exceptional low-latency performance through carefully optimised components: 713 | 714 | ### Speech Recognition Performance 715 | 716 | The system uses Faster-Whisper with the `base.en` model and a beam size of 2, striking an optimal balance between accuracy and speed. This configuration achieves: 717 | 718 | - **ASR Processing**: ~0.43 seconds for typical utterances 719 | - **Response Generation**: ~0.18 seconds 720 | - **Total Round-Trip Latency**: ~0.61 seconds 721 | 722 | Real-world example from system logs: 723 | ``` 724 | INFO:faster_whisper:Processing audio with duration 00:02.229 725 | INFO:backend.services.transcription:Transcription completed in 0.51s: Hi, how are you doing today?... 726 | INFO:backend.services.tts:Sending TTS request with 147 characters of text 727 | INFO:backend.services.tts:Received TTS response after 0.16s, size: 390102 bytes 728 | ``` 729 | 730 | ### Customising Performance 731 | 732 | You can adjust these settings to optimise for your specific needs: 733 | 734 | 1. **Model Size**: In `.env`, modify `WHISPER_MODEL=base.en` 735 | - Options: tiny.en, base.en, small.en, medium.en, large 736 | - Smaller models = faster processing, potentially lower accuracy 737 | - Larger models = more accurate, but increased latency 738 | 739 | 2. **Beam Size**: In `backend/services/transcription.py`, modify the `beam_size` parameter 740 | - Default: 2 741 | - Range: 1-5 (1 = fastest, 5 = most accurate) 742 | - Located in the `__init__` method of the `WhisperTranscriber` class 743 | 744 | ### Latency vs. Accuracy Trade-offs 745 | 746 | | Model | Beam Size | Approximate ASR Time | Accuracy | 747 | |------|-----------|---------------------|----------| 748 | | tiny.en | 1 | ~0.01s | Lower | 749 | | base.en | 2 | ~0.03s | Good | 750 | | small.en | 3 | ~0.10s | Better | 751 | | medium.en | 4 | ~0.25s | Very Good | 752 | | large | 5 | ~0.50s | Best | 753 | 754 | With optimisations in place, Vocalis can achieve total processing latencies well under 250ms when using smaller models, which is typically perceived as "immediate" by users. 755 | 756 | ## Project Structure 757 | 758 | ``` 759 | Vocalis/ 760 | ├── README.md 761 | ├── setup.bat # Windows one-time setup script 762 | ├── run.bat # Windows run script 763 | ├── install-deps.bat # Windows dependency update script 764 | ├── setup.sh # Unix one-time setup script 765 | ├── run.sh # Unix run script 766 | ├── install-deps.sh # Unix dependency update script 767 | ├── conversations/ # Directory for saved session files 768 | ├── backend/ 769 | │ ├── .env 770 | │ ├── main.py 771 | │ ├── config.py 772 | │ ├── requirements.txt 773 | │ ├── services/ 774 | │ │ ├── __init__.py 775 | │ │ ├── conversation_storage.py 776 | │ │ ├── llm.py 777 | │ │ ├── transcription.py # Includes VAD functionality 778 | │ │ ├── tts.py 779 | │ │ ├── vision.py 780 | │ ├── routes/ 781 | │ │ ├── __init__.py 782 | │ │ ├── websocket.py 783 | ├── frontend/ 784 | │ ├── public/ 785 | │ ├── src/ 786 | │ │ ├── components/ 787 | │ │ │ ├── AssistantOrb.tsx 788 | │ │ │ ├── BackgroundStars.tsx 789 | │ │ │ ├── ChatInterface.tsx 790 | │ │ │ ├── PreferencesModal.tsx 791 | │ │ │ ├── SessionManager.tsx 792 | │ │ │ ├── Sidebar.tsx 793 | │ │ ├── services/ 794 | │ │ │ ├── audio.ts 795 | │ │ │ ├── websocket.ts 796 | │ │ ├── utils/ 797 | │ │ │ ├── hooks.ts 798 | │ │ ├── App.tsx 799 | │ │ ├── main.tsx 800 | │ │ ├── index.css 801 | │ │ ├── vite-env.d.ts 802 | │ ├── package.json 803 | │ ├── tsconfig.json 804 | │ ├── tsconfig.node.json 805 | │ ├── vite.config.ts 806 | │ ├── tailwind.config.js 807 | │ ├── postcss.config.js 808 | ``` 809 | 810 | ## Dependencies 811 | 812 | ### Backend (Python) 813 | ``` 814 | fastapi==0.109.2 815 | uvicorn==0.27.1 816 | python-dotenv==1.0.1 817 | websockets==12.0 818 | numpy==1.26.4 819 | transformers 820 | faster-whisper==1.1.1 821 | requests==2.31.0 822 | python-multipart==0.0.9 823 | torch==2.0.1 824 | ctranslate2==3.10.0 825 | ffmpeg-python==0.2.0 826 | ``` 827 | 828 | ### Frontend 829 | ``` 830 | react 831 | typescript 832 | tailwindcss 833 | lucide-react 834 | websocket 835 | web-audio-api 836 | ``` 837 | 838 | ## Technical Decisions 839 | 840 | - **Audio Format**: Web Audio API (44.1kHz, 16-bit PCM) 841 | - **Browser Compatibility**: Targeting modern Chrome browsers 842 | - **Error Handling**: Graceful degradation with user-friendly messages 843 | - **Microphone Permissions**: Standard browser permission flow with clear guidance 844 | - **Conversation Model**: Multi-turn with context preservation 845 | - **State Management**: React hooks with custom state machine 846 | - **Animation System**: CSS transitions with hardware acceleration 847 | - **Vision Processing**: SmolVLM-256M-Instruct for efficient image understanding 848 | - **Session Storage**: Asynchronous JSON file-based persistence with UUID identifiers 849 | 850 | ## License 851 | 852 | This project is licensed under the Apache License 2.0 - see the LICENSE file for details. -------------------------------------------------------------------------------- /backend/.env: -------------------------------------------------------------------------------- 1 | # Vocalis backend configuration 2 | 3 | # API Endpoints 4 | LLM_API_ENDPOINT=http://127.0.0.1:1234/v1/chat/completions # Place your local LLM API endpoint here (default is LM Studio) 5 | TTS_API_ENDPOINT=http://localhost:5005/v1/audio/speech # Place your local TTS API endpoint here (default is Orpheus-FASTAPI native python launcher) - If you're using Orpheus-FASTAPI Docker Container versus native python launcher, replace "localhost" with "127.0.0.1:5005" 6 | 7 | # Whisper Model Configuration 8 | WHISPER_MODEL=base # Options: tiny.en, base.en, small.en, medium.en, large 9 | 10 | # TTS Configuration 11 | TTS_MODEL=tts-1 12 | TTS_VOICE=tara 13 | TTS_FORMAT=wav # Format for TTS output (wav, mp3, opus, flac) 14 | 15 | # WebSocket Server Configuration 16 | WEBSOCKET_HOST=0.0.0.0 17 | WEBSOCKET_PORT=8000 18 | 19 | # Audio Processing 20 | VAD_THRESHOLD=0.1 # Voice activity detection threshold (0.0-1.0) 21 | VAD_BUFFER_SIZE=30 # Buffer size in milliseconds 22 | AUDIO_SAMPLE_RATE=44100 # Sample rate in Hz 23 | -------------------------------------------------------------------------------- /backend/__init__.py: -------------------------------------------------------------------------------- 1 | # Backend package initialization 2 | # This file makes the 'backend' directory a Python package 3 | -------------------------------------------------------------------------------- /backend/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vocalis Configuration Module 3 | 4 | Loads and provides access to configuration settings from environment variables 5 | and the .env file. 6 | """ 7 | 8 | import os 9 | from dotenv import load_dotenv 10 | from typing import Dict, Any 11 | 12 | # Load environment variables from .env file 13 | load_dotenv() 14 | 15 | # API Endpoints 16 | LLM_API_ENDPOINT = os.getenv("LLM_API_ENDPOINT", "http://127.0.0.1:1234/v1/chat/completions") 17 | TTS_API_ENDPOINT = os.getenv("TTS_API_ENDPOINT", "http://localhost:5005/v1/audio/speech") 18 | 19 | # Whisper Model Configuration 20 | WHISPER_MODEL = os.getenv("WHISPER_MODEL", "tiny.en") 21 | 22 | # TTS Configuration 23 | TTS_MODEL = os.getenv("TTS_MODEL", "tts-1") 24 | TTS_VOICE = os.getenv("TTS_VOICE", "tara") 25 | TTS_FORMAT = os.getenv("TTS_FORMAT", "wav") 26 | 27 | # WebSocket Server Configuration 28 | WEBSOCKET_HOST = os.getenv("WEBSOCKET_HOST", "0.0.0.0") 29 | WEBSOCKET_PORT = int(os.getenv("WEBSOCKET_PORT", 8000)) 30 | 31 | # Audio Processing 32 | VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", 0.5)) 33 | VAD_BUFFER_SIZE = int(os.getenv("VAD_BUFFER_SIZE", 30)) 34 | AUDIO_SAMPLE_RATE = int(os.getenv("AUDIO_SAMPLE_RATE", 48000)) 35 | 36 | def get_config() -> Dict[str, Any]: 37 | """ 38 | Returns all configuration settings as a dictionary. 39 | 40 | Returns: 41 | Dict[str, Any]: Dictionary containing all configuration settings 42 | """ 43 | return { 44 | "llm_api_endpoint": LLM_API_ENDPOINT, 45 | "tts_api_endpoint": TTS_API_ENDPOINT, 46 | "whisper_model": WHISPER_MODEL, 47 | "tts_model": TTS_MODEL, 48 | "tts_voice": TTS_VOICE, 49 | "tts_format": TTS_FORMAT, 50 | "websocket_host": WEBSOCKET_HOST, 51 | "websocket_port": WEBSOCKET_PORT, 52 | "vad_threshold": VAD_THRESHOLD, 53 | "vad_buffer_size": VAD_BUFFER_SIZE, 54 | "audio_sample_rate": AUDIO_SAMPLE_RATE, 55 | } 56 | -------------------------------------------------------------------------------- /backend/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vocalis Backend Server 3 | 4 | FastAPI application entry point. 5 | """ 6 | 7 | import logging 8 | import uvicorn 9 | from fastapi import FastAPI, WebSocket, Depends, HTTPException 10 | from fastapi.middleware.cors import CORSMiddleware 11 | from contextlib import asynccontextmanager 12 | 13 | # Import configuration 14 | from . import config 15 | 16 | # Import services 17 | from .services.transcription import WhisperTranscriber 18 | from .services.llm import LLMClient 19 | from .services.tts import TTSClient 20 | from .services.vision import vision_service 21 | 22 | # Import routes 23 | from .routes.websocket import websocket_endpoint 24 | 25 | # Configure logging 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 29 | ) 30 | logger = logging.getLogger(__name__) 31 | 32 | # Global service instances 33 | transcription_service = None 34 | llm_service = None 35 | tts_service = None 36 | # Vision service is a singleton already initialized in its module 37 | 38 | @asynccontextmanager 39 | async def lifespan(app: FastAPI): 40 | """ 41 | Startup and shutdown events for the FastAPI application. 42 | """ 43 | # Load configuration 44 | cfg = config.get_config() 45 | 46 | # Initialize services on startup 47 | logger.info("Initializing services...") 48 | 49 | global transcription_service, llm_service, tts_service 50 | 51 | # Initialize transcription service 52 | transcription_service = WhisperTranscriber( 53 | model_size=cfg["whisper_model"], 54 | sample_rate=cfg["audio_sample_rate"] 55 | ) 56 | 57 | # Initialize LLM service 58 | llm_service = LLMClient( 59 | api_endpoint=cfg["llm_api_endpoint"] 60 | ) 61 | 62 | # Initialize TTS service 63 | tts_service = TTSClient( 64 | api_endpoint=cfg["tts_api_endpoint"], 65 | model=cfg["tts_model"], 66 | voice=cfg["tts_voice"], 67 | output_format=cfg["tts_format"] 68 | ) 69 | 70 | # Initialize vision service (will download model if not cached) 71 | logger.info("Initializing vision service...") 72 | vision_service.initialize() 73 | 74 | logger.info("All services initialized successfully") 75 | 76 | yield 77 | 78 | # Cleanup on shutdown 79 | logger.info("Shutting down services...") 80 | 81 | # No specific cleanup needed for these services, 82 | # but we could add resource release code here if needed (maybe in a future release lex 31/03/25) 83 | 84 | logger.info("Shutdown complete") 85 | 86 | # Create FastAPI application 87 | app = FastAPI( 88 | title="Vocalis Backend", 89 | description="Speech-to-Speech AI Assistant Backend", 90 | version="0.1.0", 91 | lifespan=lifespan 92 | ) 93 | 94 | # Configure CORS 95 | app.add_middleware( 96 | CORSMiddleware, 97 | allow_origins=["*"], # Allow all origins for development 98 | allow_credentials=True, 99 | allow_methods=["*"], 100 | allow_headers=["*"], 101 | ) 102 | 103 | # Service dependency functions 104 | def get_transcription_service(): 105 | return transcription_service 106 | 107 | def get_llm_service(): 108 | return llm_service 109 | 110 | def get_tts_service(): 111 | return tts_service 112 | 113 | # API routes 114 | @app.get("/") 115 | async def root(): 116 | """Root endpoint for health check.""" 117 | return {"status": "ok", "message": "Vocalis backend is running"} 118 | 119 | @app.get("/health") 120 | async def health_check(): 121 | """Health check endpoint.""" 122 | return { 123 | "status": "ok", 124 | "services": { 125 | "transcription": transcription_service is not None, 126 | "llm": llm_service is not None, 127 | "tts": tts_service is not None, 128 | "vision": vision_service.is_ready() 129 | }, 130 | "config": { 131 | "whisper_model": config.WHISPER_MODEL, 132 | "tts_voice": config.TTS_VOICE, 133 | "websocket_port": config.WEBSOCKET_PORT 134 | } 135 | } 136 | 137 | @app.get("/config") 138 | async def get_full_config(): 139 | """Get full configuration.""" 140 | if not all([transcription_service, llm_service, tts_service]) or not vision_service.is_ready(): 141 | raise HTTPException(status_code=503, detail="Services not initialized") 142 | 143 | return { 144 | "transcription": transcription_service.get_config(), 145 | "llm": llm_service.get_config(), 146 | "tts": tts_service.get_config(), 147 | "system": config.get_config() 148 | } 149 | 150 | # WebSocket route 151 | @app.websocket("/ws") 152 | async def websocket_route(websocket: WebSocket): 153 | """WebSocket endpoint for bidirectional audio streaming.""" 154 | await websocket_endpoint( 155 | websocket, 156 | transcription_service, 157 | llm_service, 158 | tts_service 159 | ) 160 | 161 | # Run server directly if executed as script 162 | if __name__ == "__main__": 163 | uvicorn.run( 164 | "backend.main:app", 165 | host=config.WEBSOCKET_HOST, 166 | port=config.WEBSOCKET_PORT, 167 | reload=True 168 | ) 169 | -------------------------------------------------------------------------------- /backend/prompts/system_prompt.md: -------------------------------------------------------------------------------- 1 | You are a helpful, friendly, and concise voice assistant. 2 | Respond to user queries in a natural, conversational manner. 3 | Keep responses brief and to the point, as you're communicating via voice. 4 | When providing information, focus on the most relevant details. 5 | If you don't know something, admit it rather than making up an answer. 6 | 7 | Through the webapp, you can receive and understand photographs and pictures. 8 | 9 | When the user sends a message like '[silent]', '[no response]', or '[still waiting]', it means they've gone quiet or haven't responded. When you see these signals, continue the conversation naturally based on the previous topic and context. Stay on topic, be helpful, and don't mention that they were silent - just carry on the conversation as if you're gently following up. 10 | -------------------------------------------------------------------------------- /backend/prompts/user_profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "preferences": {} 4 | } 5 | -------------------------------------------------------------------------------- /backend/prompts/vision_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "enabled": false 3 | } 4 | -------------------------------------------------------------------------------- /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.109.2 2 | uvicorn==0.27.1 3 | python-dotenv==1.0.1 4 | websockets==12.0 5 | numpy==1.26.4 6 | faster-whisper==1.1.1 7 | requests==2.31.0 8 | python-multipart==0.0.9 9 | torch>=2.0.1 10 | ffmpeg-python==0.2.0 11 | transformers>=4.31.0 12 | -------------------------------------------------------------------------------- /backend/routes/__init__.py: -------------------------------------------------------------------------------- 1 | # Routes package initialization 2 | # This file makes the 'routes' directory a Python package 3 | -------------------------------------------------------------------------------- /backend/services/__init__.py: -------------------------------------------------------------------------------- 1 | # Services package initialization 2 | # This file makes the 'services' directory a Python package 3 | -------------------------------------------------------------------------------- /backend/services/conversation_storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Conversation Storage Service 3 | 4 | Handles saving and loading conversation sessions to/from JSON files. 5 | """ 6 | 7 | import os 8 | import json 9 | import uuid 10 | import logging 11 | import asyncio # Import asyncio 12 | from typing import Dict, List, Optional, Any 13 | from datetime import datetime 14 | 15 | # Configure logging 16 | logging.basicConfig(level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | class ConversationStorage: 20 | """ 21 | Service for storing and retrieving conversation sessions. 22 | """ 23 | 24 | def __init__(self, storage_dir: str = "conversations"): 25 | """ 26 | Initialize the conversation storage service. 27 | 28 | Args: 29 | storage_dir: Directory to store conversation files 30 | """ 31 | self.storage_dir = storage_dir 32 | os.makedirs(self.storage_dir, exist_ok=True) 33 | logger.info(f"Initialized ConversationStorage with directory: {storage_dir}") 34 | 35 | async def save_session(self, messages: List[Dict], 36 | title: Optional[str] = None, 37 | session_id: Optional[str] = None, 38 | metadata: Optional[Dict[str, Any]] = None) -> str: 39 | """ 40 | Save a conversation session to a JSON file. 41 | 42 | Args: 43 | messages: List of conversation messages 44 | title: Optional title for the conversation (auto-generated if None) 45 | session_id: Optional ID for the session (new UUID if None) 46 | metadata: Optional metadata to store with the session 47 | 48 | Returns: 49 | str: The session ID 50 | """ 51 | # Generate ID if not provided 52 | if not session_id: 53 | session_id = str(uuid.uuid4()) 54 | 55 | # Generate title if not provided (from first user message or timestamp) 56 | if not title: 57 | # Try to find first user message 58 | for msg in messages: 59 | if msg.get('role') == 'user' and msg.get('content', '').strip(): 60 | # Use first ~30 chars of first user message 61 | title = msg['content'][:30] + ('...' if len(msg['content']) > 30 else '') 62 | break 63 | 64 | # Fallback to timestamp if no user messages found 65 | if not title: 66 | title = f"Conversation {datetime.now().strftime('%Y-%m-%d %H:%M')}" 67 | 68 | # Prepare session data 69 | now = datetime.now().isoformat() 70 | session = { 71 | "id": session_id, 72 | "title": title, 73 | "created_at": now, 74 | "updated_at": now, 75 | "messages": messages, 76 | "metadata": metadata or {} 77 | } 78 | 79 | # Define the synchronous file writing part 80 | def _write_file(): 81 | file_path = os.path.join(self.storage_dir, f"{session_id}.json") 82 | # Check if file exists to determine if created_at should be preserved 83 | created_time = now # Default to current time 84 | if os.path.exists(file_path): 85 | try: 86 | with open(file_path, 'r', encoding='utf-8') as f_read: 87 | existing_data = json.load(f_read) 88 | created_time = existing_data.get("created_at", now) # Use existing if found 89 | except Exception as read_err: 90 | logger.warning(f"Could not read existing session {session_id} to preserve created_at: {read_err}") 91 | pass # Ignore errors reading existing, just use 'now' 92 | 93 | session["created_at"] = created_time # Preserve original creation time or use current if new/error 94 | 95 | # Ensure directory exists (this is quick, maybe okay sync) 96 | # os.makedirs(os.path.dirname(file_path), exist_ok=True) # Already done in __init__ 97 | with open(file_path, 'w', encoding='utf-8') as f: 98 | json.dump(session, f, indent=2, ensure_ascii=False) 99 | 100 | # Run the synchronous file writing in a separate thread 101 | try: 102 | await asyncio.to_thread(_write_file) # Now _write_file is defined 103 | logger.info(f"Saved conversation session (async): {session_id}") 104 | return session_id 105 | except Exception as e: 106 | logger.error(f"Error writing session file {session_id}: {e}") 107 | raise # Re-raise the exception to be handled upstream 108 | 109 | async def load_session(self, session_id: str) -> Optional[Dict]: 110 | """ 111 | Load a conversation session from a JSON file. 112 | 113 | Args: 114 | session_id: ID of the session to load 115 | 116 | Returns: 117 | Optional[Dict]: The session data, or None if not found 118 | """ 119 | file_path = os.path.join(self.storage_dir, f"{session_id}.json") 120 | def _read_file(): 121 | if os.path.exists(file_path): 122 | with open(file_path, 'r', encoding='utf-8') as f: 123 | return json.load(f) 124 | return None 125 | 126 | try: 127 | session = await asyncio.to_thread(_read_file) 128 | if session: 129 | logger.info(f"Loaded conversation session (async): {session_id}") 130 | return session 131 | else: 132 | logger.warning(f"Session not found (async): {session_id}") 133 | return None 134 | except Exception as e: 135 | logger.error(f"Error loading session {session_id} (async): {e}") 136 | return None 137 | 138 | async def list_sessions(self) -> List[Dict]: 139 | """ 140 | List all available conversation sessions. 141 | 142 | Returns: 143 | List[Dict]: List of session metadata 144 | """ 145 | sessions = [] 146 | def _read_dir_and_files(): 147 | session_list = [] 148 | try: 149 | filenames = os.listdir(self.storage_dir) 150 | except Exception as e: 151 | logger.error(f"Error listing directory {self.storage_dir}: {e}") 152 | return [] # Return empty list if directory listing fails 153 | 154 | for filename in filenames: 155 | if filename.endswith('.json'): 156 | try: 157 | file_path = os.path.join(self.storage_dir, filename) 158 | with open(file_path, 'r', encoding='utf-8') as f: 159 | session_data = json.load(f) 160 | 161 | # Include only metadata for listing 162 | session_list.append({ 163 | "id": session_data.get("id"), 164 | "title": session_data.get("title"), 165 | "created_at": session_data.get("created_at"), 166 | "updated_at": session_data.get("updated_at"), 167 | "metadata": session_data.get("metadata", {}) 168 | }) 169 | except Exception as e: 170 | logger.error(f"Error loading session list item from {filename}: {e}") 171 | return session_list 172 | try: 173 | sessions = await asyncio.to_thread(_read_dir_and_files) 174 | # Sort by most recent first 175 | sessions.sort(key=lambda s: s.get("updated_at", ""), reverse=True) 176 | return sessions 177 | except Exception as e: 178 | logger.error(f"Error listing sessions (async): {e}") 179 | return [] 180 | 181 | async def delete_session(self, session_id: str) -> bool: 182 | """ 183 | Delete a conversation session. 184 | 185 | Args: 186 | session_id: ID of the session to delete 187 | 188 | Returns: 189 | bool: True if deleted successfully, False otherwise 190 | """ 191 | file_path = os.path.join(self.storage_dir, f"{session_id}.json") 192 | def _remove_file(): 193 | if os.path.exists(file_path): 194 | os.remove(file_path) 195 | return True 196 | return False 197 | 198 | try: 199 | deleted = await asyncio.to_thread(_remove_file) 200 | if deleted: 201 | logger.info(f"Deleted conversation session (async): {session_id}") 202 | return True 203 | else: 204 | logger.warning(f"Session not found for deletion (async): {session_id}") 205 | return False 206 | except Exception as e: 207 | logger.error(f"Error deleting session {session_id} (async): {e}") 208 | return False 209 | -------------------------------------------------------------------------------- /backend/services/llm.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM Service 3 | 4 | Handles communication with the local LLM API endpoint. 5 | """ 6 | 7 | import json 8 | import requests 9 | import logging 10 | from typing import Dict, Any, List, Optional 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | class LLMClient: 17 | """ 18 | Client for communicating with a local LLM API. 19 | 20 | This class handles requests to a locally hosted LLM API that follows 21 | the OpenAI API format. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | api_endpoint: str = "http://127.0.0.1:1234/v1/chat/completions", 27 | model: str = "default", 28 | temperature: float = 0.7, 29 | max_tokens: int = 2048, 30 | timeout: int = 60 31 | ): 32 | """ 33 | Initialize the LLM client. 34 | 35 | Args: 36 | api_endpoint: URL of the local LLM API 37 | model: Model name to use (or 'default' for API default) 38 | temperature: Sampling temperature (0.0 to 1.0) 39 | max_tokens: Maximum tokens to generate 40 | timeout: Request timeout in seconds 41 | """ 42 | self.api_endpoint = api_endpoint 43 | self.model = model 44 | self.temperature = temperature 45 | self.max_tokens = max_tokens 46 | self.timeout = timeout 47 | 48 | # State tracking 49 | self.is_processing = False 50 | self.conversation_history = [] 51 | 52 | logger.info(f"Initialized LLM Client with endpoint={api_endpoint}") 53 | 54 | def add_to_history(self, role: str, content: str) -> None: 55 | """ 56 | Add a message to the conversation history. 57 | 58 | Args: 59 | role: Message role ('system', 'user', or 'assistant') 60 | content: Message content 61 | """ 62 | self.conversation_history.append({ 63 | "role": role, 64 | "content": content 65 | }) 66 | 67 | # Allow deeper history for models with large context windows 68 | if len(self.conversation_history) > 50: 69 | # Always keep the system message if it exists 70 | if self.conversation_history[0]["role"] == "system": 71 | self.conversation_history = ( 72 | [self.conversation_history[0]] + 73 | self.conversation_history[-49:] 74 | ) 75 | else: 76 | self.conversation_history = self.conversation_history[-50:] 77 | 78 | def get_response(self, user_input: str, system_prompt: Optional[str] = None, 79 | add_to_history: bool = True, temperature: Optional[float] = None) -> Dict[str, Any]: 80 | """ 81 | Get a response from the LLM for the given user input. 82 | 83 | Args: 84 | user_input: User's text input 85 | system_prompt: Optional system prompt to set context 86 | add_to_history: Whether to add this exchange to conversation history 87 | temperature: Optional temperature override (0.0 to 1.0) 88 | 89 | Returns: 90 | Dictionary containing the LLM response and metadata 91 | """ 92 | self.is_processing = True 93 | start_time = logging.Formatter.converter() 94 | 95 | try: 96 | # Prepare messages 97 | messages = [] 98 | 99 | # Add system prompt if provided and not already in history 100 | if system_prompt: 101 | messages.append({ 102 | "role": "system", 103 | "content": system_prompt 104 | }) 105 | 106 | # Add user input to history if it's not empty and add_to_history is True 107 | if user_input.strip() and add_to_history: 108 | self.add_to_history("user", user_input) 109 | 110 | # Add conversation history (which now includes the user input if add_to_history=True) 111 | messages.extend(self.conversation_history) 112 | 113 | # Only add user input directly if not adding to history 114 | # This ensures special cases (greetings/followups) work while preventing duplication for normal speech 115 | if user_input.strip() and not add_to_history: 116 | messages.append({ 117 | "role": "user", 118 | "content": user_input 119 | }) 120 | 121 | # Prepare request payload with custom temperature if provided 122 | payload = { 123 | "model": self.model if self.model != "default" else None, 124 | "messages": messages, 125 | "temperature": temperature if temperature is not None else self.temperature, 126 | "max_tokens": self.max_tokens 127 | } 128 | 129 | # Remove None values 130 | payload = {k: v for k, v in payload.items() if v is not None} 131 | 132 | # Log the full payload (truncated for readability) 133 | payload_str = json.dumps(payload) 134 | logger.info(f"Sending request to LLM API with {len(messages)} messages") 135 | 136 | # Add more detailed logging to help debug message duplication 137 | message_roles = [msg["role"] for msg in messages] 138 | user_message_count = message_roles.count("user") 139 | logger.info(f"Message roles: {message_roles}, user messages: {user_message_count}") 140 | 141 | if len(payload_str) > 500: 142 | logger.debug(f"Payload (truncated): {payload_str[:500]}...") 143 | else: 144 | logger.debug(f"Payload: {payload_str}") 145 | 146 | # Send request to LLM API 147 | response = requests.post( 148 | self.api_endpoint, 149 | json=payload, 150 | timeout=self.timeout 151 | ) 152 | 153 | # Check if request was successful 154 | response.raise_for_status() 155 | 156 | # Parse response 157 | result = response.json() 158 | 159 | # Extract assistant response 160 | assistant_message = result.get("choices", [{}])[0].get("message", {}).get("content", "") 161 | 162 | # Add assistant response to history (only if we added the user input) 163 | if assistant_message and add_to_history: 164 | self.add_to_history("assistant", assistant_message) 165 | 166 | # Calculate processing time 167 | end_time = logging.Formatter.converter() 168 | processing_time = end_time[0] - start_time[0] 169 | 170 | logger.info(f"Received response from LLM API after {processing_time:.2f}s") 171 | 172 | return { 173 | "text": assistant_message, 174 | "processing_time": processing_time, 175 | "finish_reason": result.get("choices", [{}])[0].get("finish_reason"), 176 | "model": result.get("model", "unknown") 177 | } 178 | 179 | except requests.RequestException as e: 180 | logger.error(f"LLM API request error: {e}") 181 | error_response = f"I'm sorry, I encountered a problem connecting to my language model. {str(e)}" 182 | 183 | # Add the error to history if requested and clear history on 400 errors 184 | # to prevent the same error from happening repeatedly 185 | if add_to_history: 186 | self.add_to_history("assistant", error_response) 187 | 188 | # If we get a 400 Bad Request, the context might be corrupt 189 | if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 400: 190 | logger.warning("Received 400 error, clearing conversation history to recover") 191 | # Keep only system prompt if it exists 192 | self.clear_history(keep_system_prompt=True) 193 | 194 | return { 195 | "text": error_response, 196 | "error": str(e) 197 | } 198 | except Exception as e: 199 | logger.error(f"LLM processing error: {e}") 200 | error_response = "I'm sorry, I encountered an unexpected error. Please try again." 201 | self.add_to_history("assistant", error_response) 202 | return { 203 | "text": error_response, 204 | "error": str(e) 205 | } 206 | finally: 207 | self.is_processing = False 208 | 209 | def clear_history(self, keep_system_prompt: bool = True) -> None: 210 | """ 211 | Clear conversation history. 212 | 213 | Args: 214 | keep_system_prompt: Whether to keep the system prompt if it exists 215 | """ 216 | if keep_system_prompt and self.conversation_history and self.conversation_history[0]["role"] == "system": 217 | self.conversation_history = [self.conversation_history[0]] 218 | else: 219 | self.conversation_history = [] 220 | 221 | def get_config(self) -> Dict[str, Any]: 222 | """ 223 | Get the current configuration. 224 | 225 | Returns: 226 | Dict containing the current configuration 227 | """ 228 | return { 229 | "api_endpoint": self.api_endpoint, 230 | "model": self.model, 231 | "temperature": self.temperature, 232 | "max_tokens": self.max_tokens, 233 | "timeout": self.timeout, 234 | "is_processing": self.is_processing, 235 | "history_length": len(self.conversation_history) 236 | } 237 | -------------------------------------------------------------------------------- /backend/services/transcription.py: -------------------------------------------------------------------------------- 1 | """ 2 | Speech-to-Text Transcription Service 3 | 4 | Uses Faster Whisper to transcribe speech audio. 5 | """ 6 | 7 | import numpy as np 8 | import logging 9 | import io # For BytesIO 10 | from typing import Dict, Any, List, Optional, Tuple 11 | from faster_whisper import WhisperModel 12 | import time 13 | import torch # For CUDA availability check 14 | 15 | # Configure logging 16 | logging.basicConfig(level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | class WhisperTranscriber: 20 | """ 21 | Speech-to-Text service using Faster Whisper. 22 | 23 | This class handles transcription of speech audio segments. 24 | """ 25 | 26 | def __init__( 27 | self, 28 | model_size: str = "base", 29 | device: str = None, 30 | compute_type: str = None, 31 | beam_size: int = 2, 32 | sample_rate: int = 44100 33 | ): 34 | """ 35 | Initialize the transcription service. 36 | 37 | Args: 38 | model_size: Whisper model size (tiny.en, base.en, small.en, medium.en, large) 39 | device: Device to run model on ('cpu' or 'cuda'), if None will auto-detect 40 | compute_type: Model computation type (int8, int16, float16, float32), if None will select based on device 41 | beam_size: Beam size for decoding 42 | sample_rate: Audio sample rate in Hz 43 | """ 44 | self.model_size = model_size 45 | 46 | # Auto-detect device if not specified 47 | if device is None: 48 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 49 | else: 50 | self.device = device 51 | 52 | # Select appropriate compute type based on device if not specified 53 | if compute_type is None: 54 | self.compute_type = "float16" if self.device == "cuda" else "int8" 55 | else: 56 | self.compute_type = compute_type 57 | 58 | self.beam_size = beam_size 59 | self.sample_rate = sample_rate 60 | 61 | # Initialize model 62 | self._initialize_model() 63 | 64 | # State tracking 65 | self.is_processing = False 66 | 67 | logger.info(f"Initialized Whisper Transcriber with model={model_size}, " 68 | f"device={self.device}, compute_type={self.compute_type}") 69 | 70 | def _initialize_model(self): 71 | """Initialize Whisper model.""" 72 | try: 73 | # Load the model 74 | self.model = WhisperModel( 75 | self.model_size, # Pass as positional argument, not keyword 76 | device=self.device, 77 | compute_type=self.compute_type 78 | ) 79 | logger.info(f"Successfully loaded Whisper model: {self.model_size}") 80 | except Exception as e: 81 | logger.error(f"Failed to load Whisper model: {e}") 82 | raise 83 | 84 | def transcribe(self, audio: np.ndarray) -> Tuple[str, Dict[str, Any]]: 85 | """ 86 | Transcribe audio data to text. 87 | 88 | Args: 89 | audio: Audio data as numpy array 90 | 91 | Returns: 92 | Tuple[str, Dict[str, Any]]: 93 | - Transcribed text 94 | - Dictionary with additional information (confidence, language, etc.) 95 | """ 96 | start_time = time.time() 97 | self.is_processing = True 98 | 99 | try: 100 | # Handle WAV data (if audio is in uint8 format, it contains WAV headers) 101 | if audio.dtype == np.uint8: 102 | # First check the RIFF header to confirm this is WAV data 103 | header = bytes(audio[:44]) 104 | if header[:4] == b'RIFF' and header[8:12] == b'WAVE': 105 | # Create a file-like object that Whisper can read from 106 | audio_file = io.BytesIO(bytes(audio)) 107 | # The transcribe method expects a file-like object with read method 108 | audio = audio_file 109 | else: 110 | # Not a proper WAV header 111 | logger.warning("Received audio data with incorrect WAV header") 112 | # Attempt to process as raw data 113 | audio = audio.astype(np.float32) / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio 114 | else: 115 | # Normalize audio if it's raw float data 116 | audio = audio.astype(np.float32) / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio 117 | 118 | # Transcribe 119 | segments, info = self.model.transcribe( 120 | audio, 121 | beam_size=self.beam_size, 122 | language="en", # Force English language 123 | vad_filter=False # Disable VAD filter since we handle it in the frontend 124 | ) 125 | 126 | # Collect all segment texts 127 | text_segments = [segment.text for segment in segments] 128 | full_text = " ".join(text_segments).strip() 129 | 130 | # Calculate processing time 131 | processing_time = time.time() - start_time 132 | logger.info(f"Transcription completed in {processing_time:.2f}s: {full_text[:50]}...") 133 | 134 | metadata = { 135 | "confidence": getattr(info, "avg_logprob", 0), 136 | "language": getattr(info, "language", "en"), 137 | "processing_time": processing_time, 138 | "segments_count": len(text_segments) 139 | } 140 | 141 | return full_text, metadata 142 | 143 | except Exception as e: 144 | logger.error(f"Transcription error: {e}") 145 | return "", {"error": str(e)} 146 | finally: 147 | self.is_processing = False 148 | 149 | def transcribe_streaming(self, audio_generator): 150 | """ 151 | Stream transcription results from an audio generator. 152 | 153 | Args: 154 | audio_generator: Generator yielding audio chunks 155 | 156 | Yields: 157 | Partial transcription results as they become available 158 | """ 159 | self.is_processing = True 160 | 161 | try: 162 | # Process the streaming transcription 163 | segments = self.model.transcribe_with_vad( 164 | audio_generator, 165 | language="en" 166 | ) 167 | 168 | # Yield each segment as it's transcribed 169 | for segment in segments: 170 | yield { 171 | "text": segment.text, 172 | "start": segment.start, 173 | "end": segment.end, 174 | "confidence": segment.avg_logprob 175 | } 176 | 177 | except Exception as e: 178 | logger.error(f"Streaming transcription error: {e}") 179 | yield {"error": str(e)} 180 | finally: 181 | self.is_processing = False 182 | 183 | def get_config(self) -> Dict[str, Any]: 184 | """ 185 | Get the current configuration. 186 | 187 | Returns: 188 | Dict containing the current configuration 189 | """ 190 | return { 191 | "model_size": self.model_size, 192 | "device": self.device, 193 | "compute_type": self.compute_type, 194 | "beam_size": self.beam_size, 195 | "sample_rate": self.sample_rate, 196 | "is_processing": self.is_processing 197 | } 198 | -------------------------------------------------------------------------------- /backend/services/tts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Text-to-Speech Service 3 | 4 | Handles communication with the local TTS API endpoint. 5 | """ 6 | 7 | import json 8 | import requests 9 | import logging 10 | import io 11 | import time 12 | import base64 13 | import asyncio 14 | from typing import Dict, Any, List, Optional, BinaryIO, Generator, AsyncGenerator 15 | 16 | # Configure logging 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | class TTSClient: 21 | """ 22 | Client for communicating with a local TTS API. 23 | 24 | This class handles requests to a locally hosted TTS API that follows 25 | the OpenAI API format for text-to-speech generation. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | api_endpoint: str = "http://localhost:5005/v1/audio/speech", 31 | model: str = "tts-1", 32 | voice: str = "tara", 33 | output_format: str = "wav", 34 | speed: float = 1.0, 35 | timeout: int = 60, 36 | chunk_size: int = 4096 37 | ): 38 | """ 39 | Initialize the TTS client. 40 | 41 | Args: 42 | api_endpoint: URL of the local TTS API 43 | model: TTS model name to use 44 | voice: Voice to use for synthesis 45 | output_format: Output audio format (mp3, opus, aac, flac) 46 | speed: Speech speed multiplier (0.25 to 4.0) 47 | timeout: Request timeout in seconds 48 | chunk_size: Size of audio chunks to stream in bytes 49 | """ 50 | self.api_endpoint = api_endpoint 51 | self.model = model 52 | self.voice = voice 53 | self.output_format = output_format 54 | self.speed = speed 55 | self.timeout = timeout 56 | self.chunk_size = chunk_size 57 | 58 | # State tracking 59 | self.is_processing = False 60 | self.last_processing_time = 0 61 | 62 | logger.info(f"Initialized TTS Client with endpoint={api_endpoint}, " 63 | f"model={model}, voice={voice}") 64 | 65 | def text_to_speech(self, text: str) -> bytes: 66 | """ 67 | Convert text to speech audio. 68 | 69 | Args: 70 | text: Text to convert to speech 71 | 72 | Returns: 73 | Audio data as bytes 74 | """ 75 | self.is_processing = True 76 | start_time = time.time() 77 | 78 | try: 79 | # Prepare request payload 80 | payload = { 81 | "model": self.model, 82 | "input": text, 83 | "voice": self.voice, 84 | "response_format": self.output_format, 85 | "speed": self.speed 86 | } 87 | 88 | logger.info(f"Sending TTS request with {len(text)} characters of text") 89 | 90 | # Send request to TTS API 91 | response = requests.post( 92 | self.api_endpoint, 93 | json=payload, 94 | timeout=self.timeout 95 | ) 96 | 97 | # Check if request was successful 98 | response.raise_for_status() 99 | 100 | # Get audio content 101 | audio_data = response.content 102 | 103 | # Calculate processing time 104 | self.last_processing_time = time.time() - start_time 105 | 106 | logger.info(f"Received TTS response after {self.last_processing_time:.2f}s, " 107 | f"size: {len(audio_data)} bytes") 108 | 109 | return audio_data 110 | 111 | except requests.RequestException as e: 112 | logger.error(f"TTS API request error: {e}") 113 | raise 114 | except Exception as e: 115 | logger.error(f"TTS processing error: {e}") 116 | raise 117 | finally: 118 | self.is_processing = False 119 | 120 | def stream_text_to_speech(self, text: str) -> Generator[bytes, None, None]: 121 | """ 122 | Stream audio data from the TTS API. 123 | 124 | Args: 125 | text: Text to convert to speech 126 | 127 | Yields: 128 | Chunks of audio data 129 | """ 130 | self.is_processing = True 131 | start_time = time.time() 132 | 133 | try: 134 | # Prepare request payload 135 | payload = { 136 | "model": self.model, 137 | "input": text, 138 | "voice": self.voice, 139 | "response_format": self.output_format, 140 | "speed": self.speed 141 | } 142 | 143 | logger.info(f"Sending streaming TTS request with {len(text)} characters of text") 144 | 145 | # Send request to TTS API 146 | with requests.post( 147 | self.api_endpoint, 148 | json=payload, 149 | timeout=self.timeout, 150 | stream=True 151 | ) as response: 152 | response.raise_for_status() 153 | 154 | # Check if streaming is supported by the API 155 | is_chunked = response.headers.get('transfer-encoding', '') == 'chunked' 156 | 157 | if is_chunked: 158 | # The API supports streaming 159 | for chunk in response.iter_content(chunk_size=self.chunk_size): 160 | if chunk: 161 | yield chunk 162 | else: 163 | # The API doesn't support streaming, but we'll fake it by 164 | # splitting the response into chunks 165 | audio_data = response.content 166 | total_chunks = (len(audio_data) + self.chunk_size - 1) // self.chunk_size 167 | 168 | for i in range(total_chunks): 169 | start_idx = i * self.chunk_size 170 | end_idx = min(start_idx + self.chunk_size, len(audio_data)) 171 | yield audio_data[start_idx:end_idx] 172 | 173 | # Calculate processing time 174 | self.last_processing_time = time.time() - start_time 175 | logger.info(f"Completed TTS streaming after {self.last_processing_time:.2f}s") 176 | 177 | except requests.RequestException as e: 178 | logger.error(f"TTS API streaming request error: {e}") 179 | raise 180 | except Exception as e: 181 | logger.error(f"TTS streaming error: {e}") 182 | raise 183 | finally: 184 | self.is_processing = False 185 | 186 | async def async_text_to_speech(self, text: str) -> bytes: 187 | """ 188 | Asynchronously generate audio data from the TTS API. 189 | 190 | This method provides asynchronous TTS capability by running 191 | the synchronous method in a thread. 192 | 193 | Args: 194 | text: Text to convert to speech 195 | 196 | Returns: 197 | Complete audio data as bytes 198 | """ 199 | self.is_processing = True 200 | 201 | try: 202 | # Get complete audio data 203 | audio_data = await asyncio.to_thread(self.text_to_speech, text) 204 | return audio_data 205 | except Exception as e: 206 | logger.error(f"Async TTS error: {e}") 207 | raise 208 | finally: 209 | self.is_processing = False 210 | 211 | def get_config(self) -> Dict[str, Any]: 212 | """ 213 | Get the current configuration. 214 | 215 | Returns: 216 | Dict containing the current configuration 217 | """ 218 | return { 219 | "api_endpoint": self.api_endpoint, 220 | "model": self.model, 221 | "voice": self.voice, 222 | "output_format": self.output_format, 223 | "speed": self.speed, 224 | "timeout": self.timeout, 225 | "chunk_size": self.chunk_size, 226 | "is_processing": self.is_processing, 227 | "last_processing_time": self.last_processing_time 228 | } 229 | -------------------------------------------------------------------------------- /backend/services/vision.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vision service for image processing using SmolVLM 3 | 4 | Handles loading and initializing the vision model for image understanding. 5 | """ 6 | 7 | import logging 8 | from transformers import AutoProcessor, AutoModelForVision2Seq 9 | 10 | # Configure logging 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | class VisionService: 15 | """ 16 | Service for processing images with vision models. 17 | Currently uses SmolVLM-256M-Instruct for lightweight image understanding. 18 | """ 19 | 20 | def __init__(self): 21 | """Initialize the service with empty model references.""" 22 | self.processor = None 23 | self.model = None 24 | self.initialized = False 25 | self.model_name = "HuggingFaceTB/SmolVLM-256M-Instruct" 26 | self.default_prompt = "Describe this image in detail. Include information about objects, people, scenes, text, and any notable elements." 27 | 28 | def initialize(self): 29 | """ 30 | Initialize the model, downloading it if necessary. 31 | This will be called on server startup. 32 | 33 | Returns: 34 | bool: Whether initialization was successful 35 | """ 36 | if self.initialized: 37 | logger.info("Vision model already initialized") 38 | return True 39 | 40 | try: 41 | import torch 42 | 43 | # Determine device (use CUDA if available) 44 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 45 | logger.info(f"Using device for vision model: {self.device}") 46 | 47 | logger.info(f"Loading vision model {self.model_name} (this may take a while on first run)...") 48 | 49 | # These calls will trigger the download if the model isn't cached locally 50 | self.processor = AutoProcessor.from_pretrained(self.model_name) 51 | self.model = AutoModelForVision2Seq.from_pretrained(self.model_name) 52 | 53 | # Move model to GPU if available 54 | self.model = self.model.to(self.device) 55 | 56 | self.initialized = True 57 | logger.info(f"Vision model loaded successfully on {self.device}") 58 | return True 59 | except Exception as e: 60 | logger.error(f"Error loading vision model: {e}") 61 | return False 62 | 63 | def process_image(self, image_base64: str, prompt: str = None): 64 | """ 65 | Process an image with SmolVLM and return a description. 66 | 67 | Args: 68 | image_base64: Base64-encoded image data 69 | prompt: Prompt to guide image description (uses default if None) 70 | 71 | Returns: 72 | str: Image description 73 | """ 74 | if not self.is_ready(): 75 | raise RuntimeError("Vision model not initialized") 76 | 77 | try: 78 | # Decode base64 image 79 | import base64 80 | from io import BytesIO 81 | from PIL import Image 82 | import torch 83 | 84 | # Use default prompt if none provided 85 | if prompt is None: 86 | prompt = self.default_prompt 87 | 88 | # Format the prompt to include the token 89 | formatted_prompt = f"User uploaded this image: \n{prompt}" 90 | 91 | # Convert base64 to image 92 | image_data = base64.b64decode(image_base64) 93 | image = Image.open(BytesIO(image_data)).convert('RGB') 94 | 95 | # Prepare inputs for the model with the correct token format 96 | inputs = self.processor(text=[formatted_prompt], images=[image], return_tensors="pt") 97 | 98 | # Move inputs to the same device as the model 99 | inputs = {k: v.to(self.device) for k, v in inputs.items()} 100 | 101 | # Generate description 102 | with torch.no_grad(): 103 | output_ids = self.model.generate( 104 | **inputs, 105 | max_new_tokens=256, 106 | do_sample=False 107 | ) 108 | 109 | # Decode the output 110 | description = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0] 111 | 112 | return description.strip() 113 | 114 | except Exception as e: 115 | logger.error(f"Error processing image with vision model: {e}") 116 | return f"Error analyzing image: {str(e)}" 117 | 118 | def is_ready(self): 119 | """ 120 | Check if the model is initialized and ready. 121 | 122 | Returns: 123 | bool: Whether the model is ready for use 124 | """ 125 | return self.initialized 126 | 127 | # Create singleton instance 128 | vision_service = VisionService() 129 | -------------------------------------------------------------------------------- /docs/Demonstration_Video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Demonstration_Video.png -------------------------------------------------------------------------------- /docs/Vocalis_Demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Demo.png -------------------------------------------------------------------------------- /docs/Vocalis_Header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Header.png -------------------------------------------------------------------------------- /docs/Vocalis_Visual_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Visual_demo.gif -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Vocalis - Demo Assets 7 | 39 | 40 | 41 |

Vocalis - Demo Assets

42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 57 | 58 | 59 | 60 | 64 | 65 | 66 | 67 | 71 | 72 | 73 |
File TypeFile
Project Banner (PNG) 54 | Vocalis_Demo.png
55 | Vocalis Banner 56 |
Project Header (PNG) 61 | Vocalis_Header.png
62 | Vocalis Header Image 63 |
Visual Demonstration (GIF) 68 | Vocalis_Visual_demo.gif
69 | Vocalis Visual Demo 70 |
74 | 75 | -------------------------------------------------------------------------------- /frontend/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/frontend/favicon.ico -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Vocalis - AI Speech Assistant 8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vocalis-frontend", 3 | "private": true, 4 | "version": "0.1.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc && vite build", 9 | "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", 10 | "preview": "vite preview" 11 | }, 12 | "dependencies": { 13 | "lucide-react": "^0.344.0", 14 | "react": "^18.3.1", 15 | "react-dom": "^18.3.1" 16 | }, 17 | "devDependencies": { 18 | "@types/react": "^18.3.5", 19 | "@types/react-dom": "^18.3.0", 20 | "@typescript-eslint/eslint-plugin": "^7.2.0", 21 | "@typescript-eslint/parser": "^7.2.0", 22 | "@vitejs/plugin-react": "^4.3.1", 23 | "autoprefixer": "^10.4.18", 24 | "eslint": "^8.57.0", 25 | "eslint-plugin-react-hooks": "^4.6.0", 26 | "eslint-plugin-react-refresh": "^0.4.5", 27 | "postcss": "^8.4.35", 28 | "tailwindcss": "^3.4.1", 29 | "typescript": "^5.5.3", 30 | "vite": "^5.4.2" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /frontend/postcss.config.js: -------------------------------------------------------------------------------- 1 | export default { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } 7 | -------------------------------------------------------------------------------- /frontend/src/App.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from 'react'; 2 | import ChatInterface from './components/ChatInterface'; 3 | import Sidebar from './components/Sidebar'; 4 | import { Menu } from 'lucide-react'; 5 | import websocketService, { ConnectionState } from './services/websocket'; 6 | 7 | function App() { 8 | const [isSidebarOpen, setIsSidebarOpen] = useState(false); 9 | const [isConnected, setIsConnected] = useState(false); 10 | 11 | // Track WebSocket connection 12 | useEffect(() => { 13 | const handleConnectionChange = () => { 14 | const state = websocketService.getConnectionState(); 15 | setIsConnected(state === ConnectionState.CONNECTED); 16 | }; 17 | 18 | // Set up event listeners 19 | websocketService.addEventListener('open', handleConnectionChange); 20 | websocketService.addEventListener('close', handleConnectionChange); 21 | websocketService.addEventListener('error', handleConnectionChange); 22 | 23 | // Initial check 24 | handleConnectionChange(); 25 | 26 | // Cleanup 27 | return () => { 28 | websocketService.removeEventListener('open', handleConnectionChange); 29 | websocketService.removeEventListener('close', handleConnectionChange); 30 | websocketService.removeEventListener('error', handleConnectionChange); 31 | }; 32 | }, []); 33 | 34 | return ( 35 |
36 | {/* Toggle Button */} 37 | 44 | 45 | {/* Sidebar */} 46 |
51 | setIsSidebarOpen(false)} 53 | isConnected={isConnected} 54 | onReconnect={() => websocketService.connect()} 55 | onClearHistory={() => websocketService.clearHistory()} 56 | /> 57 |
58 | 59 | {/* Main Content */} 60 |
64 | 65 |
66 |
67 | ); 68 | } 69 | 70 | export default App; 71 | -------------------------------------------------------------------------------- /frontend/src/components/AssistantOrb.tsx: -------------------------------------------------------------------------------- 1 | import React, { useRef, useEffect, useState, useMemo, useCallback } from 'react'; 2 | 3 | interface AssistantOrbProps { 4 | state: 'idle' | 'greeting' | 'listening' | 'processing' | 'speaking' | 'vision_file' | 'vision_processing' | 'vision_asr'; 5 | } 6 | 7 | const AssistantOrb: React.FC = ({ state }) => { 8 | const canvasRef = useRef(null); 9 | const starsCanvasRef = useRef(null); 10 | const [dimensions, setDimensions] = useState({ width: 208, height: 208 }); 11 | 12 | // Adjust size based on screen size 13 | useEffect(() => { 14 | const updateSize = () => { 15 | const isMobile = window.innerWidth < 768; 16 | setDimensions({ 17 | width: isMobile ? 156 : 208, 18 | height: isMobile ? 156 : 208 19 | }); 20 | }; 21 | 22 | updateSize(); 23 | window.addEventListener('resize', updateSize); 24 | 25 | return () => { 26 | window.removeEventListener('resize', updateSize); 27 | }; 28 | }, []); 29 | 30 | // Create stars with memoization 31 | const stars = useMemo(() => { 32 | const starCount = 75; 33 | return Array.from({ length: starCount }, () => ({ 34 | x: Math.random() * 400 - 100, // Wider distribution 35 | y: Math.random() * 400 - 100, // Wider distribution 36 | size: Math.random() * 0.8 + 0.2, // Smaller size range 37 | twinkleSpeed: Math.random() * 2 + 1, 38 | moveSpeed: Math.random() * 0.2 + 0.1 39 | })); 40 | }, []); 41 | 42 | // Star animation function 43 | const drawStars = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => { 44 | ctx.clearRect(0, 0, width, height); 45 | 46 | stars.forEach((star: any) => { 47 | // Update position 48 | star.x += star.moveSpeed; 49 | if (star.x > width + 100) star.x = -100; 50 | 51 | // Calculate twinkle 52 | const twinkle = Math.sin(time * star.twinkleSpeed) * 0.5 + 0.5; 53 | let alpha = twinkle * 0.4; // Base alpha 54 | 55 | // Enhance stars based on state 56 | if (state === 'listening') { 57 | alpha *= 1.5; // Brighter during listening 58 | } else if (state === 'greeting') { 59 | alpha *= 1.4; // Almost as bright as listening during greeting 60 | } else if (state === 'speaking') { 61 | alpha *= 1 + (0.5 * Math.sin(time * 5)); // Pulsing during speaking 62 | } 63 | 64 | // Draw star with subtle glow 65 | ctx.beginPath(); 66 | const gradient = ctx.createRadialGradient( 67 | star.x, star.y, 0, 68 | star.x, star.y, star.size * 2 69 | ); 70 | gradient.addColorStop(0, `rgba(255, 255, 255, ${alpha})`); 71 | gradient.addColorStop(1, 'rgba(255, 255, 255, 0)'); 72 | ctx.fillStyle = gradient; 73 | ctx.arc(star.x, star.y, star.size, 0, Math.PI * 2); 74 | ctx.fill(); 75 | }); 76 | }, [stars, state]); 77 | 78 | // Aurora effect animation 79 | const drawAurora = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => { 80 | // Clear canvas completely each frame for clean animation 81 | ctx.clearRect(0, 0, width, height); 82 | 83 | // Create base gradient for the ethereal background 84 | let baseGradient; 85 | 86 | // Change gradient colors based on state 87 | if (state === 'vision_file') { 88 | // Light blue/cyan for vision file 89 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 90 | baseGradient.addColorStop(0, 'rgba(125, 211, 252, 0.15)'); // sky-300 91 | baseGradient.addColorStop(0.5, 'rgba(186, 230, 253, 0.1)'); // sky-200 92 | baseGradient.addColorStop(1, 'rgba(224, 242, 254, 0.12)'); // sky-100 93 | } else if (state === 'vision_asr') { 94 | // Bright green for vision ASR (matching listening) 95 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 96 | baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.15)'); 97 | baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)'); 98 | baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.08)'); 99 | } else if (state === 'vision_processing') { 100 | // Teal for vision processing 101 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 102 | baseGradient.addColorStop(0, 'rgba(183, 245, 235, 0.15)'); 103 | baseGradient.addColorStop(0.5, 'rgba(153, 235, 225, 0.12)'); 104 | baseGradient.addColorStop(1, 'rgba(45, 212, 191, 0.15)'); 105 | } else if (state === 'listening') { 106 | // Bright green for listening 107 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 108 | baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.15)'); 109 | baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)'); 110 | baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.08)'); 111 | } else if (state === 'greeting') { 112 | // Blue for greeting 113 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 114 | baseGradient.addColorStop(0, 'rgba(59, 130, 246, 0.15)'); // Blue 115 | baseGradient.addColorStop(0.5, 'rgba(96, 165, 250, 0.1)'); // Lighter blue 116 | baseGradient.addColorStop(1, 'rgba(59, 130, 246, 0.08)'); // Blue again 117 | } else if (state === 'processing') { 118 | // Purple/blue for processing 119 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 120 | baseGradient.addColorStop(0, 'rgba(72, 209, 255, 0.1)'); 121 | baseGradient.addColorStop(0.5, 'rgba(135, 150, 235, 0.1)'); 122 | baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.15)'); 123 | } else if (state === 'speaking') { 124 | // Gold/amber for speaking 125 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 126 | baseGradient.addColorStop(0, 'rgba(255, 223, 72, 0.1)'); 127 | baseGradient.addColorStop(0.5, 'rgba(255, 167, 72, 0.08)'); 128 | baseGradient.addColorStop(1, 'rgba(255, 109, 72, 0.12)'); 129 | } else { 130 | // Default/idle state - subtle green 131 | baseGradient = ctx.createLinearGradient(0, 0, width, height); 132 | baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.1)'); 133 | baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)'); 134 | baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.1)'); 135 | } 136 | 137 | // Fill with gradient 138 | ctx.fillStyle = 'rgba(72, 255, 167, 0.06)'; 139 | ctx.fillRect(0, 0, width, height); 140 | ctx.fillStyle = baseGradient; 141 | ctx.fillRect(0, 0, width, height); 142 | 143 | // Create flowing aurora effect 144 | const numWaves = 3; 145 | for (let wave = 0; wave < numWaves; wave++) { 146 | const waveOffset = wave * (Math.PI / numWaves); 147 | 148 | ctx.beginPath(); 149 | 150 | // Start from the left edge 151 | ctx.moveTo(-width * 0.1, height / 2); 152 | 153 | // Create smooth wave path 154 | for (let x = -width * 0.1; x <= width * 1.1; x += 1) { 155 | const progress = (x + width * 0.1) / (width * 1.2); 156 | const amplitude = height * 0.15; // Reduced amplitude to prevent overflow 157 | 158 | // Wave speed modifier based on state 159 | let speedMod = 1.0; 160 | if (state === 'listening') speedMod = 1.5; 161 | if (state === 'greeting') speedMod = 1.8; // Slightly faster than listening 162 | if (state === 'processing') speedMod = 2.5; 163 | if (state === 'speaking') speedMod = 2.0; 164 | 165 | // Complex wave function for organic movement 166 | const y = height / 2 + 167 | Math.sin(progress * 4 + time * speedMod + waveOffset) * amplitude * 0.5 + 168 | Math.sin(progress * 7 + time * 0.5 * speedMod) * amplitude * 0.3 + 169 | Math.sin(progress * 2 - time * 0.7 * speedMod) * amplitude * 0.2; 170 | 171 | ctx.lineTo(x, y); 172 | } 173 | 174 | // Complete the path 175 | ctx.lineTo(width, height * 1.1); // Extend slightly beyond bottom 176 | ctx.lineTo(0, height * 1.1); // Extend slightly beyond bottom 177 | ctx.closePath(); 178 | 179 | // Create gradient for each wave 180 | const gradient = ctx.createLinearGradient(0, 0, 0, height); 181 | const alpha = 0.18 - wave * 0.04; // Slightly increased contrast between waves 182 | 183 | // Color based on state 184 | let baseHue; 185 | if (state === 'vision_file') { 186 | baseHue = wave === 0 ? 195 : wave === 1 ? 200 : 205; // Light blue/cyan range 187 | } else if (state === 'vision_asr') { 188 | baseHue = wave === 0 ? 145 : wave === 1 ? 160 : 175; // Green range (matching listening) 189 | } else if (state === 'vision_processing') { 190 | baseHue = wave === 0 ? 175 : wave === 1 ? 165 : 180; // Teal range 191 | } else if (state === 'listening') { 192 | baseHue = wave === 0 ? 145 : wave === 1 ? 160 : 175; // Green range 193 | } else if (state === 'greeting') { 194 | baseHue = wave === 0 ? 210 : wave === 1 ? 220 : 200; // Blue range 195 | } else if (state === 'processing') { 196 | baseHue = wave === 0 ? 260 : wave === 1 ? 240 : 220; // Purple/Blue range 197 | } else if (state === 'speaking') { 198 | baseHue = wave === 0 ? 30 : wave === 1 ? 45 : 60; // Gold/Amber range 199 | } else { 200 | baseHue = wave === 0 ? 145 : wave === 1 ? 190 : 290; // Default range 201 | } 202 | 203 | // Ethereal color transitions 204 | const hueShift = Math.sin(time * 0.5 + wave) * 15; 205 | gradient.addColorStop(0, `hsla(${baseHue + hueShift}, 85%, 75%, 0)`); // Start transparent 206 | gradient.addColorStop(0.4, `hsla(${baseHue + hueShift}, 90%, 85%, ${alpha * 1.6})`); // Intense peak 207 | gradient.addColorStop(0.8, `hsla(${baseHue + hueShift}, 85%, 75%, ${alpha})`); // Maintain intensity 208 | gradient.addColorStop(1, `hsla(${baseHue + hueShift}, 85%, 75%, 0)`); // End transparent 209 | 210 | ctx.fillStyle = gradient; 211 | 212 | // Apply gaussian blur for soft edges 213 | ctx.filter = 'blur(15px)'; 214 | ctx.fill(); 215 | ctx.filter = 'none'; 216 | 217 | // Add subtle highlight 218 | ctx.strokeStyle = `hsla(${baseHue + hueShift}, 95%, 85%, ${alpha * 0.7})`; 219 | ctx.lineWidth = 2; 220 | ctx.stroke(); 221 | } 222 | 223 | // Add subtle noise texture 224 | const imageData = ctx.getImageData(0, 0, width, height); 225 | const data = imageData.data; 226 | const noiseIntensity = 3; // Reduced noise intensity for cleaner look 227 | for (let i = 0; i < data.length; i += 4) { 228 | const noise = (Math.random() - 0.5) * noiseIntensity; 229 | data[i] += noise; 230 | data[i + 1] += noise; 231 | data[i + 2] += noise; 232 | } 233 | ctx.putImageData(imageData, 0, 0); 234 | 235 | // Add final glow layer based on state 236 | ctx.beginPath(); 237 | const glow = ctx.createRadialGradient( 238 | width / 2, height / 2, 0, 239 | width / 2, height / 2, width * 0.7 240 | ); 241 | 242 | // Color based on state 243 | if (state === 'vision_file') { 244 | // Light blue/cyan glow for vision file 245 | const pulseIntensity = 0.25 + Math.sin(time * 2) * 0.05; 246 | glow.addColorStop(0, `rgba(125, 211, 252, ${pulseIntensity})`); // sky-300 247 | glow.addColorStop(0.5, `rgba(186, 230, 253, ${pulseIntensity * 0.4})`); // sky-200 248 | glow.addColorStop(1, 'rgba(224, 242, 254, 0)'); // sky-100 249 | } else if (state === 'vision_processing') { 250 | // Rotating teal glow for vision processing 251 | const rotationX = Math.cos(time * 2) * width * 0.2; 252 | const rotationY = Math.sin(time * 2) * height * 0.2; 253 | ctx.ellipse( 254 | width/2 + rotationX, height/2 + rotationY, 255 | width * 0.3, height * 0.3, 256 | time, 0, Math.PI * 2 257 | ); 258 | ctx.filter = 'blur(30px)'; 259 | ctx.fillStyle = 'rgba(45, 212, 191, 0.1)'; 260 | ctx.fill(); 261 | ctx.filter = 'none'; 262 | 263 | glow.addColorStop(0, 'rgba(45, 212, 191, 0.15)'); 264 | glow.addColorStop(0.5, 'rgba(45, 212, 191, 0.05)'); 265 | glow.addColorStop(1, 'rgba(45, 212, 191, 0)'); 266 | } else if (state === 'vision_asr') { 267 | // Emerald pulsing glow for vision ASR (matching listening state) 268 | const pulseIntensity = 0.25 + Math.sin(time * 2.5) * 0.07; 269 | glow.addColorStop(0, `rgba(72, 255, 167, ${pulseIntensity})`); 270 | glow.addColorStop(0.5, `rgba(72, 255, 167, ${pulseIntensity * 0.4})`); 271 | glow.addColorStop(1, 'rgba(72, 255, 167, 0)'); 272 | } else if (state === 'listening') { 273 | // Pulsing green glow for listening 274 | const pulseIntensity = 0.2 + Math.sin(time * 3) * 0.1; 275 | glow.addColorStop(0, `rgba(72, 255, 167, ${pulseIntensity})`); 276 | glow.addColorStop(0.5, `rgba(72, 255, 167, ${pulseIntensity * 0.4})`); 277 | glow.addColorStop(1, 'rgba(72, 255, 167, 0)'); 278 | } else if (state === 'greeting') { 279 | // Pulsing blue glow for greeting 280 | const pulseIntensity = 0.2 + Math.sin(time * 2.5) * 0.1; 281 | glow.addColorStop(0, `rgba(59, 130, 246, ${pulseIntensity})`); 282 | glow.addColorStop(0.5, `rgba(59, 130, 246, ${pulseIntensity * 0.4})`); 283 | glow.addColorStop(1, 'rgba(59, 130, 246, 0)'); 284 | } else if (state === 'processing') { 285 | // Rotating purple glow for processing 286 | const rotationX = Math.cos(time * 2) * width * 0.2; 287 | const rotationY = Math.sin(time * 2) * height * 0.2; 288 | ctx.ellipse( 289 | width/2 + rotationX, height/2 + rotationY, 290 | width * 0.3, height * 0.3, 291 | time, 0, Math.PI * 2 292 | ); 293 | ctx.filter = 'blur(30px)'; 294 | ctx.fillStyle = 'rgba(186, 85, 255, 0.1)'; 295 | ctx.fill(); 296 | ctx.filter = 'none'; 297 | 298 | glow.addColorStop(0, 'rgba(186, 85, 255, 0.15)'); 299 | glow.addColorStop(0.5, 'rgba(186, 85, 255, 0.05)'); 300 | glow.addColorStop(1, 'rgba(186, 85, 255, 0)'); 301 | } else if (state === 'speaking') { 302 | // Rippling amber glow for speaking 303 | const ripple = Math.sin(time * 5) * 0.1; 304 | glow.addColorStop(0, `rgba(255, 167, 72, 0.2)`); 305 | glow.addColorStop(0.4 + ripple, `rgba(255, 167, 72, 0.1)`); 306 | glow.addColorStop(0.7 + ripple, `rgba(255, 167, 72, 0.05)`); 307 | glow.addColorStop(1, 'rgba(255, 167, 72, 0)'); 308 | } else { 309 | // Subtle glow for idle 310 | glow.addColorStop(0, 'rgba(72, 255, 167, 0.2)'); 311 | glow.addColorStop(0.5, 'rgba(72, 255, 167, 0.08)'); 312 | glow.addColorStop(1, 'rgba(72, 255, 167, 0)'); 313 | } 314 | 315 | ctx.fillStyle = glow; 316 | ctx.fillRect(0, 0, width, height); 317 | }, [state]); 318 | 319 | // Set up canvas for aurora effect 320 | useEffect(() => { 321 | const canvas = canvasRef.current; 322 | if (!canvas) return; 323 | 324 | const ctx = canvas.getContext('2d'); 325 | if (!ctx) return; 326 | 327 | let animationFrame: number; 328 | let startTime = Date.now(); 329 | 330 | const animate = () => { 331 | const time = (Date.now() - startTime) * 0.001; 332 | drawAurora(ctx, canvas.width, canvas.height, time); 333 | animationFrame = requestAnimationFrame(animate); 334 | }; 335 | 336 | animate(); 337 | 338 | return () => { 339 | cancelAnimationFrame(animationFrame); 340 | }; 341 | }, [drawAurora]); 342 | 343 | // Set up canvas for stars 344 | useEffect(() => { 345 | const canvas = starsCanvasRef.current; 346 | if (!canvas) return; 347 | 348 | const ctx = canvas.getContext('2d'); 349 | if (!ctx) return; 350 | 351 | let animationFrame: number; 352 | let startTime = Date.now(); 353 | 354 | const animate = () => { 355 | const time = (Date.now() - startTime) * 0.001; 356 | drawStars(ctx, canvas.width, canvas.height, time); 357 | animationFrame = requestAnimationFrame(animate); 358 | }; 359 | 360 | animate(); 361 | 362 | return () => { 363 | cancelAnimationFrame(animationFrame); 364 | }; 365 | }, [drawStars]); 366 | 367 | return ( 368 |
384 | {/* Soft gaussian ambient glow */} 385 |
386 |
397 |
398 | 399 |
400 | 406 |
407 | 408 | {/* Aurora effect */} 409 |
422 | 428 |
429 |
430 | ); 431 | }; 432 | 433 | export default AssistantOrb; 434 | -------------------------------------------------------------------------------- /frontend/src/components/BackgroundStars.tsx: -------------------------------------------------------------------------------- 1 | import React, { useRef, useEffect, useMemo, useCallback } from 'react'; 2 | 3 | const BackgroundStars: React.FC = () => { 4 | const canvasRef = useRef(null); 5 | 6 | // Generate stars with memoization to prevent regeneration on renders 7 | const stars = useMemo(() => { 8 | const starCount = 200; 9 | return Array.from({ length: starCount }, () => ({ 10 | x: Math.random() * window.innerWidth, 11 | y: Math.random() * window.innerHeight, 12 | size: Math.random() * 1.5 + 0.5, 13 | twinkleSpeed: Math.random() * 2 + 1, 14 | moveSpeed: Math.random() * 0.05 + 0.02, 15 | angle: Math.random() * Math.PI * 2 16 | })); 17 | }, []); 18 | 19 | const drawStars = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => { 20 | ctx.clearRect(0, 0, width, height); 21 | 22 | stars.forEach((star) => { 23 | // Circular motion 24 | const radius = 1; 25 | star.angle += star.moveSpeed * 0.01; 26 | star.x += Math.cos(star.angle) * radius * 0.1; 27 | star.y += Math.sin(star.angle) * radius * 0.1; 28 | 29 | // Wrap around screen 30 | if (star.x < 0) star.x = width; 31 | if (star.x > width) star.x = 0; 32 | if (star.y < 0) star.y = height; 33 | if (star.y > height) star.y = 0; 34 | 35 | // Calculate twinkle 36 | const twinkle = Math.sin(time * star.twinkleSpeed + star.x * 0.01) * 0.5 + 0.5; 37 | const alpha = twinkle * 0.3; 38 | 39 | // Draw star with subtle glow 40 | ctx.beginPath(); 41 | const gradient = ctx.createRadialGradient( 42 | star.x, star.y, 0, 43 | star.x, star.y, star.size * 3 44 | ); 45 | gradient.addColorStop(0, `rgba(255, 255, 255, ${alpha})`); 46 | gradient.addColorStop(0.5, `rgba(255, 255, 255, ${alpha * 0.3})`); 47 | gradient.addColorStop(1, 'rgba(255, 255, 255, 0)'); 48 | ctx.fillStyle = gradient; 49 | ctx.arc(star.x, star.y, star.size, 0, Math.PI * 2); 50 | ctx.fill(); 51 | }); 52 | }, [stars]); 53 | 54 | useEffect(() => { 55 | const canvas = canvasRef.current; 56 | if (!canvas) return; 57 | 58 | const handleResize = () => { 59 | canvas.width = window.innerWidth; 60 | canvas.height = window.innerHeight; 61 | }; 62 | 63 | handleResize(); 64 | window.addEventListener('resize', handleResize); 65 | 66 | const ctx = canvas.getContext('2d'); 67 | if (!ctx) return; 68 | 69 | let animationFrame: number; 70 | let startTime = Date.now(); 71 | 72 | const animate = () => { 73 | const time = (Date.now() - startTime) * 0.001; 74 | drawStars(ctx, canvas.width, canvas.height, time); 75 | animationFrame = requestAnimationFrame(animate); 76 | }; 77 | 78 | animate(); 79 | 80 | return () => { 81 | cancelAnimationFrame(animationFrame); 82 | window.removeEventListener('resize', handleResize); 83 | }; 84 | }, [drawStars]); 85 | 86 | return ( 87 | 91 | ); 92 | }; 93 | 94 | export default BackgroundStars; 95 | -------------------------------------------------------------------------------- /frontend/src/components/PreferencesModal.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from 'react'; 2 | import { User, Sparkles, Eye } from 'lucide-react'; 3 | import websocketService, { MessageType } from '../services/websocket'; 4 | 5 | interface PreferencesModalProps { 6 | isOpen: boolean; 7 | onClose: () => void; 8 | } 9 | 10 | const PreferencesModal: React.FC = ({ isOpen, onClose }) => { 11 | const [systemPrompt, setSystemPrompt] = useState(''); 12 | const [userName, setUserName] = useState(''); 13 | const [isSaving, setIsSaving] = useState(false); 14 | const [saveError, setSaveError] = useState(null); 15 | const [activeTab, setActiveTab] = useState<'profile' | 'system'>('profile'); 16 | const [isVisionEnabled, setIsVisionEnabled] = useState(false); 17 | 18 | useEffect(() => { 19 | if (isOpen) { 20 | // Reset state when modal opens 21 | setSaveError(null); 22 | 23 | // Fetch current system prompt, user profile, and vision settings 24 | const handleSystemPrompt = (data: any) => { 25 | if (data && data.prompt) { 26 | setSystemPrompt(data.prompt); 27 | } 28 | }; 29 | 30 | const handleUserProfile = (data: any) => { 31 | if (data && data.name !== undefined) { 32 | setUserName(data.name); 33 | } 34 | }; 35 | 36 | const handleVisionSettings = (data: any) => { 37 | if (data && data.enabled !== undefined) { 38 | setIsVisionEnabled(data.enabled); 39 | } 40 | }; 41 | 42 | // Listen for responses 43 | websocketService.addEventListener(MessageType.SYSTEM_PROMPT, handleSystemPrompt); 44 | websocketService.addEventListener(MessageType.USER_PROFILE, handleUserProfile); 45 | websocketService.addEventListener(MessageType.VISION_SETTINGS as any, handleVisionSettings); 46 | 47 | // Request data 48 | websocketService.getSystemPrompt(); 49 | websocketService.getUserProfile(); 50 | websocketService.getVisionSettings(); 51 | 52 | console.log('Requested preferences data'); 53 | 54 | return () => { 55 | websocketService.removeEventListener(MessageType.SYSTEM_PROMPT, handleSystemPrompt); 56 | websocketService.removeEventListener(MessageType.USER_PROFILE, handleUserProfile); 57 | websocketService.removeEventListener(MessageType.VISION_SETTINGS as any, handleVisionSettings); 58 | }; 59 | } 60 | }, [isOpen]); 61 | 62 | // Listen for update confirmations 63 | useEffect(() => { 64 | let updateCount = 0; 65 | const expectedUpdateCount = 3; // Always expect 3 updates: system prompt, user profile, and vision 66 | let success = true; 67 | 68 | const handlePromptUpdated = (data: any) => { 69 | updateCount++; 70 | if (!(data && data.success)) { 71 | success = false; 72 | setSaveError('Failed to update system prompt. Please try again.'); 73 | } 74 | 75 | if (updateCount >= expectedUpdateCount) { 76 | setIsSaving(false); 77 | if (success) { 78 | // Close modal only if all updates succeeded 79 | onClose(); 80 | } 81 | } 82 | }; 83 | 84 | const handleProfileUpdated = (data: any) => { 85 | updateCount++; 86 | if (!(data && data.success)) { 87 | success = false; 88 | setSaveError('Failed to update user profile. Please try again.'); 89 | } 90 | 91 | if (updateCount >= expectedUpdateCount) { 92 | setIsSaving(false); 93 | if (success) { 94 | // Close modal only if all updates succeeded 95 | onClose(); 96 | } 97 | } 98 | }; 99 | 100 | const handleVisionSettingsUpdated = (data: any) => { 101 | updateCount++; 102 | if (!(data && data.success)) { 103 | success = false; 104 | setSaveError('Failed to update vision settings. Please try again.'); 105 | } 106 | 107 | if (updateCount >= expectedUpdateCount) { 108 | setIsSaving(false); 109 | if (success) { 110 | // Close modal only if all updates succeeded 111 | onClose(); 112 | } 113 | } 114 | }; 115 | 116 | websocketService.addEventListener(MessageType.SYSTEM_PROMPT_UPDATED, handlePromptUpdated); 117 | websocketService.addEventListener(MessageType.USER_PROFILE_UPDATED, handleProfileUpdated); 118 | websocketService.addEventListener(MessageType.VISION_SETTINGS_UPDATED as any, handleVisionSettingsUpdated); 119 | 120 | return () => { 121 | websocketService.removeEventListener(MessageType.SYSTEM_PROMPT_UPDATED, handlePromptUpdated); 122 | websocketService.removeEventListener(MessageType.USER_PROFILE_UPDATED, handleProfileUpdated); 123 | websocketService.removeEventListener(MessageType.VISION_SETTINGS_UPDATED as any, handleVisionSettingsUpdated); 124 | }; 125 | }, [onClose, activeTab]); 126 | 127 | const handleSave = () => { 128 | // Check if system prompt is empty when in system tab 129 | if (activeTab === 'system' && !systemPrompt.trim()) { 130 | setSaveError('System prompt cannot be empty'); 131 | return; 132 | } 133 | 134 | setIsSaving(true); 135 | setSaveError(null); 136 | 137 | // Always update all settings 138 | websocketService.updateSystemPrompt(systemPrompt); 139 | websocketService.updateUserProfile(userName); 140 | websocketService.updateVisionSettings(isVisionEnabled); 141 | }; 142 | 143 | // backticks, in my code, in the year of our lord, 2025? no. 144 | const handleRestore = () => { 145 | setSystemPrompt( 146 | "You are a helpful, friendly, and concise voice assistant. " + 147 | "Respond to user queries in a natural, conversational manner. " + 148 | "Keep responses brief and to the point, as you're communicating via voice. " + 149 | "When providing information, focus on the most relevant details. " + 150 | "If you don't know something, admit it rather than making up an answer." + 151 | "\n\n" + 152 | "Through the webapp, you can receive and understand photographs and pictures." + 153 | "\n\n" + 154 | "When the user sends a message like '[silent]', '[no response]', or '[still waiting]', it means they've gone quiet or haven't responded." + 155 | "When you see these signals, continue the conversation naturally based on the previous topic and context." + 156 | "Stay on topic, be helpful, and don't mention that they were silent - just carry on the conversation as if you're gently following up." 157 | ); 158 | }; 159 | 160 | // Tab rendering helpers 161 | const renderVisionTab = () => ( 162 |
163 |
164 | 165 |
166 |
setIsVisionEnabled(!isVisionEnabled)} 171 | > 172 | 177 |
178 | 179 | {isVisionEnabled ? 'Enabled' : 'Disabled'} 180 | 181 |
182 |

183 | When enabled, Vocalis will use computer vision to analyze images and provide visual context to your conversations. 184 |

185 |
186 |

187 | Coming Soon: Vision capabilities will allow Vocalis to see and describe images, 188 | analyze documents, interpret charts, and provide visual assistance during your conversations. 189 |

190 |
191 |
192 |
193 | ); 194 | 195 | const renderProfileTab = () => ( 196 |
197 |
198 | 199 |
200 | setUserName(e.target.value)} 203 | className="w-full bg-slate-800/50 border border-slate-700 rounded-lg pl-10 p-3 text-slate-300 text-sm focus:outline-none focus:ring-1 focus:ring-emerald-500" 204 | placeholder="Enter your name (optional)" 205 | /> 206 | 207 |
208 |

209 | Your name will be used to personalize greetings and make the conversation feel more natural. 210 |

211 |
212 |
213 | ); 214 | 215 | const renderSystemTab = () => ( 216 |
217 |
218 | 219 | 225 |
226 |