├── LICENSE
├── README.md
├── backend
    ├── .env
    ├── __init__.py
    ├── config.py
    ├── main.py
    ├── prompts
    │   ├── system_prompt.md
    │   ├── user_profile.json
    │   └── vision_settings.json
    ├── requirements.txt
    ├── routes
    │   ├── __init__.py
    │   └── websocket.py
    └── services
    │   ├── __init__.py
    │   ├── conversation_storage.py
    │   ├── llm.py
    │   ├── transcription.py
    │   ├── tts.py
    │   └── vision.py
├── docs
    ├── Demonstration_Video.png
    ├── Vocalis_Demo.png
    ├── Vocalis_Header.png
    ├── Vocalis_Visual_demo.gif
    └── index.html
├── frontend
    ├── favicon.ico
    ├── index.html
    ├── package.json
    ├── postcss.config.js
    ├── src
    │   ├── App.tsx
    │   ├── components
    │   │   ├── AssistantOrb.tsx
    │   │   ├── BackgroundStars.tsx
    │   │   ├── ChatInterface.tsx
    │   │   ├── PreferencesModal.tsx
    │   │   ├── SessionManager.tsx
    │   │   └── Sidebar.tsx
    │   ├── index.css
    │   ├── main.tsx
    │   ├── services
    │   │   ├── audio.ts
    │   │   └── websocket.ts
    │   ├── utils
    │   │   └── hooks.ts
    │   └── vite-env.d.ts
    ├── tailwind.config.js
    ├── tsconfig.json
    ├── tsconfig.node.json
    └── vite.config.ts
├── install-deps.bat
├── install-deps.sh
├── run.bat
├── run.sh
├── setup.bat
└── setup.sh


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Vocalis - Speech-to-Speech AI Assistant](https://lex-au.github.io/Vocalis/Vocalis_Header.png)
  2 | 
  3 | # Vocalis
  4 | 
  5 | [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  6 | [![React](https://img.shields.io/badge/React-18-61DAFB.svg?logo=react&logoColor=white)](https://reactjs.org/)
  7 | [![FastAPI](https://img.shields.io/badge/FastAPI-0.109.2-009688.svg?logo=fastapi&logoColor=white)](https://fastapi.tiangolo.com/)
  8 | [![Whisper](https://img.shields.io/badge/Whisper-Faster--Whisper-yellow.svg)](https://github.com/guillaumekln/faster-whisper)
  9 | [![Python](https://img.shields.io/badge/Python-3.10-3776AB.svg?logo=python&logoColor=white)](https://www.python.org/)
 10 | 
 11 | A sophisticated AI assistant with speech-to-speech capabilities built on a modern React frontend with a FastAPI backend. Vocalis provides a responsive, low-latency conversational experience with advanced visual feedback.
 12 | 
 13 | ## Video Demonstration of Setup and Usage
 14 | 
 15 | [![Watch the video](https://lex-au.github.io/Vocalis/Demonstration_Video.png)](https://www.youtube.com/watch?v=2slWwsHTNIA)
 16 | 
 17 | ## Changelog
 18 | 
 19 | **v1.5.0** (Vision Update) - April 12, 2025
 20 | - 🔍 New image analysis capability powered by [SmolVLM-256M-Instruct model](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
 21 | - 🖼️ Seamless image upload and processing interface
 22 | - 🔄 Contextual conversation continuation based on image understanding
 23 | - 🧩 Multi-modal conversation support (text, speech, and images)
 24 | - 💾 Advanced session management for saving and retrieving conversations
 25 | - 🎨 Improved UI with central call button and cleaner control layout
 26 | - 🔌 Simplified sidebar without redundant controls
 27 | 
 28 | **v1.0.0** (Initial Release) - March 31, 2025
 29 | - ✨ Revolutionary barge-in technology for natural conversation flow
 30 | - 🔊 Ultra low-latency audio streaming with adaptive buffering
 31 | - 🤖 AI-initiated greetings and follow-ups for natural conversations
 32 | - 🎨 Dynamic visual feedback system with state-aware animations
 33 | - 🔄 Streaming TTS with chunk-based delivery for immediate responses
 34 | - 🚀 Cross-platform support with optimised setup scripts
 35 | - 💻 CUDA acceleration with fallback for CPU-only systems
 36 | 
 37 | ## Features
 38 | 
 39 | ### 🎯 Advanced Conversation Capabilities
 40 | 
 41 | - **🗣️ Barge-In Interruption** - Interrupt the AI mid-speech for a truly natural conversation experience
 42 | - **👋 AI-Initiated Greetings** - Assistant automatically welcomes users with a contextual greeting
 43 | - **💬 Intelligent Follow-Ups** - System detects silence and continues conversation with natural follow-up questions
 44 | - **🔄 Conversation Memory** - Maintains context throughout the conversation session
 45 | - **🧠 Contextual Understanding** - Processes conversation history for coherent, relevant responses
 46 | - **🖼️ Image Analysis** - Upload and discuss images with integrated visual understanding
 47 | - **💾 Session Management** - Save, load, and manage conversation sessions with customisable titles
 48 | 
 49 | ### ⚡ Ultra-Responsive Performance
 50 | 
 51 | - **⏱️ Low-Latency Processing** - End-to-end latency under 500ms for immediate response perception
 52 | - **🔊 Streaming Audio** - Begin playback before full response is generated
 53 | - **📦 Adaptive Buffering** - Dynamically adjust audio buffer size based on network conditions
 54 | - **🔌 Efficient WebSocket Protocol** - Bidirectional real-time audio streaming
 55 | - **🔄 Parallel Processing** - Multi-stage pipeline for concurrent audio handling
 56 | 
 57 | ### 🎨 Interactive Visual Experience
 58 | 
 59 | - **🔮 Dynamic Assistant Orb** - Visual representation with state-aware animations:
 60 |   - Pulsing glow during listening
 61 |   - Particle animations during processing
 62 |   - Wave-like motion during speaking
 63 | - **📝 Live Transcription** - Real-time display of recognised speech
 64 | - **🚦 Status Indicators** - Clear visual cues for system state
 65 | - **🌈 Smooth Transitions** - Fluid state changes with appealing animations
 66 | - **🌙 Dark Theme** - Eye-friendly interface with cosmic aesthetic
 67 | 
 68 | ### 🛠️ Technical Excellence
 69 | 
 70 | - **🔍 High-Accuracy VAD** - Superior voice activity detection using custom-built VAD
 71 | - **🗣️ Optimised Whisper Integration** - Faster-Whisper for rapid transcription
 72 | - **🔊 Real-Time TTS** - Chunked audio delivery for immediate playback
 73 | - **🖥️ Hardware Flexibility** - CUDA acceleration with CPU fallback options
 74 | - **🔧 Easy Configuration** - Environment variables and user-friendly setup
 75 | 
 76 | ## Quick Start
 77 | 
 78 | ### Prerequisites
 79 | 
 80 | #### Windows
 81 | - Python 3.10+ installed and in your PATH
 82 | - Node.js and npm installed
 83 | 
 84 | #### macOS
 85 | - Python 3.10+ installed
 86 | - Install Homebrew (if not already installed):
 87 |   ```bash
 88 |   /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 89 |   ```
 90 | - Install Node.js and npm:
 91 |   ```bash
 92 |   brew install node
 93 |   ```
 94 | - **Apple Silicon (M1/M2/M3/M4) Notes**:
 95 |   - The setup will automatically install a compatible PyTorch version
 96 |   - If you encounter any PyTorch-related errors, you may need to manually install it:
 97 |     ```bash
 98 |     pip install torch
 99 |     ```
100 |     Then continue with the regular setup.
101 | 
102 | ### One-Click Setup (Recommended)
103 | 
104 | #### Windows
105 | 1. Run `setup.bat` to initialise the project (one-time setup)
106 |    - Includes option for CUDA or CPU-only PyTorch installation
107 | 2. Run `run.bat` to start both frontend and backend servers
108 | 3. If you need to update dependencies later, use `install-deps.bat`
109 | 
110 | #### macOS/Linux
111 | 1. Make scripts executable: `chmod +x *.sh`
112 | 2. Run `./setup.sh` to initialise the project (one-time setup)
113 |    - Includes option for CUDA or CPU-only PyTorch installation
114 | 3. Run `./run.sh` to start both frontend and backend servers
115 | 4. If you need to update dependencies later, use `./install-deps.sh`
116 | 
117 | ### Manual Setup (Alternative)
118 | 
119 | If you prefer to set up the project manually, follow these steps:
120 | 
121 | #### Backend Setup
122 | 1. Create a Python virtual environment:
123 |    ```bash
124 |    cd backend
125 |    python -m venv env
126 |    # Windows:
127 |    .\env\Scripts\activate
128 |    # macOS/Linux:
129 |    source env/bin/activate
130 |    ```
131 | 
132 | 2. Install the Python dependencies:
133 |    ```bash
134 |    pip install -r requirements.txt
135 |    ```
136 | 
137 | 3. If you need CUDA support, install PyTorch with CUDA:
138 |    ```bash
139 |    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
140 |    ```
141 | 
142 | 4. Start the backend server:
143 |    ```bash
144 |    python -m backend.main
145 |    ```
146 | 
147 | #### Frontend Setup
148 | 1. Install Node.js dependencies:
149 |    ```bash
150 |    cd frontend
151 |    npm install
152 |    ```
153 | 
154 | 2. Start the development server:
155 |    ```bash
156 |    npm run dev
157 |    ```
158 | 
159 | ### Personalising Vocalis
160 | 
161 | After launching Vocalis, you can customise your experience through the sidebar:
162 | 
163 | 1. Click the sidebar icon to open the navigation panel
164 | 2. Under the "Settings" tab, click "Preferences" to access personalisation options
165 | 
166 | The preferences modal offers several ways to tailor Vocalis to your needs:
167 | 
168 | #### User Profile
169 | - **Your Name**: Enter your name to personalise greetings and make conversations more natural
170 | - This helps Vocalis address you properly during interactions
171 | 
172 | #### System Prompt
173 | - Modify the AI's behaviour by editing the system prompt
174 | - The default prompt is optimised for natural voice interaction, but you can customise it for specific use cases
175 | - Use the "Restore Default" button to revert to the original prompt if needed
176 | 
177 | #### Vision Capabilities
178 | - Toggle vision capabilities on/off using the switch at the bottom of the preferences panel
179 | - When enabled, Vocalis can analyse images shared during conversations
180 | - This feature allows for rich multi-modal interactions where you can discuss visual content
181 | 
182 | These settings are saved automatically and persist between sessions, ensuring a consistent experience tailored to your preferences.
183 | 
184 | ## External Services
185 | 
186 | Vocalis is designed to work with OpenAI-compatible API endpoints for both LLM and TTS services:
187 | 
188 | - **LLM (Language Model)**: By default, the backend is configured to use [LM Studio](https://lmstudio.ai/) running locally. This provides a convenient way to run local language models compatible with OpenAI's API format.
189 |   
190 |   **Custom Vocalis Model**: For optimal performance, Vocalis includes a purpose-built fine-tuned model: [lex-au/Vocalis-Q4_K_M.gguf](https://huggingface.co/lex-au/Vocalis-Q4_K_M.gguf). This model is based on Meta's LLaMA 3 8B Instruct and specifically optimised for immersive conversational experiences with:
191 |   - Enhanced spatial and temporal context tracking
192 |   - Low-latency response generation
193 |   - Rich, descriptive language capabilities
194 |   - Efficient resource utilisation through Q4_K_M quantisation
195 |   - Seamless integration with the Vocalis speech-to-speech pipeline
196 | 
197 | - **Text-to-Speech (TTS)**: For voice generation, the system works out of the box with:
198 |   - [Orpheus-FASTAPI](https://github.com/Lex-au/Orpheus-FastAPI): A high-quality TTS server with OpenAI-compatible endpoints providing rich, expressive voices.
199 |   
200 |   You can adjust the endpoint in `.env` to any opensource TTS project. For a lightning-fast alternative:
201 |   - [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI): A lightning-fast TTS alternative, optimised for minimal latency when speed is the priority over maximum expressiveness.
202 | 
203 | Both services can be configured in the `backend/.env` file. The system requires these external services to function properly, as Vocalis acts as an orchestration layer combining speech recognition, language model inference, and speech synthesis.
204 | 
205 | ## Visual Demo
206 | 
207 | ![Assistant Interface](https://lex-au.github.io/Vocalis/Vocalis_Demo.png)
208 | 
209 | ## Session Management
210 | 
211 | Vocalis includes a robust session management system that allows users to save, load, and organise their conversations:
212 | 
213 | ### Key Features
214 | 
215 | - **Save Conversations**: Save the current conversation state with a custom title
216 | - **Load Previous Sessions**: Return to any saved conversation exactly as you left it
217 | - **Edit Session Titles**: Rename sessions for better organisation
218 | - **Delete Unwanted Sessions**: Remove conversations you no longer need
219 | - **Session Metadata**: View additional information like message count
220 | - **Automatic Timestamps**: Sessions track both creation and last update times
221 | 
222 | ### Technical Implementation
223 | 
224 | The session system uses a two-part architecture:
225 | 
226 | 1. **Backend Storage**:
227 |    - Conversations are stored as JSON files in a dedicated directory
228 |    - Each session maintains its complete message history
229 |    - Asynchronous file I/O prevents performance impacts
230 |    - UUID-based session identification ensures uniqueness
231 | 
232 | 2. **Frontend Interface**:
233 |    - Intuitive sidebar UI for session management
234 |    - Real-time session status updates
235 |    - Active session indicator
236 |    - Session creation with optional custom titles
237 | 
238 | ### Usage Flow
239 | 
240 | 1. Start a new conversation with the assistant
241 | 2. Click "Save As New Conversation" to preserve the current state
242 | 3. Continue your conversation or load a different session
243 | 4. Return to any saved session at any time to continue where you left off
244 | 5. Edit session titles or delete unwanted sessions as needed
245 | 
246 | This persistent storage system ensures you never lose valuable conversations and can maintain separate contexts for different topics or projects.
247 | 
248 | ## Architecture Overview
249 | 
250 | ```mermaid
251 | graph TB
252 |     subgraph "Frontend (React)"
253 |         AudioCapture[Audio Capture]
254 |         AudioVisualizer[Audio Visualizer]
255 |         WebSocket[WebSocket Client]
256 |         AudioOutput[Audio Output]
257 |         UIState[UI State Management]
258 |         ImageUpload[Image Upload]
259 |         SessionManager[Session Manager]
260 |     end
261 |     
262 |     subgraph "Backend (FastAPI)"
263 |         WSServer[WebSocket Server]
264 |         VAD[Custom Voice Activity Detection]
265 |         WhisperSTT[Faster Whisper]
266 |         LLMClient[LLM Client]
267 |         TTSClient[TTS Client]
268 |         AudioProcessing[Audio Processing]
269 |         VisionService[SmolVLM Vision Service]
270 |         StorageService[Conversation Storage]
271 |         EnvConfig[Environment Config]
272 |     end
273 |     
274 |     subgraph "Local API Services"
275 |         LLMEndpoint["LLM API (127.0.0.1:1234)"]
276 |         TTSEndpoint["TTS API (localhost:5005)"]
277 |     end
278 |     
279 |     subgraph "Storage"
280 |         SessionFiles["Session JSON Files"]
281 |     end
282 |     
283 |     AudioCapture -->|Audio Stream| WebSocket
284 |     ImageUpload -->|Image Data| WebSocket
285 |     SessionManager -->|Session Commands| WebSocket
286 |     WebSocket <-->|WebSocket Protocol| WSServer
287 |     WSServer --> VAD
288 |     VAD -->|Audio with Speech| WhisperSTT
289 |     WhisperSTT -->|Transcribed Text| LLMClient
290 |     
291 |     WebSocket -->|Image Data| WSServer
292 |     WSServer -->|Process Image| VisionService
293 |     VisionService -->|Image Description| LLMClient
294 |     
295 |     WebSocket -->|Session Operations| WSServer
296 |     WSServer -->|Store/Load/List/Delete| StorageService
297 |     StorageService <-->|Read/Write JSON| SessionFiles
298 |     
299 |     LLMClient -->|API Request| LLMEndpoint
300 |     LLMEndpoint -->|Response Text| LLMClient
301 |     LLMClient -->|Response Text| TTSClient
302 |     TTSClient -->|API Request| TTSEndpoint
303 |     TTSEndpoint -->|Audio Data| TTSClient
304 |     TTSClient --> WSServer
305 |     WSServer -->|Audio Response| WebSocket
306 |     WebSocket --> AudioOutput
307 |     EnvConfig -->|Configuration| WhisperSTT
308 |     EnvConfig -->|Configuration| LLMClient
309 |     EnvConfig -->|Configuration| TTSClient
310 |     EnvConfig -->|Configuration| VisionService
311 |     EnvConfig -->|Configuration| StorageService
312 |     UIState <--> WebSocket
313 | ```
314 | 
315 | ## Detailed System Architecture
316 | 
317 | The following diagram provides a comprehensive view of Vocalis's architecture, highlighting the advanced conversation features and interrupt handling systems that enable its natural conversational capabilities:
318 | 
319 | ```mermaid
320 | graph TD
321 |     %% Client Side
322 |     subgraph "Frontend (React + TypeScript + Vite)"
323 |         FE_Audio[Audio Capture/Playback]
324 |         FE_WebSocket[WebSocket Client]
325 |         FE_UI[UI Components]
326 |         FE_State[State Management]
327 |         FE_InterruptDetector[Interrupt Detector]
328 |         FE_SilenceDetector[Silence Detector]
329 |         FE_ImageUpload[Image Upload Handler]
330 |         FE_SessionUI[Session Manager UI]
331 |         
332 |         subgraph "UI Components"
333 |             UI_Orb[AssistantOrb]
334 |             UI_Stars[BackgroundStars] 
335 |             UI_Chat[ChatInterface]
336 |             UI_Prefs[PreferencesModal]
337 |             UI_Sidebar[Sidebar]
338 |             UI_Sessions[SessionManager]
339 |         end
340 |         
341 |         subgraph "Services"
342 |             FE_AudioService[Audio Service]
343 |             FE_WebSocketService[WebSocket Service]
344 |         end
345 |     end
346 |     
347 |     %% Server Side
348 |     subgraph "Backend (FastAPI + Python)"
349 |         BE_Main[Main App]
350 |         BE_Config[Configuration]
351 |         BE_WebSocket[WebSocket Handler]
352 |         BE_InterruptHandler[Interrupt Handler]
353 |         BE_ConversationManager[Conversation Manager]
354 |         
355 |         subgraph "Services"
356 |             BE_Transcription[Speech Transcription & VAD]
357 |             BE_LLM[LLM Client]
358 |             BE_TTS[TTS Client]
359 |             BE_Vision[SmolVLM Vision Service]
360 |             BE_Storage[Conversation Storage]
361 |         end
362 |         
363 |         subgraph "Conversation Features"
364 |             BE_GreetingSystem[AI Greeting System]
365 |             BE_FollowUpSystem[Follow-Up Generator]
366 |             BE_ContextMemory[Context Memory]
367 |             BE_VisionContext[Image Context Manager]
368 |             BE_SessionMgmt[Session Management]
369 |         end
370 |     end
371 |     
372 |     %% External Services & Storage
373 |     subgraph "External Services"
374 |         LLM_API[LM Studio OpenAI-compatible API]
375 |         TTS_API[Orpheus-FASTAPI TTS]
376 |     end
377 |     
378 |     subgraph "Persistent Storage"
379 |         JSON_Files[Session JSON Files]
380 |     end
381 |     
382 |     %% Data Flow - Main Path
383 |     FE_Audio -->|Audio Stream| FE_AudioService
384 |     FE_AudioService -->|Process Audio| FE_WebSocketService
385 |     FE_WebSocketService -->|Binary Audio Data| FE_WebSocket
386 |     FE_WebSocket <-->|WebSocket Protocol| BE_WebSocket
387 |     
388 |     BE_WebSocket -->|Audio Chunks| BE_Transcription
389 |     BE_Transcription -->|Voice Activity Detection| BE_Transcription
390 |     BE_Transcription -->|Transcribed Text| BE_ConversationManager
391 |     BE_ConversationManager -->|Format Prompt| BE_LLM
392 |     BE_LLM -->|API Request| LLM_API
393 |     LLM_API -->|Response Text| BE_LLM
394 |     BE_LLM -->|Response Text| BE_TTS
395 |     BE_TTS -->|API Request| TTS_API
396 |     TTS_API -->|Audio Data| BE_TTS
397 |     BE_TTS -->|Processed Audio| BE_WebSocket
398 |     
399 |     BE_WebSocket -->|Audio Response| FE_WebSocket
400 |     FE_WebSocket -->|Audio Data| FE_AudioService
401 |     FE_AudioService -->|Playback| FE_Audio
402 |     
403 |     %% Session Management Flow
404 |     FE_SessionUI -->|Save/Load/List/Delete| FE_WebSocketService
405 |     FE_WebSocketService -->|Session Commands| FE_WebSocket
406 |     FE_WebSocket -->|Session Operations| BE_WebSocket
407 |     BE_WebSocket -->|Session Management| BE_SessionMgmt
408 |     BE_SessionMgmt -->|Store/Retrieve| BE_Storage
409 |     BE_Storage <-->|Persist Data| JSON_Files
410 |     BE_Storage -->|Session Response| BE_WebSocket
411 |     BE_WebSocket -->|Session Status| FE_WebSocket
412 |     FE_WebSocket -->|Update UI| FE_SessionUI
413 |     
414 |     %% Vision Flow
415 |     FE_ImageUpload -->|Image Data| FE_WebSocketService
416 |     FE_WebSocketService -->|Image Base64| FE_WebSocket
417 |     FE_WebSocket -->|Image Data| BE_WebSocket
418 |     BE_WebSocket -->|Process Image| BE_Vision
419 |     BE_Vision -->|Image Description| BE_VisionContext
420 |     BE_VisionContext -->|Augmented Context| BE_ConversationManager
421 |     
422 |     %% Advanced Feature Paths
423 |     
424 |     %% 1. Interrupt System
425 |     FE_Audio -->|Voice Activity| FE_InterruptDetector
426 |     FE_InterruptDetector -->|Interrupt Signal| FE_WebSocket
427 |     FE_WebSocket -->|Interrupt Command| BE_WebSocket
428 |     BE_WebSocket -->|Cancel Processing| BE_InterruptHandler
429 |     BE_InterruptHandler -.->|Stop Generation| BE_LLM
430 |     BE_InterruptHandler -.->|Clear Buffer| BE_TTS
431 |     BE_InterruptHandler -.->|Reset State| BE_ConversationManager
432 |     
433 |     %% 2. AI-Initiated Greetings
434 |     BE_GreetingSystem -->|Initial Greeting| BE_ConversationManager
435 |     BE_ConversationManager -->|Greeting Text| BE_LLM
436 |     
437 |     %% 3. Silence-based Follow-ups
438 |     FE_SilenceDetector -->|Silence Detected| FE_WebSocket
439 |     FE_WebSocket -->|Silence Notification| BE_WebSocket
440 |     BE_WebSocket -->|Trigger Follow-up| BE_FollowUpSystem
441 |     BE_FollowUpSystem -->|Generate Follow-up| BE_ConversationManager
442 |     
443 |     %% 4. Context Management
444 |     BE_ConversationManager <-->|Store/Retrieve Context| BE_ContextMemory
445 |     BE_SessionMgmt <-->|Save/Load Messages| BE_ContextMemory
446 |     
447 |     %% UI Interactions
448 |     FE_State <-->|State Updates| FE_UI
449 |     FE_WebSocketService -->|Connection Status| FE_State
450 |     FE_AudioService -->|Audio Status| FE_State
451 |     FE_InterruptDetector -->|Interrupt Status| FE_State
452 |     FE_ImageUpload -->|Upload Status| FE_State
453 |     
454 |     %% Configuration
455 |     BE_Config -->|Environment Settings| BE_Main
456 |     BE_Config -->|API Settings| BE_LLM
457 |     BE_Config -->|API Settings| BE_TTS
458 |     BE_Config -->|Model Config| BE_Transcription
459 |     BE_Config -->|Vision Settings| BE_Vision
460 |     BE_Config -->|Storage Settings| BE_Storage
461 |     BE_Config -->|Conversation Settings| BE_GreetingSystem
462 |     BE_Config -->|Follow-up Settings| BE_FollowUpSystem
463 |     
464 |     %% UI Component Links
465 |     FE_UI -->|Renders| UI_Orb
466 | UI_Orb -->|Visualises States| FE_State
467 |     FE_UI -->|Renders| UI_Stars
468 |     FE_UI -->|Renders| UI_Chat
469 |     UI_Chat -->|Displays Transcript| FE_State
470 |     FE_UI -->|Renders| UI_Prefs
471 |     FE_UI -->|Renders| UI_Sidebar
472 |     FE_UI -->|Renders| UI_Sessions
473 |     UI_Sessions -->|Manages Sessions| FE_SessionUI
474 |     
475 |     %% Technology Labels
476 |     classDef frontend fill:#61DAFB,color:#000,stroke:#61DAFB
477 |     classDef backend fill:#009688,color:#fff,stroke:#009688
478 |     classDef external fill:#FF9800,color:#000,stroke:#FF9800
479 |     classDef feature fill:#E91E63,color:#fff,stroke:#E91E63
480 |     classDef storage fill:#9C27B0,color:#fff,stroke:#9C27B0
481 |     
482 |     class FE_Audio,FE_WebSocket,FE_UI,FE_State,FE_AudioService,FE_WebSocketService,UI_Orb,UI_Stars,UI_Chat,UI_Prefs,UI_Sidebar,FE_ImageUpload,FE_SessionUI,UI_Sessions frontend
483 |     class BE_Main,BE_Config,BE_WebSocket,BE_Transcription,BE_LLM,BE_TTS,BE_Vision,BE_Storage backend
484 |     class LLM_API,TTS_API external
485 |     class FE_InterruptDetector,FE_SilenceDetector,BE_InterruptHandler,BE_GreetingSystem,BE_FollowUpSystem,BE_ConversationManager,BE_ContextMemory,BE_VisionContext,BE_SessionMgmt feature
486 |     class JSON_Files storage
487 | ```
488 | 
489 | ## Low-Latency TTS Streaming Architecture
490 | 
491 | For achieving true low-latency in the speech system, we implement streaming TTS with chunked delivery and barge-in capability:
492 | 
493 | ```mermaid
494 | sequenceDiagram
495 |     participant Frontend
496 |     participant AudioBuffer as Frontend Audio Buffer
497 |     participant SilenceDetector as Frontend Silence Detector
498 |     participant InterruptDetector as Frontend Interrupt Detector
499 |     participant SessionMgr as Session Manager
500 |     participant Backend as FastAPI Backend
501 |     participant IntHandler as Backend Interrupt Handler
502 |     participant Transcription as Speech Transcription & VAD
503 |     participant VisionService as Vision Service (SmolVLM)
504 |     participant StorageService as Conversation Storage
505 |     participant LLM as LLM API (LM Studio)
506 |     participant TTS as TTS API (Orpheus-FASTAPI)
507 |     
508 |     Note over Frontend,TTS: Normal Speech Flow
509 |     
510 |     Frontend->>Backend: Audio stream (chunks)
511 |     Backend->>Transcription: Process audio
512 |     Transcription->>Transcription: Voice activity detection
513 |     Transcription->>Transcription: Speech-to-text
514 |     Transcription->>Backend: Transcribed text
515 |     Backend->>LLM: Text request with context
516 |     activate LLM
517 |     LLM-->>Backend: Text response (streaming)
518 |     deactivate LLM
519 |     Note over Backend: Begin TTS processing
520 |     Backend->>TTS: Request TTS
521 |     activate TTS
522 |     
523 |     %% Show parallel processing
524 |     par Streaming audio playback
525 |         TTS-->>Backend: Audio chunk 1
526 |         Backend-->>Frontend: Audio chunk 1
527 |         Frontend->>AudioBuffer: Queue chunk
528 |         AudioBuffer->>Frontend: Begin playback
529 |         
530 |         TTS-->>Backend: Audio chunk 2
531 |         Backend-->>Frontend: Audio chunk 2
532 |         Frontend->>AudioBuffer: Queue chunk
533 |         AudioBuffer->>Frontend: Continue playback
534 |         
535 |         TTS-->>Backend: Audio chunk n
536 |         Backend-->>Frontend: Audio chunk n
537 |         Frontend->>AudioBuffer: Queue chunk
538 |         AudioBuffer->>Frontend: Continue playback
539 |     end
540 |     deactivate TTS
541 |     
542 |     Note over Frontend,TTS: Session Management Flow
543 |     
544 |     SessionMgr->>Backend: Save current session
545 |     Backend->>StorageService: Store conversation
546 |     StorageService-->>Backend: Session ID
547 |     Backend-->>SessionMgr: Session saved confirmation
548 |     
549 |     SessionMgr->>Backend: Load specific session
550 |     Backend->>StorageService: Retrieve session data
551 |     StorageService-->>Backend: Conversation history
552 |     Backend->>Backend: Restore conversation context
553 |     Backend-->>SessionMgr: Session loaded confirmation
554 |     
555 |     Note over Frontend,TTS: Vision Processing Flow
556 |     
557 |     Frontend->>Backend: Upload image
558 |     Backend->>VisionService: Process image
559 |     activate VisionService
560 |     VisionService-->>Backend: Image description
561 |     deactivate VisionService
562 |     Backend->>Backend: Add to conversation context
563 |     Frontend->>Backend: Audio question about image
564 |     Backend->>Transcription: Process audio
565 |     Transcription->>Backend: Transcribed text
566 |     Backend->>LLM: Text request with image context
567 |     activate LLM
568 |     LLM-->>Backend: Image-informed response
569 |     deactivate LLM
570 |     Backend->>TTS: Request TTS
571 |     activate TTS
572 |     TTS-->>Backend: Audio response
573 |     Backend-->>Frontend: Stream audio response
574 |     deactivate TTS
575 |     
576 |     Note over Frontend,TTS: Interrupt Flow (Barge-in)
577 |     
578 |     par Interrupt handling during speech
579 |         Frontend->>InterruptDetector: User begins speaking
580 |         InterruptDetector->>Frontend: Detect interrupt
581 |         Frontend->>Backend: Send interrupt signal
582 |         Backend->>IntHandler: Process interrupt
583 |         
584 |         IntHandler->>LLM: Cancel generation
585 |         IntHandler->>TTS: Stop audio generation
586 |         IntHandler->>Backend: Clear processing pipeline
587 |         
588 |         Backend->>Frontend: Stop audio signal
589 |         Frontend->>AudioBuffer: Clear buffer
590 |         AudioBuffer->>Frontend: Stop playback immediately
591 |     end
592 |     
593 |     Note over Frontend,TTS: Silence Handling (AI Follow-ups)
594 |     
595 |     par AI-initiated follow-ups
596 |         Frontend->>SilenceDetector: No user speech detected
597 |         SilenceDetector->>Frontend: Silence timeout (3-5s)
598 |         Frontend->>Backend: Silence notification
599 |         Backend->>Backend: Generate follow-up
600 |         Backend->>LLM: Request contextual follow-up
601 |         activate LLM
602 |         LLM-->>Backend: Follow-up response
603 |         deactivate LLM
604 |         Backend->>TTS: Convert to speech
605 |         activate TTS
606 |         TTS-->>Backend: Follow-up audio
607 |         Backend-->>Frontend: Stream follow-up audio
608 |         deactivate TTS
609 |         Frontend->>AudioBuffer: Play follow-up
610 |     end
611 | ```
612 | 
613 | ### Image Analysis Process
614 | 
615 | Vocalis now includes visual understanding capabilities through the SmolVLM-256M-Instruct model:
616 | 
617 | 1. **Image Upload**:
618 |    - Users can click the vision button in the interface
619 |    - A file picker allows selecting images up to 5MB
620 |    - Images are encoded as base64 and sent to the backend
621 | 
622 | 2. **Vision Processing**:
623 |    - The SmolVLM model processes the image with transformers
624 |    - The model generates a detailed description of the image contents
625 |    - This description is added to the conversation context
626 | 
627 | 3. **Contextual Continuation**:
628 |    - After image processing, users can ask questions about the image
629 |    - The system maintains awareness of the image context
630 |    - Responses are generated with understanding of the visual content
631 | 
632 | 4. **Multi-Modal Integration**:
633 |    - The interface provides visual feedback during image processing
634 |    - Transcripts and responses flow naturally between text and visual content
635 |    - The conversation maintains coherence across modalities
636 | 
637 | ### Streaming Architecture Features
638 | 
639 | 1. **Parallel Processing**:
640 |    - Simultaneous audio generation, transmission, and playback
641 |    - Non-blocking pipeline for maximum responsiveness
642 |    - Client-side buffer management with dynamic sizing
643 | 
644 | 2. **Barge-in Capability**:
645 |    - Real-time voice activity detection during AI speech
646 |    - Multi-level interrupt system with priority handling
647 |    - Immediate pipeline clearing for zero-latency response to interruptions
648 | 
649 | 3. **Audio Buffer Management**:
650 |    - Adaptive buffer sizes based on network conditions (20-50ms chunks)
651 |    - Buffer health monitoring with automatic adjustments
652 |    - Efficient audio format selection (Opus for compression, PCM for quality)
653 | 
654 | 4. **Silence Response System**:
655 |    - Time-based silence detection with configurable thresholds
656 |    - Context-aware follow-up generation
657 |    - Natural cadence for conversation flow maintenance
658 | 
659 | ### Implementation Details:
660 | 
661 | 1. **Backend TTS Integration**:
662 |    - Configure TTS API with streaming support if available
663 |    - Implement custom chunking if necessary
664 | 
665 | 2. **Custom Streaming Implementation**:
666 |    - Set up an async generator in FastAPI
667 |    - Split audio into small chunks (10-50ms)
668 |    - Send each chunk immediately through WebSocket
669 | 
670 | 3. **WebSocket Protocol Enhancement**:
671 |    - Add message types for different audio events:
672 |      - `audio_chunk`: A piece of TTS audio to play immediately
673 |      - `audio_start`: Signal to prepare audio context
674 |      - `audio_end`: Signal that the complete utterance is finished
675 | 
676 | 4. **Frontend Audio Handling**:
677 |    - Use Web Audio API for low-latency playback
678 |    - Implement buffer queue system for smooth playback
679 | 
680 | ### Technical Considerations:
681 | 
682 | 1. **Chunk Size Tuning**:
683 |    - Find optimal balance between network overhead and latency
684 | 
685 | 2. **Buffer Management**:
686 |    - Avoid buffer underrun and excessive buffering
687 | 
688 | 3. **Format Efficiency**:
689 |    - Use efficient audio formats for streaming (Opus, WebM, or raw PCM)
690 | 
691 | 4. **Abort Capability**:
692 |    - Implement clean interruption for new user input
693 | 
694 | ## Buffer Management Approach
695 | 
696 | ### 1. Adaptive Buffer Sizing
697 | - Start with small buffers (20-30ms)
698 | - Monitor playback stability
699 | - Dynamically adjust buffer size based on network conditions
700 | 
701 | ### 2. Parallel Processing Pipeline
702 | - Process audio in parallel streams where possible
703 | - Begin TTS playback as soon as first chunk is available
704 | - Continue processing subsequent chunks during playback
705 | 
706 | ### 3. Interrupt Handling
707 | - Implement a "barge-in" capability where new user speech cancels ongoing TTS
708 | - Clear audio buffers immediately on interruption
709 | 
710 | ## Latency Optimisation
711 | 
712 | Vocalis achieves exceptional low-latency performance through carefully optimised components:
713 | 
714 | ### Speech Recognition Performance
715 | 
716 | The system uses Faster-Whisper with the `base.en` model and a beam size of 2, striking an optimal balance between accuracy and speed. This configuration achieves:
717 | 
718 | - **ASR Processing**: ~0.43 seconds for typical utterances
719 | - **Response Generation**: ~0.18 seconds
720 | - **Total Round-Trip Latency**: ~0.61 seconds
721 | 
722 | Real-world example from system logs:
723 | ```
724 | INFO:faster_whisper:Processing audio with duration 00:02.229
725 | INFO:backend.services.transcription:Transcription completed in 0.51s: Hi, how are you doing today?...
726 | INFO:backend.services.tts:Sending TTS request with 147 characters of text
727 | INFO:backend.services.tts:Received TTS response after 0.16s, size: 390102 bytes
728 | ```
729 | 
730 | ### Customising Performance
731 | 
732 | You can adjust these settings to optimise for your specific needs:
733 | 
734 | 1. **Model Size**: In `.env`, modify `WHISPER_MODEL=base.en` 
735 |    - Options: tiny.en, base.en, small.en, medium.en, large
736 |    - Smaller models = faster processing, potentially lower accuracy
737 |    - Larger models = more accurate, but increased latency
738 | 
739 | 2. **Beam Size**: In `backend/services/transcription.py`, modify the `beam_size` parameter
740 |    - Default: 2
741 |    - Range: 1-5 (1 = fastest, 5 = most accurate)
742 |    - Located in the `__init__` method of the `WhisperTranscriber` class
743 | 
744 | ### Latency vs. Accuracy Trade-offs
745 | 
746 | | Model | Beam Size | Approximate ASR Time | Accuracy |
747 | |------|-----------|---------------------|----------|
748 | | tiny.en | 1 | ~0.01s | Lower |
749 | | base.en | 2 | ~0.03s | Good |
750 | | small.en | 3 | ~0.10s | Better |
751 | | medium.en | 4 | ~0.25s | Very Good |
752 | | large | 5 | ~0.50s | Best |
753 | 
754 | With optimisations in place, Vocalis can achieve total processing latencies well under 250ms when using smaller models, which is typically perceived as "immediate" by users.
755 | 
756 | ## Project Structure
757 | 
758 | ```
759 | Vocalis/
760 | ├── README.md
761 | ├── setup.bat            # Windows one-time setup script
762 | ├── run.bat              # Windows run script 
763 | ├── install-deps.bat     # Windows dependency update script
764 | ├── setup.sh             # Unix one-time setup script
765 | ├── run.sh               # Unix run script
766 | ├── install-deps.sh      # Unix dependency update script
767 | ├── conversations/       # Directory for saved session files
768 | ├── backend/
769 | │   ├── .env
770 | │   ├── main.py
771 | │   ├── config.py
772 | │   ├── requirements.txt
773 | │   ├── services/
774 | │   │   ├── __init__.py
775 | │   │   ├── conversation_storage.py
776 | │   │   ├── llm.py
777 | │   │   ├── transcription.py  # Includes VAD functionality
778 | │   │   ├── tts.py
779 | │   │   ├── vision.py
780 | │   ├── routes/
781 | │   │   ├── __init__.py
782 | │   │   ├── websocket.py
783 | ├── frontend/
784 | │   ├── public/
785 | │   ├── src/
786 | │   │   ├── components/
787 | │   │   │   ├── AssistantOrb.tsx
788 | │   │   │   ├── BackgroundStars.tsx
789 | │   │   │   ├── ChatInterface.tsx
790 | │   │   │   ├── PreferencesModal.tsx
791 | │   │   │   ├── SessionManager.tsx
792 | │   │   │   ├── Sidebar.tsx
793 | │   │   ├── services/
794 | │   │   │   ├── audio.ts
795 | │   │   │   ├── websocket.ts
796 | │   │   ├── utils/
797 | │   │   │   ├── hooks.ts
798 | │   │   ├── App.tsx
799 | │   │   ├── main.tsx
800 | │   │   ├── index.css
801 | │   │   ├── vite-env.d.ts
802 | │   ├── package.json
803 | │   ├── tsconfig.json
804 | │   ├── tsconfig.node.json
805 | │   ├── vite.config.ts
806 | │   ├── tailwind.config.js
807 | │   ├── postcss.config.js
808 | ```
809 | 
810 | ## Dependencies
811 | 
812 | ### Backend (Python)
813 | ```
814 | fastapi==0.109.2
815 | uvicorn==0.27.1
816 | python-dotenv==1.0.1
817 | websockets==12.0
818 | numpy==1.26.4
819 | transformers
820 | faster-whisper==1.1.1
821 | requests==2.31.0
822 | python-multipart==0.0.9
823 | torch==2.0.1
824 | ctranslate2==3.10.0
825 | ffmpeg-python==0.2.0
826 | ```
827 | 
828 | ### Frontend
829 | ```
830 | react
831 | typescript
832 | tailwindcss
833 | lucide-react
834 | websocket
835 | web-audio-api
836 | ```
837 | 
838 | ## Technical Decisions
839 | 
840 | - **Audio Format**: Web Audio API (44.1kHz, 16-bit PCM)
841 | - **Browser Compatibility**: Targeting modern Chrome browsers
842 | - **Error Handling**: Graceful degradation with user-friendly messages
843 | - **Microphone Permissions**: Standard browser permission flow with clear guidance
844 | - **Conversation Model**: Multi-turn with context preservation
845 | - **State Management**: React hooks with custom state machine
846 | - **Animation System**: CSS transitions with hardware acceleration
847 | - **Vision Processing**: SmolVLM-256M-Instruct for efficient image understanding
848 | - **Session Storage**: Asynchronous JSON file-based persistence with UUID identifiers
849 | 
850 | ## License
851 | 
852 | This project is licensed under the Apache License 2.0 - see the LICENSE file for details.


--------------------------------------------------------------------------------
/backend/.env:
--------------------------------------------------------------------------------
 1 | # Vocalis backend configuration
 2 | 
 3 | # API Endpoints
 4 | LLM_API_ENDPOINT=http://127.0.0.1:1234/v1/chat/completions  # Place your local LLM API endpoint here (default is LM Studio)
 5 | TTS_API_ENDPOINT=http://localhost:5005/v1/audio/speech  # Place your local TTS API endpoint here (default is Orpheus-FASTAPI native python launcher) - If you're using Orpheus-FASTAPI Docker Container versus native python launcher, replace "localhost" with "127.0.0.1:5005"
 6 | 
 7 | # Whisper Model Configuration
 8 | WHISPER_MODEL=base  # Options: tiny.en, base.en, small.en, medium.en, large
 9 | 
10 | # TTS Configuration
11 | TTS_MODEL=tts-1 
12 | TTS_VOICE=tara 
13 | TTS_FORMAT=wav        # Format for TTS output (wav, mp3, opus, flac)
14 | 
15 | # WebSocket Server Configuration
16 | WEBSOCKET_HOST=0.0.0.0
17 | WEBSOCKET_PORT=8000
18 | 
19 | # Audio Processing
20 | VAD_THRESHOLD=0.1          # Voice activity detection threshold (0.0-1.0)
21 | VAD_BUFFER_SIZE=30         # Buffer size in milliseconds
22 | AUDIO_SAMPLE_RATE=44100    # Sample rate in Hz
23 | 


--------------------------------------------------------------------------------
/backend/__init__.py:
--------------------------------------------------------------------------------
1 | # Backend package initialization
2 | # This file makes the 'backend' directory a Python package
3 | 


--------------------------------------------------------------------------------
/backend/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Vocalis Configuration Module
 3 | 
 4 | Loads and provides access to configuration settings from environment variables
 5 | and the .env file.
 6 | """
 7 | 
 8 | import os
 9 | from dotenv import load_dotenv
10 | from typing import Dict, Any
11 | 
12 | # Load environment variables from .env file
13 | load_dotenv()
14 | 
15 | # API Endpoints
16 | LLM_API_ENDPOINT = os.getenv("LLM_API_ENDPOINT", "http://127.0.0.1:1234/v1/chat/completions")
17 | TTS_API_ENDPOINT = os.getenv("TTS_API_ENDPOINT", "http://localhost:5005/v1/audio/speech")
18 | 
19 | # Whisper Model Configuration
20 | WHISPER_MODEL = os.getenv("WHISPER_MODEL", "tiny.en")
21 | 
22 | # TTS Configuration
23 | TTS_MODEL = os.getenv("TTS_MODEL", "tts-1")
24 | TTS_VOICE = os.getenv("TTS_VOICE", "tara")
25 | TTS_FORMAT = os.getenv("TTS_FORMAT", "wav")
26 | 
27 | # WebSocket Server Configuration
28 | WEBSOCKET_HOST = os.getenv("WEBSOCKET_HOST", "0.0.0.0")
29 | WEBSOCKET_PORT = int(os.getenv("WEBSOCKET_PORT", 8000))
30 | 
31 | # Audio Processing
32 | VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", 0.5))
33 | VAD_BUFFER_SIZE = int(os.getenv("VAD_BUFFER_SIZE", 30))
34 | AUDIO_SAMPLE_RATE = int(os.getenv("AUDIO_SAMPLE_RATE", 48000))
35 | 
36 | def get_config() -> Dict[str, Any]:
37 |     """
38 |     Returns all configuration settings as a dictionary.
39 |     
40 |     Returns:
41 |         Dict[str, Any]: Dictionary containing all configuration settings
42 |     """
43 |     return {
44 |         "llm_api_endpoint": LLM_API_ENDPOINT,
45 |         "tts_api_endpoint": TTS_API_ENDPOINT,
46 |         "whisper_model": WHISPER_MODEL,
47 |         "tts_model": TTS_MODEL,
48 |         "tts_voice": TTS_VOICE,
49 |         "tts_format": TTS_FORMAT,
50 |         "websocket_host": WEBSOCKET_HOST,
51 |         "websocket_port": WEBSOCKET_PORT,
52 |         "vad_threshold": VAD_THRESHOLD,
53 |         "vad_buffer_size": VAD_BUFFER_SIZE,
54 |         "audio_sample_rate": AUDIO_SAMPLE_RATE,
55 |     }
56 | 


--------------------------------------------------------------------------------
/backend/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Vocalis Backend Server
  3 | 
  4 | FastAPI application entry point.
  5 | """
  6 | 
  7 | import logging
  8 | import uvicorn
  9 | from fastapi import FastAPI, WebSocket, Depends, HTTPException
 10 | from fastapi.middleware.cors import CORSMiddleware
 11 | from contextlib import asynccontextmanager
 12 | 
 13 | # Import configuration
 14 | from . import config
 15 | 
 16 | # Import services
 17 | from .services.transcription import WhisperTranscriber
 18 | from .services.llm import LLMClient
 19 | from .services.tts import TTSClient
 20 | from .services.vision import vision_service
 21 | 
 22 | # Import routes
 23 | from .routes.websocket import websocket_endpoint
 24 | 
 25 | # Configure logging
 26 | logging.basicConfig(
 27 |     level=logging.INFO,
 28 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 29 | )
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | # Global service instances
 33 | transcription_service = None
 34 | llm_service = None
 35 | tts_service = None
 36 | # Vision service is a singleton already initialized in its module
 37 | 
 38 | @asynccontextmanager
 39 | async def lifespan(app: FastAPI):
 40 |     """
 41 |     Startup and shutdown events for the FastAPI application.
 42 |     """
 43 |     # Load configuration
 44 |     cfg = config.get_config()
 45 |     
 46 |     # Initialize services on startup
 47 |     logger.info("Initializing services...")
 48 |     
 49 |     global transcription_service, llm_service, tts_service
 50 |     
 51 |     # Initialize transcription service
 52 |     transcription_service = WhisperTranscriber(
 53 |         model_size=cfg["whisper_model"],
 54 |         sample_rate=cfg["audio_sample_rate"]
 55 |     )
 56 |     
 57 |     # Initialize LLM service
 58 |     llm_service = LLMClient(
 59 |         api_endpoint=cfg["llm_api_endpoint"]
 60 |     )
 61 |     
 62 |     # Initialize TTS service
 63 |     tts_service = TTSClient(
 64 |         api_endpoint=cfg["tts_api_endpoint"],
 65 |         model=cfg["tts_model"],
 66 |         voice=cfg["tts_voice"],
 67 |         output_format=cfg["tts_format"]
 68 |     )
 69 |     
 70 |     # Initialize vision service (will download model if not cached)
 71 |     logger.info("Initializing vision service...")
 72 |     vision_service.initialize()
 73 |     
 74 |     logger.info("All services initialized successfully")
 75 |     
 76 |     yield
 77 |     
 78 |     # Cleanup on shutdown
 79 |     logger.info("Shutting down services...")
 80 |     
 81 |     # No specific cleanup needed for these services,
 82 |     # but we could add resource release code here if needed (maybe in a future release lex 31/03/25)
 83 |     
 84 |     logger.info("Shutdown complete")
 85 | 
 86 | # Create FastAPI application
 87 | app = FastAPI(
 88 |     title="Vocalis Backend",
 89 |     description="Speech-to-Speech AI Assistant Backend",
 90 |     version="0.1.0",
 91 |     lifespan=lifespan
 92 | )
 93 | 
 94 | # Configure CORS
 95 | app.add_middleware(
 96 |     CORSMiddleware,
 97 |     allow_origins=["*"],  # Allow all origins for development
 98 |     allow_credentials=True,
 99 |     allow_methods=["*"],
100 |     allow_headers=["*"],
101 | )
102 | 
103 | # Service dependency functions
104 | def get_transcription_service():
105 |     return transcription_service
106 | 
107 | def get_llm_service():
108 |     return llm_service
109 | 
110 | def get_tts_service():
111 |     return tts_service
112 | 
113 | # API routes
114 | @app.get("/")
115 | async def root():
116 |     """Root endpoint for health check."""
117 |     return {"status": "ok", "message": "Vocalis backend is running"}
118 | 
119 | @app.get("/health")
120 | async def health_check():
121 |     """Health check endpoint."""
122 |     return {
123 |         "status": "ok",
124 |         "services": {
125 |             "transcription": transcription_service is not None,
126 |             "llm": llm_service is not None,
127 |             "tts": tts_service is not None,
128 |             "vision": vision_service.is_ready()
129 |         },
130 |         "config": {
131 |             "whisper_model": config.WHISPER_MODEL,
132 |             "tts_voice": config.TTS_VOICE,
133 |             "websocket_port": config.WEBSOCKET_PORT
134 |         }
135 |     }
136 | 
137 | @app.get("/config")
138 | async def get_full_config():
139 |     """Get full configuration."""
140 |     if not all([transcription_service, llm_service, tts_service]) or not vision_service.is_ready():
141 |         raise HTTPException(status_code=503, detail="Services not initialized")
142 |     
143 |     return {
144 |         "transcription": transcription_service.get_config(),
145 |         "llm": llm_service.get_config(),
146 |         "tts": tts_service.get_config(),
147 |         "system": config.get_config()
148 |     }
149 | 
150 | # WebSocket route
151 | @app.websocket("/ws")
152 | async def websocket_route(websocket: WebSocket):
153 |     """WebSocket endpoint for bidirectional audio streaming."""
154 |     await websocket_endpoint(
155 |         websocket, 
156 |         transcription_service, 
157 |         llm_service, 
158 |         tts_service
159 |     )
160 | 
161 | # Run server directly if executed as script
162 | if __name__ == "__main__":
163 |     uvicorn.run(
164 |         "backend.main:app",
165 |         host=config.WEBSOCKET_HOST,
166 |         port=config.WEBSOCKET_PORT,
167 |         reload=True
168 |     )
169 | 


--------------------------------------------------------------------------------
/backend/prompts/system_prompt.md:
--------------------------------------------------------------------------------
 1 | You are a helpful, friendly, and concise voice assistant. 
 2 | Respond to user queries in a natural, conversational manner. 
 3 | Keep responses brief and to the point, as you're communicating via voice. 
 4 | When providing information, focus on the most relevant details. 
 5 | If you don't know something, admit it rather than making up an answer.
 6 | 
 7 | Through the webapp, you can receive and understand photographs and pictures.
 8 | 
 9 | When the user sends a message like '[silent]', '[no response]', or '[still waiting]', it means they've gone quiet or haven't responded. When you see these signals, continue the conversation naturally based on the previous topic and context. Stay on topic, be helpful, and don't mention that they were silent - just carry on the conversation as if you're gently following up.
10 | 


--------------------------------------------------------------------------------
/backend/prompts/user_profile.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "",
3 |   "preferences": {}
4 | }
5 | 


--------------------------------------------------------------------------------
/backend/prompts/vision_settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "enabled": false
3 | }
4 | 


--------------------------------------------------------------------------------
/backend/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.109.2
 2 | uvicorn==0.27.1
 3 | python-dotenv==1.0.1
 4 | websockets==12.0
 5 | numpy==1.26.4
 6 | faster-whisper==1.1.1
 7 | requests==2.31.0
 8 | python-multipart==0.0.9
 9 | torch>=2.0.1
10 | ffmpeg-python==0.2.0
11 | transformers>=4.31.0
12 | 


--------------------------------------------------------------------------------
/backend/routes/__init__.py:
--------------------------------------------------------------------------------
1 | # Routes package initialization
2 | # This file makes the 'routes' directory a Python package
3 | 


--------------------------------------------------------------------------------
/backend/services/__init__.py:
--------------------------------------------------------------------------------
1 | # Services package initialization
2 | # This file makes the 'services' directory a Python package
3 | 


--------------------------------------------------------------------------------
/backend/services/conversation_storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Conversation Storage Service
  3 | 
  4 | Handles saving and loading conversation sessions to/from JSON files.
  5 | """
  6 | 
  7 | import os
  8 | import json
  9 | import uuid
 10 | import logging
 11 | import asyncio  # Import asyncio
 12 | from typing import Dict, List, Optional, Any
 13 | from datetime import datetime
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(level=logging.INFO)
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | class ConversationStorage:
 20 |     """
 21 |     Service for storing and retrieving conversation sessions.
 22 |     """
 23 |     
 24 |     def __init__(self, storage_dir: str = "conversations"):
 25 |         """
 26 |         Initialize the conversation storage service.
 27 |         
 28 |         Args:
 29 |             storage_dir: Directory to store conversation files
 30 |         """
 31 |         self.storage_dir = storage_dir
 32 |         os.makedirs(self.storage_dir, exist_ok=True)
 33 |         logger.info(f"Initialized ConversationStorage with directory: {storage_dir}")
 34 | 
 35 |     async def save_session(self, messages: List[Dict],
 36 |                            title: Optional[str] = None,
 37 |                            session_id: Optional[str] = None,
 38 |                            metadata: Optional[Dict[str, Any]] = None) -> str:
 39 |         """
 40 |         Save a conversation session to a JSON file.
 41 |         
 42 |         Args:
 43 |             messages: List of conversation messages
 44 |             title: Optional title for the conversation (auto-generated if None)
 45 |             session_id: Optional ID for the session (new UUID if None)
 46 |             metadata: Optional metadata to store with the session
 47 |             
 48 |         Returns:
 49 |             str: The session ID
 50 |         """
 51 |         # Generate ID if not provided
 52 |         if not session_id:
 53 |             session_id = str(uuid.uuid4())
 54 |         
 55 |         # Generate title if not provided (from first user message or timestamp)
 56 |         if not title:
 57 |             # Try to find first user message
 58 |             for msg in messages:
 59 |                 if msg.get('role') == 'user' and msg.get('content', '').strip():
 60 |                     # Use first ~30 chars of first user message
 61 |                     title = msg['content'][:30] + ('...' if len(msg['content']) > 30 else '')
 62 |                     break
 63 |             
 64 |             # Fallback to timestamp if no user messages found
 65 |             if not title:
 66 |                 title = f"Conversation {datetime.now().strftime('%Y-%m-%d %H:%M')}"
 67 |         
 68 |         # Prepare session data
 69 |         now = datetime.now().isoformat()
 70 |         session = {
 71 |             "id": session_id,
 72 |             "title": title,
 73 |             "created_at": now,
 74 |             "updated_at": now,
 75 |             "messages": messages,
 76 |             "metadata": metadata or {}
 77 |         }
 78 | 
 79 |         # Define the synchronous file writing part
 80 |         def _write_file():
 81 |             file_path = os.path.join(self.storage_dir, f"{session_id}.json")
 82 |             # Check if file exists to determine if created_at should be preserved
 83 |             created_time = now # Default to current time
 84 |             if os.path.exists(file_path):
 85 |                 try:
 86 |                     with open(file_path, 'r', encoding='utf-8') as f_read:
 87 |                         existing_data = json.load(f_read)
 88 |                         created_time = existing_data.get("created_at", now) # Use existing if found
 89 |                 except Exception as read_err:
 90 |                     logger.warning(f"Could not read existing session {session_id} to preserve created_at: {read_err}")
 91 |                     pass # Ignore errors reading existing, just use 'now'
 92 | 
 93 |             session["created_at"] = created_time # Preserve original creation time or use current if new/error
 94 | 
 95 |             # Ensure directory exists (this is quick, maybe okay sync)
 96 |             # os.makedirs(os.path.dirname(file_path), exist_ok=True) # Already done in __init__
 97 |             with open(file_path, 'w', encoding='utf-8') as f:
 98 |                 json.dump(session, f, indent=2, ensure_ascii=False)
 99 | 
100 |         # Run the synchronous file writing in a separate thread
101 |         try:
102 |             await asyncio.to_thread(_write_file) # Now _write_file is defined
103 |             logger.info(f"Saved conversation session (async): {session_id}")
104 |             return session_id
105 |         except Exception as e:
106 |             logger.error(f"Error writing session file {session_id}: {e}")
107 |             raise  # Re-raise the exception to be handled upstream
108 | 
109 |     async def load_session(self, session_id: str) -> Optional[Dict]:
110 |         """
111 |         Load a conversation session from a JSON file.
112 |         
113 |         Args:
114 |             session_id: ID of the session to load
115 |             
116 |         Returns:
117 |             Optional[Dict]: The session data, or None if not found
118 |         """
119 |         file_path = os.path.join(self.storage_dir, f"{session_id}.json")
120 |         def _read_file():
121 |             if os.path.exists(file_path):
122 |                 with open(file_path, 'r', encoding='utf-8') as f:
123 |                     return json.load(f)
124 |             return None
125 | 
126 |         try:
127 |             session = await asyncio.to_thread(_read_file)
128 |             if session:
129 |                 logger.info(f"Loaded conversation session (async): {session_id}")
130 |                 return session
131 |             else:
132 |                 logger.warning(f"Session not found (async): {session_id}")
133 |                 return None
134 |         except Exception as e:
135 |             logger.error(f"Error loading session {session_id} (async): {e}")
136 |             return None
137 | 
138 |     async def list_sessions(self) -> List[Dict]:
139 |         """
140 |         List all available conversation sessions.
141 |         
142 |         Returns:
143 |             List[Dict]: List of session metadata
144 |         """
145 |         sessions = []
146 |         def _read_dir_and_files():
147 |             session_list = []
148 |             try:
149 |                 filenames = os.listdir(self.storage_dir)
150 |             except Exception as e:
151 |                 logger.error(f"Error listing directory {self.storage_dir}: {e}")
152 |                 return [] # Return empty list if directory listing fails
153 | 
154 |             for filename in filenames:
155 |                 if filename.endswith('.json'):
156 |                     try:
157 |                         file_path = os.path.join(self.storage_dir, filename)
158 |                         with open(file_path, 'r', encoding='utf-8') as f:
159 |                             session_data = json.load(f)
160 | 
161 |                         # Include only metadata for listing
162 |                         session_list.append({
163 |                             "id": session_data.get("id"),
164 |                             "title": session_data.get("title"),
165 |                             "created_at": session_data.get("created_at"),
166 |                             "updated_at": session_data.get("updated_at"),
167 |                             "metadata": session_data.get("metadata", {})
168 |                         })
169 |                     except Exception as e:
170 |                         logger.error(f"Error loading session list item from {filename}: {e}")
171 |             return session_list
172 |         try:
173 |             sessions = await asyncio.to_thread(_read_dir_and_files)
174 |             # Sort by most recent first
175 |             sessions.sort(key=lambda s: s.get("updated_at", ""), reverse=True)
176 |             return sessions
177 |         except Exception as e:
178 |             logger.error(f"Error listing sessions (async): {e}")
179 |             return []
180 | 
181 |     async def delete_session(self, session_id: str) -> bool:
182 |         """
183 |         Delete a conversation session.
184 |         
185 |         Args:
186 |             session_id: ID of the session to delete
187 |             
188 |         Returns:
189 |             bool: True if deleted successfully, False otherwise
190 |         """
191 |         file_path = os.path.join(self.storage_dir, f"{session_id}.json")
192 |         def _remove_file():
193 |             if os.path.exists(file_path):
194 |                 os.remove(file_path)
195 |                 return True
196 |             return False
197 | 
198 |         try:
199 |             deleted = await asyncio.to_thread(_remove_file)
200 |             if deleted:
201 |                 logger.info(f"Deleted conversation session (async): {session_id}")
202 |                 return True
203 |             else:
204 |                 logger.warning(f"Session not found for deletion (async): {session_id}")
205 |                 return False
206 |         except Exception as e:
207 |             logger.error(f"Error deleting session {session_id} (async): {e}")
208 |             return False
209 | 


--------------------------------------------------------------------------------
/backend/services/llm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LLM Service
  3 | 
  4 | Handles communication with the local LLM API endpoint.
  5 | """
  6 | 
  7 | import json
  8 | import requests
  9 | import logging
 10 | from typing import Dict, Any, List, Optional
 11 | 
 12 | # Configure logging
 13 | logging.basicConfig(level=logging.INFO)
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | class LLMClient:
 17 |     """
 18 |     Client for communicating with a local LLM API.
 19 |     
 20 |     This class handles requests to a locally hosted LLM API that follows
 21 |     the OpenAI API format.
 22 |     """
 23 |     
 24 |     def __init__(
 25 |         self,
 26 |         api_endpoint: str = "http://127.0.0.1:1234/v1/chat/completions",
 27 |         model: str = "default",
 28 |         temperature: float = 0.7,
 29 |         max_tokens: int = 2048,
 30 |         timeout: int = 60
 31 |     ):
 32 |         """
 33 |         Initialize the LLM client.
 34 |         
 35 |         Args:
 36 |             api_endpoint: URL of the local LLM API
 37 |             model: Model name to use (or 'default' for API default)
 38 |             temperature: Sampling temperature (0.0 to 1.0)
 39 |             max_tokens: Maximum tokens to generate
 40 |             timeout: Request timeout in seconds
 41 |         """
 42 |         self.api_endpoint = api_endpoint
 43 |         self.model = model
 44 |         self.temperature = temperature
 45 |         self.max_tokens = max_tokens
 46 |         self.timeout = timeout
 47 |         
 48 |         # State tracking
 49 |         self.is_processing = False
 50 |         self.conversation_history = []
 51 |         
 52 |         logger.info(f"Initialized LLM Client with endpoint={api_endpoint}")
 53 |         
 54 |     def add_to_history(self, role: str, content: str) -> None:
 55 |         """
 56 |         Add a message to the conversation history.
 57 |         
 58 |         Args:
 59 |             role: Message role ('system', 'user', or 'assistant')
 60 |             content: Message content
 61 |         """
 62 |         self.conversation_history.append({
 63 |             "role": role,
 64 |             "content": content
 65 |         })
 66 |         
 67 |         # Allow deeper history for models with large context windows
 68 |         if len(self.conversation_history) > 50:
 69 |             # Always keep the system message if it exists
 70 |             if self.conversation_history[0]["role"] == "system":
 71 |                 self.conversation_history = (
 72 |                     [self.conversation_history[0]] + 
 73 |                     self.conversation_history[-49:]
 74 |                 )
 75 |             else:
 76 |                 self.conversation_history = self.conversation_history[-50:]
 77 |     
 78 |     def get_response(self, user_input: str, system_prompt: Optional[str] = None, 
 79 |                     add_to_history: bool = True, temperature: Optional[float] = None) -> Dict[str, Any]:
 80 |         """
 81 |         Get a response from the LLM for the given user input.
 82 |         
 83 |         Args:
 84 |             user_input: User's text input
 85 |             system_prompt: Optional system prompt to set context
 86 |             add_to_history: Whether to add this exchange to conversation history
 87 |             temperature: Optional temperature override (0.0 to 1.0)
 88 |             
 89 |         Returns:
 90 |             Dictionary containing the LLM response and metadata
 91 |         """
 92 |         self.is_processing = True
 93 |         start_time = logging.Formatter.converter()
 94 |         
 95 |         try:
 96 |             # Prepare messages
 97 |             messages = []
 98 |             
 99 |             # Add system prompt if provided and not already in history
100 |             if system_prompt:
101 |                 messages.append({
102 |                     "role": "system",
103 |                     "content": system_prompt
104 |                 })
105 |             
106 |             # Add user input to history if it's not empty and add_to_history is True
107 |             if user_input.strip() and add_to_history:
108 |                 self.add_to_history("user", user_input)
109 |             
110 |             # Add conversation history (which now includes the user input if add_to_history=True)
111 |             messages.extend(self.conversation_history)
112 |             
113 |             # Only add user input directly if not adding to history
114 |             # This ensures special cases (greetings/followups) work while preventing duplication for normal speech
115 |             if user_input.strip() and not add_to_history:
116 |                 messages.append({
117 |                     "role": "user",
118 |                     "content": user_input
119 |                 })
120 |             
121 |             # Prepare request payload with custom temperature if provided
122 |             payload = {
123 |                 "model": self.model if self.model != "default" else None,
124 |                 "messages": messages,
125 |                 "temperature": temperature if temperature is not None else self.temperature,
126 |                 "max_tokens": self.max_tokens
127 |             }
128 |             
129 |             # Remove None values
130 |             payload = {k: v for k, v in payload.items() if v is not None}
131 |             
132 |             # Log the full payload (truncated for readability)
133 |             payload_str = json.dumps(payload)
134 |             logger.info(f"Sending request to LLM API with {len(messages)} messages")
135 |             
136 |             # Add more detailed logging to help debug message duplication
137 |             message_roles = [msg["role"] for msg in messages]
138 |             user_message_count = message_roles.count("user")
139 |             logger.info(f"Message roles: {message_roles}, user messages: {user_message_count}")
140 |             
141 |             if len(payload_str) > 500:
142 |                 logger.debug(f"Payload (truncated): {payload_str[:500]}...")
143 |             else:
144 |                 logger.debug(f"Payload: {payload_str}")
145 |             
146 |             # Send request to LLM API
147 |             response = requests.post(
148 |                 self.api_endpoint,
149 |                 json=payload,
150 |                 timeout=self.timeout
151 |             )
152 |             
153 |             # Check if request was successful
154 |             response.raise_for_status()
155 |             
156 |             # Parse response
157 |             result = response.json()
158 |             
159 |             # Extract assistant response
160 |             assistant_message = result.get("choices", [{}])[0].get("message", {}).get("content", "")
161 |             
162 |             # Add assistant response to history (only if we added the user input)
163 |             if assistant_message and add_to_history:
164 |                 self.add_to_history("assistant", assistant_message)
165 |             
166 |             # Calculate processing time
167 |             end_time = logging.Formatter.converter()
168 |             processing_time = end_time[0] - start_time[0]
169 |             
170 |             logger.info(f"Received response from LLM API after {processing_time:.2f}s")
171 |             
172 |             return {
173 |                 "text": assistant_message,
174 |                 "processing_time": processing_time,
175 |                 "finish_reason": result.get("choices", [{}])[0].get("finish_reason"),
176 |                 "model": result.get("model", "unknown")
177 |             }
178 |             
179 |         except requests.RequestException as e:
180 |             logger.error(f"LLM API request error: {e}")
181 |             error_response = f"I'm sorry, I encountered a problem connecting to my language model. {str(e)}"
182 |             
183 |             # Add the error to history if requested and clear history on 400 errors
184 |             # to prevent the same error from happening repeatedly
185 |             if add_to_history:
186 |                 self.add_to_history("assistant", error_response)
187 |                 
188 |                 # If we get a 400 Bad Request, the context might be corrupt
189 |                 if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 400:
190 |                     logger.warning("Received 400 error, clearing conversation history to recover")
191 |                     # Keep only system prompt if it exists
192 |                     self.clear_history(keep_system_prompt=True)
193 |             
194 |             return {
195 |                 "text": error_response,
196 |                 "error": str(e)
197 |             }
198 |         except Exception as e:
199 |             logger.error(f"LLM processing error: {e}")
200 |             error_response = "I'm sorry, I encountered an unexpected error. Please try again."
201 |             self.add_to_history("assistant", error_response)
202 |             return {
203 |                 "text": error_response,
204 |                 "error": str(e)
205 |             }
206 |         finally:
207 |             self.is_processing = False
208 |     
209 |     def clear_history(self, keep_system_prompt: bool = True) -> None:
210 |         """
211 |         Clear conversation history.
212 |         
213 |         Args:
214 |             keep_system_prompt: Whether to keep the system prompt if it exists
215 |         """
216 |         if keep_system_prompt and self.conversation_history and self.conversation_history[0]["role"] == "system":
217 |             self.conversation_history = [self.conversation_history[0]]
218 |         else:
219 |             self.conversation_history = []
220 |     
221 |     def get_config(self) -> Dict[str, Any]:
222 |         """
223 |         Get the current configuration.
224 |         
225 |         Returns:
226 |             Dict containing the current configuration
227 |         """
228 |         return {
229 |             "api_endpoint": self.api_endpoint,
230 |             "model": self.model,
231 |             "temperature": self.temperature,
232 |             "max_tokens": self.max_tokens,
233 |             "timeout": self.timeout,
234 |             "is_processing": self.is_processing,
235 |             "history_length": len(self.conversation_history)
236 |         }
237 | 


--------------------------------------------------------------------------------
/backend/services/transcription.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Speech-to-Text Transcription Service
  3 | 
  4 | Uses Faster Whisper to transcribe speech audio.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import logging
  9 | import io  # For BytesIO
 10 | from typing import Dict, Any, List, Optional, Tuple
 11 | from faster_whisper import WhisperModel
 12 | import time
 13 | import torch  # For CUDA availability check
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(level=logging.INFO)
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | class WhisperTranscriber:
 20 |     """
 21 |     Speech-to-Text service using Faster Whisper.
 22 |     
 23 |     This class handles transcription of speech audio segments.
 24 |     """
 25 |     
 26 |     def __init__(
 27 |         self,
 28 |         model_size: str = "base",
 29 |         device: str = None,
 30 |         compute_type: str = None,
 31 |         beam_size: int = 2,
 32 |         sample_rate: int = 44100
 33 |     ):
 34 |         """
 35 |         Initialize the transcription service.
 36 |         
 37 |         Args:
 38 |             model_size: Whisper model size (tiny.en, base.en, small.en, medium.en, large)
 39 |             device: Device to run model on ('cpu' or 'cuda'), if None will auto-detect
 40 |             compute_type: Model computation type (int8, int16, float16, float32), if None will select based on device
 41 |             beam_size: Beam size for decoding
 42 |             sample_rate: Audio sample rate in Hz
 43 |         """
 44 |         self.model_size = model_size
 45 |         
 46 |         # Auto-detect device if not specified
 47 |         if device is None:
 48 |             self.device = "cuda" if torch.cuda.is_available() else "cpu"
 49 |         else:
 50 |             self.device = device
 51 |             
 52 |         # Select appropriate compute type based on device if not specified
 53 |         if compute_type is None:
 54 |             self.compute_type = "float16" if self.device == "cuda" else "int8"
 55 |         else:
 56 |             self.compute_type = compute_type
 57 |             
 58 |         self.beam_size = beam_size
 59 |         self.sample_rate = sample_rate
 60 |         
 61 |         # Initialize model
 62 |         self._initialize_model()
 63 |         
 64 |         # State tracking
 65 |         self.is_processing = False
 66 |         
 67 |         logger.info(f"Initialized Whisper Transcriber with model={model_size}, "
 68 |                    f"device={self.device}, compute_type={self.compute_type}")
 69 |     
 70 |     def _initialize_model(self):
 71 |         """Initialize Whisper model."""
 72 |         try:
 73 |             # Load the model
 74 |             self.model = WhisperModel(
 75 |                 self.model_size,  # Pass as positional argument, not keyword
 76 |                 device=self.device,
 77 |                 compute_type=self.compute_type
 78 |             )
 79 |             logger.info(f"Successfully loaded Whisper model: {self.model_size}")
 80 |         except Exception as e:
 81 |             logger.error(f"Failed to load Whisper model: {e}")
 82 |             raise
 83 |     
 84 |     def transcribe(self, audio: np.ndarray) -> Tuple[str, Dict[str, Any]]:
 85 |         """
 86 |         Transcribe audio data to text.
 87 |         
 88 |         Args:
 89 |             audio: Audio data as numpy array
 90 |             
 91 |         Returns:
 92 |             Tuple[str, Dict[str, Any]]: 
 93 |                 - Transcribed text
 94 |                 - Dictionary with additional information (confidence, language, etc.)
 95 |         """
 96 |         start_time = time.time()
 97 |         self.is_processing = True
 98 |         
 99 |         try:
100 |             # Handle WAV data (if audio is in uint8 format, it contains WAV headers)
101 |             if audio.dtype == np.uint8:
102 |                 # First check the RIFF header to confirm this is WAV data
103 |                 header = bytes(audio[:44])
104 |                 if header[:4] == b'RIFF' and header[8:12] == b'WAVE':
105 |                     # Create a file-like object that Whisper can read from
106 |                     audio_file = io.BytesIO(bytes(audio))
107 |                     # The transcribe method expects a file-like object with read method
108 |                     audio = audio_file
109 |                 else:
110 |                     # Not a proper WAV header
111 |                     logger.warning("Received audio data with incorrect WAV header")
112 |                     # Attempt to process as raw data
113 |                     audio = audio.astype(np.float32) / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio
114 |             else:
115 |                 # Normalize audio if it's raw float data
116 |                 audio = audio.astype(np.float32) / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio
117 |             
118 |             # Transcribe
119 |             segments, info = self.model.transcribe(
120 |                 audio, 
121 |                 beam_size=self.beam_size,
122 |                 language="en",  # Force English language
123 |                 vad_filter=False  # Disable VAD filter since we handle it in the frontend
124 |             )
125 |             
126 |             # Collect all segment texts
127 |             text_segments = [segment.text for segment in segments]
128 |             full_text = " ".join(text_segments).strip()
129 |             
130 |             # Calculate processing time
131 |             processing_time = time.time() - start_time
132 |             logger.info(f"Transcription completed in {processing_time:.2f}s: {full_text[:50]}...")
133 |             
134 |             metadata = {
135 |                 "confidence": getattr(info, "avg_logprob", 0),
136 |                 "language": getattr(info, "language", "en"),
137 |                 "processing_time": processing_time,
138 |                 "segments_count": len(text_segments)
139 |             }
140 |             
141 |             return full_text, metadata
142 |             
143 |         except Exception as e:
144 |             logger.error(f"Transcription error: {e}")
145 |             return "", {"error": str(e)}
146 |         finally:
147 |             self.is_processing = False
148 |     
149 |     def transcribe_streaming(self, audio_generator):
150 |         """
151 |         Stream transcription results from an audio generator.
152 |         
153 |         Args:
154 |             audio_generator: Generator yielding audio chunks
155 |             
156 |         Yields:
157 |             Partial transcription results as they become available
158 |         """
159 |         self.is_processing = True
160 |         
161 |         try:
162 |             # Process the streaming transcription
163 |             segments = self.model.transcribe_with_vad(
164 |                 audio_generator,
165 |                 language="en"
166 |             )
167 |             
168 |             # Yield each segment as it's transcribed
169 |             for segment in segments:
170 |                 yield {
171 |                     "text": segment.text,
172 |                     "start": segment.start,
173 |                     "end": segment.end,
174 |                     "confidence": segment.avg_logprob
175 |                 }
176 |                 
177 |         except Exception as e:
178 |             logger.error(f"Streaming transcription error: {e}")
179 |             yield {"error": str(e)}
180 |         finally:
181 |             self.is_processing = False
182 |     
183 |     def get_config(self) -> Dict[str, Any]:
184 |         """
185 |         Get the current configuration.
186 |         
187 |         Returns:
188 |             Dict containing the current configuration
189 |         """
190 |         return {
191 |             "model_size": self.model_size,
192 |             "device": self.device,
193 |             "compute_type": self.compute_type,
194 |             "beam_size": self.beam_size,
195 |             "sample_rate": self.sample_rate,
196 |             "is_processing": self.is_processing
197 |         }
198 | 


--------------------------------------------------------------------------------
/backend/services/tts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Text-to-Speech Service
  3 | 
  4 | Handles communication with the local TTS API endpoint.
  5 | """
  6 | 
  7 | import json
  8 | import requests
  9 | import logging
 10 | import io
 11 | import time
 12 | import base64
 13 | import asyncio
 14 | from typing import Dict, Any, List, Optional, BinaryIO, Generator, AsyncGenerator
 15 | 
 16 | # Configure logging
 17 | logging.basicConfig(level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class TTSClient:
 21 |     """
 22 |     Client for communicating with a local TTS API.
 23 |     
 24 |     This class handles requests to a locally hosted TTS API that follows
 25 |     the OpenAI API format for text-to-speech generation.
 26 |     """
 27 |     
 28 |     def __init__(
 29 |         self,
 30 |         api_endpoint: str = "http://localhost:5005/v1/audio/speech",
 31 |         model: str = "tts-1",
 32 |         voice: str = "tara",
 33 |         output_format: str = "wav",
 34 |         speed: float = 1.0,
 35 |         timeout: int = 60,
 36 |         chunk_size: int = 4096
 37 |     ):
 38 |         """
 39 |         Initialize the TTS client.
 40 |         
 41 |         Args:
 42 |             api_endpoint: URL of the local TTS API
 43 |             model: TTS model name to use
 44 |             voice: Voice to use for synthesis
 45 |             output_format: Output audio format (mp3, opus, aac, flac)
 46 |             speed: Speech speed multiplier (0.25 to 4.0)
 47 |             timeout: Request timeout in seconds
 48 |             chunk_size: Size of audio chunks to stream in bytes
 49 |         """
 50 |         self.api_endpoint = api_endpoint
 51 |         self.model = model
 52 |         self.voice = voice
 53 |         self.output_format = output_format
 54 |         self.speed = speed
 55 |         self.timeout = timeout
 56 |         self.chunk_size = chunk_size
 57 |         
 58 |         # State tracking
 59 |         self.is_processing = False
 60 |         self.last_processing_time = 0
 61 |         
 62 |         logger.info(f"Initialized TTS Client with endpoint={api_endpoint}, "
 63 |                    f"model={model}, voice={voice}")
 64 |     
 65 |     def text_to_speech(self, text: str) -> bytes:
 66 |         """
 67 |         Convert text to speech audio.
 68 |         
 69 |         Args:
 70 |             text: Text to convert to speech
 71 |             
 72 |         Returns:
 73 |             Audio data as bytes
 74 |         """
 75 |         self.is_processing = True
 76 |         start_time = time.time()
 77 |         
 78 |         try:
 79 |             # Prepare request payload
 80 |             payload = {
 81 |                 "model": self.model,
 82 |                 "input": text,
 83 |                 "voice": self.voice,
 84 |                 "response_format": self.output_format,
 85 |                 "speed": self.speed
 86 |             }
 87 |             
 88 |             logger.info(f"Sending TTS request with {len(text)} characters of text")
 89 |             
 90 |             # Send request to TTS API
 91 |             response = requests.post(
 92 |                 self.api_endpoint,
 93 |                 json=payload,
 94 |                 timeout=self.timeout
 95 |             )
 96 |             
 97 |             # Check if request was successful
 98 |             response.raise_for_status()
 99 |             
100 |             # Get audio content
101 |             audio_data = response.content
102 |             
103 |             # Calculate processing time
104 |             self.last_processing_time = time.time() - start_time
105 |             
106 |             logger.info(f"Received TTS response after {self.last_processing_time:.2f}s, "
107 |                        f"size: {len(audio_data)} bytes")
108 |             
109 |             return audio_data
110 |             
111 |         except requests.RequestException as e:
112 |             logger.error(f"TTS API request error: {e}")
113 |             raise
114 |         except Exception as e:
115 |             logger.error(f"TTS processing error: {e}")
116 |             raise
117 |         finally:
118 |             self.is_processing = False
119 |     
120 |     def stream_text_to_speech(self, text: str) -> Generator[bytes, None, None]:
121 |         """
122 |         Stream audio data from the TTS API.
123 |         
124 |         Args:
125 |             text: Text to convert to speech
126 |             
127 |         Yields:
128 |             Chunks of audio data
129 |         """
130 |         self.is_processing = True
131 |         start_time = time.time()
132 |         
133 |         try:
134 |             # Prepare request payload
135 |             payload = {
136 |                 "model": self.model,
137 |                 "input": text,
138 |                 "voice": self.voice,
139 |                 "response_format": self.output_format,
140 |                 "speed": self.speed
141 |             }
142 |             
143 |             logger.info(f"Sending streaming TTS request with {len(text)} characters of text")
144 |             
145 |             # Send request to TTS API
146 |             with requests.post(
147 |                 self.api_endpoint,
148 |                 json=payload,
149 |                 timeout=self.timeout,
150 |                 stream=True
151 |             ) as response:
152 |                 response.raise_for_status()
153 |                 
154 |                 # Check if streaming is supported by the API
155 |                 is_chunked = response.headers.get('transfer-encoding', '') == 'chunked'
156 |                 
157 |                 if is_chunked:
158 |                     # The API supports streaming
159 |                     for chunk in response.iter_content(chunk_size=self.chunk_size):
160 |                         if chunk:
161 |                             yield chunk
162 |                 else:
163 |                     # The API doesn't support streaming, but we'll fake it by
164 |                     # splitting the response into chunks
165 |                     audio_data = response.content
166 |                     total_chunks = (len(audio_data) + self.chunk_size - 1) // self.chunk_size
167 |                     
168 |                     for i in range(total_chunks):
169 |                         start_idx = i * self.chunk_size
170 |                         end_idx = min(start_idx + self.chunk_size, len(audio_data))
171 |                         yield audio_data[start_idx:end_idx]
172 |                 
173 |             # Calculate processing time
174 |             self.last_processing_time = time.time() - start_time
175 |             logger.info(f"Completed TTS streaming after {self.last_processing_time:.2f}s")
176 |             
177 |         except requests.RequestException as e:
178 |             logger.error(f"TTS API streaming request error: {e}")
179 |             raise
180 |         except Exception as e:
181 |             logger.error(f"TTS streaming error: {e}")
182 |             raise
183 |         finally:
184 |             self.is_processing = False
185 |     
186 |     async def async_text_to_speech(self, text: str) -> bytes:
187 |         """
188 |         Asynchronously generate audio data from the TTS API.
189 |         
190 |         This method provides asynchronous TTS capability by running
191 |         the synchronous method in a thread.
192 |         
193 |         Args:
194 |             text: Text to convert to speech
195 |             
196 |         Returns:
197 |             Complete audio data as bytes
198 |         """
199 |         self.is_processing = True
200 |         
201 |         try:
202 |             # Get complete audio data
203 |             audio_data = await asyncio.to_thread(self.text_to_speech, text)
204 |             return audio_data
205 |         except Exception as e:
206 |             logger.error(f"Async TTS error: {e}")
207 |             raise
208 |         finally:
209 |             self.is_processing = False
210 |     
211 |     def get_config(self) -> Dict[str, Any]:
212 |         """
213 |         Get the current configuration.
214 |         
215 |         Returns:
216 |             Dict containing the current configuration
217 |         """
218 |         return {
219 |             "api_endpoint": self.api_endpoint,
220 |             "model": self.model,
221 |             "voice": self.voice,
222 |             "output_format": self.output_format,
223 |             "speed": self.speed,
224 |             "timeout": self.timeout,
225 |             "chunk_size": self.chunk_size,
226 |             "is_processing": self.is_processing,
227 |             "last_processing_time": self.last_processing_time
228 |         }
229 | 


--------------------------------------------------------------------------------
/backend/services/vision.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Vision service for image processing using SmolVLM
  3 | 
  4 | Handles loading and initializing the vision model for image understanding.
  5 | """
  6 | 
  7 | import logging
  8 | from transformers import AutoProcessor, AutoModelForVision2Seq
  9 | 
 10 | # Configure logging
 11 | logging.basicConfig(level=logging.INFO)
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class VisionService:
 15 |     """
 16 |     Service for processing images with vision models.
 17 |     Currently uses SmolVLM-256M-Instruct for lightweight image understanding.
 18 |     """
 19 |     
 20 |     def __init__(self):
 21 |         """Initialize the service with empty model references."""
 22 |         self.processor = None
 23 |         self.model = None
 24 |         self.initialized = False
 25 |         self.model_name = "HuggingFaceTB/SmolVLM-256M-Instruct"
 26 |         self.default_prompt = "Describe this image in detail. Include information about objects, people, scenes, text, and any notable elements."
 27 |     
 28 |     def initialize(self):
 29 |         """
 30 |         Initialize the model, downloading it if necessary.
 31 |         This will be called on server startup.
 32 |         
 33 |         Returns:
 34 |             bool: Whether initialization was successful
 35 |         """
 36 |         if self.initialized:
 37 |             logger.info("Vision model already initialized")
 38 |             return True
 39 |         
 40 |         try:
 41 |             import torch
 42 |             
 43 |             # Determine device (use CUDA if available)
 44 |             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 45 |             logger.info(f"Using device for vision model: {self.device}")
 46 |             
 47 |             logger.info(f"Loading vision model {self.model_name} (this may take a while on first run)...")
 48 |             
 49 |             # These calls will trigger the download if the model isn't cached locally
 50 |             self.processor = AutoProcessor.from_pretrained(self.model_name)
 51 |             self.model = AutoModelForVision2Seq.from_pretrained(self.model_name)
 52 |             
 53 |             # Move model to GPU if available
 54 |             self.model = self.model.to(self.device)
 55 |             
 56 |             self.initialized = True
 57 |             logger.info(f"Vision model loaded successfully on {self.device}")
 58 |             return True
 59 |         except Exception as e:
 60 |             logger.error(f"Error loading vision model: {e}")
 61 |             return False
 62 |     
 63 |     def process_image(self, image_base64: str, prompt: str = None):
 64 |         """
 65 |         Process an image with SmolVLM and return a description.
 66 |         
 67 |         Args:
 68 |             image_base64: Base64-encoded image data
 69 |             prompt: Prompt to guide image description (uses default if None)
 70 |             
 71 |         Returns:
 72 |             str: Image description
 73 |         """
 74 |         if not self.is_ready():
 75 |             raise RuntimeError("Vision model not initialized")
 76 |             
 77 |         try:
 78 |             # Decode base64 image
 79 |             import base64
 80 |             from io import BytesIO
 81 |             from PIL import Image
 82 |             import torch
 83 |             
 84 |             # Use default prompt if none provided
 85 |             if prompt is None:
 86 |                 prompt = self.default_prompt
 87 |             
 88 |             # Format the prompt to include the <image> token
 89 |             formatted_prompt = f"User uploaded this image: <image>\n{prompt}"
 90 |             
 91 |             # Convert base64 to image
 92 |             image_data = base64.b64decode(image_base64)
 93 |             image = Image.open(BytesIO(image_data)).convert('RGB')
 94 |             
 95 |             # Prepare inputs for the model with the correct token format
 96 |             inputs = self.processor(text=[formatted_prompt], images=[image], return_tensors="pt")
 97 |             
 98 |             # Move inputs to the same device as the model
 99 |             inputs = {k: v.to(self.device) for k, v in inputs.items()}
100 |             
101 |             # Generate description
102 |             with torch.no_grad():
103 |                 output_ids = self.model.generate(
104 |                     **inputs,
105 |                     max_new_tokens=256,
106 |                     do_sample=False
107 |                 )
108 |             
109 |             # Decode the output
110 |             description = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0]
111 |             
112 |             return description.strip()
113 |             
114 |         except Exception as e:
115 |             logger.error(f"Error processing image with vision model: {e}")
116 |             return f"Error analyzing image: {str(e)}"
117 |     
118 |     def is_ready(self):
119 |         """
120 |         Check if the model is initialized and ready.
121 |         
122 |         Returns:
123 |             bool: Whether the model is ready for use
124 |         """
125 |         return self.initialized
126 | 
127 | # Create singleton instance
128 | vision_service = VisionService()
129 | 


--------------------------------------------------------------------------------
/docs/Demonstration_Video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Demonstration_Video.png


--------------------------------------------------------------------------------
/docs/Vocalis_Demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Demo.png


--------------------------------------------------------------------------------
/docs/Vocalis_Header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Header.png


--------------------------------------------------------------------------------
/docs/Vocalis_Visual_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/docs/Vocalis_Visual_demo.gif


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Vocalis - Demo Assets</title>
 7 |     <style>
 8 |         body {
 9 |             font-family: Arial, sans-serif;
10 |             margin: 20px;
11 |             background-color: #121212;
12 |             color: #ffffff;
13 |         }
14 |         table {
15 |             width: 100%;
16 |             border-collapse: collapse;
17 |         }
18 |         th, td {
19 |             padding: 10px;
20 |             border: 1px solid #333;
21 |             text-align: left;
22 |         }
23 |         th {
24 |             background-color: #222;
25 |         }
26 |         a {
27 |             color: #1E90FF;
28 |             text-decoration: none;
29 |         }
30 |         a:hover {
31 |             text-decoration: underline;
32 |         }
33 |         img {
34 |             max-width: 700px;
35 |             display: block;
36 |             margin-top: 10px;
37 |         }
38 |     </style>
39 | </head>
40 | <body>
41 |     <h1>Vocalis - Demo Assets</h1>
42 | 
43 |     <table>
44 |         <thead>
45 |             <tr>
46 |                 <th>File Type</th>
47 |                 <th>File</th>
48 |             </tr>
49 |         </thead>
50 |         <tbody>
51 |             <tr>
52 |                 <td>Project Banner (PNG)</td>
53 |                 <td>
54 |                     <a href="Vocalis_Demo.png" target="_blank">Vocalis_Demo.png</a><br>
55 |                     <img src="Vocalis_Demo.png" alt="Vocalis Banner">
56 |                 </td>
57 |             </tr>
58 |             <tr>
59 |                 <td>Project Header (PNG)</td>
60 |                 <td>
61 |                     <a href="Vocalis_Header.png" target="_blank">Vocalis_Header.png</a><br>
62 |                     <img src="Vocalis_Header.png" alt="Vocalis Header Image">
63 |                 </td>
64 |             </tr>
65 |             <tr>
66 |                 <td>Visual Demonstration (GIF)</td>
67 |                 <td>
68 |                     <a href="Vocalis_Visual_demo.gif" target="_blank">Vocalis_Visual_demo.gif</a><br>
69 |                     <img src="Vocalis_Visual_demo.gif" alt="Vocalis Visual Demo">
70 |                 </td>
71 |             </tr>
72 |         </tbody>
73 |     </table>
74 | </body>
75 | </html>


--------------------------------------------------------------------------------
/frontend/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lex-au/Vocalis/136fea2d32cf55eaf4b2f368cad8ecdad07bb4be/frontend/favicon.ico


--------------------------------------------------------------------------------
/frontend/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <title>Vocalis - AI Speech Assistant</title>
 8 |     <meta name="description" content="Vocalis is a sophisticated AI assistant with speech-to-speech capabilities." />
 9 |   </head>
10 |   <body>
11 |     <div id="root"></div>
12 |     <script type="module" src="/src/main.tsx"></script>
13 |   </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "vocalis-frontend",
 3 |   "private": true,
 4 |   "version": "0.1.0",
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "dev": "vite",
 8 |     "build": "tsc && vite build",
 9 |     "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
10 |     "preview": "vite preview"
11 |   },
12 |   "dependencies": {
13 |     "lucide-react": "^0.344.0",
14 |     "react": "^18.3.1",
15 |     "react-dom": "^18.3.1"
16 |   },
17 |   "devDependencies": {
18 |     "@types/react": "^18.3.5",
19 |     "@types/react-dom": "^18.3.0",
20 |     "@typescript-eslint/eslint-plugin": "^7.2.0",
21 |     "@typescript-eslint/parser": "^7.2.0",
22 |     "@vitejs/plugin-react": "^4.3.1",
23 |     "autoprefixer": "^10.4.18",
24 |     "eslint": "^8.57.0",
25 |     "eslint-plugin-react-hooks": "^4.6.0",
26 |     "eslint-plugin-react-refresh": "^0.4.5",
27 |     "postcss": "^8.4.35",
28 |     "tailwindcss": "^3.4.1",
29 |     "typescript": "^5.5.3",
30 |     "vite": "^5.4.2"
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/frontend/postcss.config.js:
--------------------------------------------------------------------------------
1 | export default {
2 |   plugins: {
3 |     tailwindcss: {},
4 |     autoprefixer: {},
5 |   },
6 | }
7 | 


--------------------------------------------------------------------------------
/frontend/src/App.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useState, useEffect } from 'react';
 2 | import ChatInterface from './components/ChatInterface';
 3 | import Sidebar from './components/Sidebar';
 4 | import { Menu } from 'lucide-react';
 5 | import websocketService, { ConnectionState } from './services/websocket';
 6 | 
 7 | function App() {
 8 |   const [isSidebarOpen, setIsSidebarOpen] = useState(false);
 9 |   const [isConnected, setIsConnected] = useState(false);
10 |   
11 |   // Track WebSocket connection
12 |   useEffect(() => {
13 |     const handleConnectionChange = () => {
14 |       const state = websocketService.getConnectionState();
15 |       setIsConnected(state === ConnectionState.CONNECTED);
16 |     };
17 |     
18 |     // Set up event listeners
19 |     websocketService.addEventListener('open', handleConnectionChange);
20 |     websocketService.addEventListener('close', handleConnectionChange);
21 |     websocketService.addEventListener('error', handleConnectionChange);
22 |     
23 |     // Initial check
24 |     handleConnectionChange();
25 |     
26 |     // Cleanup
27 |     return () => {
28 |       websocketService.removeEventListener('open', handleConnectionChange);
29 |       websocketService.removeEventListener('close', handleConnectionChange);
30 |       websocketService.removeEventListener('error', handleConnectionChange);
31 |     };
32 |   }, []);
33 | 
34 |   return (
35 |     <div className="flex relative">
36 |       {/* Toggle Button */}
37 |       <button
38 |         onClick={() => setIsSidebarOpen(!isSidebarOpen)}
39 |         className="fixed top-4 left-4 z-50 p-2 rounded-lg bg-slate-800/30 hover:bg-slate-700/30 
40 |                    text-slate-400 hover:text-slate-300 transition-all duration-300"
41 |       >
42 |         <Menu className="w-5 h-5" />
43 |       </button>
44 |       
45 |       {/* Sidebar */}
46 |       <div className={`
47 |         fixed top-0 left-0 h-screen z-40
48 |         transition-all duration-300 ease-in-out
49 |         ${isSidebarOpen ? 'translate-x-0' : '-translate-x-full'}
50 |       `}>
51 |       <Sidebar 
52 |         onClose={() => setIsSidebarOpen(false)} 
53 |         isConnected={isConnected}
54 |         onReconnect={() => websocketService.connect()}
55 |         onClearHistory={() => websocketService.clearHistory()}
56 |       />
57 |       </div>
58 |       
59 |       {/* Main Content */}
60 |       <div className={`
61 |         flex-1 transition-all duration-300 ease-in-out
62 |         ${isSidebarOpen ? 'ml-64' : 'ml-0'}
63 |       `}>
64 |         <ChatInterface />
65 |       </div>
66 |     </div>
67 |   );
68 | }
69 | 
70 | export default App;
71 | 


--------------------------------------------------------------------------------
/frontend/src/components/AssistantOrb.tsx:
--------------------------------------------------------------------------------
  1 | import React, { useRef, useEffect, useState, useMemo, useCallback } from 'react';
  2 | 
  3 | interface AssistantOrbProps {
  4 |   state: 'idle' | 'greeting' | 'listening' | 'processing' | 'speaking' | 'vision_file' | 'vision_processing' | 'vision_asr';
  5 | }
  6 | 
  7 | const AssistantOrb: React.FC<AssistantOrbProps> = ({ state }) => {
  8 |   const canvasRef = useRef<HTMLCanvasElement>(null);
  9 |   const starsCanvasRef = useRef<HTMLCanvasElement>(null);
 10 |   const [dimensions, setDimensions] = useState({ width: 208, height: 208 });
 11 |   
 12 |   // Adjust size based on screen size
 13 |   useEffect(() => {
 14 |     const updateSize = () => {
 15 |       const isMobile = window.innerWidth < 768;
 16 |       setDimensions({
 17 |         width: isMobile ? 156 : 208,
 18 |         height: isMobile ? 156 : 208
 19 |       });
 20 |     };
 21 |     
 22 |     updateSize();
 23 |     window.addEventListener('resize', updateSize);
 24 |     
 25 |     return () => {
 26 |       window.removeEventListener('resize', updateSize);
 27 |     };
 28 |   }, []);
 29 |   
 30 |   // Create stars with memoization
 31 |   const stars = useMemo(() => {
 32 |     const starCount = 75;
 33 |     return Array.from({ length: starCount }, () => ({
 34 |       x: Math.random() * 400 - 100, // Wider distribution
 35 |       y: Math.random() * 400 - 100, // Wider distribution
 36 |       size: Math.random() * 0.8 + 0.2, // Smaller size range
 37 |       twinkleSpeed: Math.random() * 2 + 1,
 38 |       moveSpeed: Math.random() * 0.2 + 0.1
 39 |     }));
 40 |   }, []);
 41 | 
 42 |   // Star animation function
 43 |   const drawStars = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => {
 44 |     ctx.clearRect(0, 0, width, height);
 45 |     
 46 |     stars.forEach((star: any) => {
 47 |       // Update position
 48 |       star.x += star.moveSpeed;
 49 |       if (star.x > width + 100) star.x = -100;
 50 |       
 51 |       // Calculate twinkle
 52 |       const twinkle = Math.sin(time * star.twinkleSpeed) * 0.5 + 0.5;
 53 |       let alpha = twinkle * 0.4; // Base alpha
 54 |       
 55 |       // Enhance stars based on state
 56 |       if (state === 'listening') {
 57 |         alpha *= 1.5; // Brighter during listening
 58 |       } else if (state === 'greeting') {
 59 |         alpha *= 1.4; // Almost as bright as listening during greeting
 60 |       } else if (state === 'speaking') {
 61 |         alpha *= 1 + (0.5 * Math.sin(time * 5)); // Pulsing during speaking
 62 |       }
 63 |       
 64 |       // Draw star with subtle glow
 65 |       ctx.beginPath();
 66 |       const gradient = ctx.createRadialGradient(
 67 |         star.x, star.y, 0,
 68 |         star.x, star.y, star.size * 2
 69 |       );
 70 |       gradient.addColorStop(0, `rgba(255, 255, 255, ${alpha})`);
 71 |       gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
 72 |       ctx.fillStyle = gradient;
 73 |       ctx.arc(star.x, star.y, star.size, 0, Math.PI * 2);
 74 |       ctx.fill();
 75 |     });
 76 |   }, [stars, state]);
 77 | 
 78 |   // Aurora effect animation
 79 |   const drawAurora = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => {
 80 |     // Clear canvas completely each frame for clean animation
 81 |     ctx.clearRect(0, 0, width, height);
 82 |     
 83 |     // Create base gradient for the ethereal background
 84 |     let baseGradient;
 85 |     
 86 |     // Change gradient colors based on state
 87 |     if (state === 'vision_file') {
 88 |       // Light blue/cyan for vision file
 89 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
 90 |       baseGradient.addColorStop(0, 'rgba(125, 211, 252, 0.15)'); // sky-300
 91 |       baseGradient.addColorStop(0.5, 'rgba(186, 230, 253, 0.1)'); // sky-200
 92 |       baseGradient.addColorStop(1, 'rgba(224, 242, 254, 0.12)'); // sky-100
 93 |     } else if (state === 'vision_asr') {
 94 |       // Bright green for vision ASR (matching listening)
 95 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
 96 |       baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.15)');
 97 |       baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)');
 98 |       baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.08)');
 99 |     } else if (state === 'vision_processing') {
100 |       // Teal for vision processing
101 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
102 |       baseGradient.addColorStop(0, 'rgba(183, 245, 235, 0.15)');
103 |       baseGradient.addColorStop(0.5, 'rgba(153, 235, 225, 0.12)');
104 |       baseGradient.addColorStop(1, 'rgba(45, 212, 191, 0.15)');
105 |     } else if (state === 'listening') {
106 |       // Bright green for listening
107 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
108 |       baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.15)');
109 |       baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)');
110 |       baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.08)');
111 |     } else if (state === 'greeting') {
112 |       // Blue for greeting
113 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
114 |       baseGradient.addColorStop(0, 'rgba(59, 130, 246, 0.15)'); // Blue
115 |       baseGradient.addColorStop(0.5, 'rgba(96, 165, 250, 0.1)'); // Lighter blue
116 |       baseGradient.addColorStop(1, 'rgba(59, 130, 246, 0.08)'); // Blue again
117 |     } else if (state === 'processing') {
118 |       // Purple/blue for processing
119 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
120 |       baseGradient.addColorStop(0, 'rgba(72, 209, 255, 0.1)');
121 |       baseGradient.addColorStop(0.5, 'rgba(135, 150, 235, 0.1)');
122 |       baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.15)');
123 |     } else if (state === 'speaking') {
124 |       // Gold/amber for speaking
125 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
126 |       baseGradient.addColorStop(0, 'rgba(255, 223, 72, 0.1)');
127 |       baseGradient.addColorStop(0.5, 'rgba(255, 167, 72, 0.08)');
128 |       baseGradient.addColorStop(1, 'rgba(255, 109, 72, 0.12)');
129 |     } else {
130 |       // Default/idle state - subtle green
131 |       baseGradient = ctx.createLinearGradient(0, 0, width, height);
132 |       baseGradient.addColorStop(0, 'rgba(72, 255, 167, 0.1)');
133 |       baseGradient.addColorStop(0.5, 'rgba(135, 206, 235, 0.1)');
134 |       baseGradient.addColorStop(1, 'rgba(186, 85, 255, 0.1)');
135 |     }
136 |     
137 |     // Fill with gradient
138 |     ctx.fillStyle = 'rgba(72, 255, 167, 0.06)';
139 |     ctx.fillRect(0, 0, width, height);
140 |     ctx.fillStyle = baseGradient;
141 |     ctx.fillRect(0, 0, width, height);
142 | 
143 |     // Create flowing aurora effect
144 |     const numWaves = 3;
145 |     for (let wave = 0; wave < numWaves; wave++) {
146 |       const waveOffset = wave * (Math.PI / numWaves);
147 |       
148 |       ctx.beginPath();
149 |       
150 |       // Start from the left edge
151 |       ctx.moveTo(-width * 0.1, height / 2);
152 |       
153 |       // Create smooth wave path
154 |       for (let x = -width * 0.1; x <= width * 1.1; x += 1) {
155 |         const progress = (x + width * 0.1) / (width * 1.2);
156 |         const amplitude = height * 0.15; // Reduced amplitude to prevent overflow
157 |         
158 |         // Wave speed modifier based on state
159 |         let speedMod = 1.0;
160 |         if (state === 'listening') speedMod = 1.5;
161 |         if (state === 'greeting') speedMod = 1.8;  // Slightly faster than listening
162 |         if (state === 'processing') speedMod = 2.5;
163 |         if (state === 'speaking') speedMod = 2.0;
164 |         
165 |         // Complex wave function for organic movement
166 |         const y = height / 2 + 
167 |           Math.sin(progress * 4 + time * speedMod + waveOffset) * amplitude * 0.5 +
168 |           Math.sin(progress * 7 + time * 0.5 * speedMod) * amplitude * 0.3 +
169 |           Math.sin(progress * 2 - time * 0.7 * speedMod) * amplitude * 0.2;
170 |         
171 |         ctx.lineTo(x, y);
172 |       }
173 |       
174 |       // Complete the path
175 |       ctx.lineTo(width, height * 1.1); // Extend slightly beyond bottom
176 |       ctx.lineTo(0, height * 1.1); // Extend slightly beyond bottom
177 |       ctx.closePath();
178 |       
179 |       // Create gradient for each wave
180 |       const gradient = ctx.createLinearGradient(0, 0, 0, height);
181 |       const alpha = 0.18 - wave * 0.04; // Slightly increased contrast between waves
182 |       
183 |       // Color based on state
184 |       let baseHue;
185 |       if (state === 'vision_file') {
186 |         baseHue = wave === 0 ? 195 : wave === 1 ? 200 : 205; // Light blue/cyan range
187 |       } else if (state === 'vision_asr') {
188 |         baseHue = wave === 0 ? 145 : wave === 1 ? 160 : 175; // Green range (matching listening)
189 |       } else if (state === 'vision_processing') {
190 |         baseHue = wave === 0 ? 175 : wave === 1 ? 165 : 180; // Teal range
191 |       } else if (state === 'listening') {
192 |         baseHue = wave === 0 ? 145 : wave === 1 ? 160 : 175; // Green range
193 |       } else if (state === 'greeting') {
194 |         baseHue = wave === 0 ? 210 : wave === 1 ? 220 : 200; // Blue range
195 |       } else if (state === 'processing') {
196 |         baseHue = wave === 0 ? 260 : wave === 1 ? 240 : 220; // Purple/Blue range
197 |       } else if (state === 'speaking') {
198 |         baseHue = wave === 0 ? 30 : wave === 1 ? 45 : 60; // Gold/Amber range
199 |       } else {
200 |         baseHue = wave === 0 ? 145 : wave === 1 ? 190 : 290; // Default range
201 |       }
202 |       
203 |       // Ethereal color transitions
204 |       const hueShift = Math.sin(time * 0.5 + wave) * 15;
205 |       gradient.addColorStop(0, `hsla(${baseHue + hueShift}, 85%, 75%, 0)`); // Start transparent
206 |       gradient.addColorStop(0.4, `hsla(${baseHue + hueShift}, 90%, 85%, ${alpha * 1.6})`); // Intense peak
207 |       gradient.addColorStop(0.8, `hsla(${baseHue + hueShift}, 85%, 75%, ${alpha})`); // Maintain intensity
208 |       gradient.addColorStop(1, `hsla(${baseHue + hueShift}, 85%, 75%, 0)`); // End transparent
209 |       
210 |       ctx.fillStyle = gradient;
211 |       
212 |       // Apply gaussian blur for soft edges
213 |       ctx.filter = 'blur(15px)';
214 |       ctx.fill();
215 |       ctx.filter = 'none';
216 |       
217 |       // Add subtle highlight
218 |       ctx.strokeStyle = `hsla(${baseHue + hueShift}, 95%, 85%, ${alpha * 0.7})`;
219 |       ctx.lineWidth = 2;
220 |       ctx.stroke();
221 |     }
222 |     
223 |     // Add subtle noise texture
224 |     const imageData = ctx.getImageData(0, 0, width, height);
225 |     const data = imageData.data;
226 |     const noiseIntensity = 3; // Reduced noise intensity for cleaner look
227 |     for (let i = 0; i < data.length; i += 4) {
228 |       const noise = (Math.random() - 0.5) * noiseIntensity;
229 |       data[i] += noise;
230 |       data[i + 1] += noise;
231 |       data[i + 2] += noise;
232 |     }
233 |     ctx.putImageData(imageData, 0, 0);
234 |     
235 |     // Add final glow layer based on state
236 |     ctx.beginPath();
237 |     const glow = ctx.createRadialGradient(
238 |       width / 2, height / 2, 0,
239 |       width / 2, height / 2, width * 0.7
240 |     );
241 |     
242 |     // Color based on state
243 |     if (state === 'vision_file') {
244 |       // Light blue/cyan glow for vision file
245 |       const pulseIntensity = 0.25 + Math.sin(time * 2) * 0.05;
246 |       glow.addColorStop(0, `rgba(125, 211, 252, ${pulseIntensity})`); // sky-300
247 |       glow.addColorStop(0.5, `rgba(186, 230, 253, ${pulseIntensity * 0.4})`); // sky-200
248 |       glow.addColorStop(1, 'rgba(224, 242, 254, 0)'); // sky-100
249 |     } else if (state === 'vision_processing') {
250 |       // Rotating teal glow for vision processing
251 |       const rotationX = Math.cos(time * 2) * width * 0.2;
252 |       const rotationY = Math.sin(time * 2) * height * 0.2;
253 |       ctx.ellipse(
254 |         width/2 + rotationX, height/2 + rotationY, 
255 |         width * 0.3, height * 0.3, 
256 |         time, 0, Math.PI * 2
257 |       );
258 |       ctx.filter = 'blur(30px)';
259 |       ctx.fillStyle = 'rgba(45, 212, 191, 0.1)';
260 |       ctx.fill();
261 |       ctx.filter = 'none';
262 |       
263 |       glow.addColorStop(0, 'rgba(45, 212, 191, 0.15)');
264 |       glow.addColorStop(0.5, 'rgba(45, 212, 191, 0.05)');
265 |       glow.addColorStop(1, 'rgba(45, 212, 191, 0)');
266 |     } else if (state === 'vision_asr') {
267 |       // Emerald pulsing glow for vision ASR (matching listening state)
268 |       const pulseIntensity = 0.25 + Math.sin(time * 2.5) * 0.07;
269 |       glow.addColorStop(0, `rgba(72, 255, 167, ${pulseIntensity})`); 
270 |       glow.addColorStop(0.5, `rgba(72, 255, 167, ${pulseIntensity * 0.4})`);
271 |       glow.addColorStop(1, 'rgba(72, 255, 167, 0)');
272 |     } else if (state === 'listening') {
273 |       // Pulsing green glow for listening
274 |       const pulseIntensity = 0.2 + Math.sin(time * 3) * 0.1;
275 |       glow.addColorStop(0, `rgba(72, 255, 167, ${pulseIntensity})`); 
276 |       glow.addColorStop(0.5, `rgba(72, 255, 167, ${pulseIntensity * 0.4})`);
277 |       glow.addColorStop(1, 'rgba(72, 255, 167, 0)');
278 |     } else if (state === 'greeting') {
279 |       // Pulsing blue glow for greeting
280 |       const pulseIntensity = 0.2 + Math.sin(time * 2.5) * 0.1;
281 |       glow.addColorStop(0, `rgba(59, 130, 246, ${pulseIntensity})`); 
282 |       glow.addColorStop(0.5, `rgba(59, 130, 246, ${pulseIntensity * 0.4})`);
283 |       glow.addColorStop(1, 'rgba(59, 130, 246, 0)');
284 |     } else if (state === 'processing') {
285 |       // Rotating purple glow for processing
286 |       const rotationX = Math.cos(time * 2) * width * 0.2;
287 |       const rotationY = Math.sin(time * 2) * height * 0.2;
288 |       ctx.ellipse(
289 |         width/2 + rotationX, height/2 + rotationY, 
290 |         width * 0.3, height * 0.3, 
291 |         time, 0, Math.PI * 2
292 |       );
293 |       ctx.filter = 'blur(30px)';
294 |       ctx.fillStyle = 'rgba(186, 85, 255, 0.1)';
295 |       ctx.fill();
296 |       ctx.filter = 'none';
297 |       
298 |       glow.addColorStop(0, 'rgba(186, 85, 255, 0.15)');
299 |       glow.addColorStop(0.5, 'rgba(186, 85, 255, 0.05)');
300 |       glow.addColorStop(1, 'rgba(186, 85, 255, 0)');
301 |     } else if (state === 'speaking') {
302 |       // Rippling amber glow for speaking
303 |       const ripple = Math.sin(time * 5) * 0.1;
304 |       glow.addColorStop(0, `rgba(255, 167, 72, 0.2)`);
305 |       glow.addColorStop(0.4 + ripple, `rgba(255, 167, 72, 0.1)`);
306 |       glow.addColorStop(0.7 + ripple, `rgba(255, 167, 72, 0.05)`);
307 |       glow.addColorStop(1, 'rgba(255, 167, 72, 0)');
308 |     } else {
309 |       // Subtle glow for idle
310 |       glow.addColorStop(0, 'rgba(72, 255, 167, 0.2)');
311 |       glow.addColorStop(0.5, 'rgba(72, 255, 167, 0.08)');
312 |       glow.addColorStop(1, 'rgba(72, 255, 167, 0)');
313 |     }
314 |     
315 |     ctx.fillStyle = glow;
316 |     ctx.fillRect(0, 0, width, height);
317 |   }, [state]);
318 | 
319 |   // Set up canvas for aurora effect
320 |   useEffect(() => {
321 |     const canvas = canvasRef.current;
322 |     if (!canvas) return;
323 | 
324 |     const ctx = canvas.getContext('2d');
325 |     if (!ctx) return;
326 | 
327 |     let animationFrame: number;
328 |     let startTime = Date.now();
329 | 
330 |     const animate = () => {
331 |       const time = (Date.now() - startTime) * 0.001;
332 |       drawAurora(ctx, canvas.width, canvas.height, time);
333 |       animationFrame = requestAnimationFrame(animate);
334 |     };
335 | 
336 |     animate();
337 | 
338 |     return () => {
339 |       cancelAnimationFrame(animationFrame);
340 |     };
341 |   }, [drawAurora]);
342 | 
343 |   // Set up canvas for stars
344 |   useEffect(() => {
345 |     const canvas = starsCanvasRef.current;
346 |     if (!canvas) return;
347 | 
348 |     const ctx = canvas.getContext('2d');
349 |     if (!ctx) return;
350 | 
351 |     let animationFrame: number;
352 |     let startTime = Date.now();
353 | 
354 |     const animate = () => {
355 |       const time = (Date.now() - startTime) * 0.001;
356 |       drawStars(ctx, canvas.width, canvas.height, time);
357 |       animationFrame = requestAnimationFrame(animate);
358 |     };
359 | 
360 |     animate();
361 | 
362 |     return () => {
363 |       cancelAnimationFrame(animationFrame);
364 |     };
365 |   }, [drawStars]);
366 | 
367 |   return (
368 |     <div 
369 |       className={`
370 |         relative transition-all duration-500 z-50
371 |         ${state === 'listening' ? 'scale-110' : ''}
372 |         ${state === 'greeting' ? 'scale-105' : ''}
373 |         ${state === 'processing' ? 'scale-105' : ''}
374 |         ${state === 'speaking' ? 'scale-110 pulse-slow' : ''}
375 |         ${state === 'vision_file' ? 'scale-110' : ''}
376 |         ${state === 'vision_processing' ? 'scale-105' : ''}
377 |         ${state === 'vision_asr' ? 'scale-110' : ''}
378 |       `}
379 |       style={{
380 |         width: `${dimensions.width}px`,
381 |         height: `${dimensions.height}px`
382 |       }}
383 |     >
384 |       {/* Soft gaussian ambient glow */}
385 |       <div className="absolute -inset-32 rounded-full">
386 |         <div className={`
387 |           absolute inset-0 blur-2xl transform scale-90
388 |           ${state === 'vision_file' ? 'bg-gradient-radial from-sky-300/[0.04] via-sky-200/[0.01] to-transparent' : ''}
389 |           ${state === 'vision_processing' ? 'bg-gradient-radial from-teal-300/[0.04] via-teal-300/[0.01] to-transparent' : ''}
390 |           ${state === 'vision_asr' ? 'bg-gradient-radial from-emerald-500/[0.04] via-emerald-500/[0.01] to-transparent' : ''}
391 |           ${state === 'listening' ? 'bg-gradient-radial from-emerald-500/[0.04] via-emerald-500/[0.01] to-transparent' : ''}
392 |           ${state === 'greeting' ? 'bg-gradient-radial from-blue-500/[0.04] via-blue-500/[0.01] to-transparent' : ''}
393 |           ${state === 'processing' ? 'bg-gradient-radial from-purple-500/[0.04] via-purple-500/[0.01] to-transparent' : ''}
394 |           ${state === 'speaking' ? 'bg-gradient-radial from-amber-500/[0.04] via-amber-500/[0.01] to-transparent' : ''}
395 |           ${state === 'idle' ? 'bg-gradient-radial from-emerald-500/[0.02] via-emerald-500/[0.005] to-transparent' : ''}
396 |         `} />
397 |       </div>
398 |       
399 |       <div className="absolute -inset-16">
400 |         <canvas
401 |           ref={starsCanvasRef}
402 |           width={400}
403 |           height={400}
404 |           className="absolute inset-0 w-full h-full scale-150"
405 |         />
406 |       </div>
407 |       
408 |       {/* Aurora effect */}
409 |       <div className={`
410 |         absolute inset-0 rounded-full shadow-[0_0_60px_-8px_rgba(72,255,167,0.4)]
411 |         backdrop-blur-md overflow-hidden border
412 |         ring-1
413 |         ${state === 'vision_file' ? 'bg-gradient-to-b from-sky-300/50 via-sky-200/45 to-sky-100/50 border-sky-300/40 ring-sky-200/30' : ''}
414 |         ${state === 'vision_processing' ? 'bg-gradient-to-b from-teal-300/45 via-teal-200/40 to-teal-300/45 border-teal-400/30 ring-teal-400/20' : ''}
415 |         ${state === 'vision_asr' ? 'bg-gradient-to-b from-emerald-300/50 via-emerald-200/45 to-emerald-300/50 border-emerald-400/40 ring-emerald-400/30' : ''}
416 |         ${state === 'listening' ? 'bg-gradient-to-b from-emerald-300/50 via-emerald-200/45 to-emerald-300/50 border-emerald-400/40 ring-emerald-400/30' : ''}
417 |         ${state === 'greeting' ? 'bg-gradient-to-b from-blue-300/50 via-blue-200/45 to-blue-300/50 border-blue-400/40 ring-blue-400/30' : ''}
418 |         ${state === 'processing' ? 'bg-gradient-to-b from-purple-300/45 via-blue-300/40 to-purple-300/45 border-purple-400/30 ring-purple-400/20' : ''}
419 |         ${state === 'speaking' ? 'bg-gradient-to-b from-amber-300/45 via-yellow-300/40 to-amber-300/45 border-amber-400/30 ring-amber-400/20' : ''}
420 |         ${state === 'idle' ? 'bg-gradient-to-b from-emerald-300/45 via-sky-300/40 to-purple-300/45 border-emerald-400/30 ring-emerald-400/20' : ''}
421 |       `}>
422 |         <canvas
423 |           ref={canvasRef}
424 |           width={300}
425 |           height={300}
426 |           className="absolute inset-0 w-full h-full opacity-100 mix-blend-plus-lighter"
427 |         />
428 |       </div>
429 |     </div>
430 |   );
431 | };
432 | 
433 | export default AssistantOrb;
434 | 


--------------------------------------------------------------------------------
/frontend/src/components/BackgroundStars.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useRef, useEffect, useMemo, useCallback } from 'react';
 2 | 
 3 | const BackgroundStars: React.FC = () => {
 4 |   const canvasRef = useRef<HTMLCanvasElement>(null);
 5 |   
 6 |   // Generate stars with memoization to prevent regeneration on renders
 7 |   const stars = useMemo(() => {
 8 |     const starCount = 200;
 9 |     return Array.from({ length: starCount }, () => ({
10 |       x: Math.random() * window.innerWidth,
11 |       y: Math.random() * window.innerHeight,
12 |       size: Math.random() * 1.5 + 0.5,
13 |       twinkleSpeed: Math.random() * 2 + 1,
14 |       moveSpeed: Math.random() * 0.05 + 0.02,
15 |       angle: Math.random() * Math.PI * 2
16 |     }));
17 |   }, []);
18 | 
19 |   const drawStars = useCallback((ctx: CanvasRenderingContext2D, width: number, height: number, time: number) => {
20 |     ctx.clearRect(0, 0, width, height);
21 |     
22 |     stars.forEach((star) => {
23 |       // Circular motion
24 |       const radius = 1;
25 |       star.angle += star.moveSpeed * 0.01;
26 |       star.x += Math.cos(star.angle) * radius * 0.1;
27 |       star.y += Math.sin(star.angle) * radius * 0.1;
28 |       
29 |       // Wrap around screen
30 |       if (star.x < 0) star.x = width;
31 |       if (star.x > width) star.x = 0;
32 |       if (star.y < 0) star.y = height;
33 |       if (star.y > height) star.y = 0;
34 |       
35 |       // Calculate twinkle
36 |       const twinkle = Math.sin(time * star.twinkleSpeed + star.x * 0.01) * 0.5 + 0.5;
37 |       const alpha = twinkle * 0.3;
38 |       
39 |       // Draw star with subtle glow
40 |       ctx.beginPath();
41 |       const gradient = ctx.createRadialGradient(
42 |         star.x, star.y, 0,
43 |         star.x, star.y, star.size * 3
44 |       );
45 |       gradient.addColorStop(0, `rgba(255, 255, 255, ${alpha})`);
46 |       gradient.addColorStop(0.5, `rgba(255, 255, 255, ${alpha * 0.3})`);
47 |       gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
48 |       ctx.fillStyle = gradient;
49 |       ctx.arc(star.x, star.y, star.size, 0, Math.PI * 2);
50 |       ctx.fill();
51 |     });
52 |   }, [stars]);
53 | 
54 |   useEffect(() => {
55 |     const canvas = canvasRef.current;
56 |     if (!canvas) return;
57 | 
58 |     const handleResize = () => {
59 |       canvas.width = window.innerWidth;
60 |       canvas.height = window.innerHeight;
61 |     };
62 | 
63 |     handleResize();
64 |     window.addEventListener('resize', handleResize);
65 | 
66 |     const ctx = canvas.getContext('2d');
67 |     if (!ctx) return;
68 | 
69 |     let animationFrame: number;
70 |     let startTime = Date.now();
71 | 
72 |     const animate = () => {
73 |       const time = (Date.now() - startTime) * 0.001;
74 |       drawStars(ctx, canvas.width, canvas.height, time);
75 |       animationFrame = requestAnimationFrame(animate);
76 |     };
77 | 
78 |     animate();
79 | 
80 |     return () => {
81 |       cancelAnimationFrame(animationFrame);
82 |       window.removeEventListener('resize', handleResize);
83 |     };
84 |   }, [drawStars]);
85 | 
86 |   return (
87 |     <canvas
88 |       ref={canvasRef}
89 |       className="fixed inset-0 w-full h-full pointer-events-none opacity-70"
90 |     />
91 |   );
92 | };
93 | 
94 | export default BackgroundStars;
95 | 


--------------------------------------------------------------------------------
/frontend/src/components/PreferencesModal.tsx:
--------------------------------------------------------------------------------
  1 | import React, { useState, useEffect } from 'react';
  2 | import { User, Sparkles, Eye } from 'lucide-react';
  3 | import websocketService, { MessageType } from '../services/websocket';
  4 | 
  5 | interface PreferencesModalProps {
  6 |   isOpen: boolean;
  7 |   onClose: () => void;
  8 | }
  9 | 
 10 | const PreferencesModal: React.FC<PreferencesModalProps> = ({ isOpen, onClose }) => {
 11 |   const [systemPrompt, setSystemPrompt] = useState('');
 12 |   const [userName, setUserName] = useState('');
 13 |   const [isSaving, setIsSaving] = useState(false);
 14 |   const [saveError, setSaveError] = useState<string | null>(null);
 15 |   const [activeTab, setActiveTab] = useState<'profile' | 'system'>('profile');
 16 |   const [isVisionEnabled, setIsVisionEnabled] = useState(false);
 17 |   
 18 |   useEffect(() => {
 19 |     if (isOpen) {
 20 |       // Reset state when modal opens
 21 |       setSaveError(null);
 22 |       
 23 |       // Fetch current system prompt, user profile, and vision settings
 24 |       const handleSystemPrompt = (data: any) => {
 25 |         if (data && data.prompt) {
 26 |           setSystemPrompt(data.prompt);
 27 |         }
 28 |       };
 29 |       
 30 |       const handleUserProfile = (data: any) => {
 31 |         if (data && data.name !== undefined) {
 32 |           setUserName(data.name);
 33 |         }
 34 |       };
 35 |       
 36 |       const handleVisionSettings = (data: any) => {
 37 |         if (data && data.enabled !== undefined) {
 38 |           setIsVisionEnabled(data.enabled);
 39 |         }
 40 |       };
 41 |       
 42 |       // Listen for responses
 43 |       websocketService.addEventListener(MessageType.SYSTEM_PROMPT, handleSystemPrompt);
 44 |       websocketService.addEventListener(MessageType.USER_PROFILE, handleUserProfile);
 45 |       websocketService.addEventListener(MessageType.VISION_SETTINGS as any, handleVisionSettings);
 46 |       
 47 |       // Request data
 48 |       websocketService.getSystemPrompt();
 49 |       websocketService.getUserProfile();
 50 |       websocketService.getVisionSettings();
 51 |       
 52 |       console.log('Requested preferences data');
 53 |       
 54 |       return () => {
 55 |         websocketService.removeEventListener(MessageType.SYSTEM_PROMPT, handleSystemPrompt);
 56 |         websocketService.removeEventListener(MessageType.USER_PROFILE, handleUserProfile);
 57 |         websocketService.removeEventListener(MessageType.VISION_SETTINGS as any, handleVisionSettings);
 58 |       };
 59 |     }
 60 |   }, [isOpen]);
 61 |   
 62 |   // Listen for update confirmations
 63 |   useEffect(() => {
 64 |     let updateCount = 0;
 65 |     const expectedUpdateCount = 3; // Always expect 3 updates: system prompt, user profile, and vision
 66 |     let success = true;
 67 |     
 68 |     const handlePromptUpdated = (data: any) => {
 69 |       updateCount++;
 70 |       if (!(data && data.success)) {
 71 |         success = false;
 72 |         setSaveError('Failed to update system prompt. Please try again.');
 73 |       }
 74 |       
 75 |       if (updateCount >= expectedUpdateCount) {
 76 |         setIsSaving(false);
 77 |         if (success) {
 78 |           // Close modal only if all updates succeeded
 79 |           onClose();
 80 |         }
 81 |       }
 82 |     };
 83 |     
 84 |     const handleProfileUpdated = (data: any) => {
 85 |       updateCount++;
 86 |       if (!(data && data.success)) {
 87 |         success = false;
 88 |         setSaveError('Failed to update user profile. Please try again.');
 89 |       }
 90 |       
 91 |       if (updateCount >= expectedUpdateCount) {
 92 |         setIsSaving(false);
 93 |         if (success) {
 94 |           // Close modal only if all updates succeeded
 95 |           onClose();
 96 |         }
 97 |       }
 98 |     };
 99 |     
100 |     const handleVisionSettingsUpdated = (data: any) => {
101 |       updateCount++;
102 |       if (!(data && data.success)) {
103 |         success = false;
104 |         setSaveError('Failed to update vision settings. Please try again.');
105 |       }
106 |       
107 |       if (updateCount >= expectedUpdateCount) {
108 |         setIsSaving(false);
109 |         if (success) {
110 |           // Close modal only if all updates succeeded
111 |           onClose();
112 |         }
113 |       }
114 |     };
115 |     
116 |     websocketService.addEventListener(MessageType.SYSTEM_PROMPT_UPDATED, handlePromptUpdated);
117 |     websocketService.addEventListener(MessageType.USER_PROFILE_UPDATED, handleProfileUpdated);
118 |     websocketService.addEventListener(MessageType.VISION_SETTINGS_UPDATED as any, handleVisionSettingsUpdated);
119 |     
120 |     return () => {
121 |       websocketService.removeEventListener(MessageType.SYSTEM_PROMPT_UPDATED, handlePromptUpdated);
122 |       websocketService.removeEventListener(MessageType.USER_PROFILE_UPDATED, handleProfileUpdated);
123 |       websocketService.removeEventListener(MessageType.VISION_SETTINGS_UPDATED as any, handleVisionSettingsUpdated);
124 |     };
125 |   }, [onClose, activeTab]);
126 |   
127 |   const handleSave = () => {
128 |     // Check if system prompt is empty when in system tab
129 |     if (activeTab === 'system' && !systemPrompt.trim()) {
130 |       setSaveError('System prompt cannot be empty');
131 |       return;
132 |     }
133 |     
134 |     setIsSaving(true);
135 |     setSaveError(null);
136 |     
137 |     // Always update all settings
138 |     websocketService.updateSystemPrompt(systemPrompt);
139 |     websocketService.updateUserProfile(userName);
140 |     websocketService.updateVisionSettings(isVisionEnabled);
141 |   };
142 |   
143 |   // backticks, in my code, in the year of our lord, 2025? no.
144 |   const handleRestore = () => { 
145 |     setSystemPrompt(
146 |       "You are a helpful, friendly, and concise voice assistant. " +
147 |       "Respond to user queries in a natural, conversational manner. " +
148 |       "Keep responses brief and to the point, as you're communicating via voice. " +
149 |       "When providing information, focus on the most relevant details. " +
150 |       "If you don't know something, admit it rather than making up an answer." +
151 |       "\n\n" +
152 |       "Through the webapp, you can receive and understand photographs and pictures." +
153 |       "\n\n" +
154 |       "When the user sends a message like '[silent]', '[no response]', or '[still waiting]', it means they've gone quiet or haven't responded." +
155 |       "When you see these signals, continue the conversation naturally based on the previous topic and context." +
156 |       "Stay on topic, be helpful, and don't mention that they were silent - just carry on the conversation as if you're gently following up."
157 |     );
158 |   };
159 |   
160 |   // Tab rendering helpers
161 |   const renderVisionTab = () => (
162 |     <div className="space-y-4">
163 |       <div className="space-y-2">
164 |         <label className="text-sm font-medium text-slate-300">Vision</label>
165 |         <div className="flex items-center">
166 |           <div
167 |             className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
168 |               isVisionEnabled ? 'bg-emerald-600' : 'bg-slate-700'
169 |             }`}
170 |             onClick={() => setIsVisionEnabled(!isVisionEnabled)}
171 |           >
172 |             <span
173 |               className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
174 |                 isVisionEnabled ? 'translate-x-6' : 'translate-x-1'
175 |               }`}
176 |             />
177 |           </div>
178 |           <span className="ml-3 text-sm text-slate-300">
179 |             {isVisionEnabled ? 'Enabled' : 'Disabled'}
180 |           </span>
181 |         </div>
182 |         <p className="text-xs text-slate-400">
183 |           When enabled, Vocalis will use computer vision to analyze images and provide visual context to your conversations.
184 |         </p>
185 |         <div className="mt-4 p-3 bg-blue-900/20 border border-blue-800/30 rounded-lg">
186 |           <p className="text-xs text-blue-300">
187 |             <strong>Coming Soon:</strong> Vision capabilities will allow Vocalis to see and describe images,
188 |             analyze documents, interpret charts, and provide visual assistance during your conversations.
189 |           </p>
190 |         </div>
191 |       </div>
192 |     </div>
193 |   );
194 |   
195 |   const renderProfileTab = () => (
196 |     <div className="space-y-4">
197 |       <div className="space-y-2">
198 |         <label className="text-sm font-medium text-slate-300">Your Name</label>
199 |         <div className="relative">
200 |           <input
201 |             value={userName}
202 |             onChange={(e) => setUserName(e.target.value)}
203 |             className="w-full bg-slate-800/50 border border-slate-700 rounded-lg pl-10 p-3 text-slate-300 text-sm focus:outline-none focus:ring-1 focus:ring-emerald-500"
204 |             placeholder="Enter your name (optional)"
205 |           />
206 |           <User className="absolute left-3 top-3 w-4 h-4 text-slate-500" />
207 |         </div>
208 |         <p className="text-xs text-slate-400">
209 |           Your name will be used to personalize greetings and make the conversation feel more natural.
210 |         </p>
211 |       </div>
212 |     </div>
213 |   );
214 |   
215 |   const renderSystemTab = () => (
216 |     <div className="space-y-2">
217 |       <div className="flex items-center justify-between">
218 |         <label className="text-sm font-medium text-slate-300">System Prompt</label>
219 |         <button
220 |           onClick={handleRestore}
221 |           className="text-xs text-sky-400 hover:text-sky-300"
222 |         >
223 |           Restore Default
224 |         </button>
225 |       </div>
226 |       <textarea
227 |         value={systemPrompt}
228 |         onChange={(e) => setSystemPrompt(e.target.value)}
229 |         className="w-full h-64 bg-slate-800/50 border border-slate-700 rounded-lg p-3 text-slate-300 text-sm resize-none focus:outline-none focus:ring-1 focus:ring-emerald-500"
230 |         placeholder="Enter system prompt..."
231 |       />
232 |       <p className="text-xs text-slate-400">
233 |         The system prompt defines how the AI assistant behaves when responding to your voice commands.
234 |       </p>
235 |     </div>
236 |   );
237 |   
238 |   // Handle animation state
239 |   const [isVisible, setIsVisible] = useState(false);
240 |   
241 |   useEffect(() => {
242 |     if (isOpen) {
243 |       setIsVisible(true);
244 |     } else {
245 |       setTimeout(() => setIsVisible(false), 300); // Match animation duration
246 |     }
247 |   }, [isOpen]);
248 |   
249 |   if (!isOpen && !isVisible) return null;
250 |   
251 |   return (
252 |     <div className={`fixed inset-0 z-50 flex items-center justify-center backdrop-blur-sm transition-opacity duration-300 ease-in-out ${isOpen ? 'opacity-100 bg-black/50' : 'opacity-0 pointer-events-none'}`}>
253 |       <div className={`bg-slate-900 border border-slate-700 rounded-lg w-full max-w-2xl max-h-[90vh] flex flex-col shadow-xl transition-all duration-300 ease-in-out ${isOpen ? 'opacity-100 scale-100' : 'opacity-0 scale-95'}`}>
254 |         {/* Header */}
255 |         <div className="flex items-center p-4 border-b border-slate-700">
256 |           <h2 className="text-lg font-semibold text-slate-100">Preferences</h2>
257 |         </div>
258 |         
259 |         {/* Tabs */}
260 |         <div className="flex border-b border-slate-700">
261 |           <button
262 |             className={`flex items-center gap-2 px-4 py-3 border-b-2 transition-colors ${
263 |               activeTab === 'profile'
264 |                 ? 'border-emerald-500 text-emerald-400'
265 |                 : 'border-transparent text-slate-400 hover:text-slate-300'
266 |             }`}
267 |             onClick={() => setActiveTab('profile')}
268 |           >
269 |             <User className="w-4 h-4" />
270 |             <span>User Profile</span>
271 |           </button>
272 |           <button
273 |             className={`flex items-center gap-2 px-4 py-3 border-b-2 transition-colors ${
274 |               activeTab === 'system'
275 |                 ? 'border-emerald-500 text-emerald-400'
276 |                 : 'border-transparent text-slate-400 hover:text-slate-300'
277 |             }`}
278 |             onClick={() => setActiveTab('system')}
279 |           >
280 |             <Sparkles className="w-4 h-4" />
281 |             <span>System Prompt</span>
282 |           </button>
283 |         </div>
284 |         
285 |         {/* Content */}
286 |         <div className="flex-1 overflow-auto p-4 space-y-4">
287 |           {activeTab === 'profile' 
288 |             ? renderProfileTab() 
289 |             : renderSystemTab()}
290 |           
291 |           {saveError && (
292 |             <div className="text-red-400 text-sm p-2 bg-red-900/20 border border-red-900/30 rounded">
293 |               {saveError}
294 |             </div>
295 |           )}
296 |         </div>
297 |         
298 |         {/* Vision Settings Section */}
299 |         <div className="px-4 py-3 border-t border-slate-700 bg-slate-800/30">
300 |           <div className="flex items-center justify-between gap-2">
301 |             <div className="flex items-center gap-2">
302 |               <Eye className="w-4 h-4 text-indigo-400" />
303 |               <span className="text-sm font-medium text-slate-300">Vision</span>
304 |             </div>
305 |             <div className="flex items-center gap-2">
306 |               <div
307 |                 className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
308 |                   isVisionEnabled ? 'bg-indigo-600' : 'bg-slate-700'
309 |                 }`}
310 |                 onClick={() => setIsVisionEnabled(!isVisionEnabled)}
311 |               >
312 |                 <span
313 |                   className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
314 |                     isVisionEnabled ? 'translate-x-6' : 'translate-x-1'
315 |                   }`}
316 |                 />
317 |               </div>
318 |               <span className="text-xs text-slate-400">
319 |                 {isVisionEnabled ? 'Enabled' : 'Disabled'}
320 |               </span>
321 |             </div>
322 |           </div>
323 |           <p className="text-xs text-slate-400 mt-1">
324 |             When enabled, Vocalis can analyze images and provide visual context (coming soon).
325 |           </p>
326 |         </div>
327 |         
328 |         {/* Footer */}
329 |         <div className="p-4 border-t border-slate-700 flex justify-end space-x-2">
330 |           <button
331 |             onClick={onClose}
332 |             className="px-4 py-2 bg-slate-800 hover:bg-slate-700 text-slate-300 rounded-lg"
333 |           >
334 |             Cancel
335 |           </button>
336 |           <button
337 |             onClick={handleSave}
338 |             disabled={isSaving}
339 |             className={`
340 |               px-4 py-2 bg-emerald-600 hover:bg-emerald-500 text-white rounded-lg
341 |               ${isSaving ? 'opacity-50 cursor-not-allowed' : ''}
342 |             `}
343 |           >
344 |             {isSaving ? 'Saving...' : 'Save'}
345 |           </button>
346 |         </div>
347 |       </div>
348 |     </div>
349 |   );
350 | };
351 | 
352 | export default PreferencesModal;
353 | 


--------------------------------------------------------------------------------
/frontend/src/components/SessionManager.tsx:
--------------------------------------------------------------------------------
  1 | import React, { useState, useEffect } from 'react';
  2 | import { Save, FolderOpen, Trash2, FileText, Plus, Edit, Check, X } from 'lucide-react';
  3 | import websocketService, { MessageType, Session } from '../services/websocket';
  4 | 
  5 | interface SessionManagerProps {
  6 |   className?: string;
  7 | }
  8 | 
  9 | const SessionManager: React.FC<SessionManagerProps> = ({ className = '' }) => {
 10 |   const [sessions, setSessions] = useState<Session[]>([]);
 11 |   const [loading, setLoading] = useState(false);
 12 |   const [newSessionTitle, setNewSessionTitle] = useState('');
 13 |   const [isCreatingSession, setIsCreatingSession] = useState(false);
 14 |   const [editingSessionId, setEditingSessionId] = useState<string | null>(null);
 15 |   const [editingTitle, setEditingTitle] = useState('');
 16 |   const [error, setError] = useState<string | null>(null);
 17 |   const [currentSessionId, setCurrentSessionId] = useState<string | null>(null);
 18 | 
 19 |   // Fetch sessions when component mounts
 20 |   useEffect(() => {
 21 |     fetchSessions();
 22 |     
 23 |     // Add event listeners for WebSocket responses
 24 |     const handleListSessionsResult = (data: any) => {
 25 |       console.log('Received list_sessions_result:', data);
 26 |       if (data.sessions) {
 27 |         setSessions(data.sessions);
 28 |       }
 29 |       setLoading(false);
 30 |     };
 31 |     
 32 |     const handleSaveSessionResult = (data: any) => {
 33 |       console.log('Received save_session_result:', data);
 34 |       if (data.success) {
 35 |         fetchSessions(); // Refresh the list
 36 |         setIsCreatingSession(false);
 37 |         setNewSessionTitle('');
 38 |       } else {
 39 |         setError('Failed to save session');
 40 |       }
 41 |       setLoading(false);
 42 |     };
 43 |     
 44 |     const handleLoadSessionResult = (data: any) => {
 45 |       console.log('Received load_session_result:', data);
 46 |       if (data.success) {
 47 |         // Set currently loaded session
 48 |         setCurrentSessionId(data.session_id);
 49 |       } else {
 50 |         setError(`Failed to load session: ${data.session_id}`);
 51 |       }
 52 |       setLoading(false);
 53 |     };
 54 |     
 55 |     const handleDeleteSessionResult = (data: any) => {
 56 |       console.log('Received delete_session_result:', data);
 57 |       if (data.success) {
 58 |         // If the deleted session was the current one, clear currentSessionId
 59 |         if (data.session_id === currentSessionId) {
 60 |           setCurrentSessionId(null);
 61 |         }
 62 |         fetchSessions(); // Refresh the list
 63 |       } else {
 64 |         setError(`Failed to delete session: ${data.session_id}`);
 65 |       }
 66 |       setLoading(false);
 67 |     };
 68 |     
 69 |     // General error handler for session operations
 70 |     const handleError = (data: any) => {
 71 |       console.log('Received error message:', data);
 72 |       if (data.error) {
 73 |         // Check if error is related to session operations
 74 |         if (data.error.includes('conversation') || data.error.includes('session')) {
 75 |           setLoading(false);
 76 |           setError(data.error);
 77 |         }
 78 |       }
 79 |     };
 80 |     
 81 |     // Make sure these are correctly registered
 82 |     websocketService.addEventListener(MessageType.LIST_SESSIONS_RESULT as any, handleListSessionsResult);
 83 |     websocketService.addEventListener(MessageType.SAVE_SESSION_RESULT as any, handleSaveSessionResult);
 84 |     websocketService.addEventListener(MessageType.LOAD_SESSION_RESULT as any, handleLoadSessionResult);
 85 |     websocketService.addEventListener(MessageType.DELETE_SESSION_RESULT as any, handleDeleteSessionResult);
 86 |     websocketService.addEventListener(MessageType.ERROR as any, handleError);
 87 |     
 88 |     return () => {
 89 |       websocketService.removeEventListener(MessageType.LIST_SESSIONS_RESULT as any, handleListSessionsResult);
 90 |       websocketService.removeEventListener(MessageType.SAVE_SESSION_RESULT as any, handleSaveSessionResult);
 91 |       websocketService.removeEventListener(MessageType.LOAD_SESSION_RESULT as any, handleLoadSessionResult);
 92 |       websocketService.removeEventListener(MessageType.DELETE_SESSION_RESULT as any, handleDeleteSessionResult);
 93 |       websocketService.removeEventListener(MessageType.ERROR as any, handleError);
 94 |     };
 95 |   }, [currentSessionId]);
 96 | 
 97 |   // Fetch sessions from server
 98 |   const fetchSessions = () => {
 99 |     setLoading(true);
100 |     websocketService.listSessions();
101 |   };
102 | 
103 |   // Save current session
104 |   const saveSession = (title: string = '', sessionId?: string) => {
105 |     setLoading(true);
106 |     websocketService.saveSession(title, sessionId);
107 |   };
108 | 
109 |   // Load a session
110 |   const loadSession = (sessionId: string) => {
111 |     setLoading(true);
112 |     websocketService.loadSession(sessionId);
113 |   };
114 | 
115 |   // Delete a session
116 |   const deleteSession = (sessionId: string) => {
117 |     if (window.confirm('Are you sure you want to delete this session?')) {
118 |       setLoading(true);
119 |       websocketService.deleteSession(sessionId);
120 |     }
121 |   };
122 | 
123 |   // Format date for display
124 |   const formatDate = (dateString: string) => {
125 |     const date = new Date(dateString);
126 |     return date.toLocaleDateString('en-US', {
127 |       month: 'short',
128 |       day: 'numeric',
129 |       year: 'numeric',
130 |       hour: '2-digit',
131 |       minute: '2-digit'
132 |     });
133 |   };
134 | 
135 |   // Start editing a session title
136 |   const startEditingSession = (session: Session) => {
137 |     setEditingSessionId(session.id);
138 |     setEditingTitle(session.title);
139 |   };
140 | 
141 |   // Save edited session title
142 |   const saveEditedSession = () => {
143 |     if (editingSessionId) {
144 |       saveSession(editingTitle, editingSessionId);
145 |       setEditingSessionId(null);
146 |     }
147 |   };
148 | 
149 |   // Cancel editing session title
150 |   const cancelEditingSession = () => {
151 |     setEditingSessionId(null);
152 |   };
153 | 
154 |   return (
155 |     <div className={`flex flex-col h-full ${className}`}>
156 |       <div className="flex items-center justify-between mb-4 px-4 pt-4">
157 |         <h2 className="text-xl font-semibold text-gray-200">Sessions</h2>
158 |         <button
159 |           onClick={() => setIsCreatingSession(true)}
160 |           className="p-2 rounded-full hover:bg-gray-700 transition-colors"
161 |           aria-label="New session"
162 |           title="Save current conversation"
163 |         >
164 |           <Plus className="w-5 h-5 text-gray-200" />
165 |         </button>
166 |       </div>
167 | 
168 |       {/* Create new session form */}
169 |       {isCreatingSession && (
170 |         <div className="mx-4 mb-4 p-3 bg-gray-800 rounded-lg">
171 |           <div className="flex items-center mb-2">
172 |             <FileText className="w-4 h-4 text-gray-400 mr-2" />
173 |             <h3 className="text-sm font-medium text-gray-200">Save Conversation</h3>
174 |           </div>
175 |           <input
176 |             type="text"
177 |             value={newSessionTitle}
178 |             onChange={(e) => setNewSessionTitle(e.target.value)}
179 |             placeholder="Title (optional)"
180 |             className="w-full p-2 mb-2 bg-gray-700 border border-gray-600 rounded text-sm text-gray-200"
181 |             autoFocus
182 |           />
183 |           <div className="flex justify-end gap-2">
184 |             <button
185 |               onClick={() => setIsCreatingSession(false)}
186 |               className="px-2 py-1 text-sm rounded hover:bg-gray-600"
187 |             >
188 |               Cancel
189 |             </button>
190 |             <button
191 |               onClick={() => saveSession(newSessionTitle)}
192 |               className="px-2 py-1 text-sm bg-blue-600 rounded hover:bg-blue-700"
193 |               disabled={loading}
194 |             >
195 |               Save
196 |             </button>
197 |           </div>
198 |         </div>
199 |       )}
200 | 
201 |       {/* Error message */}
202 |       {error && (
203 |         <div className="mx-4 mb-4 p-2 bg-red-900/60 text-red-100 rounded-lg text-sm flex items-center justify-between">
204 |           <span>{error}</span>
205 |           <button onClick={() => setError(null)} className="p-1 hover:bg-red-800 rounded">
206 |             <X className="w-4 h-4" />
207 |           </button>
208 |         </div>
209 |       )}
210 | 
211 |       {/* Session list */}
212 |       <div className="flex-1 overflow-y-auto px-4">
213 |         {loading ? (
214 |           <div className="text-center py-4">
215 |             <div className="animate-spin rounded-full h-8 w-8 border-t-2 border-b-2 border-blue-500 mx-auto"></div>
216 |             <p className="mt-2 text-sm text-gray-400">Loading sessions...</p>
217 |           </div>
218 |         ) : sessions.length === 0 ? (
219 |           <div className="text-center py-8 text-gray-500">
220 |             <FolderOpen className="w-12 h-12 mx-auto mb-3 opacity-30" />
221 |             <p>No saved sessions</p>
222 |             <p className="text-sm mt-2">Click the + button to save the current conversation</p>
223 |           </div>
224 |         ) : (
225 |           <ul className="space-y-2">
226 |             {sessions.map((session) => (
227 |               <li
228 |                 key={session.id}
229 |                 className={`p-3 rounded-lg border ${
230 |                   currentSessionId === session.id
231 |                     ? 'bg-gray-800 border-blue-500/50'
232 |                     : 'bg-gray-800/50 border-gray-700/50'
233 |                 } hover:bg-gray-800 transition-colors`}
234 |               >
235 |                 {editingSessionId === session.id ? (
236 |                   <div className="flex items-center gap-2">
237 |                     <input
238 |                       type="text"
239 |                       value={editingTitle}
240 |                       onChange={(e) => setEditingTitle(e.target.value)}
241 |                       className="flex-1 p-1 bg-gray-700 border border-gray-600 rounded text-sm"
242 |                       autoFocus
243 |                     />
244 |                     <button 
245 |                       onClick={saveEditedSession}
246 |                       className="p-1 rounded hover:bg-gray-700"
247 |                     >
248 |                       <Check className="w-4 h-4 text-green-500" />
249 |                     </button>
250 |                     <button 
251 |                       onClick={cancelEditingSession}
252 |                       className="p-1 rounded hover:bg-gray-700"
253 |                     >
254 |                       <X className="w-4 h-4 text-red-500" />
255 |                     </button>
256 |                   </div>
257 |                 ) : (
258 |                   <>
259 |                     <div className="flex items-center justify-between">
260 |                       <h3 className="font-medium text-gray-200 mb-1 truncate pr-2" title={session.title}>
261 |                         {session.title || `Conversation ${formatDate(session.created_at)}`}
262 |                         {currentSessionId === session.id && (
263 |                           <span className="ml-2 px-1.5 py-0.5 rounded-full bg-blue-900/50 text-blue-300 text-xs">
264 |                             Active
265 |                           </span>
266 |                         )}
267 |                       </h3>
268 |                       <div className="flex space-x-1">
269 |                         <button
270 |                           onClick={() => startEditingSession(session)}
271 |                           className="p-1 rounded hover:bg-gray-700"
272 |                           title="Edit title"
273 |                         >
274 |                           <Edit className="w-4 h-4 text-gray-400 hover:text-gray-200" />
275 |                         </button>
276 |                         <button
277 |                           onClick={() => deleteSession(session.id)}
278 |                           className="p-1 rounded hover:bg-gray-700"
279 |                           title="Delete session"
280 |                         >
281 |                           <Trash2 className="w-4 h-4 text-gray-400 hover:text-red-400" />
282 |                         </button>
283 |                       </div>
284 |                     </div>
285 |                     <div className="flex justify-between items-center">
286 |                       <div className="text-xs text-gray-400">
287 |                         {formatDate(session.updated_at)}
288 |                       </div>
289 |                       <div className="flex space-x-1">
290 |                         <button
291 |                           onClick={() => loadSession(session.id)}
292 |                           className="text-xs px-2 py-1 rounded bg-blue-800/50 text-blue-200 hover:bg-blue-700/60"
293 |                           title="Load session"
294 |                         >
295 |                           Load
296 |                         </button>
297 |                         <button
298 |                           onClick={() => saveSession(session.title, session.id)}
299 |                           className="text-xs px-2 py-1 rounded bg-green-800/50 text-green-200 hover:bg-green-700/60"
300 |                           title="Save current conversation to this session"
301 |                         >
302 |                           Save
303 |                         </button>
304 |                       </div>
305 |                     </div>
306 |                     {session.metadata && (
307 |                       <div className="mt-2 flex gap-2 text-xs">
308 |                         <span className="px-1.5 py-0.5 rounded-full bg-gray-700/50 text-gray-300">
309 |                           {session.metadata.message_count || 0} messages
310 |                         </span>
311 |                       </div>
312 |                     )}
313 |                   </>
314 |                 )}
315 |               </li>
316 |             ))}
317 |           </ul>
318 |         )}
319 |       </div>
320 | 
321 |       {!isCreatingSession && (
322 |         <div className="p-4 border-t border-gray-800">
323 |           <button
324 |             onClick={() => setIsCreatingSession(true)}
325 |             className="w-full py-2 rounded-md bg-blue-600 hover:bg-blue-700 flex items-center justify-center gap-2"
326 |             disabled={loading}
327 |           >
328 |             <Save className="w-4 h-4" />
329 |             Save As New Conversation
330 |           </button>
331 |         </div>
332 |       )}
333 |     </div>
334 |   );
335 | };
336 | 
337 | export default SessionManager;
338 | 


--------------------------------------------------------------------------------
/frontend/src/components/Sidebar.tsx:
--------------------------------------------------------------------------------
  1 | import React, { useState } from 'react';
  2 | import { Settings, RefreshCw, Trash2, MessageSquare, Save } from 'lucide-react';
  3 | import PreferencesModal from './PreferencesModal';
  4 | import SessionManager from './SessionManager';
  5 | 
  6 | interface SidebarProps {
  7 |   onClose: () => void;
  8 |   isConnected: boolean;
  9 |   onReconnect: () => void;
 10 |   onClearHistory: () => void;
 11 | }
 12 | 
 13 | const Sidebar: React.FC<SidebarProps> = ({ 
 14 |   onClose, 
 15 |   isConnected,
 16 |   onReconnect, 
 17 |   onClearHistory 
 18 | }) => {
 19 |   const [isPreferencesOpen, setIsPreferencesOpen] = useState(false);
 20 |   const [isReconnecting, setIsReconnecting] = useState(false);
 21 |   const [isClearing, setIsClearing] = useState(false);
 22 |   const [reconnectSuccess, setReconnectSuccess] = useState(false);
 23 |   const [clearSuccess, setClearSuccess] = useState(false);
 24 |   const [activeTab, setActiveTab] = useState<'settings' | 'sessions'>('settings');
 25 |   
 26 |   const handleReconnect = () => {
 27 |     setIsReconnecting(true);
 28 |     setReconnectSuccess(false);
 29 |     
 30 |     // Call the actual reconnect handler
 31 |     onReconnect();
 32 |     
 33 |     // Show animation for 1 second
 34 |     setTimeout(() => {
 35 |       setIsReconnecting(false);
 36 |       setReconnectSuccess(true);
 37 |       
 38 |       // Hide success indicator after 2 seconds
 39 |       setTimeout(() => {
 40 |         setReconnectSuccess(false);
 41 |       }, 2000);
 42 |     }, 1000);
 43 |   };
 44 |   
 45 |   const handleClearHistory = () => {
 46 |     setIsClearing(true);
 47 |     setClearSuccess(false);
 48 |     
 49 |     // Call the actual clear history handler
 50 |     onClearHistory();
 51 |     
 52 |     // Show animation for 1 second
 53 |     setTimeout(() => {
 54 |       setIsClearing(false);
 55 |       setClearSuccess(true);
 56 |       
 57 |       // Hide success indicator after 2 seconds
 58 |       setTimeout(() => {
 59 |         setClearSuccess(false);
 60 |       }, 2000);
 61 |     }, 1000);
 62 |   };
 63 |   
 64 |   return (
 65 |     <div className="w-64 h-screen bg-slate-900/20 backdrop-blur-sm flex flex-col pt-16 border-r border-slate-800/50">
 66 |       <div className="flex-1 flex flex-col text-slate-300">
 67 |         <div className="p-4 pb-2">
 68 |           <h2 className="text-lg font-semibold mb-4 text-emerald-400/90">Vocalis</h2>
 69 |           
 70 |           {/* Tab Navigation */}
 71 |           <div className="flex border-b border-slate-800/50 mb-4">
 72 |             <button
 73 |               onClick={() => setActiveTab('settings')}
 74 |               className={`px-4 py-2 text-sm font-medium -mb-px ${
 75 |                 activeTab === 'settings'
 76 |                   ? 'text-emerald-400 border-b-2 border-emerald-400/70'
 77 |                   : 'text-slate-400 hover:text-slate-300'
 78 |               }`}
 79 |             >
 80 |               <div className="flex items-center space-x-2">
 81 |                 <Settings className="w-4 h-4" />
 82 |                 <span>Settings</span>
 83 |               </div>
 84 |             </button>
 85 |             <button
 86 |               onClick={() => setActiveTab('sessions')}
 87 |               className={`px-4 py-2 text-sm font-medium -mb-px ${
 88 |                 activeTab === 'sessions'
 89 |                   ? 'text-emerald-400 border-b-2 border-emerald-400/70'
 90 |                   : 'text-slate-400 hover:text-slate-300'
 91 |               }`}
 92 |             >
 93 |               <div className="flex items-center space-x-2">
 94 |                 <MessageSquare className="w-4 h-4" />
 95 |                 <span>Sessions</span>
 96 |               </div>
 97 |             </button>
 98 |           </div>
 99 |         </div>
100 |         
101 |         {activeTab === 'settings' ? (
102 |           <div className="p-4 pt-0 space-y-6">
103 |           {/* Connection Status */}
104 |           <div className="space-y-2">
105 |             <h3 className="text-sm font-medium text-slate-400">Connection</h3>
106 |             <div className="flex items-center space-x-2 p-2 w-full rounded-lg">
107 |               <div className={`w-2 h-2 rounded-full ${isConnected ? 'bg-emerald-400' : 'bg-red-400'}`}></div>
108 |               <span className="text-sm">{isConnected ? 'Connected' : 'Disconnected'}</span>
109 |             </div>
110 |           </div>
111 |           
112 |           {/* Controls */}
113 |           <div className="space-y-2">
114 |             <h3 className="text-sm font-medium text-slate-400">Controls</h3>
115 |             <div className="space-y-2">
116 |               <button 
117 |                 onClick={handleReconnect}
118 |                 disabled={isReconnecting}
119 |                 className="flex items-center justify-between space-x-2 p-2 w-full rounded-lg hover:bg-slate-800/30 transition-colors"
120 |               >
121 |                 <div className="flex items-center space-x-2">
122 |                   <RefreshCw className={`w-4 h-4 text-sky-400 ${isReconnecting ? 'animate-spin' : ''}`} />
123 |                   <span className="text-sm">{isReconnecting ? 'Reconnecting...' : 'Reconnect'}</span>
124 |                 </div>
125 |                 {reconnectSuccess && (
126 |                   <span className="text-xs text-emerald-400 animate-fadeIn">Done!</span>
127 |                 )}
128 |               </button>
129 |               
130 |               <button 
131 |                 onClick={handleClearHistory}
132 |                 disabled={isClearing}
133 |                 className="flex items-center justify-between space-x-2 p-2 w-full rounded-lg hover:bg-slate-800/30 transition-colors"
134 |               >
135 |                 <div className="flex items-center space-x-2">
136 |                   <Trash2 className={`w-4 h-4 text-slate-400 ${isClearing ? 'opacity-70' : ''}`} />
137 |                   <span className="text-sm">{isClearing ? 'Clearing...' : 'Clear History'}</span>
138 |                 </div>
139 |                 {clearSuccess && (
140 |                   <span className="text-xs text-emerald-400 animate-fadeIn">Done!</span>
141 |                 )}
142 |               </button>
143 |             </div>
144 |           </div>
145 |           
146 |           {/* Settings */}
147 |           <div className="space-y-2">
148 |             <h3 className="text-sm font-medium text-slate-400">Settings</h3>
149 |             <button 
150 |               onClick={() => setIsPreferencesOpen(true)}
151 |               className="flex items-center space-x-2 p-2 w-full rounded-lg hover:bg-slate-800/30 transition-colors"
152 |             >
153 |               <Settings className="w-4 h-4 text-slate-400" />
154 |               <span className="text-sm">Preferences</span>
155 |             </button>
156 |           </div>
157 |         </div>
158 |         ) : (
159 |           <SessionManager />
160 |         )}
161 |       </div>
162 |       
163 |       <div className="p-4 text-xs text-slate-500 border-t border-slate-800/50">
164 |         <p>Vocalis v1.5.0</p>
165 |         <a 
166 |           href="https://github.com/lex-au" 
167 |           target="_blank" 
168 |           rel="noopener noreferrer"
169 |           className="text-slate-500 hover:text-emerald-400 transition-colors"
170 |         >
171 |           github.com/lex-au
172 |         </a>
173 |       </div>
174 |       
175 |       {/* Preferences Modal */}
176 |       <PreferencesModal 
177 |         isOpen={isPreferencesOpen}
178 |         onClose={() => setIsPreferencesOpen(false)}
179 |       />
180 |     </div>
181 |   );
182 | };
183 | 
184 | export default Sidebar;
185 | 


--------------------------------------------------------------------------------
/frontend/src/index.css:
--------------------------------------------------------------------------------
 1 | @tailwind base;
 2 | @tailwind components;
 3 | @tailwind utilities;
 4 | 
 5 | :root {
 6 |   font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif;
 7 |   line-height: 1.5;
 8 |   font-weight: 400;
 9 | 
10 |   color-scheme: dark;
11 |   color: rgba(255, 255, 255, 0.87);
12 |   background-color: #0f172a;
13 | 
14 |   font-synthesis: none;
15 |   text-rendering: optimizeLegibility;
16 |   -webkit-font-smoothing: antialiased;
17 |   -moz-osx-font-smoothing: grayscale;
18 | }
19 | 
20 | body {
21 |   margin: 0;
22 |   display: flex;
23 |   place-items: center;
24 |   min-width: 320px;
25 |   min-height: 100vh;
26 |   overflow: hidden;
27 | }
28 | 
29 | #root {
30 |   width: 100%;
31 |   height: 100vh;
32 | }
33 | 
34 | /* Hide scrollbar for Chrome, Safari and Opera */
35 | *::-webkit-scrollbar {
36 |   display: none;
37 | }
38 | 
39 | /* Hide scrollbar for IE, Edge and Firefox */
40 | * {
41 |   -ms-overflow-style: none;  /* IE and Edge */
42 |   scrollbar-width: none;  /* Firefox */
43 | }
44 | 


--------------------------------------------------------------------------------
/frontend/src/main.tsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import ReactDOM from 'react-dom/client';
 3 | import App from './App';
 4 | import './index.css';
 5 | 
 6 | ReactDOM.createRoot(document.getElementById('root')!).render(
 7 |   <React.StrictMode>
 8 |     <App />
 9 |   </React.StrictMode>,
10 | );
11 | 


--------------------------------------------------------------------------------
/frontend/src/services/audio.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Audio Service
  3 |  *
  4 |  * Handles audio recording, processing, and playback
  5 |  */
  6 | 
  7 | import websocketService, { WebSocketService, MessageType } from './websocket';
  8 | 
  9 | // Audio configuration
 10 | interface AudioConfig {
 11 |   sampleRate: number;
 12 |   channelCount: number;
 13 |   echoCancellation: boolean;
 14 |   noiseSuppression: boolean;
 15 |   autoGainControl: boolean;
 16 |   bufferSize: number;
 17 | }
 18 | 
 19 | // Default audio configuration
 20 | const DEFAULT_CONFIG: AudioConfig = {
 21 |   sampleRate: 44100, // Match microphone's native sample rate
 22 |   channelCount: 1, // Mono
 23 |   echoCancellation: true,
 24 |   noiseSuppression: true,
 25 |   autoGainControl: true,
 26 |   bufferSize: 4096
 27 | };
 28 | 
 29 | // Audio service state
 30 | export enum AudioState {
 31 |   INACTIVE = 'inactive',
 32 |   RECORDING = 'recording',
 33 |   PLAYING = 'playing',
 34 |   SPEAKING = 'speaking',     // Playing TTS content specifically
 35 |   INTERRUPTED = 'interrupted'
 36 | }
 37 | 
 38 | // Audio service events
 39 | export enum AudioEvent {
 40 |   RECORDING_START = 'recording_start',
 41 |   RECORDING_STOP = 'recording_stop',
 42 |   RECORDING_DATA = 'recording_data',
 43 |   PLAYBACK_START = 'playback_start',
 44 |   PLAYBACK_STOP = 'playback_stop',
 45 |   PLAYBACK_END = 'playback_end',
 46 |   AUDIO_ERROR = 'audio_error',
 47 |   AUDIO_STATE_CHANGE = 'audio_state_change'
 48 | }
 49 | 
 50 | // Event listener interface
 51 | type AudioEventListener = (data: any) => void;
 52 | 
 53 | /**
 54 |  * Audio Service class
 55 |  */
 56 | export class AudioService {
 57 |   private config: AudioConfig;
 58 |   private audioContext: AudioContext | null = null;
 59 |   private mediaStream: MediaStream | null = null;
 60 |   private mediaStreamSource: MediaStreamAudioSourceNode | null = null;
 61 |   private scriptProcessor: ScriptProcessorNode | null = null;
 62 |   private recordingIntervalId: number | null = null;
 63 |   private recordingInterval: number = 100; // ms
 64 |   private audioBuffer: Float32Array[] = [];
 65 |   private audioState: AudioState = AudioState.INACTIVE;
 66 |   private eventListeners: Map<AudioEvent, AudioEventListener[]> = new Map();
 67 |   private audioQueue: AudioBuffer[] = [];
 68 |   private isPlaying: boolean = false;
 69 |   private isSpeaking: boolean = false; // Distinct from isPlaying to track TTS specifically
 70 |   private isMuted: boolean = false; // Track microphone mute state
 71 |   private currentSource: AudioBufferSourceNode | null = null;
 72 |   
 73 |   // State tracking (for UI coordination)
 74 |   private isProcessing: boolean = false;
 75 |   private isGreeting: boolean = false;
 76 |   private isVisionProcessing: boolean = false;
 77 |   
 78 |   // Voice detection parameters
 79 |   private isVoiceDetected: boolean = false;
 80 |   private voiceThreshold: number = 0.01; // Adjust based on testing
 81 |   private silenceTimeout: number = 1000; // ms to keep recording after voice drops below threshold
 82 |   private lastVoiceTime: number = 0;
 83 |   private minRecordingLength: number = 1000; // Minimum ms of audio to send
 84 | 
 85 |   constructor(config: Partial<AudioConfig> = {}) {
 86 |     this.config = { ...DEFAULT_CONFIG, ...config };
 87 |   }
 88 | 
 89 |   /**
 90 |    * Set processing state from UI
 91 |    */
 92 |   public setProcessingState(isProcessing: boolean): void {
 93 |     this.isProcessing = isProcessing;
 94 |     console.log(`Processing state set to: ${isProcessing}`);
 95 |   }
 96 |   
 97 |   /**
 98 |    * Set greeting state from UI
 99 |    * This prevents interrupts during the initial greeting
100 |    */
101 |   public setGreetingState(isGreeting: boolean): void {
102 |     this.isGreeting = isGreeting;
103 |     console.log(`Greeting state set to: ${isGreeting}`);
104 |   }
105 |   
106 |   /**
107 |    * Set vision processing state from UI
108 |    * This prevents interrupts during vision processing
109 |    */
110 |   public setVisionProcessingState(isVisionProcessing: boolean): void {
111 |     this.isVisionProcessing = isVisionProcessing;
112 |     console.log(`Vision processing state set to: ${isVisionProcessing}`);
113 |   }
114 |   
115 | 
116 |   /**
117 |    * Initialize the audio context
118 |    */
119 |   private async initAudioContext(): Promise<void> {
120 |     // If context is null, create a new one
121 |     if (!this.audioContext) {
122 |       console.log('Creating new AudioContext');
123 |       try {
124 |         this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({
125 |           sampleRate: this.config.sampleRate
126 |         });
127 |       } catch (error) {
128 |         console.error('Failed to create AudioContext', error);
129 |         this.dispatchEvent(AudioEvent.AUDIO_ERROR, { error });
130 |         throw error;
131 |       }
132 |     }
133 |     
134 |     // Always make sure context is running
135 |     if (this.audioContext.state === 'suspended') {
136 |       console.log('Resuming suspended AudioContext');
137 |       try {
138 |         await this.audioContext.resume();
139 |       } catch (error) {
140 |         console.error('Failed to resume AudioContext', error);
141 |         // If resume fails, try creating a new context
142 |         this.audioContext = null;
143 |         return this.initAudioContext();
144 |       }
145 |     } else if (this.audioContext.state === 'closed') {
146 |       console.log('AudioContext was closed, creating new one');
147 |       this.audioContext = null;
148 |       return this.initAudioContext();
149 |     }
150 |     
151 |     console.log(`AudioContext initialized, state: ${this.audioContext.state}`);
152 |   }
153 | 
154 |   /**
155 |    * Start recording audio
156 |    */
157 |   public async startRecording(): Promise<void> {
158 |     if (this.audioState === AudioState.RECORDING) {
159 |       console.log('Already recording');
160 |       return;
161 |     }
162 | 
163 |     try {
164 |       await this.initAudioContext();
165 |       
166 |       // Request microphone access
167 |       this.mediaStream = await navigator.mediaDevices.getUserMedia({
168 |         audio: {
169 |           sampleRate: this.config.sampleRate,
170 |           channelCount: this.config.channelCount,
171 |           echoCancellation: this.config.echoCancellation,
172 |           noiseSuppression: this.config.noiseSuppression,
173 |           autoGainControl: this.config.autoGainControl
174 |         }
175 |       });
176 |       
177 |       // Apply mute state if already set
178 |       if (this.isMuted && this.mediaStream) {
179 |         this.mediaStream.getAudioTracks().forEach(track => {
180 |           track.enabled = !this.isMuted;
181 |         });
182 |       }
183 |       
184 |       // Create media stream source
185 |       if (this.audioContext) {
186 |         this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.mediaStream);
187 |         
188 |         // Create script processor for recording
189 |         this.scriptProcessor = this.audioContext.createScriptProcessor(
190 |           this.config.bufferSize,
191 |           this.config.channelCount,
192 |           this.config.channelCount
193 |         );
194 |         
195 |         // Connect nodes
196 |         this.mediaStreamSource.connect(this.scriptProcessor);
197 |         this.scriptProcessor.connect(this.audioContext.destination);
198 |         
199 |         // Handle audio processing
200 |         this.scriptProcessor.onaudioprocess = this.handleAudioProcess.bind(this);
201 |         
202 |         // Clear previous buffer
203 |         this.audioBuffer = [];
204 |         
205 |         // Set state
206 |         this.audioState = AudioState.RECORDING;
207 |         
208 |         // Reset voice detection state
209 |         this.isVoiceDetected = false;
210 |         this.lastVoiceTime = 0;
211 |         
212 |         // Log voice detection threshold
213 |         console.log(`Voice detection enabled with threshold: ${this.voiceThreshold}`);
214 |         
215 |         // Dispatch event
216 |         this.dispatchEvent(AudioEvent.RECORDING_START, {});
217 |         
218 |         console.log('Recording started');
219 |       }
220 |     } catch (error) {
221 |       console.error('Error starting recording:', error);
222 |       this.dispatchEvent(AudioEvent.AUDIO_ERROR, { error });
223 |       this.stopRecording();
224 |       throw error;
225 |     }
226 |   }
227 | 
228 |   /**
229 |    * Stop recording audio
230 |    */
231 |   public stopRecording(): void {
232 |     if (this.audioState !== AudioState.RECORDING) {
233 |       return;
234 |     }
235 | 
236 |     // Stop sending chunks
237 |     if (this.recordingIntervalId !== null) {
238 |       clearInterval(this.recordingIntervalId);
239 |       this.recordingIntervalId = null;
240 |     }
241 | 
242 |     // Stop and clean up recorder
243 |     if (this.scriptProcessor) {
244 |       this.scriptProcessor.disconnect();
245 |       this.scriptProcessor = null;
246 |     }
247 | 
248 |     if (this.mediaStreamSource) {
249 |       this.mediaStreamSource.disconnect();
250 |       this.mediaStreamSource = null;
251 |     }
252 | 
253 |     if (this.mediaStream) {
254 |       this.mediaStream.getTracks().forEach(track => track.stop());
255 |       this.mediaStream = null;
256 |     }
257 | 
258 |     // Send any remaining audio data
259 |     this.sendAudioChunk();
260 | 
261 |     // Reset state
262 |     this.audioState = AudioState.INACTIVE;
263 |     this.audioBuffer = [];
264 | 
265 |     // Dispatch event
266 |     this.dispatchEvent(AudioEvent.RECORDING_STOP, {});
267 |     
268 |     console.log('Recording stopped');
269 |   }
270 | 
271 |   /**
272 |    * Calculate RMS (Root Mean Square) energy of an audio buffer
273 |    */
274 |   private calculateRMSEnergy(buffer: Float32Array): number {
275 |     let sum = 0;
276 |     for (let i = 0; i < buffer.length; i++) {
277 |       sum += buffer[i] * buffer[i]; // Square each sample
278 |     }
279 |     const rms = Math.sqrt(sum / buffer.length); // RMS = square root of average
280 |     return rms;
281 |   }
282 | 
283 |   /**
284 |    * Handle audio processing
285 |    */
286 |   private handleAudioProcess(event: AudioProcessingEvent): void {
287 |     const inputBuffer = event.inputBuffer;
288 |     const inputData = inputBuffer.getChannelData(0);
289 |     
290 |     // Create a copy of the buffer
291 |     const bufferCopy = new Float32Array(inputData.length);
292 |     bufferCopy.set(inputData);
293 |     
294 |     // Calculate RMS energy
295 |     const energy = this.calculateRMSEnergy(bufferCopy);
296 |     
297 |     // Check if energy is above threshold (voice detected)
298 |     if (energy > this.voiceThreshold) {
299 |       // Check if in a protected state - if so, ignore voice detection entirely
300 |       if (this.isProcessing || this.isVisionProcessing || this.isGreeting) {
301 |         let state = "processing";
302 |         if (this.isVisionProcessing) state = "vision_processing";
303 |         if (this.isGreeting) state = "greeting";
304 |         
305 |         console.log(`Voice detected during ${state} (energy: ${energy.toFixed(4)}), ignoring`);
306 |         // Skip further processing - don't even update isVoiceDetected
307 |         
308 |         // Still dispatch event for visualization, but mark isVoice as false
309 |         this.dispatchEvent(AudioEvent.RECORDING_DATA, { 
310 |           buffer: bufferCopy,
311 |           energy: energy,
312 |           isVoice: false // Force false during processing or greeting
313 |         });
314 |         
315 |         return;
316 |       }
317 |       
318 |       if (!this.isVoiceDetected) {
319 |         console.log('Voice detected, energy:', energy);
320 |         this.isVoiceDetected = true;
321 |         
322 |       // Check if we're currently playing TTS audio
323 |       // If so, interrupt it immediately - BUT NOT during greeting
324 |       // Also explicitly check audioState to catch any edge cases
325 |       if ((this.isSpeaking || this.audioState === AudioState.SPEAKING) && !this.isGreeting) {
326 |         console.log('User started speaking while assistant was speaking - interrupting playback',
327 |                    `isSpeaking=${this.isSpeaking}, audioState=${this.audioState}, isGreeting=${this.isGreeting}`);
328 |         // Stop playback locally
329 |         this.stopPlayback();
330 |         // Send interrupt signal to server
331 |         websocketService.interrupt();
332 |         // Dispatch an event so UI can update
333 |         this.dispatchEvent(AudioEvent.PLAYBACK_STOP, {
334 |           interrupted: true,
335 |           reason: 'user_interrupt'
336 |         });
337 |       } else if (this.isGreeting) {
338 |         console.log('Voice detected during greeting - suppressing interrupt');
339 |       }
340 |       }
341 |       this.lastVoiceTime = Date.now();
342 |     }
343 |     
344 |     // If in a protected state, never accumulate audio buffer
345 |     if (this.isProcessing || this.isVisionProcessing || this.isGreeting) {
346 |       // Dispatch event for visualization only
347 |       this.dispatchEvent(AudioEvent.RECORDING_DATA, { 
348 |         buffer: bufferCopy,
349 |         energy: energy,
350 |         isVoice: false // Force false during processing
351 |       });
352 |       return;
353 |     }
354 |     
355 |     // Add to buffer if voice is detected or we're in the silence timeout period
356 |     if (this.isVoiceDetected) {
357 |       this.audioBuffer.push(bufferCopy);
358 |       
359 |       // Check if we've exceeded silence timeout
360 |       const timeSinceVoice = Date.now() - this.lastVoiceTime;
361 |       if (energy <= this.voiceThreshold && timeSinceVoice > this.silenceTimeout) {
362 |         console.log('Voice ended, silence timeout exceeded');
363 |         this.isVoiceDetected = false;
364 |         
365 |         // Send accumulated audio
366 |         this.sendAudioChunk();
367 |       }
368 |     }
369 |     
370 |     // Dispatch event
371 |     this.dispatchEvent(AudioEvent.RECORDING_DATA, { 
372 |       buffer: bufferCopy,
373 |       energy: energy,
374 |       isVoice: this.isVoiceDetected
375 |     });
376 |   }
377 | 
378 |   /**
379 |    * Convert Float32Array audio data to WAV format
380 |    */
381 |   private float32ToWav(buffer: Float32Array, sampleRate: number): ArrayBuffer {
382 |     // Create buffer with WAV header
383 |     const numChannels = 1; // Mono
384 |     const bytesPerSample = 2; // 16-bit PCM
385 |     const dataSize = buffer.length * bytesPerSample;
386 |     const headerSize = 44; // Standard WAV header size
387 |     const totalSize = headerSize + dataSize;
388 |     
389 |     // Create the WAV buffer
390 |     const wavBuffer = new ArrayBuffer(totalSize);
391 |     const wavView = new DataView(wavBuffer);
392 |     
393 |     // Write WAV header
394 |     // "RIFF" chunk descriptor
395 |     this.writeString(wavView, 0, 'RIFF');
396 |     wavView.setUint32(4, totalSize - 8, true); // File size - 8
397 |     this.writeString(wavView, 8, 'WAVE');
398 |     
399 |     // "fmt " sub-chunk
400 |     this.writeString(wavView, 12, 'fmt ');
401 |     wavView.setUint32(16, 16, true); // Sub-chunk size (16 for PCM)
402 |     wavView.setUint16(20, 1, true); // Audio format (1 for PCM)
403 |     wavView.setUint16(22, numChannels, true); // Number of channels
404 |     wavView.setUint32(24, sampleRate, true); // Sample rate
405 |     wavView.setUint32(28, sampleRate * numChannels * bytesPerSample, true); // Byte rate
406 |     wavView.setUint16(32, numChannels * bytesPerSample, true); // Block align
407 |     wavView.setUint16(34, bytesPerSample * 8, true); // Bits per sample
408 |     
409 |     // "data" sub-chunk
410 |     this.writeString(wavView, 36, 'data');
411 |     wavView.setUint32(40, dataSize, true); // Sub-chunk size
412 |     
413 |     // Write audio data
414 |     // Convert from Float32 [-1.0,1.0] to Int16 [-32768,32767]
415 |     const offset = 44;
416 |     for (let i = 0; i < buffer.length; i++) {
417 |       // Clamp the value to [-1.0, 1.0]
418 |       const sample = Math.max(-1.0, Math.min(1.0, buffer[i]));
419 |       // Convert to Int16
420 |       const val = sample < 0 ? sample * 32768 : sample * 32767;
421 |       wavView.setInt16(offset + i * bytesPerSample, val, true);
422 |     }
423 |     
424 |     return wavBuffer;
425 |   }
426 |   
427 |   /**
428 |    * Helper function to write a string to a DataView
429 |    */
430 |   private writeString(view: DataView, offset: number, string: string): void {
431 |     for (let i = 0; i < string.length; i++) {
432 |       view.setUint8(offset + i, string.charCodeAt(i));
433 |     }
434 |   }
435 | 
436 |   /**
437 |    * Send accumulated audio chunk to WebSocket
438 |    */
439 |   private sendAudioChunk(): void {
440 |     if (this.audioBuffer.length === 0) {
441 |       return;
442 |     }
443 |     
444 |     // Don't send audio if we're in processing state
445 |     if (this.isProcessing) {
446 |       console.log('Processing state active, discarding audio chunk');
447 |       this.audioBuffer = [];
448 |       return;
449 |     }
450 | 
451 |     // Calculate total length
452 |     const totalLength = this.audioBuffer.reduce((acc, buffer) => acc + buffer.length, 0);
453 |     
454 |     // Check if we have enough audio to send (avoid sending tiny fragments)
455 |     const audioLengthMs = (totalLength / this.config.sampleRate) * 1000;
456 |     if (!this.isVoiceDetected && audioLengthMs < this.minRecordingLength) {
457 |       console.log(`Audio too short (${audioLengthMs.toFixed(0)}ms), discarding`);
458 |       this.audioBuffer = [];
459 |       return;
460 |     }
461 |     
462 |     // Create combined buffer
463 |     const combinedBuffer = new Float32Array(totalLength);
464 |     
465 |     // Copy data
466 |     let offset = 0;
467 |     for (const buffer of this.audioBuffer) {
468 |       combinedBuffer.set(buffer, offset);
469 |       offset += buffer.length;
470 |     }
471 |     
472 |     console.log(`Sending audio chunk: ${audioLengthMs.toFixed(0)}ms`);
473 |     
474 |     // Convert to WAV format
475 |     const wavBuffer = this.float32ToWav(combinedBuffer, this.config.sampleRate);
476 |     
477 |     // Send to WebSocket
478 |     websocketService.sendAudio(wavBuffer);
479 |     
480 |     // Clear buffer
481 |     this.audioBuffer = [];
482 |   }
483 | 
484 |   /**
485 |    * Play audio from base64-encoded data
486 |    * 
487 |    * The backend now sends complete audio files instead of chunks,
488 |    * so we just need to decode and play the entire file at once.
489 |    * 
490 |    * This method is specifically for playing TTS content and will
491 |    * set the state to SPEAKING rather than just PLAYING.
492 |    */
493 |   public async playAudioChunk(base64AudioChunk: string, format: string = 'wav'): Promise<void> {
494 |     try {
495 |       await this.initAudioContext();
496 |       
497 |       if (!this.audioContext) {
498 |         throw new Error('AudioContext not initialized');
499 |       }
500 |       
501 |       // Convert base64 to ArrayBuffer
502 |       const audioData = WebSocketService.base64ToArrayBuffer(base64AudioChunk);
503 |       
504 |       console.log(`Received complete audio file (${audioData.byteLength} bytes)`);
505 |       
506 |       // Decode the audio data
507 |       try {
508 |         const audioBuffer = await this.audioContext.decodeAudioData(audioData);
509 |         
510 |         // Add to queue (instead of immediate playback)
511 |         this.audioQueue.push(audioBuffer);
512 |         
513 |         // Start playback if not already playing
514 |         if (!this.isPlaying) {
515 |           this.playNextChunk();
516 |         } else {
517 |           console.log(`Added audio buffer to queue: duration=${audioBuffer.duration.toFixed(2)}s`);
518 |         }
519 |         
520 |       } catch (error) {
521 |         console.error('Error decoding audio data:', error);
522 |         this.dispatchEvent(AudioEvent.AUDIO_ERROR, { error });
523 |       }
524 |     } catch (error) {
525 |       console.error('Error queueing audio chunk:', error);
526 |       this.dispatchEvent(AudioEvent.AUDIO_ERROR, { error });
527 |     }
528 |   }
529 |   
530 |   /**
531 |    * Play next audio chunk from the queue
532 |    */
533 |   private playNextChunk(): void {
534 |     console.log(`>> playNextChunk called. Queue length: ${this.audioQueue.length}, isPlaying: ${this.isPlaying}, isSpeaking: ${this.isSpeaking}`);
535 |     
536 |     if (this.audioQueue.length === 0) {
537 |       this.isPlaying = false;
538 |       this.isSpeaking = false;
539 |       this.audioState = AudioState.INACTIVE;
540 |       this.dispatchEvent(AudioEvent.PLAYBACK_END, {
541 |         previousState: AudioState.SPEAKING
542 |       });
543 |       console.log('Audio queue empty, playback complete');
544 |       return;
545 |     }
546 |     
547 |     if (!this.audioContext) return;
548 |     
549 |     const buffer = this.audioQueue.shift();
550 |     if (!buffer) return;
551 |     
552 |     // Set playback state - only dispatch PLAYBACK_START on the first buffer
553 |     const wasPlaying = this.isPlaying;
554 |     this.isPlaying = true;
555 |     this.isSpeaking = true;
556 |     this.audioState = AudioState.SPEAKING;
557 |     
558 |     // Create source node
559 |     const source = this.audioContext.createBufferSource();
560 |     source.buffer = buffer;
561 |     source.connect(this.audioContext.destination);
562 |     
563 |     // Handle when this chunk ends
564 |     source.onended = () => {
565 |       console.log(`Buffer playback ended. Queue length: ${this.audioQueue.length}`);
566 |       // If there are more chunks, play them
567 |       if (this.audioQueue.length > 0) {
568 |         this.playNextChunk();
569 |       } else {
570 |         // No more chunks, end playback
571 |         this.isPlaying = false;
572 |         this.isSpeaking = false;
573 |         this.audioState = AudioState.INACTIVE;
574 |         this.currentSource = null;
575 |         this.dispatchEvent(AudioEvent.PLAYBACK_END, {
576 |           previousState: AudioState.SPEAKING
577 |         });
578 |         console.log('Last audio chunk complete, playback ended');
579 |       }
580 |     };
581 |     
582 |     // Keep track of current source for stopping
583 |     this.currentSource = source;
584 |     
585 |     // Start playback with a small delay
586 |     source.start(this.audioContext.currentTime + 0.05);
587 |     
588 |     console.log(`Playing audio buffer: duration=${buffer.duration.toFixed(2)}s, queue remaining: ${this.audioQueue.length}`);
589 |     
590 |     // Dispatch playback start event only if we weren't already playing
591 |     if (!wasPlaying) {
592 |       console.log('First chunk in sequence - dispatching PLAYBACK_START event');
593 |       this.dispatchEvent(AudioEvent.PLAYBACK_START, {});
594 |     }
595 |   }
596 |   
597 |   /**
598 |    * Check if audio is currently playing speech
599 |    */
600 |   public isCurrentlySpeaking(): boolean {
601 |     return this.isSpeaking;
602 |   }
603 |   
604 |   /**
605 |    * Get the length of the audio queue
606 |    */
607 |   public getAudioQueueLength(): number {
608 |     return this.audioQueue.length;
609 |   }
610 |   
611 |   /**
612 |    * Check if microphone input is muted
613 |    */
614 |   public isMicrophoneMuted(): boolean {
615 |     return this.isMuted;
616 |   }
617 |   
618 |   /**
619 |    * Toggle microphone mute state
620 |    * Returns the new mute state
621 |    */
622 |   public toggleMicrophoneMute(): boolean {
623 |     this.isMuted = !this.isMuted;
624 |     
625 |     // Apply mute state to active audio tracks
626 |     if (this.mediaStream) {
627 |       this.mediaStream.getAudioTracks().forEach(track => {
628 |         track.enabled = !this.isMuted;
629 |       });
630 |       console.log(`Microphone ${this.isMuted ? 'muted' : 'unmuted'}`);
631 |     } else {
632 |       console.log('No active microphone to mute/unmute');
633 |     }
634 |     
635 |     // Dispatch event
636 |     this.dispatchEvent(AudioEvent.AUDIO_STATE_CHANGE, {
637 |       type: 'mute_change',
638 |       isMuted: this.isMuted
639 |     });
640 |     
641 |     return this.isMuted;
642 |   }
643 | 
644 |   /**
645 |    * Stop audio playback
646 |    */
647 |   public stopPlayback(): void {
648 |     if (!this.currentSource) {
649 |       return;
650 |     }
651 |     
652 |     // Store previous state for the event
653 |     const previousState = this.audioState;
654 |     
655 |     try {
656 |       this.currentSource.stop();
657 |       this.currentSource = null;
658 |     } catch (error) {
659 |       console.error('Error stopping playback:', error);
660 |     }
661 |     
662 |     // Clear the queue
663 |     this.audioQueue = [];
664 |     
665 |     // Set state to INTERRUPTED if we were SPEAKING
666 |     if (previousState === AudioState.SPEAKING) {
667 |       this.audioState = AudioState.INTERRUPTED;
668 |     } else {
669 |       this.audioState = AudioState.INACTIVE;
670 |     }
671 |     
672 |     this.isPlaying = false;
673 |     this.isSpeaking = false;
674 |     
675 |     // Dispatch event with previous state info
676 |     this.dispatchEvent(AudioEvent.PLAYBACK_STOP, { 
677 |       interrupted: previousState === AudioState.SPEAKING,
678 |       previousState: previousState
679 |     });
680 |     
681 |     console.log('Playback stopped');
682 |   }
683 | 
684 |   /**
685 |    * Fully release all hardware access
686 |    * This is more aggressive than just stopRecording() as it also:
687 |    * - Forces all media tracks to stop
688 |    * - Suspends the audio context
689 |    * - Nullifies all resources
690 |    * 
691 |    * Use this when completely ending a call to ensure microphone
692 |    * permissions are fully released at the hardware level.
693 |    */
694 |   public releaseHardware(): void {
695 |     console.log('Releasing all hardware access...');
696 |     
697 |     // First stop any active recording/playback
698 |     this.stopRecording();
699 |     this.stopPlayback();
700 |     
701 |     // Force-stop and disable all tracks to release hardware
702 |     if (this.mediaStream) {
703 |       this.mediaStream.getTracks().forEach(track => {
704 |         track.stop();
705 |         track.enabled = false;
706 |       });
707 |       this.mediaStream = null;
708 |     }
709 |     
710 |     // Ensure script processor is disconnected
711 |     if (this.scriptProcessor) {
712 |       this.scriptProcessor.disconnect();
713 |       this.scriptProcessor = null;
714 |     }
715 |     
716 |     // Ensure media stream source is disconnected
717 |     if (this.mediaStreamSource) {
718 |       this.mediaStreamSource.disconnect();
719 |       this.mediaStreamSource = null;
720 |     }
721 |     
722 |     // Suspend the audio context if it's running
723 |     if (this.audioContext?.state === 'running') {
724 |       this.audioContext.suspend().catch(err => {
725 |         console.error('Error suspending audio context:', err);
726 |       });
727 |     }
728 |     
729 |     // Reset all state
730 |     this.audioState = AudioState.INACTIVE;
731 |     this.isVoiceDetected = false;
732 |     this.audioBuffer = [];
733 |     this.isPlaying = false;
734 |     this.isSpeaking = false;
735 |     
736 |     console.log('All hardware access released');
737 |   }
738 | 
739 |   /**
740 |    * Get current audio state
741 |    */
742 |   public getAudioState(): AudioState {
743 |     return this.audioState;
744 |   }
745 | 
746 |   /**
747 |    * Add event listener
748 |    */
749 |   public addEventListener(event: AudioEvent, callback: AudioEventListener): void {
750 |     if (!this.eventListeners.has(event)) {
751 |       this.eventListeners.set(event, []);
752 |     }
753 |     
754 |     this.eventListeners.get(event)?.push(callback);
755 |   }
756 | 
757 |   /**
758 |    * Remove event listener
759 |    */
760 |   public removeEventListener(event: AudioEvent, callback: AudioEventListener): void {
761 |     if (!this.eventListeners.has(event)) {
762 |       return;
763 |     }
764 |     
765 |     const listeners = this.eventListeners.get(event) || [];
766 |     this.eventListeners.set(
767 |       event,
768 |       listeners.filter(listener => listener !== callback)
769 |     );
770 |   }
771 | 
772 |   /**
773 |    * Dispatch event
774 |    */
775 |   private dispatchEvent(event: AudioEvent, data: any): void {
776 |     if (!this.eventListeners.has(event)) {
777 |       return;
778 |     }
779 |     
780 |     const listeners = this.eventListeners.get(event) || [];
781 |     listeners.forEach(listener => {
782 |       try {
783 |         listener(data);
784 |       } catch (error) {
785 |         console.error(`Error in ${event} listener:`, error);
786 |       }
787 |     });
788 |   }
789 | }
790 | 
791 | // Create singleton instance
792 | const audioService = new AudioService();
793 | export default audioService;
794 | 


--------------------------------------------------------------------------------
/frontend/src/services/websocket.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * WebSocket Service
  3 |  *
  4 |  * Handles WebSocket connection and communication with the backend
  5 |  */
  6 | 
  7 | // Message types (corresponds to backend message types)
  8 | export enum MessageType {
  9 |   AUDIO = "audio",
 10 |   TRANSCRIPTION = "transcription",
 11 |   LLM_RESPONSE = "llm_response",
 12 |   TTS_CHUNK = "tts_chunk",
 13 |   TTS_START = "tts_start",
 14 |   TTS_END = "tts_end",
 15 |   STATUS = "status",
 16 |   ERROR = "error",
 17 |   PING = "ping",
 18 |   PONG = "pong",
 19 |   INTERRUPT = "interrupt",
 20 |   CLEAR_HISTORY = "clear_history",
 21 |   SYSTEM_PROMPT = "system_prompt",
 22 |   SYSTEM_PROMPT_UPDATED = "system_prompt_updated",
 23 |   USER_PROFILE = "user_profile",
 24 |   USER_PROFILE_UPDATED = "user_profile_updated",
 25 |   GREETING = "greeting",
 26 |   SILENT_FOLLOWUP = "silent_followup",
 27 |   
 28 |   // Session storage message types
 29 |   SAVE_SESSION = "save_session",
 30 |   SAVE_SESSION_RESULT = "save_session_result",
 31 |   LOAD_SESSION = "load_session",
 32 |   LOAD_SESSION_RESULT = "load_session_result",
 33 |   LIST_SESSIONS = "list_sessions",
 34 |   LIST_SESSIONS_RESULT = "list_sessions_result",
 35 |   DELETE_SESSION = "delete_session",
 36 |   DELETE_SESSION_RESULT = "delete_session_result",
 37 |   
 38 |   // Vision feature message types
 39 |   VISION_SETTINGS = "vision_settings",
 40 |   VISION_SETTINGS_UPDATED = "vision_settings_updated",
 41 |   
 42 |   // Vision processing message types
 43 |   VISION_FILE_UPLOAD = "vision_file_upload",
 44 |   VISION_FILE_UPLOAD_RESULT = "vision_file_upload_result", 
 45 |   VISION_PROCESSING = "vision_processing",
 46 |   VISION_READY = "vision_ready"
 47 | }
 48 | 
 49 | // Session interface
 50 | export interface Session {
 51 |   id: string;
 52 |   title: string;
 53 |   created_at: string;
 54 |   updated_at: string;
 55 |   metadata?: {
 56 |     message_count?: number;
 57 |     user_message_count?: number;
 58 |     assistant_message_count?: number;
 59 |     user_name?: string;
 60 |     [key: string]: any;
 61 |   };
 62 | }
 63 | 
 64 | // Event types
 65 | type WebSocketEventType = 
 66 |   | 'open'
 67 |   | 'close'
 68 |   | 'error'
 69 |   | 'audio'
 70 |   | 'transcription'
 71 |   | 'llm_response'
 72 |   | 'tts_start'
 73 |   | 'tts_chunk'
 74 |   | 'tts_end'
 75 |   | 'status'
 76 |   | 'ping'
 77 |   | 'pong'
 78 |   | 'error'
 79 |   | 'system_prompt'
 80 |   | 'system_prompt_updated'
 81 |   | 'user_profile'
 82 |   | 'user_profile_updated'
 83 |   | 'save_session_result'
 84 |   | 'load_session_result'
 85 |   | 'list_sessions_result'
 86 |   | 'delete_session_result'
 87 |   | 'vision_settings'
 88 |   | 'vision_settings_updated'
 89 |   | 'vision_file_upload_result'
 90 |   | 'vision_processing'
 91 |   | 'vision_ready';
 92 | 
 93 | // WebSocket state
 94 | export enum ConnectionState {
 95 |   CONNECTING = 'connecting',
 96 |   CONNECTED = 'connected',
 97 |   DISCONNECTED = 'disconnected',
 98 |   ERROR = 'error'
 99 | }
100 | 
101 | // Event listener type
102 | interface EventListener {
103 |   type: WebSocketEventType;
104 |   callback: (data: any) => void;
105 | }
106 | 
107 | // Main WebSocket service class
108 | export class WebSocketService {
109 |   private socket: WebSocket | null = null;
110 |   private url: string;
111 |   private autoReconnect: boolean;
112 |   private reconnectInterval: number; // in milliseconds
113 |   private reconnectAttempts: number;
114 |   private maxReconnectAttempts: number;
115 |   private listeners: EventListener[] = [];
116 |   private pingInterval: number | null = null;
117 |   private connectionState: ConnectionState = ConnectionState.DISCONNECTED;
118 |   
119 |   // Track states that should prevent interrupt signals
120 |   private isInGreetingFlow: boolean = false;
121 | 
122 |   constructor(
123 |     url: string = 'ws://localhost:8000/ws', 
124 |     autoReconnect: boolean = true,
125 |     reconnectInterval: number = 3000,
126 |     maxReconnectAttempts: number = 5
127 |   ) {
128 |     this.url = url;
129 |     this.autoReconnect = autoReconnect;
130 |     this.reconnectInterval = reconnectInterval;
131 |     this.reconnectAttempts = 0;
132 |     this.maxReconnectAttempts = maxReconnectAttempts;
133 |   }
134 | 
135 |   /**
136 |    * Connect to the WebSocket server
137 |    */
138 |   public connect(): void {
139 |     if (this.socket && (this.socket.readyState === WebSocket.OPEN || this.socket.readyState === WebSocket.CONNECTING)) {
140 |       console.log('WebSocket already connected or connecting');
141 |       return;
142 |     }
143 | 
144 |     this.setConnectionState(ConnectionState.CONNECTING);
145 |     
146 |     try {
147 |       this.socket = new WebSocket(this.url);
148 |       
149 |       this.socket.onopen = this.onOpen.bind(this);
150 |       this.socket.onclose = this.onClose.bind(this);
151 |       this.socket.onerror = this.onError.bind(this);
152 |       this.socket.onmessage = this.onMessage.bind(this);
153 |     } catch (error) {
154 |       console.error('WebSocket connection error:', error);
155 |       this.setConnectionState(ConnectionState.ERROR);
156 |       this.handleReconnect();
157 |     }
158 |   }
159 | 
160 |   /**
161 |    * Disconnect from the WebSocket server
162 |    */
163 |   public disconnect(): void {
164 |     if (this.socket) {
165 |       this.autoReconnect = false; // Prevent auto reconnect
166 |       this.socket.close();
167 |       this.socket = null;
168 |     }
169 |     
170 |     if (this.pingInterval) {
171 |       clearInterval(this.pingInterval);
172 |       this.pingInterval = null;
173 |     }
174 |     
175 |     this.setConnectionState(ConnectionState.DISCONNECTED);
176 |   }
177 | 
178 |   /**
179 |    * Send a message to the WebSocket server
180 |    */
181 |   public send(type: MessageType, data: any = {}): boolean {
182 |     if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
183 |       console.error('WebSocket not connected');
184 |       return false;
185 |     }
186 | 
187 |     try {
188 |       const message = {
189 |         type,
190 |         ...data,
191 |         timestamp: new Date().toISOString()
192 |       };
193 |       
194 |       this.socket.send(JSON.stringify(message));
195 |       return true;
196 |     } catch (error) {
197 |       console.error('Error sending message:', error);
198 |       return false;
199 |     }
200 |   }
201 | 
202 |   /**
203 |    * Send audio data to the WebSocket server
204 |    */
205 |   public sendAudio(audioData: Float32Array | ArrayBuffer): boolean {
206 |     // Convert to base64 if Float32Array
207 |     let base64Data: string;
208 |     
209 |     if (audioData instanceof Float32Array) {
210 |       // Create a buffer from the Float32Array
211 |       const buffer = new ArrayBuffer(audioData.length * 4); // 4 bytes per float
212 |       const view = new Float32Array(buffer);
213 |       view.set(audioData);
214 |       
215 |       base64Data = this.arrayBufferToBase64(buffer);
216 |     } else {
217 |       base64Data = this.arrayBufferToBase64(audioData);
218 |     }
219 |     
220 |     return this.send(MessageType.AUDIO, {
221 |       audio_data: base64Data
222 |     });
223 |   }
224 | 
225 |   /**
226 |    * Send an interrupt signal to stop ongoing TTS
227 |    * Will not send if we're in the initial greeting flow
228 |    */
229 |   public interrupt(): boolean {
230 |     // Don't send interrupt during greeting flow
231 |     if (this.isInGreetingFlow) {
232 |       console.log('Interrupt prevented: still in greeting flow');
233 |       return false;
234 |     }
235 |     
236 |     console.log('Sending interrupt signal to server');
237 |     return this.send(MessageType.INTERRUPT);
238 |   }
239 |   
240 |   /**
241 |    * Set whether we're in the initial greeting flow
242 |    * This prevents interrupts during initial greeting and playback
243 |    */
244 |   public setGreetingFlowState(isInGreetingFlow: boolean): void {
245 |     this.isInGreetingFlow = isInGreetingFlow;
246 |     console.log(`Greeting flow state set to: ${isInGreetingFlow}`);
247 |   }
248 | 
249 |   /**
250 |    * Send a request to clear conversation history
251 |    */
252 |   public clearHistory(): boolean {
253 |     return this.send(MessageType.CLEAR_HISTORY);
254 |   }
255 |   
256 |   /**
257 |    * Request the current system prompt
258 |    */
259 |   public getSystemPrompt(): boolean {
260 |     // Send explicit string type expected by backend
261 |     return this.send("get_system_prompt" as any);
262 |   }
263 |   
264 |   /**
265 |    * Update the system prompt
266 |    */
267 |   public updateSystemPrompt(prompt: string): boolean {
268 |     // Send explicit string type expected by backend
269 |     return this.send("update_system_prompt" as any, {
270 |       prompt
271 |     });
272 |   }
273 |   
274 |   /**
275 |    * Request the current user profile
276 |    */
277 |   public getUserProfile(): boolean {
278 |     // Send explicit string type expected by backend
279 |     return this.send("get_user_profile" as any);
280 |   }
281 |   
282 |   /**
283 |    * Update the user profile
284 |    */
285 |   public updateUserProfile(name: string): boolean {
286 |     // Send explicit string type expected by backend
287 |     return this.send("update_user_profile" as any, {
288 |       name
289 |     });
290 |   }
291 |   
292 |   /**
293 |    * Request the current vision settings
294 |    */
295 |   public getVisionSettings(): boolean {
296 |     // Send explicit string type expected by backend
297 |     return this.send("get_vision_settings" as any);
298 |   }
299 |   
300 |   /**
301 |    * Update the vision settings
302 |    * 
303 |    * @param enabled Whether vision is enabled
304 |    */
305 |   public updateVisionSettings(enabled: boolean): boolean {
306 |     // Send explicit string type expected by backend
307 |     return this.send("update_vision_settings" as any, {
308 |       enabled
309 |     });
310 |   }
311 |   
312 |   /**
313 |    * Send an image for vision processing
314 |    * 
315 |    * @param imageData Base64-encoded image data
316 |    */
317 |   public sendVisionImage(imageData: string): boolean {
318 |     return this.send(MessageType.VISION_FILE_UPLOAD, {
319 |       image_data: imageData
320 |     });
321 |   }
322 |   
323 |   /**
324 |    * Send a greeting request (for conversation starters)
325 |    */
326 |   public sendGreeting(): boolean {
327 |     // When sending a greeting, set the greeting flow state
328 |     this.setGreetingFlowState(true);
329 |     return this.send(MessageType.GREETING);
330 |   }
331 |   
332 |   /**
333 |    * Send a silent follow-up request when user is inactive
334 |    * 
335 |    * @param tier The current follow-up tier (0-2)
336 |    */
337 |   public sendSilentFollowUp(tier: number): boolean {
338 |     return this.send(MessageType.SILENT_FOLLOWUP, { tier });
339 |   }
340 | 
341 |   /**
342 |    * Save the current conversation session
343 |    * 
344 |    * @param title Optional title for the session
345 |    * @param sessionId Optional ID for overwriting an existing session
346 |    * @returns boolean indicating if the request was sent
347 |    */
348 |   public saveSession(title?: string, sessionId?: string): boolean {
349 |     return this.send(MessageType.SAVE_SESSION, {
350 |       title,
351 |       session_id: sessionId
352 |     });
353 |   }
354 | 
355 |   /**
356 |    * Load a conversation session
357 |    * 
358 |    * @param sessionId ID of the session to load
359 |    * @returns boolean indicating if the request was sent
360 |    */
361 |   public loadSession(sessionId: string): boolean {
362 |     if (!sessionId) {
363 |       console.error('Session ID is required to load a session');
364 |       return false;
365 |     }
366 |     
367 |     return this.send(MessageType.LOAD_SESSION, {
368 |       session_id: sessionId
369 |     });
370 |   }
371 | 
372 |   /**
373 |    * List all saved conversation sessions
374 |    * 
375 |    * @returns boolean indicating if the request was sent
376 |    */
377 |   public listSessions(): boolean {
378 |     return this.send(MessageType.LIST_SESSIONS);
379 |   }
380 | 
381 |   /**
382 |    * Delete a conversation session
383 |    * 
384 |    * @param sessionId ID of the session to delete
385 |    * @returns boolean indicating if the request was sent
386 |    */
387 |   public deleteSession(sessionId: string): boolean {
388 |     if (!sessionId) {
389 |       console.error('Session ID is required to delete a session');
390 |       return false;
391 |     }
392 |     
393 |     return this.send(MessageType.DELETE_SESSION, {
394 |       session_id: sessionId
395 |     });
396 |   }
397 | 
398 |   /**
399 |    * Add event listener
400 |    */
401 |   public addEventListener(type: WebSocketEventType, callback: (data: any) => void): void {
402 |     this.listeners.push({ type, callback });
403 |   }
404 | 
405 |   /**
406 |    * Remove event listener
407 |    */
408 |   public removeEventListener(type: WebSocketEventType, callback: (data: any) => void): void {
409 |     this.listeners = this.listeners.filter(
410 |       listener => !(listener.type === type && listener.callback === callback)
411 |     );
412 |   }
413 | 
414 |   /**
415 |    * Get current connection state
416 |    */
417 |   public getConnectionState(): ConnectionState {
418 |     return this.connectionState;
419 |   }
420 | 
421 |   /**
422 |    * Handle WebSocket open event
423 |    */
424 |   private onOpen(event: Event): void {
425 |     console.log('WebSocket connected');
426 |     this.setConnectionState(ConnectionState.CONNECTED);
427 |     this.reconnectAttempts = 0;
428 |     
429 |     // Set up ping interval to keep connection alive
430 |     this.pingInterval = setInterval(() => {
431 |       if (this.socket && this.socket.readyState === WebSocket.OPEN) {
432 |         this.send(MessageType.PING);
433 |       }
434 |     }, 30000); // Send ping every 30 seconds
435 |     
436 |     // Notify listeners
437 |     this.notifyListeners('open', { event });
438 |   }
439 | 
440 |   /**
441 |    * Handle WebSocket close event
442 |    */
443 |   private onClose(event: CloseEvent): void {
444 |     console.log('WebSocket disconnected');
445 |     this.setConnectionState(ConnectionState.DISCONNECTED);
446 |     
447 |     if (this.pingInterval) {
448 |       clearInterval(this.pingInterval);
449 |       this.pingInterval = null;
450 |     }
451 |     
452 |     // Notify listeners
453 |     this.notifyListeners('close', { event });
454 |     
455 |     // Attempt to reconnect if enabled
456 |     this.handleReconnect();
457 |   }
458 | 
459 |   /**
460 |    * Handle WebSocket error event
461 |    */
462 |   private onError(event: Event): void {
463 |     console.error('WebSocket error:', event);
464 |     this.setConnectionState(ConnectionState.ERROR);
465 |     
466 |     // Notify listeners
467 |     this.notifyListeners('error', { event });
468 |   }
469 | 
470 |   /**
471 |    * Handle WebSocket message event
472 |    */
473 |   private onMessage(event: MessageEvent): void {
474 |     try {
475 |       const message = JSON.parse(event.data);
476 |       const type = message.type as WebSocketEventType;
477 |       
478 |       // Acknowledge pings but DON'T send pong responses
479 |       // Backend doesn't handle pong messages well
480 |       if (message.type === 'ping') {
481 |         // Just update the last message time - no need to send pong
482 |         console.debug('Received ping from server');
483 |         return;
484 |       }
485 |       
486 |       // Silently ignore pong messages as they're just connection keepalive responses
487 |       if (message.type === 'pong') {
488 |         console.debug('Received pong response');
489 |         return;
490 |       }
491 |       
492 |       // Notify listeners
493 |       this.notifyListeners(type, message);
494 |     } catch (error) {
495 |       console.error('Error parsing message:', error);
496 |     }
497 |   }
498 | 
499 |   /**
500 |    * Notify all listeners of an event
501 |    */
502 |   private notifyListeners(type: WebSocketEventType, data: any): void {
503 |     this.listeners
504 |       .filter(listener => listener.type === type)
505 |       .forEach(listener => {
506 |         try {
507 |           listener.callback(data);
508 |         } catch (error) {
509 |           console.error(`Error in ${type} listener:`, error);
510 |         }
511 |       });
512 |   }
513 | 
514 |   /**
515 |    * Handle reconnection attempts
516 |    */
517 |   private handleReconnect(): void {
518 |     if (!this.autoReconnect || this.reconnectAttempts >= this.maxReconnectAttempts) {
519 |       return;
520 |     }
521 |     
522 |     this.reconnectAttempts++;
523 |     
524 |     console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
525 |     
526 |     setTimeout(() => {
527 |       this.connect();
528 |     }, this.reconnectInterval);
529 |   }
530 | 
531 |   /**
532 |    * Set the connection state
533 |    */
534 |   private setConnectionState(state: ConnectionState): void {
535 |     this.connectionState = state;
536 |   }
537 | 
538 |   /**
539 |    * Convert ArrayBuffer to Base64 string
540 |    */
541 |   private arrayBufferToBase64(buffer: ArrayBuffer): string {
542 |     const binary = [];
543 |     const bytes = new Uint8Array(buffer);
544 |     const len = bytes.byteLength;
545 |     
546 |     for (let i = 0; i < len; i++) {
547 |       binary.push(String.fromCharCode(bytes[i]));
548 |     }
549 |     
550 |     return btoa(binary.join(''));
551 |   }
552 | 
553 |   /**
554 |    * Convert Base64 string to ArrayBuffer
555 |    */
556 |   public static base64ToArrayBuffer(base64: string): ArrayBuffer {
557 |     const binaryString = atob(base64);
558 |     const len = binaryString.length;
559 |     const bytes = new Uint8Array(len);
560 |     
561 |     for (let i = 0; i < len; i++) {
562 |       bytes[i] = binaryString.charCodeAt(i);
563 |     }
564 |     
565 |     return bytes.buffer;
566 |   }
567 | }
568 | 
569 | // Create a singleton instance
570 | const websocketService = new WebSocketService();
571 | export default websocketService;
572 | 


--------------------------------------------------------------------------------
/frontend/src/utils/hooks.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Custom Hooks for Vocalis
 3 |  */
 4 | 
 5 | import { useEffect, useRef } from 'react';
 6 | 
 7 | /**
 8 |  * A hook that executes a callback at a specified interval
 9 |  * 
10 |  * @param callback Function to call at each interval
11 |  * @param delay Delay in milliseconds, or null to pause the interval
12 |  */
13 | export function useInterval(callback: () => void, delay: number | null) {
14 |   const savedCallback = useRef<() => void | null>();
15 | 
16 |   // Remember the latest callback
17 |   useEffect(() => {
18 |     savedCallback.current = callback;
19 |   }, [callback]);
20 | 
21 |   // Set up the interval
22 |   useEffect(() => {
23 |     function tick() {
24 |       if (savedCallback.current) {
25 |         savedCallback.current();
26 |       }
27 |     }
28 | 
29 |     if (delay !== null) {
30 |       const id = setInterval(tick, delay);
31 |       return () => clearInterval(id);
32 |     }
33 |     
34 |     return undefined;
35 |   }, [delay]);
36 | }
37 | 


--------------------------------------------------------------------------------
/frontend/src/vite-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="vite/client" />
2 | 


--------------------------------------------------------------------------------
/frontend/tailwind.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('tailwindcss').Config} */
 2 | export default {
 3 |   content: [
 4 |     "./index.html",
 5 |     "./src/**/*.{js,ts,jsx,tsx}",
 6 |   ],
 7 |   theme: {
 8 |     extend: {
 9 |       backgroundImage: {
10 |         'gradient-radial': 'radial-gradient(var(--tw-gradient-stops))',
11 |       },
12 |       animation: {
13 |         'pulse-slow': 'pulse 3s cubic-bezier(0.4, 0, 0.6, 1) infinite',
14 |         'spin-slow': 'spin 3s linear infinite',
15 |         'fadeIn': 'fadeIn 0.5s ease-in-out',
16 |       },
17 |       keyframes: {
18 |         fadeIn: {
19 |           '0%': { opacity: '0' },
20 |           '100%': { opacity: '1' },
21 |         },
22 |       },
23 |       colors: {
24 |         // Custom color palette that matches the assistant orb
25 |         emerald: {
26 |           300: '#6ee7b7',
27 |           400: '#34d399',
28 |           500: '#10b981',
29 |         },
30 |         sky: {
31 |           300: '#7dd3fc',
32 |           400: '#38bdf8',
33 |         },
34 |         purple: {
35 |           300: '#c4b5fd',
36 |           400: '#a78bfa',
37 |         },
38 |         slate: {
39 |           200: '#e2e8f0',
40 |           300: '#cbd5e1',
41 |           400: '#94a3b8',
42 |           700: '#334155',
43 |           800: '#1e293b',
44 |           900: '#0f172a',
45 |         },
46 |       },
47 |     },
48 |   },
49 |   plugins: [],
50 | }
51 | 


--------------------------------------------------------------------------------
/frontend/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2020",
 4 |     "useDefineForClassFields": true,
 5 |     "lib": ["ES2020", "DOM", "DOM.Iterable"],
 6 |     "module": "ESNext",
 7 |     "skipLibCheck": true,
 8 | 
 9 |     /* Bundler mode */
10 |     "moduleResolution": "bundler",
11 |     "allowImportingTsExtensions": true,
12 |     "resolveJsonModule": true,
13 |     "isolatedModules": true,
14 |     "noEmit": true,
15 |     "jsx": "react-jsx",
16 | 
17 |     /* Linting */
18 |     "strict": true,
19 |     "noUnusedLocals": true,
20 |     "noUnusedParameters": true,
21 |     "noFallthroughCasesInSwitch": true
22 |   },
23 |   "include": ["src"],
24 |   "references": [{ "path": "./tsconfig.node.json" }]
25 | }
26 | 


--------------------------------------------------------------------------------
/frontend/tsconfig.node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "composite": true,
 4 |     "skipLibCheck": true,
 5 |     "module": "ESNext",
 6 |     "moduleResolution": "bundler",
 7 |     "allowSyntheticDefaultImports": true
 8 |   },
 9 |   "include": ["vite.config.ts"]
10 | }
11 | 


--------------------------------------------------------------------------------
/frontend/vite.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from 'vite'
 2 | import react from '@vitejs/plugin-react'
 3 | 
 4 | // https://vitejs.dev/config/
 5 | export default defineConfig({
 6 |   plugins: [react()],
 7 |   server: {
 8 |     port: 3000,
 9 |     open: true, // Automatically open browser
10 |     proxy: {
11 |       // Proxy WebSocket connections to backend
12 |       '/ws': {
13 |         target: 'ws://localhost:8000',
14 |         ws: true,
15 |       },
16 |       // Proxy REST API calls to backend
17 |       '/api': {
18 |         target: 'http://localhost:8000',
19 |         changeOrigin: true,
20 |         rewrite: (path) => path.replace(/^\/api/, ''),
21 |       },
22 |     },
23 |   },
24 | })
25 | 


--------------------------------------------------------------------------------
/install-deps.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | echo === Updating Vocalis dependencies ===
 3 | 
 4 | echo === Updating frontend dependencies ===
 5 | cd frontend
 6 | call npm install
 7 | cd ..
 8 | 
 9 | echo === Updating backend dependencies ===
10 | call .\env\Scripts\activate
11 | 
12 | echo.
13 | echo Would you like to install PyTorch with CUDA support?
14 | echo 1. Yes - Install with CUDA support (recommended for NVIDIA GPUs)
15 | echo 2. No - Use CPU only
16 | choice /c 12 /n /m "Enter your choice (1 or 2): "
17 | 
18 | if errorlevel 2 (
19 |     echo === Installing with CPU support only ===
20 |     python -m pip install -r backend\requirements.txt
21 | ) else (
22 |     echo === Installing with CUDA support ===
23 |     python -m pip install -r backend\requirements.txt
24 |     echo === Installing PyTorch with CUDA support ===
25 |     python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
26 | )
27 | 
28 | echo === Dependencies updated! ===
29 | 


--------------------------------------------------------------------------------
/install-deps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Note: You may need to make this script executable with: chmod +x install-deps.sh
 3 | 
 4 | echo "=== Updating Vocalis dependencies ==="
 5 | 
 6 | echo "=== Updating frontend dependencies ==="
 7 | cd frontend
 8 | npm install
 9 | cd ..
10 | 
11 | echo "=== Updating backend dependencies ==="
12 | source ./env/bin/activate
13 | 
14 | echo
15 | echo "Would you like to install PyTorch with CUDA support?"
16 | echo "1. Yes - Install with CUDA support (recommended for NVIDIA GPUs)"
17 | echo "2. No - Use CPU only"
18 | read -p "Enter your choice (1 or 2): " choice
19 | 
20 | if [ "$choice" = "1" ]; then
21 |     echo "=== Installing with CUDA support ==="
22 |     pip install -r backend/requirements.txt
23 |     echo "=== Installing PyTorch with CUDA support ==="
24 |     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
25 | else
26 |     echo "=== Installing with CPU support only ==="
27 |     pip install -r backend/requirements.txt
28 | fi
29 | 
30 | echo "=== Dependencies updated! ==="
31 | 


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | echo === Starting Vocalis ===
 3 | 
 4 | :: Start backend in a new window
 5 | start cmd /k "call .\env\Scripts\activate && python -m backend.main"
 6 | 
 7 | :: Wait a moment for backend to initialize
 8 | timeout /t 2 /nobreak > nul
 9 | 
10 | :: Start frontend in a new window
11 | start cmd /k "cd frontend && npm run dev"
12 | 
13 | echo === Vocalis servers started ===
14 | echo Frontend: http://localhost:5173 (or your Vite port)
15 | echo Backend: http://localhost:8000 (or your FastAPI port)
16 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Note: You may need to make this script executable with: chmod +x run.sh
 3 | 
 4 | echo "=== Starting Vocalis ==="
 5 | 
 6 | # Determine which terminal command to use based on OS and available commands
 7 | terminal_cmd=""
 8 | if [ "$(uname)" == "Darwin" ]; then
 9 |     # macOS (try to use Terminal.app)
10 |     if command -v osascript &> /dev/null; then
11 |         terminal_cmd="osascript"
12 |     fi
13 | elif command -v gnome-terminal &> /dev/null; then
14 |     terminal_cmd="gnome-terminal"
15 | elif command -v xterm &> /dev/null; then
16 |     terminal_cmd="xterm"
17 | elif command -v konsole &> /dev/null; then
18 |     terminal_cmd="konsole"
19 | fi
20 | 
21 | # Start backend server
22 | if [ "$terminal_cmd" == "osascript" ]; then
23 |     # macOS specific approach
24 |     osascript -e 'tell app "Terminal" to do script "cd \"'$(pwd)'\" && source ./env/bin/activate && python -m backend.main"'
25 | elif [ -n "$terminal_cmd" ]; then
26 |     # For Linux with available terminal
27 |     $terminal_cmd -- bash -c "cd '$(pwd)' && source ./env/bin/activate && python -m backend.main; exec bash" &
28 | else
29 |     # Fallback - start in background
30 |     echo "Could not detect terminal. Starting services in background."
31 |     source ./env/bin/activate && python -m backend.main &
32 |     BACKEND_PID=$!
33 |     echo "Backend started with PID: $BACKEND_PID"
34 | fi
35 | 
36 | # Wait a moment for backend to initialize
37 | sleep 2
38 | 
39 | # Start frontend server
40 | if [ "$terminal_cmd" == "osascript" ]; then
41 |     # macOS specific approach
42 |     osascript -e 'tell app "Terminal" to do script "cd \"'$(pwd)'/frontend\" && npm run dev"'
43 | elif [ -n "$terminal_cmd" ]; then
44 |     # For Linux with available terminal
45 |     $terminal_cmd -- bash -c "cd '$(pwd)/frontend' && npm run dev; exec bash" &
46 | else
47 |     # Fallback - start in background
48 |     cd frontend && npm run dev &
49 |     FRONTEND_PID=$!
50 |     echo "Frontend started with PID: $FRONTEND_PID"
51 |     cd ..
52 | fi
53 | 
54 | echo "=== Vocalis servers started ==="
55 | echo "Frontend: http://localhost:5173 (or your Vite port)"
56 | echo "Backend: http://localhost:8000 (or your FastAPI port)"
57 | 
58 | # For fallback mode, provide instructions to terminate
59 | if [ -z "$terminal_cmd" ]; then
60 |     echo
61 |     echo "Since services are running in the background, use the following to terminate:"
62 |     echo "  kill $BACKEND_PID $FRONTEND_PID"
63 |     # Keep script running to make it easier to terminate
64 |     echo "Press Ctrl+C to terminate all services and exit"
65 |     trap "kill $BACKEND_PID $FRONTEND_PID 2>/dev/null" EXIT
66 |     wait
67 | fi
68 | 


--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | echo === Setting up Vocalis project ===
 3 | 
 4 | echo === Setting up frontend ===
 5 | cd frontend
 6 | call npm install
 7 | cd ..
 8 | 
 9 | echo === Setting up backend environment ===
10 | python -m venv env
11 | call .\env\Scripts\activate
12 | 
13 | echo.
14 | echo Would you like to install PyTorch with CUDA support?
15 | echo 1. Yes - Install with CUDA support (recommended for NVIDIA GPUs)
16 | echo 2. No - Use CPU only
17 | choice /c 12 /n /m "Enter your choice (1 or 2): "
18 | 
19 | if errorlevel 2 (
20 |     echo === Installing with CPU support only ===
21 |     python -m pip install -r backend\requirements.txt
22 | ) else (
23 |     echo === Installing with CUDA support ===
24 |     python -m pip install -r backend\requirements.txt
25 |     echo === Installing PyTorch with CUDA support ===
26 |     pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
27 | )
28 | 
29 | echo === Setup complete! ===
30 | echo Run 'run.bat' to start the application
31 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Note: You may need to make this script executable with: chmod +x setup.sh
 3 | 
 4 | echo "=== Setting up Vocalis project ==="
 5 | 
 6 | echo "=== Setting up frontend ==="
 7 | cd frontend
 8 | npm install
 9 | cd ..
10 | 
11 | echo "=== Setting up backend environment ==="
12 | python3 -m venv env
13 | source ./env/bin/activate
14 | 
15 | echo
16 | echo "Would you like to install PyTorch with CUDA support?"
17 | echo "1. Yes - Install with CUDA support (recommended for NVIDIA GPUs)"
18 | echo "2. No - Use CPU only"
19 | read -p "Enter your choice (1 or 2): " choice
20 | 
21 | if [ "$choice" = "1" ]; then
22 |     echo "=== Installing with CUDA support ==="
23 |     pip install -r backend/requirements.txt
24 |     echo "=== Installing PyTorch with CUDA support ==="
25 |     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
26 | else
27 |     echo "=== Installing with CPU support only ==="
28 |     pip install -r backend/requirements.txt
29 | fi
30 | 
31 | echo "=== Setup complete! ==="
32 | echo "Run './run.sh' to start the application"
33 | 


--------------------------------------------------------------------------------