├── .env.example
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── images
    └── screen.png
├── index.html
├── requirements.txt
├── run.sh
├── settings.py
└── speech.py


/.env.example:
--------------------------------------------------------------------------------
 1 | LLM_BASE_URL="http://192.168.1.111:8880/v1"
 2 | LLM_MODEL="llama-3.2-3b-instruct-q4_k_m"
 3 | LLM_API_KEY="sk-XXXX"
 4 | 
 5 | STT_BASE_URL="http://192.168.1.111:8880/v1"
 6 | STT_API_KEY="sk-XXXX"
 7 | STT_MODEL="whisper-base"
 8 | STT_RESPONSE_FORMAT="verbose_json"
 9 | LANGUAGE="en"
10 | 
11 | TTS_BASE_URL="http://192.168.1.111:8884/v1"
12 | TTS_API_KEY="dummy"
13 | TTS_MODEL="kokoro"
14 | TTS_VOICE="af_heart"
15 | TTS_BACKEND="kokoro"
16 | TTS_AUDIO_FORMAT="pcm"
17 | 
18 | MODE="UI"


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .env.local
133 | .env.cloud
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | 
172 | # PyPI configuration file
173 | .pypirc
174 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Lim Chee Kin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Talk To AI with FastRTC
  2 | 
  3 | A real-time voice conversation application powered by [FastRTC](https://fastrtc.org/) that enables interactive audio communication with both Local and Cloud AI models. Inspired by [Talk To Claude](https://huggingface.co/spaces/fastrtc/talk-to-claude), this project transforms text-based AI interactions into natural voice conversations.
  4 | 
  5 | ## Overview
  6 | 
  7 | This application creates a seamless voice interface to interact with AI models. It provides:
  8 | 
  9 | - Real-time speech-to-text conversion using various STT models
 10 | - Enables text generation using local or cloud-based language models through an OpenAI-compatible API
 11 | - High-quality text-to-speech synthesis
 12 | - Interactive web interface with audio visualization
 13 | - Flexibility to use either Local or Cloud APIs with simple configuration changes
 14 | 
 15 | ![Talk to AI Screen](images/screen.png)
 16 | 
 17 | ## System Architecture
 18 | 
 19 | The application follows a modular architecture with the following components:
 20 | 
 21 | ```
 22 | ┌────────────────┐      ┌────────────────┐      ┌────────────────┐
 23 | │   Web Browser  │<────>│  FastRTC API   │<────>│  OpenAI API    │
 24 | │  (WebRTC+UI)   │      │  (Python App)  │      │(Local or Cloud)│
 25 | └────────────────┘      └────────────────┘      └────────────────┘
 26 |                                                        ▲
 27 |                                │───────────────────────│───────────────────────│
 28 |                                ▼                       ▼                       ▼
 29 |                         ┌────────────────┐      ┌────────────────┐     ┌────────────────┐
 30 |                         │   STT Server   │      │   LLM Server   │     │   TTS Server   │
 31 |                         │(Local or Cloud)│      │(Local or Cloud)│     │(Local or Cloud)│
 32 |                         └────────────────┘      └────────────────┘     └────────────────┘
 33 | ```
 34 | 
 35 | ## API Compatibility
 36 | 
 37 | The application has been tested with the following API combinations:
 38 | 
 39 | ### 1. Local APIs    
 40 | - **STT**: [LocalAI with whisper.cpp backend](https://localai.io/features/audio-to-text/), [FastWhisperAPI](https://github.com/3choff/FastWhisperAPI)    
 41 | - **LLM**: [LocalAI with llama.cpp backend](https://localai.io/features/text-generation/), [MLC LLM](https://llm.mlc.ai/)
 42 | - **TTS**: [LocalAI with Piper backend](https://localai.io/features/text-to-audio/), [FastKoko](https://github.com/remsky/Kokoro-FastAPI/)  
 43 | 
 44 | ### 2. Cloud APIs    
 45 | - **STT**: [Groq](https://console.groq.com/docs/speech-to-text)    
 46 | - **LLM**: [Groq](https://console.groq.com/docs/text-chat)    
 47 | - **TTS**: [Microsoft Edge TTS with openai-edge-tts](https://github.com/travisvn/openai-edge-tts)
 48 | 
 49 | ## Features
 50 | 
 51 | - **API Flexibility**: Switch between local and cloud APIs with simple .env file changes
 52 | - **Real-time Voice Interaction**: Speak naturally and receive AI responses as audio
 53 | - **WebRTC Integration**: Low-latency audio streaming with network traversal capability
 54 | - **Progressive TTS Playback**: Audio responses begin playing as soon as sentences are completed
 55 | - **Responsive Audio Visualization**: Visual feedback of audio input/output
 56 | - **Configurable AI Models**: Easily switch between different AI models
 57 | - **Customizable Voice Settings**: Configure voice, language and audio format
 58 | - **Multiple Deployment Options**: UI, API, or phone integration
 59 | 
 60 | ## Prerequisites
 61 | 
 62 | - Python 3.8+
 63 | - Local AI instance or Cloud API credentials
 64 | - FastRTC-compatible environment
 65 | - Modern web browser with WebRTC support
 66 | 
 67 | ## Installation
 68 | 
 69 | 1. Clone the repository:
 70 |    ```bash
 71 |    git clone https://github.com/limcheekin/talk-to-ai.git
 72 |    cd talk-to-ai
 73 |    ```
 74 | 
 75 | 2. Create and activate a virtual environment:
 76 |    ```bash
 77 |    python -m venv .venv
 78 |    source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 79 |    ```
 80 | 
 81 | 3. Install dependencies:
 82 |    ```bash
 83 |    pip install -r requirements.txt
 84 |    ```
 85 | 
 86 | 4. Configure environment variables:
 87 |    ```bash
 88 |    cp .env.example .env
 89 |    # Edit .env with your settings
 90 |    ```
 91 | 
 92 | ## Configuration
 93 | 
 94 | Edit the `.env` file to configure the following settings. You can easily switch between local and cloud providers by updating these settings:
 95 | 
 96 | ### LLM Configuration
 97 | - `LLM_BASE_URL`: URL of your AI instance (e.g., "http://192.168.1.111:8880/v1" for LocalAI or "https://api.groq.com/openai/v1" for Groq)
 98 | - `LLM_MODEL`: Name of the language model to use (e.g., "llama-3.2-3b-instruct-q4_k_m" for LocalAI or "llama3-8b-8192" for Groq)
 99 | - `LLM_API_KEY`: API key for your AI service (required for cloud APIs, `dummy_api_key` for local)
100 | 
101 | ### Speech-to-Text Configuration
102 | - `STT_BASE_URL`: URL of your STT service (can be LocalAI or cloud service like Groq)
103 | - `STT_API_KEY`: API key for STT service (required for cloud APIs)
104 | - `STT_MODEL`: Model to use for speech recognition (e.g., "whisper-base" for LocalAI or "whisper-large-v3" for Groq)
105 | - `STT_RESPONSE_FORMAT`: Format for STT responses (e.g., "verbose_json")
106 | - `LANGUAGE`: Language code for speech recognition (e.g., "en")
107 | 
108 | ### Text-to-Speech Configuration
109 | - `TTS_BASE_URL`: URL of your TTS service (LocalAI, FastKoko, or cloud services like Edge TTS)
110 | - `TTS_API_KEY`: API key for TTS service (required for cloud APIs)
111 | - `TTS_MODEL`: TTS model to use (e.g., "kokoro" or "tts-1-hd" or "tts-1")
112 | - `TTS_VOICE`: Voice ID to use (e.g., "af_heart" for Kokoro or "en-US-AriaNeural" for Edge TTS)
113 | - `TTS_BACKEND`: TTS backend identifier (e.g., "kokoro" or "edge-tts")
114 | - `TTS_AUDIO_FORMAT`: Output audio format (e.g., "pcm")
115 | 
116 | ### Application Mode
117 | - `MODE`: Deployment mode ("UI", "PHONE", or "API")
118 | 
119 | ## Running the Application
120 | 
121 | Start the application using the provided shell script:
122 | 
123 | ```bash
124 | chmod +x run.sh
125 | ./run.sh
126 | ```
127 | 
128 | Or run it directly with Python:
129 | 
130 | ```bash
131 | python app.py
132 | ```
133 | 
134 | The application will be available at:
135 | - UI mode: `http://localhost:7860`
136 | - API mode: `http://localhost:7860/`
137 | 
138 | ## Usage
139 | 
140 | 1. Open the web interface in your browser
141 | 2. Click the microphone icon or "Click to Access Microphone", allow microphone access when prompted
142 | 3. Click the "Record" button to initialize the WebRTC connection
143 | 4. Speak naturally after the connection is established
144 | 5. The application will convert your speech to text, process it with the AI model, and provide an audio response
145 | 6. The conversation history will be displayed in the chat window
146 | 
147 | ## Technical Details
148 | 
149 | ### Components
150 | 
151 | - **app.py**: Main application server handling WebRTC connections and API endpoints
152 | - **speech.py**: Client for speech-to-text and text-to-speech services
153 | - **settings.py**: Configuration management using Pydantic
154 | - **index.html**: Web interface with WebRTC client implementation
155 | - **requirements.txt**: Python dependencies
156 | - **run.sh**: Convenience script to run the application
157 | 
158 | ### Key Technologies
159 | 
160 | - **FastRTC**: Handles WebRTC connections and audio streaming
161 | - **OpenAI API Client**: Used for compatible interfaces with local APIs and cloud services
162 | - **Gradio**: Provides UI components and server functionality
163 | - **Pydantic**: Configuration and settings management
164 | - **WebRTC**: Browser-based real-time communication
165 | 
166 | ## Customization
167 | 
168 | ### Switching Between Local and Cloud APIs
169 | 
170 | Simply update the `.env` file with appropriate URLs and API keys:
171 | 
172 | #### For Local API Setup:
173 | ```
174 | LLM_BASE_URL="http://192.168.1.111:8880/v1"
175 | LLM_MODEL="llama-3.2-3b-instruct-q4_k_m"
176 | LLM_API_KEY="sk-1" # dummy api key required by openai package
177 | 
178 | STT_BASE_URL="http://192.168.1.111:8880/v1" # or your FastWhisperAPI instance
179 | STT_MODEL="whisper-base" # or "small.en"
180 | 
181 | TTS_BASE_URL="http://192.168.1.111:8880/v1"  # or your FastKoko instance
182 | TTS_MODEL="en-us-ryan-high.onnx" # or "kokoro"
183 | TTS_VOICE="en-us-ryan-high.onnx" # or "af_heart"
184 | TTS_BACKEND="piper" # or "kokoro"
185 | ```
186 | 
187 | #### For Cloud API Setup:
188 | ```
189 | LLM_BASE_URL="https://api.groq.com/openai/v1"
190 | LLM_MODEL="llama3-8b-8192"
191 | LLM_API_KEY="your-groq-api-key"
192 | 
193 | STT_BASE_URL="https://api.groq.com/openai/v1"
194 | STT_MODEL="whisper-large-v3"
195 | STT_API_KEY="your-groq-api-key"
196 | 
197 | TTS_BASE_URL="https://your-edge-tts-server/v1"
198 | TTS_MODEL="tts-1-hd"
199 | TTS_VOICE="en-US-AriaNeural"
200 | ```
201 | 
202 | ### Voice Customization
203 | 
204 | Modify the TTS settings in `.env` to change voice characteristics:
205 | 
206 | ```
207 | TTS_VOICE="different_voice"  # Voice ID depends on your TTS provider
208 | ```
209 | 
210 | ### UI Customization
211 | 
212 | The web interface can be customized by editing the `index.html` file. The interface uses standard HTML, CSS, and JavaScript.
213 | 
214 | ## Troubleshooting
215 | 
216 | ### Connection Issues
217 | - Ensure your AI services (local or cloud) are accessible
218 | - Check if the provided URLs in `.env` are correct
219 | - Verify API keys are valid for cloud services
220 | - Verify that WebRTC is supported in your browser
221 | - If behind a firewall, ensure WebRTC traffic is allowed
222 | 
223 | ### Audio Problems
224 | - Check microphone permissions in your browser
225 | - Ensure audio output is enabled and volume is up
226 | - Try a different browser if issues persist
227 | - For local APIs, verify the models are properly loaded
228 | 
229 | ### API-Specific Issues
230 | - **Local APIs**: Ensure sufficient system resources for running models
231 | - **Cloud APIs**: Check API quotas and rate limits
232 | - Verify API endpoint formatting is correct for your chosen provider
233 | 
234 | ### Performance Considerations
235 | - STT and TTS processing can be resource-intensive for local setups
236 | - Smaller models may provide faster responses at the cost of quality
237 | - Consider adjusting the concurrent user limit based on your server capacity
238 | - Cloud APIs typically offer better performance but at a cost
239 | 
240 | ## Contributing
241 | Contributions are welcome! To contribute:
242 | 1. Fork the repository.
243 | 2. Create a new branch:
244 |    ```bash
245 |    git checkout -b feature-name
246 |    ```
247 | 3. Commit your changes:
248 |    ```bash
249 |    git commit -m "Add feature description"
250 |    ```
251 | 4. Push to the branch:
252 |    ```bash
253 |    git push origin feature-name
254 |    ```
255 | 5. Open a pull request.
256 | 
257 | ## License
258 | 
259 | This project is open source and available under the [MIT License](LICENSE).
260 | 
261 | ## Acknowledgements
262 | 
263 | This project builds upon and integrates numerous open-source projects and commercial APIs:
264 | 
265 | ### Core Technology
266 | - [FastRTC](https://fastrtc.org/) by Hugging Face - For real-time communication capabilities
267 | - [WebRTC](https://webrtc.org/) - For browser-based real-time communication protocol
268 | - [Gradio](https://gradio.app/) - For UI components and server functionality
269 | - [Pydantic](https://pydantic.dev/opensource) - For configuration and validation
270 | 
271 | ### Inspiration
272 | - [Talk To Claude](https://huggingface.co/spaces/fastrtc/talk-to-claude) - The original project that inspired this adaptation
273 | 
274 | ### Local AI Solutions
275 | - [LocalAI](https://localai.io/) - For local deployment of AI models
276 |   - [whisper.cpp](https://github.com/ggerganov/whisper.cpp) - Backend for speech-to-text
277 |   - [llama.cpp](https://github.com/ggerganov/llama.cpp) - Backend for text generation
278 |   - [Piper](https://github.com/rhasspy/piper) - Backend for text-to-speech
279 | - [FastWhisperAPI](https://github.com/3choff/FastWhisperAPI) - For efficient speech recognition
280 | - [MLC LLM](https://llm.mlc.ai/) - For local deployment of language models
281 | - [FastKoko](https://github.com/remsky/Kokoro-FastAPI/) - For local text-to-speech synthesis
282 | - [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) - The underlying TTS model for FastKoko
283 | 
284 | ### Cloud Services
285 | - [Groq](https://groq.com/) - For cloud LLM and STT services
286 |   - Groq's [Speech-to-Text API](https://console.groq.com/docs/speech-to-text)
287 |   - Groq's [Text Chat API](https://console.groq.com/docs/text-chat)
288 | - Microsoft Edge TTS - For cloud text-to-speech
289 | - [openai-edge-tts](https://github.com/travisvn/openai-edge-tts) - For OpenAI-compatible interface to Edge TTS
290 | 
291 | ### Additional Libraries
292 | - Please refer to the [requirements.txt](requirements.txt)
293 | 
294 | We extend our sincere appreciation to all the developers and organizations that have made their work available for integration into this project.


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | from pathlib import Path
  4 | 
  5 | import gradio as gr
  6 | import numpy as np
  7 | from fastapi import FastAPI
  8 | from fastapi.responses import HTMLResponse, StreamingResponse
  9 | from fastrtc import (
 10 |     AdditionalOutputs,
 11 |     ReplyOnPause,
 12 |     Stream,
 13 |     get_twilio_turn_credentials,
 14 | )
 15 | from fastrtc.utils import audio_to_bytes
 16 | from gradio.utils import get_space
 17 | from pydantic import BaseModel
 18 | from speech import SpeechClient
 19 | from settings import get_settings
 20 | from openai import AsyncOpenAI
 21 | import asyncio
 22 | 
 23 | settings = get_settings()
 24 | speech_client = SpeechClient(
 25 |     stt_base_url=settings.STT_BASE_URL,
 26 |     stt_model=settings.STT_MODEL,
 27 |     stt_api_key=settings.STT_API_KEY,
 28 |     stt_response_format=settings.STT_RESPONSE_FORMAT,
 29 |     tts_base_url=settings.TTS_BASE_URL,
 30 |     tts_api_key=settings.TTS_API_KEY,
 31 |     tts_model=settings.TTS_MODEL,
 32 |     tts_voice=settings.TTS_VOICE,
 33 |     tts_backend=settings.TTS_BACKEND,
 34 |     tts_audio_format=settings.TTS_AUDIO_FORMAT,
 35 |     language=settings.LANGUAGE,
 36 | )
 37 | llm_client = AsyncOpenAI(api_key=settings.LLM_API_KEY.get_secret_value(), base_url=settings.LLM_BASE_URL)
 38 | curr_dir = Path(__file__).parent
 39 | 
 40 | async def async_response(audio, chatbot=None):
 41 |     """Asynchronous response function with optimized streaming."""
 42 |     chatbot = chatbot or []
 43 |     messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
 44 |     
 45 |     # Process STT
 46 |     prompt = await speech_client.speech_to_text(("audio-file.mp3", audio_to_bytes(audio)))
 47 |     chatbot.append({"role": "user", "content": prompt})
 48 |     yield AdditionalOutputs(chatbot)
 49 |     messages.append({"role": "user", "content": prompt})
 50 |     
 51 |     # Set up streaming response
 52 |     start = time.time()
 53 |     print("starting response pipeline", start)
 54 |     
 55 |     # Buffer for collecting the complete response
 56 |     complete_response = ""
 57 |     sentence_buffer = ""
 58 |     
 59 |     # Start LLM streaming
 60 |     stream = await llm_client.chat.completions.create(
 61 |         model=settings.LLM_MODEL,
 62 |         max_tokens=512,
 63 |         messages=messages,
 64 |         stream=True,
 65 |     )
 66 |     
 67 |     async for chunk in stream:
 68 |         # Extract content from the chunk
 69 |         content = chunk.choices[0].delta.content
 70 |         if content is None:
 71 |             continue
 72 |             
 73 |         complete_response += content
 74 |         sentence_buffer += content
 75 |         
 76 |         # Check if we have a complete sentence or significant phrase
 77 |         if ('.' in content or '!' in content or '?' in content or '\n' in content) and len(sentence_buffer) > 15:
 78 |             # Process this sentence for TTS - use async for to iterate through yielded chunks
 79 |             async for audio_data in speech_client.text_to_speech_stream(sentence_buffer):
 80 |                 yield audio_data
 81 |             
 82 |             sentence_buffer = ""
 83 |     
 84 |     # Process any remaining text in the buffer
 85 |     if sentence_buffer:
 86 |         async for audio_data in speech_client.text_to_speech_stream(sentence_buffer):
 87 |             yield audio_data
 88 |     
 89 |     # Update chat history
 90 |     chatbot.append({"role": "assistant", "content": complete_response})
 91 |     yield AdditionalOutputs(chatbot)
 92 |     print("finished response pipeline", time.time() - start)
 93 | 
 94 | def response(audio: tuple[int, np.ndarray], chatbot: list[dict] | None = None):
 95 |     """Synchronous wrapper for the asynchronous response generator."""
 96 |     loop = asyncio.new_event_loop()
 97 |     asyncio.set_event_loop(loop)
 98 |     
 99 |     try:
100 |         agen = async_response(audio, chatbot)
101 |         
102 |         while True:
103 |             try:
104 |                 # Get the next item from the async generator
105 |                 item = loop.run_until_complete(agen.__anext__())
106 |                 yield item
107 |             except StopAsyncIteration:
108 |                 # Exit loop when the async generator is exhausted
109 |                 break
110 |             except Exception as e:
111 |                 print(f"Error in response generator: {e}")
112 |                 # Continue with the next iteration rather than breaking completely
113 |                 continue
114 |     finally:
115 |         loop.close()
116 | 
117 | chatbot = gr.Chatbot(type="messages")
118 | stream = Stream(
119 |     modality="audio",
120 |     mode="send-receive",
121 |     handler=ReplyOnPause(response), 
122 |     additional_outputs_handler=lambda a, b: b,
123 |     additional_inputs=[chatbot],
124 |     additional_outputs=[chatbot],
125 |     rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
126 |     concurrency_limit=5 if get_space() else None,
127 |     time_limit=90 if get_space() else None,
128 | )
129 | 
130 | 
131 | class Message(BaseModel):
132 |     role: str
133 |     content: str
134 | 
135 | 
136 | class InputData(BaseModel):
137 |     webrtc_id: str
138 |     chatbot: list[Message]
139 | 
140 | 
141 | app = FastAPI()
142 | stream.mount(app)
143 | 
144 | 
145 | @app.get("/")
146 | async def _():
147 |     rtc_config = get_twilio_turn_credentials() if get_space() else None
148 |     html_content = (curr_dir / "index.html").read_text()
149 |     html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
150 |     return HTMLResponse(content=html_content, status_code=200)
151 | 
152 | 
153 | @app.post("/input_hook")
154 | async def _(body: InputData):
155 |     stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
156 |     return {"status": "ok"}
157 | 
158 | 
159 | @app.get("/outputs")
160 | def _(webrtc_id: str):
161 |     async def output_stream():
162 |         async for output in stream.output_stream(webrtc_id):
163 |             chatbot = output.args[0]
164 |             yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
165 | 
166 |     return StreamingResponse(output_stream(), media_type="text/event-stream")
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     if (mode := settings.MODE) == "UI":
171 |         stream.ui.launch(server_port=7860, server_name="0.0.0.0")
172 |     elif mode == "PHONE":
173 |         stream.fastphone(host="0.0.0.0", port=7860)
174 |     else:
175 |         import uvicorn
176 | 
177 |         uvicorn.run(app, host="0.0.0.0", port=7860)


--------------------------------------------------------------------------------
/images/screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/limcheekin/talk-to-ai/a6365e7cedd55d24df775959ec8912e7ca0f5836/images/screen.png


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |     <title>Talk To LocalAI</title>
  8 |     <style>
  9 |         body {
 10 |             font-family: monospace;
 11 |             background-color: #1a1a1a;
 12 |             color: #00ff00;
 13 |             margin: 0;
 14 |             padding: 20px;
 15 |             height: 100vh;
 16 |             box-sizing: border-box;
 17 |         }
 18 | 
 19 |         .container {
 20 |             display: flex;
 21 |             flex-direction: column;
 22 |             gap: 20px;
 23 |             height: calc(100% - 100px);
 24 |             margin-bottom: 20px;
 25 |         }
 26 | 
 27 |         .chat-container {
 28 |             border: 2px solid #00ff00;
 29 |             padding: 20px;
 30 |             display: flex;
 31 |             flex-direction: column;
 32 |             flex-grow: 1;
 33 |             box-sizing: border-box;
 34 |         }
 35 | 
 36 |         .controls-container {
 37 |             border: 2px solid #00ff00;
 38 |             padding: 20px;
 39 |             display: flex;
 40 |             align-items: center;
 41 |             gap: 20px;
 42 |             height: 128px;
 43 |             box-sizing: border-box;
 44 |         }
 45 | 
 46 |         .visualization-container {
 47 |             flex-grow: 1;
 48 |             display: flex;
 49 |             align-items: center;
 50 |         }
 51 | 
 52 |         .box-container {
 53 |             display: flex;
 54 |             justify-content: space-between;
 55 |             height: 64px;
 56 |             width: 100%;
 57 |         }
 58 | 
 59 |         .box {
 60 |             height: 100%;
 61 |             width: 8px;
 62 |             background: #00ff00;
 63 |             border-radius: 8px;
 64 |             transition: transform 0.05s ease;
 65 |         }
 66 | 
 67 |         .chat-messages {
 68 |             flex-grow: 1;
 69 |             overflow-y: auto;
 70 |             margin-bottom: 20px;
 71 |             padding: 10px;
 72 |             border: 1px solid #00ff00;
 73 |         }
 74 | 
 75 |         .message {
 76 |             margin-bottom: 10px;
 77 |             padding: 8px;
 78 |             border-radius: 4px;
 79 |         }
 80 | 
 81 |         .message.user {
 82 |             background-color: #003300;
 83 |         }
 84 | 
 85 |         .message.assistant {
 86 |             background-color: #002200;
 87 |         }
 88 | 
 89 |         button {
 90 |             height: 64px;
 91 |             min-width: 120px;
 92 |             background-color: #000;
 93 |             color: #00ff00;
 94 |             border: 2px solid #00ff00;
 95 |             padding: 10px 20px;
 96 |             font-family: monospace;
 97 |             font-size: 16px;
 98 |             cursor: pointer;
 99 |             transition: all 0.3s;
100 |         }
101 | 
102 |         button:hover {
103 |             border-width: 3px;
104 |         }
105 | 
106 |         #audio-output {
107 |             display: none;
108 |         }
109 | 
110 |         /* Retro CRT effect */
111 |         .crt-overlay {
112 |             position: absolute;
113 |             top: 0;
114 |             left: 0;
115 |             width: 100%;
116 |             height: 100%;
117 |             background: repeating-linear-gradient(0deg,
118 |                     rgba(0, 255, 0, 0.03),
119 |                     rgba(0, 255, 0, 0.03) 1px,
120 |                     transparent 1px,
121 |                     transparent 2px);
122 |             pointer-events: none;
123 |         }
124 | 
125 |         /* Add these new styles */
126 |         .icon-with-spinner {
127 |             display: flex;
128 |             align-items: center;
129 |             justify-content: center;
130 |             gap: 12px;
131 |             min-width: 180px;
132 |         }
133 | 
134 |         .spinner {
135 |             width: 20px;
136 |             height: 20px;
137 |             border: 2px solid #00ff00;
138 |             border-top-color: transparent;
139 |             border-radius: 50%;
140 |             animation: spin 1s linear infinite;
141 |             flex-shrink: 0;
142 |         }
143 | 
144 |         @keyframes spin {
145 |             to {
146 |                 transform: rotate(360deg);
147 |             }
148 |         }
149 | 
150 |         .pulse-container {
151 |             display: flex;
152 |             align-items: center;
153 |             justify-content: center;
154 |             gap: 12px;
155 |             min-width: 180px;
156 |         }
157 | 
158 |         .pulse-circle {
159 |             width: 20px;
160 |             height: 20px;
161 |             border-radius: 50%;
162 |             background-color: #00ff00;
163 |             opacity: 0.2;
164 |             flex-shrink: 0;
165 |             transform: translateX(-0%) scale(var(--audio-level, 1));
166 |             transition: transform 0.1s ease;
167 |         }
168 | 
169 |         /* Add styles for typing indicator */
170 |         .typing-indicator {
171 |             padding: 8px;
172 |             background-color: #002200;
173 |             border-radius: 4px;
174 |             margin-bottom: 10px;
175 |             display: none;
176 |         }
177 | 
178 |         .dots {
179 |             display: inline-flex;
180 |             gap: 4px;
181 |         }
182 | 
183 |         .dot {
184 |             width: 8px;
185 |             height: 8px;
186 |             background-color: #00ff00;
187 |             border-radius: 50%;
188 |             animation: pulse 1.5s infinite;
189 |             opacity: 0.5;
190 |         }
191 | 
192 |         .dot:nth-child(2) {
193 |             animation-delay: 0.5s;
194 |         }
195 | 
196 |         .dot:nth-child(3) {
197 |             animation-delay: 1s;
198 |         }
199 | 
200 |         @keyframes pulse {
201 | 
202 |             0%,
203 |             100% {
204 |                 opacity: 0.5;
205 |                 transform: scale(1);
206 |             }
207 | 
208 |             50% {
209 |                 opacity: 1;
210 |                 transform: scale(1.2);
211 |             }
212 |         }
213 | 
214 |         /* Add styles for toast notifications */
215 |         .toast {
216 |             position: fixed;
217 |             top: 20px;
218 |             left: 50%;
219 |             transform: translateX(-50%);
220 |             padding: 16px 24px;
221 |             border-radius: 4px;
222 |             font-size: 14px;
223 |             z-index: 1000;
224 |             display: none;
225 |             box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
226 |         }
227 | 
228 |         .toast.error {
229 |             background-color: #f44336;
230 |             color: white;
231 |         }
232 | 
233 |         .toast.warning {
234 |             background-color: #ffd700;
235 |             color: black;
236 |         }
237 |     </style>
238 | </head>
239 | 
240 | <body>
241 |     <!-- Add toast element after body opening tag -->
242 |     <div id="error-toast" class="toast"></div>
243 |     <div class="container">
244 |         <div class="chat-container">
245 |             <div class="chat-messages" id="chat-messages"></div>
246 |             <!-- Move typing indicator outside the chat messages -->
247 |             <div class="typing-indicator" id="typing-indicator">
248 |                 <div class="dots">
249 |                     <div class="dot"></div>
250 |                     <div class="dot"></div>
251 |                     <div class="dot"></div>
252 |                 </div>
253 |             </div>
254 |         </div>
255 |         <div class="controls-container">
256 |             <div class="visualization-container">
257 |                 <div class="box-container">
258 |                     <!-- Boxes will be dynamically added here -->
259 |                 </div>
260 |             </div>
261 |             <button id="start-button">Start</button>
262 |         </div>
263 |     </div>
264 |     <audio id="audio-output"></audio>
265 | 
266 |     <script>
267 |         let audioContext;
268 |         let analyser_input, analyser_output;
269 |         let dataArray_input, dataArray_output;
270 |         let animationId_input, animationId_output;
271 |         let chatHistory = [];
272 |         let peerConnection;
273 |         let webrtc_id;
274 | 
275 |         const audioOutput = document.getElementById('audio-output');
276 |         const startButton = document.getElementById('start-button');
277 |         const chatMessages = document.getElementById('chat-messages');
278 | 
279 |         function updateButtonState() {
280 |             if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
281 |                 startButton.innerHTML = `
282 |                     <div class="icon-with-spinner">
283 |                         <div class="spinner"></div>
284 |                         <span>Connecting...</span>
285 |                     </div>
286 |                 `;
287 |             } else if (peerConnection && peerConnection.connectionState === 'connected') {
288 |                 startButton.innerHTML = `
289 |                     <div class="pulse-container">
290 |                         <div class="pulse-circle"></div>
291 |                         <span>Stop</span>
292 |                     </div>
293 |                 `;
294 |             } else {
295 |                 startButton.innerHTML = 'Start';
296 |             }
297 |         }
298 | 
299 |         function showError(message) {
300 |             const toast = document.getElementById('error-toast');
301 |             toast.textContent = message;
302 |             toast.className = 'toast error';
303 |             toast.style.display = 'block';
304 | 
305 |             // Hide toast after 5 seconds
306 |             setTimeout(() => {
307 |                 toast.style.display = 'none';
308 |             }, 5000);
309 |         }
310 | 
311 |         async function setupWebRTC() {
312 |             const config = __RTC_CONFIGURATION__;
313 |             peerConnection = new RTCPeerConnection(config);
314 | 
315 |             const timeoutId = setTimeout(() => {
316 |                 const toast = document.getElementById('error-toast');
317 |                 toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
318 |                 toast.className = 'toast warning';
319 |                 toast.style.display = 'block';
320 | 
321 |                 // Hide warning after 5 seconds
322 |                 setTimeout(() => {
323 |                     toast.style.display = 'none';
324 |                 }, 5000);
325 |             }, 5000);
326 | 
327 |             try {
328 |                 const stream = await navigator.mediaDevices.getUserMedia({
329 |                     audio: true
330 |                 });
331 | 
332 |                 // Set up input visualization
333 |                 audioContext = new AudioContext();
334 |                 analyser_input = audioContext.createAnalyser();
335 |                 const inputSource = audioContext.createMediaStreamSource(stream);
336 |                 inputSource.connect(analyser_input);
337 |                 analyser_input.fftSize = 64;
338 |                 dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
339 | 
340 |                 function updateAudioLevel() {
341 |                     analyser_input.getByteFrequencyData(dataArray_input);
342 |                     const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
343 |                     const audioLevel = average / 255;
344 | 
345 |                     const pulseCircle = document.querySelector('.pulse-circle');
346 |                     if (pulseCircle) {
347 |                         pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
348 |                     }
349 | 
350 |                     animationId_input = requestAnimationFrame(updateAudioLevel);
351 |                 }
352 |                 updateAudioLevel();
353 | 
354 |                 stream.getTracks().forEach(track => {
355 |                     peerConnection.addTrack(track, stream);
356 |                 });
357 | 
358 |                 // Add connection state change listener
359 |                 peerConnection.addEventListener('connectionstatechange', () => {
360 |                     console.log('Connection state:', peerConnection.connectionState);
361 |                     if (peerConnection.connectionState === 'connected') {
362 |                         clearTimeout(timeoutId);
363 |                         const toast = document.getElementById('error-toast');
364 |                         toast.style.display = 'none';
365 |                     }
366 |                     updateButtonState();
367 |                 });
368 | 
369 |                 // Handle incoming audio
370 |                 peerConnection.addEventListener('track', (evt) => {
371 |                     if (audioOutput.srcObject !== evt.streams[0]) {
372 |                         audioOutput.srcObject = evt.streams[0];
373 |                         audioOutput.play();
374 | 
375 |                         // Set up output visualization
376 |                         analyser_output = audioContext.createAnalyser();
377 |                         const outputSource = audioContext.createMediaStreamSource(evt.streams[0]);
378 |                         outputSource.connect(analyser_output);
379 |                         analyser_output.fftSize = 2048;
380 |                         dataArray_output = new Uint8Array(analyser_output.frequencyBinCount);
381 |                         updateVisualization();
382 |                     }
383 |                 });
384 | 
385 |                 // Create data channel for messages
386 |                 const dataChannel = peerConnection.createDataChannel('text');
387 |                 dataChannel.onmessage = (event) => {
388 |                     const eventJson = JSON.parse(event.data);
389 |                     const typingIndicator = document.getElementById('typing-indicator');
390 | 
391 |                     if (eventJson.type === "error") {
392 |                         showError(eventJson.message);
393 |                     } else if (eventJson.type === "send_input") {
394 |                         fetch('/input_hook', {
395 |                             method: 'POST',
396 |                             headers: {
397 |                                 'Content-Type': 'application/json',
398 |                             },
399 |                             body: JSON.stringify({
400 |                                 webrtc_id: webrtc_id,
401 |                                 chatbot: chatHistory
402 |                             })
403 |                         });
404 |                     } else if (eventJson.type === "log") {
405 |                         if (eventJson.data === "pause_detected") {
406 |                             typingIndicator.style.display = 'block';
407 |                             chatMessages.scrollTop = chatMessages.scrollHeight;
408 |                         } else if (eventJson.data === "response_starting") {
409 |                             typingIndicator.style.display = 'none';
410 |                         }
411 |                     }
412 |                 };
413 | 
414 |                 // Create and send offer
415 |                 const offer = await peerConnection.createOffer();
416 |                 await peerConnection.setLocalDescription(offer);
417 | 
418 |                 await new Promise((resolve) => {
419 |                     if (peerConnection.iceGatheringState === "complete") {
420 |                         resolve();
421 |                     } else {
422 |                         const checkState = () => {
423 |                             if (peerConnection.iceGatheringState === "complete") {
424 |                                 peerConnection.removeEventListener("icegatheringstatechange", checkState);
425 |                                 resolve();
426 |                             }
427 |                         };
428 |                         peerConnection.addEventListener("icegatheringstatechange", checkState);
429 |                     }
430 |                 });
431 | 
432 |                 webrtc_id = Math.random().toString(36).substring(7);
433 | 
434 |                 const response = await fetch('/webrtc/offer', {
435 |                     method: 'POST',
436 |                     headers: { 'Content-Type': 'application/json' },
437 |                     body: JSON.stringify({
438 |                         sdp: peerConnection.localDescription.sdp,
439 |                         type: peerConnection.localDescription.type,
440 |                         webrtc_id: webrtc_id
441 |                     })
442 |                 });
443 | 
444 |                 const serverResponse = await response.json();
445 | 
446 |                 if (serverResponse.status === 'failed') {
447 |                     showError(serverResponse.meta.error === 'concurrency_limit_reached'
448 |                         ? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
449 |                         : serverResponse.meta.error);
450 |                     stop();
451 |                     return;
452 |                 }
453 | 
454 |                 await peerConnection.setRemoteDescription(serverResponse);
455 | 
456 |                 // Start visualization
457 |                 updateVisualization();
458 | 
459 |                 // create event stream to receive messages from /output
460 |                 const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
461 |                 eventSource.addEventListener("output", (event) => {
462 |                     const eventJson = JSON.parse(event.data);
463 |                     addMessage(eventJson.role, eventJson.content);
464 |                 });
465 |             } catch (err) {
466 |                 clearTimeout(timeoutId);
467 |                 console.error('Error setting up WebRTC:', err);
468 |                 showError('Failed to establish connection. Please try again.');
469 |                 stop();
470 |             }
471 |         }
472 | 
473 |         function addMessage(role, content) {
474 |             const messageDiv = document.createElement('div');
475 |             messageDiv.classList.add('message', role);
476 |             messageDiv.textContent = content;
477 |             chatMessages.appendChild(messageDiv);
478 |             chatMessages.scrollTop = chatMessages.scrollHeight;
479 |             chatHistory.push({ role, content });
480 |         }
481 | 
482 |         // Add this after other const declarations
483 |         const boxContainer = document.querySelector('.box-container');
484 |         const numBars = 32;
485 |         for (let i = 0; i < numBars; i++) {
486 |             const box = document.createElement('div');
487 |             box.className = 'box';
488 |             boxContainer.appendChild(box);
489 |         }
490 | 
491 |         // Replace the draw function with updateVisualization
492 |         function updateVisualization() {
493 |             animationId_output = requestAnimationFrame(updateVisualization);
494 | 
495 |             analyser_output.getByteFrequencyData(dataArray_output);
496 |             const bars = document.querySelectorAll('.box');
497 | 
498 |             for (let i = 0; i < bars.length; i++) {
499 |                 const barHeight = (dataArray_output[i] / 255) * 2;
500 |                 bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
501 |             }
502 |         }
503 | 
504 |         function stop() {
505 |             if (peerConnection) {
506 |                 if (peerConnection.getTransceivers) {
507 |                     peerConnection.getTransceivers().forEach(transceiver => {
508 |                         if (transceiver.stop) {
509 |                             transceiver.stop();
510 |                         }
511 |                     });
512 |                 }
513 | 
514 |                 if (peerConnection.getSenders) {
515 |                     peerConnection.getSenders().forEach(sender => {
516 |                         if (sender.track && sender.track.stop) sender.track.stop();
517 |                     });
518 |                 }
519 | 
520 |                 peerConnection.close();
521 |             }
522 | 
523 |             if (animationId_input) {
524 |                 cancelAnimationFrame(animationId_input);
525 |             }
526 |             if (animationId_output) {
527 |                 cancelAnimationFrame(animationId_output);
528 |             }
529 |             if (audioContext) {
530 |                 audioContext.close();
531 |             }
532 | 
533 |             updateButtonState();
534 |         }
535 | 
536 |         startButton.addEventListener('click', () => {
537 |             if (startButton.textContent === 'Start') {
538 |                 setupWebRTC();
539 |             } else {
540 |                 stop();
541 |             }
542 |         });
543 |     </script>
544 | </body>
545 | 
546 | </html>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastrtc[vad]
2 | twilio
3 | openai
4 | gradio
5 | pydantic_settings
6 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the virtual environment is activated
 4 | if [[ -z "$VIRTUAL_ENV" ]]; then
 5 |     echo "Activating virtual environment..."
 6 |     source .venv/bin/activate
 7 | else
 8 |     echo "Virtual environment already activated."
 9 | fi
10 | 
11 | python app.py
12 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | from pydantic import SecretStr, Field
 2 | from pydantic_settings import BaseSettings, SettingsConfigDict
 3 | 
 4 | class Settings(BaseSettings):
 5 |     """
 6 |     Centralized application configuration.
 7 |     Environment variables are loaded automatically from a .env file.
 8 |     """
 9 | 
10 |     # LLM configuration
11 |     LLM_BASE_URL: str = Field(..., description="Base URL for the LLM service")
12 |     LLM_MODEL: str = Field(..., description="Model name for the LLM service")
13 |     LLM_API_KEY: SecretStr = Field(..., description="API key for the LLM service")
14 | 
15 |     # STT configuration
16 |     STT_BASE_URL: str = Field(..., description="Base URL for the STT service")
17 |     STT_API_KEY: SecretStr = Field("", description="API key for the STT service")
18 |     STT_MODEL: str = Field(..., description="Model name for the STT service")
19 |     STT_RESPONSE_FORMAT: str = Field(..., description="Response format for the STT service")
20 |     LANGUAGE: str = Field("en", description="Language setting for the STT service")
21 | 
22 |     # TTS configuration
23 |     TTS_BASE_URL: str = Field(..., description="Base URL for the TTS service")
24 |     TTS_API_KEY: SecretStr = Field(..., description="API key for the TTS service")
25 |     TTS_MODEL: str = Field(..., description="Model name for the TTS service")
26 |     TTS_VOICE: str = Field(..., description="Voice(s) to use for the TTS service")
27 |     TTS_BACKEND: str = Field(..., description="Backend identifier for TTS processing")
28 |     TTS_AUDIO_FORMAT: str = Field(..., description="Audio format for TTS service")
29 | 
30 |     # Application mode
31 |     MODE: str = Field(..., description="Mode of the application, e.g. 'UI', 'PHONE'")
32 | 
33 |     # Load environment variables from .env file using UTF-8 encoding.
34 |     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
35 | 
36 | # Optionally, for use in web frameworks, wrap settings retrieval in a function with caching:
37 | from functools import lru_cache
38 | 
39 | @lru_cache()
40 | def get_settings() -> Settings:
41 |     return Settings()
42 | 


--------------------------------------------------------------------------------
/speech.py:
--------------------------------------------------------------------------------
 1 | from openai import AsyncOpenAI
 2 | from pydantic import SecretStr
 3 | import numpy as np
 4 | 
 5 | class SpeechClient:
 6 |   def __init__(self,  
 7 |                stt_base_url: str,
 8 |                stt_api_key: SecretStr, 
 9 |                stt_model: str,
10 |                stt_response_format: str,  
11 |                tts_base_url: str,
12 |                tts_api_key: SecretStr, 
13 |                tts_model: str,
14 |                tts_voice: str,
15 |                tts_backend: str,
16 |                tts_audio_format: str,
17 |                language: str = 'en'):
18 |     self.__stt_model = stt_model
19 |     self.__stt_response_format = stt_response_format
20 |     self.__language = language
21 |     self.__tts_model = tts_model
22 |     self.__tts_voice = tts_voice
23 |     self.__tts_backend = tts_backend
24 |     self.__tts_audio_format = tts_audio_format
25 |     # Initialize async clients
26 |     self.__tts_client = AsyncOpenAI(api_key=tts_api_key.get_secret_value(), base_url=tts_base_url)
27 |     self.__stt_client = AsyncOpenAI(api_key=stt_api_key.get_secret_value(), base_url=stt_base_url)        
28 | 
29 |   async def speech_to_text(self, audio_file: tuple) -> str:
30 |       """Asynchronous version of speech-to-text conversion"""
31 |       response = await self.__stt_client.audio.transcriptions.create(
32 |           model=self.__stt_model,
33 |           file=audio_file,
34 |           language=self.__language,
35 |           response_format=self.__stt_response_format,
36 |       )
37 |       return response.text
38 | 
39 |   async def text_to_speech_stream(self, text: str):
40 |       """Process a text chunk and yield audio data sequentially.
41 |       
42 |       This function creates a streaming TTS response and yields each audio chunk
43 |       as it becomes available, allowing for real-time audio playback.
44 |       """
45 |       try:
46 |           async with self.__tts_client.audio.speech.with_streaming_response.create(
47 |               model=self.__tts_model,
48 |               voice=self.__tts_voice,
49 |               input=text,
50 |               response_format=self.__tts_audio_format,
51 |               extra_body={"backend": self.__tts_backend, "language": self.__language},
52 |           ) as stream_audio:
53 |               # Iterate through all audio chunks in the stream
54 |               print("\nProcessing audio chunk...")
55 |               async for audio_chunk in stream_audio.iter_bytes(chunk_size=1024):
56 |                   print(".", end="")
57 |                   audio_array = np.frombuffer(audio_chunk, dtype=np.int16).reshape(1, -1)
58 |                   yield (24000, audio_array)
59 |                   #audio_array = np.frombuffer(audio_chunk, dtype=np.uint8).reshape(1, -1)
60 |                   #yield (48000, audio_array)
61 |           print()
62 |       except Exception as e:
63 |           print(f"Error in TTS processing: {e}")
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------