├── .env.example ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── images └── screen.png ├── index.html ├── requirements.txt ├── run.sh ├── settings.py └── speech.py /.env.example: -------------------------------------------------------------------------------- 1 | LLM_BASE_URL="http://192.168.1.111:8880/v1" 2 | LLM_MODEL="llama-3.2-3b-instruct-q4_k_m" 3 | LLM_API_KEY="sk-XXXX" 4 | 5 | STT_BASE_URL="http://192.168.1.111:8880/v1" 6 | STT_API_KEY="sk-XXXX" 7 | STT_MODEL="whisper-base" 8 | STT_RESPONSE_FORMAT="verbose_json" 9 | LANGUAGE="en" 10 | 11 | TTS_BASE_URL="http://192.168.1.111:8884/v1" 12 | TTS_API_KEY="dummy" 13 | TTS_MODEL="kokoro" 14 | TTS_VOICE="af_heart" 15 | TTS_BACKEND="kokoro" 16 | TTS_AUDIO_FORMAT="pcm" 17 | 18 | MODE="UI" -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tar filter=lfs diff=lfs merge=lfs -text 29 | *.tflite filter=lfs diff=lfs merge=lfs -text 30 | *.tgz filter=lfs diff=lfs merge=lfs -text 31 | *.wasm filter=lfs diff=lfs merge=lfs -text 32 | *.xz filter=lfs diff=lfs merge=lfs -text 33 | *.zip filter=lfs diff=lfs merge=lfs -text 34 | *.zst filter=lfs diff=lfs merge=lfs -text 35 | *tfevents* filter=lfs diff=lfs merge=lfs -text 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .env.local 133 | .env.cloud 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | 172 | # PyPI configuration file 173 | .pypirc 174 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Lim Chee Kin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Talk To AI with FastRTC 2 | 3 | A real-time voice conversation application powered by [FastRTC](https://fastrtc.org/) that enables interactive audio communication with both Local and Cloud AI models. Inspired by [Talk To Claude](https://huggingface.co/spaces/fastrtc/talk-to-claude), this project transforms text-based AI interactions into natural voice conversations. 4 | 5 | ## Overview 6 | 7 | This application creates a seamless voice interface to interact with AI models. It provides: 8 | 9 | - Real-time speech-to-text conversion using various STT models 10 | - Enables text generation using local or cloud-based language models through an OpenAI-compatible API 11 | - High-quality text-to-speech synthesis 12 | - Interactive web interface with audio visualization 13 | - Flexibility to use either Local or Cloud APIs with simple configuration changes 14 | 15 | ![Talk to AI Screen](images/screen.png) 16 | 17 | ## System Architecture 18 | 19 | The application follows a modular architecture with the following components: 20 | 21 | ``` 22 | ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ 23 | │ Web Browser │<────>│ FastRTC API │<────>│ OpenAI API │ 24 | │ (WebRTC+UI) │ │ (Python App) │ │(Local or Cloud)│ 25 | └────────────────┘ └────────────────┘ └────────────────┘ 26 | ▲ 27 | │───────────────────────│───────────────────────│ 28 | ▼ ▼ ▼ 29 | ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ 30 | │ STT Server │ │ LLM Server │ │ TTS Server │ 31 | │(Local or Cloud)│ │(Local or Cloud)│ │(Local or Cloud)│ 32 | └────────────────┘ └────────────────┘ └────────────────┘ 33 | ``` 34 | 35 | ## API Compatibility 36 | 37 | The application has been tested with the following API combinations: 38 | 39 | ### 1. Local APIs 40 | - **STT**: [LocalAI with whisper.cpp backend](https://localai.io/features/audio-to-text/), [FastWhisperAPI](https://github.com/3choff/FastWhisperAPI) 41 | - **LLM**: [LocalAI with llama.cpp backend](https://localai.io/features/text-generation/), [MLC LLM](https://llm.mlc.ai/) 42 | - **TTS**: [LocalAI with Piper backend](https://localai.io/features/text-to-audio/), [FastKoko](https://github.com/remsky/Kokoro-FastAPI/) 43 | 44 | ### 2. Cloud APIs 45 | - **STT**: [Groq](https://console.groq.com/docs/speech-to-text) 46 | - **LLM**: [Groq](https://console.groq.com/docs/text-chat) 47 | - **TTS**: [Microsoft Edge TTS with openai-edge-tts](https://github.com/travisvn/openai-edge-tts) 48 | 49 | ## Features 50 | 51 | - **API Flexibility**: Switch between local and cloud APIs with simple .env file changes 52 | - **Real-time Voice Interaction**: Speak naturally and receive AI responses as audio 53 | - **WebRTC Integration**: Low-latency audio streaming with network traversal capability 54 | - **Progressive TTS Playback**: Audio responses begin playing as soon as sentences are completed 55 | - **Responsive Audio Visualization**: Visual feedback of audio input/output 56 | - **Configurable AI Models**: Easily switch between different AI models 57 | - **Customizable Voice Settings**: Configure voice, language and audio format 58 | - **Multiple Deployment Options**: UI, API, or phone integration 59 | 60 | ## Prerequisites 61 | 62 | - Python 3.8+ 63 | - Local AI instance or Cloud API credentials 64 | - FastRTC-compatible environment 65 | - Modern web browser with WebRTC support 66 | 67 | ## Installation 68 | 69 | 1. Clone the repository: 70 | ```bash 71 | git clone https://github.com/limcheekin/talk-to-ai.git 72 | cd talk-to-ai 73 | ``` 74 | 75 | 2. Create and activate a virtual environment: 76 | ```bash 77 | python -m venv .venv 78 | source .venv/bin/activate # On Windows: .venv\Scripts\activate 79 | ``` 80 | 81 | 3. Install dependencies: 82 | ```bash 83 | pip install -r requirements.txt 84 | ``` 85 | 86 | 4. Configure environment variables: 87 | ```bash 88 | cp .env.example .env 89 | # Edit .env with your settings 90 | ``` 91 | 92 | ## Configuration 93 | 94 | Edit the `.env` file to configure the following settings. You can easily switch between local and cloud providers by updating these settings: 95 | 96 | ### LLM Configuration 97 | - `LLM_BASE_URL`: URL of your AI instance (e.g., "http://192.168.1.111:8880/v1" for LocalAI or "https://api.groq.com/openai/v1" for Groq) 98 | - `LLM_MODEL`: Name of the language model to use (e.g., "llama-3.2-3b-instruct-q4_k_m" for LocalAI or "llama3-8b-8192" for Groq) 99 | - `LLM_API_KEY`: API key for your AI service (required for cloud APIs, `dummy_api_key` for local) 100 | 101 | ### Speech-to-Text Configuration 102 | - `STT_BASE_URL`: URL of your STT service (can be LocalAI or cloud service like Groq) 103 | - `STT_API_KEY`: API key for STT service (required for cloud APIs) 104 | - `STT_MODEL`: Model to use for speech recognition (e.g., "whisper-base" for LocalAI or "whisper-large-v3" for Groq) 105 | - `STT_RESPONSE_FORMAT`: Format for STT responses (e.g., "verbose_json") 106 | - `LANGUAGE`: Language code for speech recognition (e.g., "en") 107 | 108 | ### Text-to-Speech Configuration 109 | - `TTS_BASE_URL`: URL of your TTS service (LocalAI, FastKoko, or cloud services like Edge TTS) 110 | - `TTS_API_KEY`: API key for TTS service (required for cloud APIs) 111 | - `TTS_MODEL`: TTS model to use (e.g., "kokoro" or "tts-1-hd" or "tts-1") 112 | - `TTS_VOICE`: Voice ID to use (e.g., "af_heart" for Kokoro or "en-US-AriaNeural" for Edge TTS) 113 | - `TTS_BACKEND`: TTS backend identifier (e.g., "kokoro" or "edge-tts") 114 | - `TTS_AUDIO_FORMAT`: Output audio format (e.g., "pcm") 115 | 116 | ### Application Mode 117 | - `MODE`: Deployment mode ("UI", "PHONE", or "API") 118 | 119 | ## Running the Application 120 | 121 | Start the application using the provided shell script: 122 | 123 | ```bash 124 | chmod +x run.sh 125 | ./run.sh 126 | ``` 127 | 128 | Or run it directly with Python: 129 | 130 | ```bash 131 | python app.py 132 | ``` 133 | 134 | The application will be available at: 135 | - UI mode: `http://localhost:7860` 136 | - API mode: `http://localhost:7860/` 137 | 138 | ## Usage 139 | 140 | 1. Open the web interface in your browser 141 | 2. Click the microphone icon or "Click to Access Microphone", allow microphone access when prompted 142 | 3. Click the "Record" button to initialize the WebRTC connection 143 | 4. Speak naturally after the connection is established 144 | 5. The application will convert your speech to text, process it with the AI model, and provide an audio response 145 | 6. The conversation history will be displayed in the chat window 146 | 147 | ## Technical Details 148 | 149 | ### Components 150 | 151 | - **app.py**: Main application server handling WebRTC connections and API endpoints 152 | - **speech.py**: Client for speech-to-text and text-to-speech services 153 | - **settings.py**: Configuration management using Pydantic 154 | - **index.html**: Web interface with WebRTC client implementation 155 | - **requirements.txt**: Python dependencies 156 | - **run.sh**: Convenience script to run the application 157 | 158 | ### Key Technologies 159 | 160 | - **FastRTC**: Handles WebRTC connections and audio streaming 161 | - **OpenAI API Client**: Used for compatible interfaces with local APIs and cloud services 162 | - **Gradio**: Provides UI components and server functionality 163 | - **Pydantic**: Configuration and settings management 164 | - **WebRTC**: Browser-based real-time communication 165 | 166 | ## Customization 167 | 168 | ### Switching Between Local and Cloud APIs 169 | 170 | Simply update the `.env` file with appropriate URLs and API keys: 171 | 172 | #### For Local API Setup: 173 | ``` 174 | LLM_BASE_URL="http://192.168.1.111:8880/v1" 175 | LLM_MODEL="llama-3.2-3b-instruct-q4_k_m" 176 | LLM_API_KEY="sk-1" # dummy api key required by openai package 177 | 178 | STT_BASE_URL="http://192.168.1.111:8880/v1" # or your FastWhisperAPI instance 179 | STT_MODEL="whisper-base" # or "small.en" 180 | 181 | TTS_BASE_URL="http://192.168.1.111:8880/v1" # or your FastKoko instance 182 | TTS_MODEL="en-us-ryan-high.onnx" # or "kokoro" 183 | TTS_VOICE="en-us-ryan-high.onnx" # or "af_heart" 184 | TTS_BACKEND="piper" # or "kokoro" 185 | ``` 186 | 187 | #### For Cloud API Setup: 188 | ``` 189 | LLM_BASE_URL="https://api.groq.com/openai/v1" 190 | LLM_MODEL="llama3-8b-8192" 191 | LLM_API_KEY="your-groq-api-key" 192 | 193 | STT_BASE_URL="https://api.groq.com/openai/v1" 194 | STT_MODEL="whisper-large-v3" 195 | STT_API_KEY="your-groq-api-key" 196 | 197 | TTS_BASE_URL="https://your-edge-tts-server/v1" 198 | TTS_MODEL="tts-1-hd" 199 | TTS_VOICE="en-US-AriaNeural" 200 | ``` 201 | 202 | ### Voice Customization 203 | 204 | Modify the TTS settings in `.env` to change voice characteristics: 205 | 206 | ``` 207 | TTS_VOICE="different_voice" # Voice ID depends on your TTS provider 208 | ``` 209 | 210 | ### UI Customization 211 | 212 | The web interface can be customized by editing the `index.html` file. The interface uses standard HTML, CSS, and JavaScript. 213 | 214 | ## Troubleshooting 215 | 216 | ### Connection Issues 217 | - Ensure your AI services (local or cloud) are accessible 218 | - Check if the provided URLs in `.env` are correct 219 | - Verify API keys are valid for cloud services 220 | - Verify that WebRTC is supported in your browser 221 | - If behind a firewall, ensure WebRTC traffic is allowed 222 | 223 | ### Audio Problems 224 | - Check microphone permissions in your browser 225 | - Ensure audio output is enabled and volume is up 226 | - Try a different browser if issues persist 227 | - For local APIs, verify the models are properly loaded 228 | 229 | ### API-Specific Issues 230 | - **Local APIs**: Ensure sufficient system resources for running models 231 | - **Cloud APIs**: Check API quotas and rate limits 232 | - Verify API endpoint formatting is correct for your chosen provider 233 | 234 | ### Performance Considerations 235 | - STT and TTS processing can be resource-intensive for local setups 236 | - Smaller models may provide faster responses at the cost of quality 237 | - Consider adjusting the concurrent user limit based on your server capacity 238 | - Cloud APIs typically offer better performance but at a cost 239 | 240 | ## Contributing 241 | Contributions are welcome! To contribute: 242 | 1. Fork the repository. 243 | 2. Create a new branch: 244 | ```bash 245 | git checkout -b feature-name 246 | ``` 247 | 3. Commit your changes: 248 | ```bash 249 | git commit -m "Add feature description" 250 | ``` 251 | 4. Push to the branch: 252 | ```bash 253 | git push origin feature-name 254 | ``` 255 | 5. Open a pull request. 256 | 257 | ## License 258 | 259 | This project is open source and available under the [MIT License](LICENSE). 260 | 261 | ## Acknowledgements 262 | 263 | This project builds upon and integrates numerous open-source projects and commercial APIs: 264 | 265 | ### Core Technology 266 | - [FastRTC](https://fastrtc.org/) by Hugging Face - For real-time communication capabilities 267 | - [WebRTC](https://webrtc.org/) - For browser-based real-time communication protocol 268 | - [Gradio](https://gradio.app/) - For UI components and server functionality 269 | - [Pydantic](https://pydantic.dev/opensource) - For configuration and validation 270 | 271 | ### Inspiration 272 | - [Talk To Claude](https://huggingface.co/spaces/fastrtc/talk-to-claude) - The original project that inspired this adaptation 273 | 274 | ### Local AI Solutions 275 | - [LocalAI](https://localai.io/) - For local deployment of AI models 276 | - [whisper.cpp](https://github.com/ggerganov/whisper.cpp) - Backend for speech-to-text 277 | - [llama.cpp](https://github.com/ggerganov/llama.cpp) - Backend for text generation 278 | - [Piper](https://github.com/rhasspy/piper) - Backend for text-to-speech 279 | - [FastWhisperAPI](https://github.com/3choff/FastWhisperAPI) - For efficient speech recognition 280 | - [MLC LLM](https://llm.mlc.ai/) - For local deployment of language models 281 | - [FastKoko](https://github.com/remsky/Kokoro-FastAPI/) - For local text-to-speech synthesis 282 | - [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) - The underlying TTS model for FastKoko 283 | 284 | ### Cloud Services 285 | - [Groq](https://groq.com/) - For cloud LLM and STT services 286 | - Groq's [Speech-to-Text API](https://console.groq.com/docs/speech-to-text) 287 | - Groq's [Text Chat API](https://console.groq.com/docs/text-chat) 288 | - Microsoft Edge TTS - For cloud text-to-speech 289 | - [openai-edge-tts](https://github.com/travisvn/openai-edge-tts) - For OpenAI-compatible interface to Edge TTS 290 | 291 | ### Additional Libraries 292 | - Please refer to the [requirements.txt](requirements.txt) 293 | 294 | We extend our sincere appreciation to all the developers and organizations that have made their work available for integration into this project. -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from pathlib import Path 4 | 5 | import gradio as gr 6 | import numpy as np 7 | from fastapi import FastAPI 8 | from fastapi.responses import HTMLResponse, StreamingResponse 9 | from fastrtc import ( 10 | AdditionalOutputs, 11 | ReplyOnPause, 12 | Stream, 13 | get_twilio_turn_credentials, 14 | ) 15 | from fastrtc.utils import audio_to_bytes 16 | from gradio.utils import get_space 17 | from pydantic import BaseModel 18 | from speech import SpeechClient 19 | from settings import get_settings 20 | from openai import AsyncOpenAI 21 | import asyncio 22 | 23 | settings = get_settings() 24 | speech_client = SpeechClient( 25 | stt_base_url=settings.STT_BASE_URL, 26 | stt_model=settings.STT_MODEL, 27 | stt_api_key=settings.STT_API_KEY, 28 | stt_response_format=settings.STT_RESPONSE_FORMAT, 29 | tts_base_url=settings.TTS_BASE_URL, 30 | tts_api_key=settings.TTS_API_KEY, 31 | tts_model=settings.TTS_MODEL, 32 | tts_voice=settings.TTS_VOICE, 33 | tts_backend=settings.TTS_BACKEND, 34 | tts_audio_format=settings.TTS_AUDIO_FORMAT, 35 | language=settings.LANGUAGE, 36 | ) 37 | llm_client = AsyncOpenAI(api_key=settings.LLM_API_KEY.get_secret_value(), base_url=settings.LLM_BASE_URL) 38 | curr_dir = Path(__file__).parent 39 | 40 | async def async_response(audio, chatbot=None): 41 | """Asynchronous response function with optimized streaming.""" 42 | chatbot = chatbot or [] 43 | messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] 44 | 45 | # Process STT 46 | prompt = await speech_client.speech_to_text(("audio-file.mp3", audio_to_bytes(audio))) 47 | chatbot.append({"role": "user", "content": prompt}) 48 | yield AdditionalOutputs(chatbot) 49 | messages.append({"role": "user", "content": prompt}) 50 | 51 | # Set up streaming response 52 | start = time.time() 53 | print("starting response pipeline", start) 54 | 55 | # Buffer for collecting the complete response 56 | complete_response = "" 57 | sentence_buffer = "" 58 | 59 | # Start LLM streaming 60 | stream = await llm_client.chat.completions.create( 61 | model=settings.LLM_MODEL, 62 | max_tokens=512, 63 | messages=messages, 64 | stream=True, 65 | ) 66 | 67 | async for chunk in stream: 68 | # Extract content from the chunk 69 | content = chunk.choices[0].delta.content 70 | if content is None: 71 | continue 72 | 73 | complete_response += content 74 | sentence_buffer += content 75 | 76 | # Check if we have a complete sentence or significant phrase 77 | if ('.' in content or '!' in content or '?' in content or '\n' in content) and len(sentence_buffer) > 15: 78 | # Process this sentence for TTS - use async for to iterate through yielded chunks 79 | async for audio_data in speech_client.text_to_speech_stream(sentence_buffer): 80 | yield audio_data 81 | 82 | sentence_buffer = "" 83 | 84 | # Process any remaining text in the buffer 85 | if sentence_buffer: 86 | async for audio_data in speech_client.text_to_speech_stream(sentence_buffer): 87 | yield audio_data 88 | 89 | # Update chat history 90 | chatbot.append({"role": "assistant", "content": complete_response}) 91 | yield AdditionalOutputs(chatbot) 92 | print("finished response pipeline", time.time() - start) 93 | 94 | def response(audio: tuple[int, np.ndarray], chatbot: list[dict] | None = None): 95 | """Synchronous wrapper for the asynchronous response generator.""" 96 | loop = asyncio.new_event_loop() 97 | asyncio.set_event_loop(loop) 98 | 99 | try: 100 | agen = async_response(audio, chatbot) 101 | 102 | while True: 103 | try: 104 | # Get the next item from the async generator 105 | item = loop.run_until_complete(agen.__anext__()) 106 | yield item 107 | except StopAsyncIteration: 108 | # Exit loop when the async generator is exhausted 109 | break 110 | except Exception as e: 111 | print(f"Error in response generator: {e}") 112 | # Continue with the next iteration rather than breaking completely 113 | continue 114 | finally: 115 | loop.close() 116 | 117 | chatbot = gr.Chatbot(type="messages") 118 | stream = Stream( 119 | modality="audio", 120 | mode="send-receive", 121 | handler=ReplyOnPause(response), 122 | additional_outputs_handler=lambda a, b: b, 123 | additional_inputs=[chatbot], 124 | additional_outputs=[chatbot], 125 | rtc_configuration=get_twilio_turn_credentials() if get_space() else None, 126 | concurrency_limit=5 if get_space() else None, 127 | time_limit=90 if get_space() else None, 128 | ) 129 | 130 | 131 | class Message(BaseModel): 132 | role: str 133 | content: str 134 | 135 | 136 | class InputData(BaseModel): 137 | webrtc_id: str 138 | chatbot: list[Message] 139 | 140 | 141 | app = FastAPI() 142 | stream.mount(app) 143 | 144 | 145 | @app.get("/") 146 | async def _(): 147 | rtc_config = get_twilio_turn_credentials() if get_space() else None 148 | html_content = (curr_dir / "index.html").read_text() 149 | html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)) 150 | return HTMLResponse(content=html_content, status_code=200) 151 | 152 | 153 | @app.post("/input_hook") 154 | async def _(body: InputData): 155 | stream.set_input(body.webrtc_id, body.model_dump()["chatbot"]) 156 | return {"status": "ok"} 157 | 158 | 159 | @app.get("/outputs") 160 | def _(webrtc_id: str): 161 | async def output_stream(): 162 | async for output in stream.output_stream(webrtc_id): 163 | chatbot = output.args[0] 164 | yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n" 165 | 166 | return StreamingResponse(output_stream(), media_type="text/event-stream") 167 | 168 | 169 | if __name__ == "__main__": 170 | if (mode := settings.MODE) == "UI": 171 | stream.ui.launch(server_port=7860, server_name="0.0.0.0") 172 | elif mode == "PHONE": 173 | stream.fastphone(host="0.0.0.0", port=7860) 174 | else: 175 | import uvicorn 176 | 177 | uvicorn.run(app, host="0.0.0.0", port=7860) -------------------------------------------------------------------------------- /images/screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/limcheekin/talk-to-ai/a6365e7cedd55d24df775959ec8912e7ca0f5836/images/screen.png -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Talk To LocalAI 8 | 238 | 239 | 240 | 241 | 242 |
243 |
244 |
245 |
246 | 247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 | 259 |
260 |
261 | 262 |
263 |
264 | 265 | 266 | 544 | 545 | 546 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastrtc[vad] 2 | twilio 3 | openai 4 | gradio 5 | pydantic_settings 6 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the virtual environment is activated 4 | if [[ -z "$VIRTUAL_ENV" ]]; then 5 | echo "Activating virtual environment..." 6 | source .venv/bin/activate 7 | else 8 | echo "Virtual environment already activated." 9 | fi 10 | 11 | python app.py 12 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | from pydantic import SecretStr, Field 2 | from pydantic_settings import BaseSettings, SettingsConfigDict 3 | 4 | class Settings(BaseSettings): 5 | """ 6 | Centralized application configuration. 7 | Environment variables are loaded automatically from a .env file. 8 | """ 9 | 10 | # LLM configuration 11 | LLM_BASE_URL: str = Field(..., description="Base URL for the LLM service") 12 | LLM_MODEL: str = Field(..., description="Model name for the LLM service") 13 | LLM_API_KEY: SecretStr = Field(..., description="API key for the LLM service") 14 | 15 | # STT configuration 16 | STT_BASE_URL: str = Field(..., description="Base URL for the STT service") 17 | STT_API_KEY: SecretStr = Field("", description="API key for the STT service") 18 | STT_MODEL: str = Field(..., description="Model name for the STT service") 19 | STT_RESPONSE_FORMAT: str = Field(..., description="Response format for the STT service") 20 | LANGUAGE: str = Field("en", description="Language setting for the STT service") 21 | 22 | # TTS configuration 23 | TTS_BASE_URL: str = Field(..., description="Base URL for the TTS service") 24 | TTS_API_KEY: SecretStr = Field(..., description="API key for the TTS service") 25 | TTS_MODEL: str = Field(..., description="Model name for the TTS service") 26 | TTS_VOICE: str = Field(..., description="Voice(s) to use for the TTS service") 27 | TTS_BACKEND: str = Field(..., description="Backend identifier for TTS processing") 28 | TTS_AUDIO_FORMAT: str = Field(..., description="Audio format for TTS service") 29 | 30 | # Application mode 31 | MODE: str = Field(..., description="Mode of the application, e.g. 'UI', 'PHONE'") 32 | 33 | # Load environment variables from .env file using UTF-8 encoding. 34 | model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") 35 | 36 | # Optionally, for use in web frameworks, wrap settings retrieval in a function with caching: 37 | from functools import lru_cache 38 | 39 | @lru_cache() 40 | def get_settings() -> Settings: 41 | return Settings() 42 | -------------------------------------------------------------------------------- /speech.py: -------------------------------------------------------------------------------- 1 | from openai import AsyncOpenAI 2 | from pydantic import SecretStr 3 | import numpy as np 4 | 5 | class SpeechClient: 6 | def __init__(self, 7 | stt_base_url: str, 8 | stt_api_key: SecretStr, 9 | stt_model: str, 10 | stt_response_format: str, 11 | tts_base_url: str, 12 | tts_api_key: SecretStr, 13 | tts_model: str, 14 | tts_voice: str, 15 | tts_backend: str, 16 | tts_audio_format: str, 17 | language: str = 'en'): 18 | self.__stt_model = stt_model 19 | self.__stt_response_format = stt_response_format 20 | self.__language = language 21 | self.__tts_model = tts_model 22 | self.__tts_voice = tts_voice 23 | self.__tts_backend = tts_backend 24 | self.__tts_audio_format = tts_audio_format 25 | # Initialize async clients 26 | self.__tts_client = AsyncOpenAI(api_key=tts_api_key.get_secret_value(), base_url=tts_base_url) 27 | self.__stt_client = AsyncOpenAI(api_key=stt_api_key.get_secret_value(), base_url=stt_base_url) 28 | 29 | async def speech_to_text(self, audio_file: tuple) -> str: 30 | """Asynchronous version of speech-to-text conversion""" 31 | response = await self.__stt_client.audio.transcriptions.create( 32 | model=self.__stt_model, 33 | file=audio_file, 34 | language=self.__language, 35 | response_format=self.__stt_response_format, 36 | ) 37 | return response.text 38 | 39 | async def text_to_speech_stream(self, text: str): 40 | """Process a text chunk and yield audio data sequentially. 41 | 42 | This function creates a streaming TTS response and yields each audio chunk 43 | as it becomes available, allowing for real-time audio playback. 44 | """ 45 | try: 46 | async with self.__tts_client.audio.speech.with_streaming_response.create( 47 | model=self.__tts_model, 48 | voice=self.__tts_voice, 49 | input=text, 50 | response_format=self.__tts_audio_format, 51 | extra_body={"backend": self.__tts_backend, "language": self.__language}, 52 | ) as stream_audio: 53 | # Iterate through all audio chunks in the stream 54 | print("\nProcessing audio chunk...") 55 | async for audio_chunk in stream_audio.iter_bytes(chunk_size=1024): 56 | print(".", end="") 57 | audio_array = np.frombuffer(audio_chunk, dtype=np.int16).reshape(1, -1) 58 | yield (24000, audio_array) 59 | #audio_array = np.frombuffer(audio_chunk, dtype=np.uint8).reshape(1, -1) 60 | #yield (48000, audio_array) 61 | print() 62 | except Exception as e: 63 | print(f"Error in TTS processing: {e}") 64 | 65 | 66 | --------------------------------------------------------------------------------