├── .gitattributes ├── requirements.txt ├── requirementstest.txt ├── LICENSE ├── environment_backup.yml ├── run_voice_changer.bat ├── README.md └── kokoro_api.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nodeblackbox/Kokoro-Voice-Api/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirementstest.txt: -------------------------------------------------------------------------------- 1 | # Core Dependencies 2 | flask>=2.3.0,<3.0.0 3 | flask-cors>=4.0.0,<5.0.0 4 | torch>=2.0.0,<3.0.0 5 | torchaudio>=2.0.0,<3.0.0 6 | 7 | # Kokoro TTS Model (install from source or pip if available) 8 | # kokoro>=0.1.0 # Uncomment if available via pip 9 | 10 | # Audio Processing 11 | librosa>=0.10.0,<1.0.0 12 | numpy>=1.24.0,<2.0.0 13 | scipy>=1.10.0,<2.0.0 14 | soundfile>=0.12.0,<1.0.0 15 | pydub>=0.25.0,<1.0.0 16 | resampy>=0.4.0,<1.0.0 17 | 18 | # Text Processing (Enhanced Features) 19 | num2words>=0.5.12,<1.0.0 20 | markdown>=3.4.0,<4.0.0 21 | unidecode>=1.3.0,<2.0.0 22 | regex>=2023.0.0,<2024.0.0 23 | 24 | # Web Server & API 25 | gunicorn>=21.0.0,<22.0.0 26 | requests>=2.31.0,<3.0.0 27 | 28 | # Utilities 29 | pyyaml>=6.0.0,<7.0.0 30 | 31 | # Optional Dependencies for Enhanced Features 32 | # Uncomment these for additional functionality: 33 | # markdown[extra]>=3.4.0,<4.0.0 # For advanced markdown processing 34 | # tinytag>=1.10.0,<2.0.0 # For audio file metadata 35 | # mutagen>=1.47.0,<2.0.0 # Alternative audio metadata library 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 NASA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /environment_backup.yml: -------------------------------------------------------------------------------- 1 | name: randnameko3 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - bzip2=1.0.8=h2466b09_7 7 | - ca-certificates=2025.4.26=h4c7d964_0 8 | - libexpat=2.7.0=he0c23c2_0 9 | - libffi=3.4.6=h537db12_1 10 | - liblzma=5.8.1=h2466b09_2 11 | - libsqlite=3.50.1=h67fdade_0 12 | - libzlib=1.3.1=h2466b09_2 13 | - openssl=3.5.0=ha4e3fda_1 14 | - pip=25.1.1=pyh8b19718_0 15 | - python=3.12.11=h3f84c4b_0_cpython 16 | - setuptools=80.9.0=pyhff2d567_0 17 | - tk=8.6.13=h2c6b04d_2 18 | - tzdata=2025b=h78e105d_0 19 | - ucrt=10.0.22621.0=h57928b3_1 20 | - vc=14.3=h2b53caa_26 21 | - vc14_runtime=14.42.34438=hfd919c2_26 22 | - wheel=0.45.1=pyhd8ed1ab_1 23 | - pip: 24 | - certifi==2025.4.26 25 | - cffi==1.17.1 26 | - charset-normalizer==3.4.2 27 | - colorama==0.4.6 28 | - distro==1.9.0 29 | - en-core-web-sm==3.8.0 30 | - idna==3.10 31 | - numpy==2.2.6 32 | - packaging==25.0 33 | - platformdirs==4.3.8 34 | - pycparser==2.22 35 | - pydantic==2.11.5 36 | - pydantic-core==2.33.2 37 | - pysocks==1.7.1 38 | - spacy==3.8.7 39 | - tqdm==4.67.1 40 | - urllib3==2.4.0 41 | prefix: C:\Users\nasan\.conda\envs\randnameko3 42 | -------------------------------------------------------------------------------- /run_voice_changer.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo Starting Voice Changer API... 3 | echo. 4 | 5 | REM Check if conda is installed 6 | if not exist "C:\ProgramData\miniconda3\condabin\conda.bat" ( 7 | echo ERROR: Miniconda not found at C:\ProgramData\miniconda3 8 | echo Please install Miniconda or update the path 9 | pause 10 | exit /b 1 11 | ) 12 | 13 | REM Activate the specific environment 14 | call C:\ProgramData\miniconda3\condabin\conda.bat activate randnameko3 15 | if %ERRORLEVEL% neq 0 ( 16 | echo ERROR: Failed to activate environment randnameko3 17 | pause 18 | exit /b 1 19 | ) 20 | 21 | REM Check if PyTorch is installed and CUDA availability 22 | echo Checking PyTorch and CUDA... 23 | python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())" 2>nul 24 | if %ERRORLEVEL% neq 0 ( 25 | echo ERROR: PyTorch not found or import failed. Install with: pip install torch 26 | pause 27 | exit /b 1 28 | ) 29 | 30 | REM Run the voice changer API 31 | echo Starting voicechangerapiV8.py... 32 | echo. 33 | if not exist "voicechangerapiV8.py" ( 34 | echo ERROR: voicechangerapiV8.py not found in current directory 35 | pause 36 | exit /b 1 37 | ) 38 | python voicechangerapiV8.py 39 | if %ERRORLEVEL% neq 0 ( 40 | echo ERROR: voicechangerapiV8.py failed to run 41 | pause 42 | exit /b 1 43 | ) 44 | 45 | REM Keep the window open if the script exits 46 | echo. 47 | echo Script ended. Press any key to close this window... 48 | pause >nul 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🎙️ Complete Kokoro TTS API 2 | 3 | [![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://python.org) 4 | [![Flask](https://img.shields.io/badge/flask-2.3+-green.svg)](https://flask.palletsprojects.com) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 6 | [![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)](https://pytorch.org) 7 | 8 | > **A lightning-fast, production-grade text-to-speech server with OpenAI-style quality, robust text processing, and accessibility-first design. Processing time: ~1 second with nearly instant output.** 9 | 10 | *Created by [nodeblackbox](https://github.com/nodeblackbox) - Making accessibility available for everyone.* 11 | 12 | ## ✨ Overview 13 | 14 | The Complete Kokoro TTS API delivers **OpenAI-style text-to-speech quality** with exceptional performance and accessibility features. Designed with a commitment that **accessibility should be for everyone**, this API provides crystal-clear voices especially suitable for dyslexic users and assistive technology integration. 15 | 16 | **🚀 Performance Highlights:** 17 | - **~1 second total processing time** 18 | - **Nearly instant audio output** 19 | - **GPU acceleration available** 20 | - **Real-time streaming capabilities** 21 | 22 | ## 🎯 Accessibility & Integration 23 | 24 | ### 🔗 Read Aloud Chrome Extension Integration 25 | 26 | This API seamlessly integrates with the **[Read Aloud](https://chromewebstore.google.com/detail/read-aloud-a-text-to-spee/hdhinadidafjejdhmfkjgnolgimiaplp)** Chrome extension, providing an excellent solution for dyslexic users and anyone who benefits from text-to-speech technology. 27 | 28 | **Setup Instructions:** 29 | 30 | 1. **Install the Extension**: Add [Read Aloud](https://chromewebstore.google.com/detail/read-aloud-a-text-to-spee/hdhinadidafjejdhmfkjgnolgimiaplp) to Chrome 31 | 2. **Configure API Endpoint**: `http://127.0.0.1:5000/v1` 32 | 3. **API Key**: `your-secret-key` 33 | 4. **Select from 28 High-Quality Voices** (see voice configuration below) 34 | 35 | ### 🎤 Available Voices for Read Aloud 36 | 37 | ```json 38 | [ 39 | { "lang": "en-US", "model": "tts-1", "voice": "af_heart" }, 40 | { "lang": "en-US", "model": "tts-1", "voice": "af_bella" }, 41 | { "lang": "en-US", "model": "tts-1", "voice": "af_nicole" }, 42 | { "lang": "en-US", "model": "tts-1", "voice": "af_aoede" }, 43 | { "lang": "en-US", "model": "tts-1", "voice": "af_kore" }, 44 | { "lang": "en-US", "model": "tts-1", "voice": "af_sarah" }, 45 | { "lang": "en-US", "model": "tts-1", "voice": "af_nova" }, 46 | { "lang": "en-US", "model": "tts-1", "voice": "af_sky" }, 47 | { "lang": "en-US", "model": "tts-1", "voice": "af_alloy" }, 48 | { "lang": "en-US", "model": "tts-1", "voice": "af_jessica" }, 49 | { "lang": "en-US", "model": "tts-1", "voice": "af_river" }, 50 | { "lang": "en-US", "model": "tts-1", "voice": "am_michael" }, 51 | { "lang": "en-US", "model": "tts-1", "voice": "am_fenrir" }, 52 | { "lang": "en-US", "model": "tts-1", "voice": "am_puck" }, 53 | { "lang": "en-US", "model": "tts-1", "voice": "am_echo" }, 54 | { "lang": "en-US", "model": "tts-1", "voice": "am_eric" }, 55 | { "lang": "en-US", "model": "tts-1", "voice": "am_liam" }, 56 | { "lang": "en-US", "model": "tts-1", "voice": "am_onyx" }, 57 | { "lang": "en-US", "model": "tts-1", "voice": "am_santa" }, 58 | { "lang": "en-US", "model": "tts-1", "voice": "am_adam" }, 59 | { "lang": "en-GB", "model": "tts-1", "voice": "bf_emma" }, 60 | { "lang": "en-GB", "model": "tts-1", "voice": "bf_isabella" }, 61 | { "lang": "en-GB", "model": "tts-1", "voice": "bf_alice" }, 62 | { "lang": "en-GB", "model": "tts-1", "voice": "bf_lily" }, 63 | { "lang": "en-GB", "model": "tts-1", "voice": "bm_george" }, 64 | { "lang": "en-GB", "model": "tts-1", "voice": "bm_fable" }, 65 | { "lang": "en-GB", "model": "tts-1", "voice": "bm_lewis" }, 66 | { "lang": "en-GB", "model": "tts-1", "voice": "bm_daniel" } 67 | ] 68 | ``` 69 | 70 | ## 🚀 Features 71 | 72 | ### Core Features 73 | - **🔧 Robust Text Processing**: Intelligent handling of markdown, Unicode characters, numbers, abbreviations, and special formatting 74 | - **⚡ Ultra-Fast Performance**: ~1 second total processing with nearly instant output 75 | - **🎚️ Zero-Default Effects**: Clean audio output with effects only when explicitly configured 76 | - **🎵 Local Playback Control**: Built-in audio playback with interrupt capability and session management 77 | - **📡 Real-time Streaming**: Live audio streaming support for compatible clients 78 | - **🎼 Advanced Audio Effects**: FIXED and robust pitch shifting with librosa compatibility 79 | - **🌐 Browser Integration**: Full CORS support for browser extensions and web applications 80 | - **♿ Accessibility First**: Crystal-clear voices optimized for dyslexic users and assistive technology 81 | 82 | ### Audio Processing 83 | - **28 High-quality voices** (20 US English, 8 British English) 84 | - **OpenAI-style TTS quality** with superior clarity 85 | - Various audio format outputs 86 | - Pitch shifting and formant modification 87 | - Dynamic range compression 88 | - Professional-grade audio processing 89 | 90 | ### Developer Experience 91 | - RESTful API design 92 | - OpenAPI specification 93 | - Comprehensive error handling 94 | - Session-based playback management 95 | - Easy integration with existing applications 96 | - Chrome extension compatibility 97 | 98 | ## 📋 Table of Contents 99 | 100 | - [🚀 Features](#-features) 101 | - [⚡ Quick Start](#-quick-start) 102 | - [📦 Installation](#-installation) 103 | - [🔧 Configuration](#-configuration) 104 | - [📡 API Endpoints](#-api-endpoints) 105 | - [💡 Usage Examples](#-usage-examples) 106 | - [🎨 Audio Effects](#-audio-effects) 107 | - [📚 API Documentation](#-api-documentation) 108 | - [🤝 Contributing](#-contributing) 109 | - [📄 License](#-license) 110 | 111 | ## ⚡ Quick Start 112 | 113 | ```bash 114 | # Clone the repository 115 | git clone https://github.com/your-username/Kokoro-Voice-Api.git 116 | cd Kokoro-Voice-Api 117 | 118 | # Install dependencies 119 | pip install -r requirements.txt 120 | 121 | # Run the server 122 | python kokoro_api.py 123 | 124 | # Test the API 125 | curl -X POST \ 126 | -H "Content-Type: application/json" \ 127 | -d '{"input": "Hello, world!", "voice": "af_heart"}' \ 128 | http://localhost:5000/v1/audio/speech 129 | ``` 130 | 131 | ## 📦 Installation 132 | 133 | ### Prerequisites 134 | - **Python 3.8+** (recommended: Python 3.10+) 135 | - **PyTorch** with CUDA support (optional, for GPU acceleration) 136 | - **Git** for cloning the repository 137 | 138 | ### Step-by-Step Installation 139 | 140 | 1. **Clone the Repository** 141 | ```bash 142 | git clone https://github.com/your-username/Kokoro-Voice-Api.git 143 | cd Kokoro-Voice-Api 144 | ``` 145 | 146 | 2. **Create Virtual Environment** (Recommended) 147 | ```bash 148 | python -m venv venv 149 | 150 | # On Windows 151 | venv\Scripts\activate 152 | 153 | # On macOS/Linux 154 | source venv/bin/activate 155 | ``` 156 | 157 | 3. **Install Dependencies** 158 | ```bash 159 | pip install -r requirements.txt 160 | ``` 161 | 162 | 4. **Verify Installation** 163 | ```bash 164 | python -c "import torch, librosa, flask; print('✅ All dependencies installed successfully!')" 165 | ``` 166 | 167 | ## 🔧 Configuration 168 | 169 | ### Environment Variables 170 | ```bash 171 | # Server Configuration 172 | export TTS_HOST=0.0.0.0 173 | export TTS_PORT=5000 174 | export TTS_DEBUG=false 175 | 176 | # Audio Configuration 177 | export TTS_SAMPLE_RATE=22050 178 | export TTS_AUDIO_FORMAT=wav 179 | 180 | # Performance 181 | export TTS_MAX_TEXT_LENGTH=1000 182 | export TTS_CACHE_SIZE=100 183 | ``` 184 | 185 | ### Configuration File 186 | Create a `config.yaml` file in the project root: 187 | ```yaml 188 | server: 189 | host: "0.0.0.0" 190 | port: 5000 191 | debug: false 192 | 193 | audio: 194 | sample_rate: 22050 195 | format: "wav" 196 | quality: "high" 197 | 198 | processing: 199 | max_text_length: 1000 200 | cache_enabled: true 201 | cache_size: 100 202 | ``` 203 | 204 | ## 📡 API Endpoints 205 | 206 | ### Speech Generation 207 | 208 | | Endpoint | Method | Description | 209 | |----------|--------|-------------| 210 | | `/v1/audio/speech` | POST | Standard speech generation with clean zero-default effects | 211 | | `/v1/audio/speech/robust` | POST | Enhanced speech generation with advanced text processing | 212 | | `/v1/audio/speech/stream` | POST | Real-time streaming speech generation | 213 | 214 | ### Playback Control 215 | 216 | | Endpoint | Method | Description | 217 | |----------|--------|-------------| 218 | | `/v1/audio/speech/play` | POST | Local playback with session control | 219 | | `/v1/audio/speech/stop` | POST | Stop/interrupt current playback | 220 | | `/v1/audio/speech/status` | GET | Get current playback status | 221 | 222 | ### System Information 223 | 224 | | Endpoint | Method | Description | 225 | |----------|--------|-------------| 226 | | `/health` | GET | Health check endpoint | 227 | | `/voices` | GET | List available voices | 228 | | `/version` | GET | API version information | 229 | 230 | ## 💡 Usage Examples 231 | 232 | ### Basic Text-to-Speech 233 | ```bash 234 | curl -X POST \ 235 | -H "Content-Type: application/json" \ 236 | -d '{ 237 | "input": "Hello, world! This is a test of the Kokoro TTS API.", 238 | "voice": "af_heart" 239 | }' \ 240 | http://localhost:5000/v1/audio/speech \ 241 | --output hello.wav 242 | ``` 243 | 244 | ### Squeaky Voice Effect 245 | ```bash 246 | curl -X POST \ 247 | -H "Content-Type: application/json" \ 248 | -d '{ 249 | "input": "I sound like a chipmunk!", 250 | "voice": "af_heart", 251 | "effects": { 252 | "pitch": { 253 | "semitone_shift": 8.0 254 | } 255 | } 256 | }' \ 257 | http://localhost:5000/v1/audio/speech \ 258 | --output squeaky.wav 259 | ``` 260 | 261 | ### Deep Voice Effect 262 | ```bash 263 | curl -X POST \ 264 | -H "Content-Type: application/json" \ 265 | -d '{ 266 | "input": "I have a very deep voice now.", 267 | "voice": "af_heart", 268 | "effects": { 269 | "pitch": { 270 | "semitone_shift": -6.0 271 | } 272 | } 273 | }' \ 274 | http://localhost:5000/v1/audio/speech \ 275 | --output deep.wav 276 | ``` 277 | 278 | ### Robust Text Processing 279 | ```bash 280 | curl -X POST \ 281 | -H "Content-Type: application/json" \ 282 | -d '{ 283 | "input": "Process this: **bold text**, _italic_, numbers: 123, $50.99, and 50% off!", 284 | "voice": "af_heart", 285 | "robust_processing": true 286 | }' \ 287 | http://localhost:5000/v1/audio/speech/robust \ 288 | --output processed.wav 289 | ``` 290 | 291 | ### Streaming Audio 292 | ```bash 293 | curl -X POST \ 294 | -H "Content-Type: application/json" \ 295 | -d '{ 296 | "input": "This will be streamed in real-time as it is generated.", 297 | "voice": "af_heart", 298 | "stream": true 299 | }' \ 300 | http://localhost:5000/v1/audio/speech/stream \ 301 | --output stream.wav 302 | ``` 303 | 304 | ### Local Playback Control 305 | ```bash 306 | # Start playback 307 | curl -X POST \ 308 | -H "Content-Type: application/json" \ 309 | -d '{ 310 | "input": "This will play locally on the server.", 311 | "voice": "af_heart", 312 | "session_id": "my-session" 313 | }' \ 314 | http://localhost:5000/v1/audio/speech/play 315 | 316 | # Check status 317 | curl http://localhost:5000/v1/audio/speech/status 318 | 319 | # Stop playback 320 | curl -X POST \ 321 | -H "Content-Type: application/json" \ 322 | -d '{"session_id": "my-session"}' \ 323 | http://localhost:5000/v1/audio/speech/stop 324 | ``` 325 | 326 | ## 🎨 Audio Effects 327 | 328 | ### Pitch Modification 329 | ```json 330 | { 331 | "effects": { 332 | "pitch": { 333 | "semitone_shift": 4.0, // Shift by semitones (-12 to +12) 334 | "preserve_formants": true // Maintain voice character 335 | } 336 | } 337 | } 338 | ``` 339 | 340 | ### Dynamic Range Compression 341 | ```json 342 | { 343 | "effects": { 344 | "compression": { 345 | "ratio": 4.0, // Compression ratio 346 | "threshold": -20.0, // Threshold in dB 347 | "attack": 0.003, // Attack time in seconds 348 | "release": 0.1 // Release time in seconds 349 | } 350 | } 351 | } 352 | ``` 353 | 354 | ### Multiple Effects 355 | ```json 356 | { 357 | "effects": { 358 | "pitch": { 359 | "semitone_shift": 2.0 360 | }, 361 | "compression": { 362 | "ratio": 2.0, 363 | "threshold": -18.0 364 | }, 365 | "reverb": { 366 | "room_size": 0.3, 367 | "damping": 0.5, 368 | "wet_level": 0.2 369 | } 370 | } 371 | } 372 | ``` 373 | 374 | ## 🛠️ Development 375 | 376 | ### Running in Development Mode 377 | ```bash 378 | # Enable debug mode 379 | export FLASK_ENV=development 380 | export TTS_DEBUG=true 381 | 382 | # Run with auto-reload 383 | python kokoro_api.py 384 | ``` 385 | 386 | ### Running Tests 387 | ```bash 388 | # Install test dependencies 389 | pip install pytest pytest-cov 390 | 391 | # Run tests 392 | pytest tests/ -v --cov=kokoro_api 393 | ``` 394 | 395 | ### Docker Support 396 | ```dockerfile 397 | # Build Docker image 398 | docker build -t kokoro-tts-api . 399 | 400 | # Run container 401 | docker run -p 5000:5000 kokoro-tts-api 402 | ``` 403 | 404 | ## 📚 API Documentation 405 | 406 | ### OpenAPI Specification 407 | The complete API documentation is available in OpenAPI format: 408 | - **Specification File**: [`openapi.yaml`](openapi.yaml) 409 | - **Interactive Documentation**: Visit `/docs` when the server is running 410 | - **Redoc Documentation**: Visit `/redoc` when the server is running 411 | 412 | ### Response Formats 413 | All endpoints return standardized responses: 414 | 415 | **Success Response:** 416 | ```json 417 | { 418 | "success": true, 419 | "data": { 420 | "audio_url": "/generated/audio.wav", 421 | "duration": 2.5, 422 | "sample_rate": 22050 423 | }, 424 | "metadata": { 425 | "voice": "af_heart", 426 | "effects_applied": ["pitch_shift"], 427 | "processing_time": 0.85 428 | } 429 | } 430 | ``` 431 | 432 | **Error Response:** 433 | ```json 434 | { 435 | "success": false, 436 | "error": { 437 | "code": "INVALID_VOICE", 438 | "message": "The specified voice 'invalid_voice' is not available", 439 | "details": { 440 | "available_voices": ["af_heart", "af_bella", "af_sarah"] 441 | } 442 | } 443 | } 444 | ``` 445 | 446 | ## 🤝 Contributing 447 | 448 | We welcome contributions! Please follow these steps: 449 | 450 | 1. **Fork the Repository** 451 | ```bash 452 | git fork https://github.com/your-username/Kokoro-Voice-Api.git 453 | ``` 454 | 455 | 2. **Create a Feature Branch** 456 | ```bash 457 | git checkout -b feature/amazing-new-feature 458 | ``` 459 | 460 | 3. **Make Your Changes** 461 | - Follow PEP 8 style guidelines 462 | - Add tests for new functionality 463 | - Update documentation as needed 464 | 465 | 4. **Run Tests** 466 | ```bash 467 | pytest tests/ -v 468 | black kokoro_api.py 469 | flake8 kokoro_api.py 470 | ``` 471 | 472 | 5. **Submit a Pull Request** 473 | - Provide a clear description of your changes 474 | - Reference any related issues 475 | - Ensure all tests pass 476 | 477 | ### Development Guidelines 478 | - **Code Style**: Follow PEP 8 and use `black` for formatting 479 | - **Testing**: Maintain >90% test coverage 480 | - **Documentation**: Update docstrings and README for new features 481 | - **Performance**: Profile code for optimization opportunities 482 | 483 | ## 🔒 Security 484 | 485 | - **API Keys**: Never hardcode API keys; use environment variables 486 | - **Input Validation**: All inputs are sanitized and validated 487 | - **Rate Limiting**: Built-in rate limiting to prevent abuse 488 | - **CORS**: Configurable CORS settings for web integration 489 | 490 | ## 📊 Performance 491 | 492 | ### Benchmarks 493 | - **Average Response Time**: ~1 second for 50-word text 494 | - **Output Latency**: Nearly instant audio delivery 495 | - **Concurrent Requests**: Supports up to 10 simultaneous requests 496 | - **Memory Usage**: ~200MB baseline + ~50MB per active session 497 | - **GPU Acceleration**: 3x faster processing with CUDA-enabled PyTorch 498 | - **Voice Quality**: OpenAI-comparable clarity and naturalness 499 | 500 | ### Optimization Tips 501 | - Use GPU acceleration when available for fastest processing 502 | - Enable caching for repeated requests 503 | - Batch multiple requests when possible 504 | - Use streaming for long-form content 505 | - Perfect for real-time applications and accessibility tools 506 | 507 | ## 🌟 Accessibility Statement 508 | 509 | **We believe accessibility should be for everyone.** This API is specifically designed with dyslexic users and assistive technology in mind, providing: 510 | 511 | - **Crystal-clear voice quality** optimized for comprehension 512 | - **Multiple accent options** (US and British English) 513 | - **Fast processing** for responsive user experience 514 | - **Browser extension compatibility** for seamless web integration 515 | - **Professional-grade audio** without distortion or artifacts 516 | 517 | ### Perfect for: 518 | - 📚 **Dyslexic students and professionals** 519 | - 👩‍🦯 **Users with visual impairments** 520 | - 🧠 **People with learning differences** 521 | - 👥 **Anyone who benefits from audio content** 522 | - 🌐 **Web accessibility implementations** 523 | 524 | ## 🙏 Acknowledgments 525 | 526 | - **[nodeblackbox](https://github.com/nodeblackbox)** - Project creator and maintainer 527 | - **Kokoro TTS Team** for the underlying neural TTS technology 528 | - **PyTorch Team** for the deep learning framework 529 | - **Librosa Contributors** for audio processing capabilities 530 | - **Flask Community** for the web framework 531 | - **Accessibility advocates** who inspire inclusive technology 532 | 533 | --- 534 | 535 |
536 | 537 | **[⬆ Back to Top](#-complete-kokoro-tts-api)** 538 | 539 | Made with ❤️ for accessibility by [nodeblackbox](https://github.com/nodeblackbox) 540 | 541 | *"Accessibility should be for everyone"* 542 | 543 |
544 | -------------------------------------------------------------------------------- /kokoro_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from kokoro import KModel, KPipeline 4 | import scipy.io.wavfile as wavfile 5 | from pydub import AudioSegment 6 | import tempfile 7 | import logging 8 | import io 9 | import html 10 | import unicodedata 11 | from typing import Dict, Optional, Tuple, List, Any, Union 12 | from functools import wraps 13 | import numpy as np 14 | import librosa 15 | import librosa.display 16 | from scipy import signal 17 | import math 18 | import re 19 | from datetime import datetime 20 | from dataclasses import dataclass 21 | from enum import Enum 22 | import hashlib 23 | import threading 24 | from concurrent.futures import ThreadPoolExecutor 25 | import time 26 | 27 | # Flask imports 28 | from flask import Flask, request, Response, jsonify, stream_with_context 29 | from flask_cors import CORS 30 | 31 | # Additional imports for enhanced text processing 32 | try: 33 | import num2words 34 | HAS_NUM2WORDS = True 35 | except ImportError: 36 | HAS_NUM2WORDS = False 37 | logging.warning("num2words not available - using basic number processing") 38 | 39 | try: 40 | import markdown 41 | from markdown.extensions import codehilite, fenced_code, tables 42 | HAS_MARKDOWN = True 43 | except ImportError: 44 | HAS_MARKDOWN = False 45 | logging.warning("markdown not available - using regex-based processing") 46 | 47 | # --- Configuration ---------------------------------------------------------- 48 | 49 | KOKORO_VOICES = { 50 | 'af_heart': {'lang': 'en-US', 'gender': 'female', 'description': 'Heart ❤️'}, 51 | 'af_bella': {'lang': 'en-US', 'gender': 'female', 'description': 'Bella 🔥'}, 52 | 'af_nicole': {'lang': 'en-US', 'gender': 'female', 'description': 'Nicole 🎧'}, 53 | 'af_aoede': {'lang': 'en-US', 'gender': 'female', 'description': 'Aoede'}, 54 | 'af_kore': {'lang': 'en-US', 'gender': 'female', 'description': 'Kore'}, 55 | 'af_sarah': {'lang': 'en-US', 'gender': 'female', 'description': 'Sarah'}, 56 | 'af_nova': {'lang': 'en-US', 'gender': 'female', 'description': 'Nova'}, 57 | 'af_sky': {'lang': 'en-US', 'gender': 'female', 'description': 'Sky'}, 58 | 'af_alloy': {'lang': 'en-US', 'gender': 'female', 'description': 'Alloy'}, 59 | 'af_jessica': {'lang': 'en-US', 'gender': 'female', 'description': 'Jessica'}, 60 | 'af_river': {'lang': 'en-US', 'gender': 'female', 'description': 'River'}, 61 | 'am_michael': {'lang': 'en-US', 'gender': 'male', 'description': 'Michael'}, 62 | 'am_fenrir': {'lang': 'en-US', 'gender': 'male', 'description': 'Fenrir'}, 63 | 'am_puck': {'lang': 'en-US', 'gender': 'male', 'description': 'Puck'}, 64 | 'am_echo': {'lang': 'en-US', 'gender': 'male', 'description': 'Echo'}, 65 | 'am_eric': {'lang': 'en-US', 'gender': 'male', 'description': 'Eric'}, 66 | 'am_liam': {'lang': 'en-US', 'gender': 'male', 'description': 'Liam'}, 67 | 'am_onyx': {'lang': 'en-US', 'gender': 'male', 'description': 'Onyx'}, 68 | 'am_santa': {'lang': 'en-US', 'gender': 'male', 'description': 'Santa'}, 69 | 'am_adam': {'lang': 'en-US', 'gender': 'male', 'description': 'Adam'}, 70 | 'bf_emma': {'lang': 'en-GB', 'gender': 'female', 'description': 'Emma'}, 71 | 'bf_isabella': {'lang': 'en-GB', 'gender': 'female', 'description': 'Isabella'}, 72 | 'bf_alice': {'lang': 'en-GB', 'gender': 'female', 'description': 'Alice'}, 73 | 'bf_lily': {'lang': 'en-GB', 'gender': 'female', 'description': 'Lily'}, 74 | 'bm_george': {'lang': 'en-GB', 'gender': 'male', 'description': 'George'}, 75 | 'bm_fable': {'lang': 'en-GB', 'gender': 'male', 'description': 'Fable'}, 76 | 'bm_lewis': {'lang': 'en-GB', 'gender': 'male', 'description': 'Lewis'}, 77 | 'bm_daniel': {'lang': 'en-GB', 'gender': 'male', 'description': 'Daniel'}, 78 | } 79 | 80 | DEFAULT_EFFECT_ORDER = [ 81 | 'volume', 'equalizer', 'compression', 'distortion', 'pitch', 82 | 'formant', 'voice_character', 'reverb' 83 | ] 84 | 85 | DEFAULT_VOICE = 'af_heart' 86 | SAMPLE_RATE = 24000 87 | API_PORT = 5000 88 | API_HOST = '0.0.0.0' 89 | 90 | ENABLE_CORS = os.getenv('ENABLE_CORS', 'true').lower() == 'true' 91 | LOG_AUTH_ATTEMPTS = True 92 | 93 | # Enhanced configuration 94 | MAX_TEXT_LENGTH = 10000 95 | DEFAULT_CHUNK_SIZE = 400 96 | MIN_CHUNK_SIZE = 50 97 | MAX_CHUNKS = 50 98 | CACHE_SIZE = 1000 99 | 100 | # --- Setup logging ---------------------------------------------------------- 101 | 102 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 103 | logger = logging.getLogger(__name__) 104 | 105 | # --- GPU setup -------------------------------------------------------------- 106 | 107 | CUDA_AVAILABLE = torch.cuda.is_available() 108 | GPU_DEVICE_COUNT = torch.cuda.device_count() if CUDA_AVAILABLE else 0 109 | GPU_DEVICE_NAME = torch.cuda.get_device_name(0) if CUDA_AVAILABLE else "N/A" 110 | 111 | def get_gpu_memory_info(): 112 | """Get GPU memory information if CUDA is available.""" 113 | if not CUDA_AVAILABLE: 114 | return {"total": 0, "allocated": 0, "free": 0} 115 | 116 | try: 117 | total_memory = torch.cuda.get_device_properties(0).total_memory 118 | allocated_memory = torch.cuda.memory_allocated(0) 119 | free_memory = total_memory - allocated_memory 120 | 121 | return { 122 | "total": total_memory / (1024**3), # Convert to GB 123 | "allocated": allocated_memory / (1024**3), 124 | "free": free_memory / (1024**3) 125 | } 126 | except Exception as e: 127 | logger.warning(f"Failed to get GPU memory info: {e}") 128 | return {"total": 0, "allocated": 0, "free": 0} 129 | 130 | # Log detailed GPU information 131 | logger.info(f"CUDA Available: {CUDA_AVAILABLE}") 132 | if CUDA_AVAILABLE: 133 | logger.info(f"GPU Device Count: {GPU_DEVICE_COUNT}") 134 | logger.info(f"GPU Device Name: {GPU_DEVICE_NAME}") 135 | memory_info = get_gpu_memory_info() 136 | logger.info(f"GPU Memory - Total: {memory_info['total']:.2f} GB, Available: {memory_info['free']:.2f} GB") 137 | else: 138 | logger.warning("CUDA not available - will use CPU for inference (slower)") 139 | 140 | # Function to log current GPU usage 141 | def log_gpu_status(context: str = ""): 142 | """Log current GPU usage status.""" 143 | if CUDA_AVAILABLE: 144 | memory_info = get_gpu_memory_info() 145 | logger.info(f"GPU Status {context}: {memory_info['allocated']:.2f} GB used / {memory_info['total']:.2f} GB total") 146 | else: 147 | logger.info(f"GPU Status {context}: Using CPU (CUDA not available)") 148 | 149 | # --- Audio Playback Control ------------------------------------------------- 150 | 151 | class AudioPlaybackController: 152 | """Global controller for managing audio playback state and interruptions.""" 153 | 154 | def __init__(self): 155 | self.is_playing = False 156 | self.should_stop = False 157 | self.current_session_id = None 158 | self.playback_start_time = None 159 | self.interrupted_at = None 160 | self.total_duration = None 161 | self.lock = threading.Lock() 162 | 163 | def start_playback(self, session_id: str, duration: float): 164 | """Start a new playback session.""" 165 | with self.lock: 166 | self.is_playing = True 167 | self.should_stop = False 168 | self.current_session_id = session_id 169 | self.playback_start_time = time.time() 170 | self.interrupted_at = None 171 | self.total_duration = duration 172 | logger.info(f"Started playback session: {session_id}, duration: {duration:.2f}s") 173 | 174 | def stop_playback(self, session_id: str = None): 175 | """Stop the current playback session.""" 176 | with self.lock: 177 | if session_id and session_id != self.current_session_id: 178 | return False # Wrong session 179 | 180 | if self.is_playing: 181 | current_time = time.time() 182 | self.interrupted_at = current_time - self.playback_start_time if self.playback_start_time else 0 183 | logger.info(f"Stopped playback session: {self.current_session_id}, interrupted at: {self.interrupted_at:.2f}s") 184 | 185 | self.is_playing = False 186 | self.should_stop = True 187 | return True 188 | 189 | def finish_playback(self): 190 | """Mark playback as naturally finished.""" 191 | with self.lock: 192 | self.is_playing = False 193 | self.should_stop = False 194 | self.interrupted_at = None 195 | logger.info(f"Finished playback session: {self.current_session_id}") 196 | 197 | def get_status(self): 198 | """Get current playback status.""" 199 | with self.lock: 200 | current_time = time.time() 201 | elapsed = current_time - self.playback_start_time if self.playback_start_time else 0 202 | 203 | return { 204 | "is_playing": self.is_playing, 205 | "session_id": self.current_session_id, 206 | "elapsed_time": elapsed, 207 | "total_duration": self.total_duration, 208 | "interrupted_at": self.interrupted_at, 209 | "should_stop": self.should_stop 210 | } 211 | 212 | # Global playback controller 213 | playback_controller = AudioPlaybackController() 214 | 215 | # --- Enhanced Text Processing Classes --------------------------------------- 216 | 217 | @dataclass 218 | class TextChunk: 219 | """Represents a processed text chunk with metadata.""" 220 | text: str 221 | original_text: str 222 | chunk_id: int 223 | total_chunks: int 224 | processing_time: float = 0.0 225 | char_count: int = 0 226 | 227 | def __post_init__(self): 228 | self.char_count = len(self.text) 229 | 230 | class TextProcessingMode(Enum): 231 | """Text processing modes for different input types.""" 232 | PLAIN = "plain" 233 | MARKDOWN = "markdown" 234 | HTML = "html" 235 | SSML = "ssml" 236 | 237 | class ProductionTextProcessor: 238 | """Production-grade text processor with comprehensive normalization.""" 239 | 240 | def __init__(self): 241 | # Enhanced character replacements (comprehensive Unicode mapping) 242 | self.char_replacements = { 243 | # Smart quotes and apostrophes 244 | '"': '"', '"': '"', ''': "'", ''': "'", 245 | '‚': ',', '„': '"', '‹': '<', '›': '>', 246 | '«': '"', '»': '"', 247 | 248 | # Dashes and hyphens 249 | '–': '-', '—': '-', '―': '-', '‒': '-', 250 | 251 | # Mathematical and special symbols 252 | '×': ' times ', '÷': ' divided by ', '±': ' plus or minus ', 253 | '≤': ' less than or equal to ', '≥': ' greater than or equal to ', 254 | '≠': ' not equal to ', '≈': ' approximately ', 255 | '∞': ' infinity ', '√': ' square root of ', 256 | 257 | # Currency symbols 258 | '€': ' euros ', '£': ' pounds ', '¥': ' yen ', 259 | '₹': ' rupees ', '₽': ' rubles ', '₩': ' won ', 260 | 261 | # Other symbols 262 | '©': ' copyright ', '®': ' registered ', '™': ' trademark ', 263 | '§': ' section ', '¶': ' paragraph ', '†': ' dagger ', 264 | '‡': ' double dagger ', '•': ' bullet ', '‰': ' per mille ', 265 | '…': '...', '⋯': '...', '⋮': '...', 266 | 267 | # Fractions 268 | '½': ' one half ', '⅓': ' one third ', '⅔': ' two thirds ', 269 | '¼': ' one quarter ', '¾': ' three quarters ', '⅕': ' one fifth ', 270 | '⅖': ' two fifths ', '⅗': ' three fifths ', '⅘': ' four fifths ', 271 | '⅙': ' one sixth ', '⅚': ' five sixths ', '⅛': ' one eighth ', 272 | '⅜': ' three eighths ', '⅝': ' five eighths ', '⅞': ' seven eighths ', 273 | } 274 | 275 | # Context-aware abbreviations with disambiguation 276 | self.abbreviations = { 277 | # Titles 278 | 'Dr.': {'default': 'Doctor', 'context': {'street': 'Drive'}}, 279 | 'Mr.': {'default': 'Mister'}, 280 | 'Mrs.': {'default': 'Missus'}, 281 | 'Ms.': {'default': 'Miss'}, 282 | 'Prof.': {'default': 'Professor'}, 283 | 284 | # Places and directions 285 | 'St.': {'default': 'Saint', 'context': {'address': 'Street'}}, 286 | 'Ave.': {'default': 'Avenue'}, 287 | 'Blvd.': {'default': 'Boulevard'}, 288 | 'Rd.': {'default': 'Road'}, 289 | 'Ln.': {'default': 'Lane'}, 290 | 'Ct.': {'default': 'Court'}, 291 | 'Pl.': {'default': 'Place'}, 292 | 'Sq.': {'default': 'Square'}, 293 | 'N.': {'default': 'North', 'context': {'name': 'N'}}, 294 | 'S.': {'default': 'South', 'context': {'name': 'S'}}, 295 | 'E.': {'default': 'East', 'context': {'name': 'E'}}, 296 | 'W.': {'default': 'West', 'context': {'name': 'W'}}, 297 | 298 | # Common abbreviations 299 | 'etc.': {'default': 'etcetera'}, 300 | 'vs.': {'default': 'versus'}, 301 | 'e.g.': {'default': 'for example'}, 302 | 'i.e.': {'default': 'that is'}, 303 | 'cf.': {'default': 'compare'}, 304 | 'et al.': {'default': 'and others'}, 305 | 'ibid.': {'default': 'in the same place'}, 306 | 'op. cit.': {'default': 'in the work cited'}, 307 | 308 | # Business 309 | 'Inc.': {'default': 'Incorporated'}, 310 | 'Corp.': {'default': 'Corporation'}, 311 | 'Ltd.': {'default': 'Limited'}, 312 | 'Co.': {'default': 'Company'}, 313 | 'LLC': {'default': 'Limited Liability Company'}, 314 | 'LLP': {'default': 'Limited Liability Partnership'}, 315 | 316 | # Time and dates 317 | 'Jan.': {'default': 'January'}, 318 | 'Feb.': {'default': 'February'}, 319 | 'Mar.': {'default': 'March'}, 320 | 'Apr.': {'default': 'April'}, 321 | 'Jun.': {'default': 'June'}, 322 | 'Jul.': {'default': 'July'}, 323 | 'Aug.': {'default': 'August'}, 324 | 'Sep.': {'default': 'September'}, 325 | 'Sept.': {'default': 'September'}, 326 | 'Oct.': {'default': 'October'}, 327 | 'Nov.': {'default': 'November'}, 328 | 'Dec.': {'default': 'December'}, 329 | 330 | 'Mon.': {'default': 'Monday'}, 331 | 'Tue.': {'default': 'Tuesday'}, 332 | 'Wed.': {'default': 'Wednesday'}, 333 | 'Thu.': {'default': 'Thursday'}, 334 | 'Fri.': {'default': 'Friday'}, 335 | 'Sat.': {'default': 'Saturday'}, 336 | 'Sun.': {'default': 'Sunday'}, 337 | 338 | 'AM': {'default': 'A M'}, 339 | 'PM': {'default': 'P M'}, 340 | 'a.m.': {'default': 'A M'}, 341 | 'p.m.': {'default': 'P M'}, 342 | } 343 | 344 | # Enhanced markdown patterns 345 | self.markdown_patterns = [ 346 | # Code blocks (must come first) 347 | (r'\`\`\`[\s\S]*?\`\`\`', ' [code block] '), 348 | (r'`([^`]+)`', r'\1'), 349 | 350 | # Headers 351 | (r'^#{1,6}\s+(.+)$', r'\1', re.MULTILINE), 352 | 353 | # Links and images 354 | (r'!\[([^\]]*)\]$$[^)]+$$', r'\1'), # Images - use alt text 355 | (r'\[([^\]]+)\]$$[^)]+$$', r'\1'), # Links - use link text 356 | 357 | # Emphasis 358 | (r'\*\*\*(.+?)\*\*\*', r'\1'), # Bold italic 359 | (r'\*\*(.+?)\*\*', r'\1'), # Bold 360 | (r'\*(.+?)\*', r'\1'), # Italic 361 | (r'__(.+?)__', r'\1'), # Bold alt 362 | (r'_(.+?)_', r'\1'), # Italic alt 363 | (r'~~(.+?)~~', r'\1'), # Strikethrough 364 | 365 | # Lists 366 | (r'^\s*[-*+]\s+(.+)$', r'\1', re.MULTILINE), # Unordered lists 367 | (r'^\s*\d+\.\s+(.+)$', r'\1', re.MULTILINE), # Ordered lists 368 | 369 | # Blockquotes 370 | (r'^\s*>\s*(.+)$', r'\1', re.MULTILINE), 371 | 372 | # Horizontal rules 373 | (r'^[-*_]{3,}$', '', re.MULTILINE), 374 | 375 | # Tables (remove pipe separators) 376 | (r'\|', ' '), 377 | ] 378 | 379 | # Number processing patterns 380 | self.number_patterns = [ 381 | # Currency with amounts 382 | (r'(\$|USD|€|EUR|£|GBP|¥|JPY|₹|INR)\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', self._expand_currency), 383 | 384 | # Percentages 385 | (r'(\d+(?:\.\d+)?)\s*%', r'\1 percent'), 386 | 387 | # Temperatures 388 | (r'(\d+(?:\.\d+)?)\s*°([CF])', self._expand_temperature), 389 | 390 | # Measurements 391 | (r'(\d+(?:\.\d+)?)\s*(km|m|cm|mm|ft|in|mi|kg|g|lb|oz)', self._expand_measurement), 392 | 393 | # Years (4 digits) 394 | (r'\b(19|20)\d{2}\b', self._expand_year), 395 | 396 | # Large numbers with commas 397 | (r'\b(\d{1,3}(?:,\d{3})+)\b', self._expand_large_number), 398 | 399 | # Decimals 400 | (r'\b(\d+)\.(\d+)\b', self._expand_decimal), 401 | 402 | # Ordinals 403 | (r'\b(\d+)(st|nd|rd|th)\b', self._expand_ordinal), 404 | 405 | # Phone numbers (basic pattern) 406 | (r'\b(\d{3})-(\d{3})-(\d{4})\b', r'\1 \2 \3'), 407 | 408 | # Time 409 | (r'\b(\d{1,2}):(\d{2})\s*(AM|PM|am|pm)?\b', self._expand_time), 410 | ] 411 | 412 | # Sentence boundary patterns for smart chunking 413 | self.sentence_boundaries = re.compile( 414 | r'(?<=[.!?])\s+(?=[A-Z])|' # Period/exclamation/question + space + capital 415 | r'(?<=[.!?])\s*\n\s*(?=[A-Z])|' # Same with newline 416 | r'(?<=\.)\s+(?=["\'"]?[A-Z])' # Period + space + optional quote + capital 417 | ) 418 | 419 | # Compile regex patterns for performance 420 | self._compile_patterns() 421 | 422 | # Cache for processed text 423 | self._cache = {} 424 | self._cache_lock = threading.Lock() 425 | 426 | def _compile_patterns(self): 427 | """Compile regex patterns for better performance.""" 428 | self.compiled_markdown = [] 429 | for pattern in self.markdown_patterns: 430 | if len(pattern) == 3: 431 | self.compiled_markdown.append((re.compile(pattern[0], pattern[2]), pattern[1])) 432 | else: 433 | self.compiled_markdown.append((re.compile(pattern[0]), pattern[1])) 434 | 435 | def _expand_currency(self, match): 436 | """Expand currency amounts.""" 437 | symbol = match.group(1) 438 | amount = match.group(2) 439 | 440 | # Currency symbol mapping 441 | currency_map = { 442 | '$': 'dollars', 'USD': 'dollars', 443 | '€': 'euros', 'EUR': 'euros', 444 | '£': 'pounds', 'GBP': 'pounds', 445 | '¥': 'yen', 'JPY': 'yen', 446 | '₹': 'rupees', 'INR': 'rupees' 447 | } 448 | 449 | currency_name = currency_map.get(symbol, 'units') 450 | 451 | if HAS_NUM2WORDS: 452 | try: 453 | # Remove commas and convert to float 454 | amount_float = float(amount.replace(',', '')) 455 | if amount_float == int(amount_float): 456 | # Whole number 457 | amount_words = num2words.num2words(int(amount_float)) 458 | else: 459 | # Has decimal places 460 | dollars = int(amount_float) 461 | cents = int((amount_float - dollars) * 100) 462 | amount_words = f"{num2words.num2words(dollars)} {currency_name}" 463 | if cents > 0: 464 | amount_words += f" and {num2words.num2words(cents)} cents" 465 | return amount_words 466 | return f"{amount_words} {currency_name}" 467 | except: 468 | pass 469 | 470 | return f"{amount} {currency_name}" 471 | 472 | def _expand_temperature(self, match): 473 | """Expand temperature readings.""" 474 | temp = match.group(1) 475 | scale = match.group(2) 476 | scale_name = 'Celsius' if scale.upper() == 'C' else 'Fahrenheit' 477 | return f"{temp} degrees {scale_name}" 478 | 479 | def _expand_measurement(self, match): 480 | """Expand measurements.""" 481 | value = match.group(1) 482 | unit = match.group(2) 483 | 484 | unit_map = { 485 | 'km': 'kilometers', 'm': 'meters', 'cm': 'centimeters', 'mm': 'millimeters', 486 | 'ft': 'feet', 'in': 'inches', 'mi': 'miles', 487 | 'kg': 'kilograms', 'g': 'grams', 'lb': 'pounds', 'oz': 'ounces' 488 | } 489 | 490 | unit_name = unit_map.get(unit, unit) 491 | return f"{value} {unit_name}" 492 | 493 | def _expand_year(self, match): 494 | """Expand years for better pronunciation.""" 495 | year = match.group(0) 496 | if HAS_NUM2WORDS: 497 | try: 498 | return num2words.num2words(int(year)) 499 | except: 500 | pass 501 | return year 502 | 503 | def _expand_large_number(self, match): 504 | """Expand large numbers with commas.""" 505 | number = match.group(1).replace(',', '') 506 | if HAS_NUM2WORDS: 507 | try: 508 | return num2words.num2words(int(number)) 509 | except: 510 | pass 511 | return number 512 | 513 | def _expand_decimal(self, match): 514 | """Expand decimal numbers.""" 515 | whole = match.group(1) 516 | decimal = match.group(2) 517 | 518 | if HAS_NUM2WORDS: 519 | try: 520 | whole_words = num2words.num2words(int(whole)) 521 | decimal_words = ' '.join([num2words.num2words(int(d)) for d in decimal]) 522 | return f"{whole_words} point {decimal_words}" 523 | except: 524 | pass 525 | 526 | return f"{whole} point {' '.join(decimal)}" 527 | 528 | def _expand_ordinal(self, match): 529 | """Expand ordinal numbers.""" 530 | number = match.group(1) 531 | suffix = match.group(2) 532 | 533 | if HAS_NUM2WORDS: 534 | try: 535 | return num2words.num2words(int(number), ordinal=True) 536 | except: 537 | pass 538 | 539 | return f"{number}{suffix}" 540 | 541 | def _expand_time(self, match): 542 | """Expand time expressions.""" 543 | hour = int(match.group(1)) 544 | minute = match.group(2) 545 | period = match.group(3) if match.group(3) else "" 546 | 547 | if HAS_NUM2WORDS: 548 | try: 549 | hour_words = num2words.num2words(hour) 550 | if minute == "00": 551 | time_words = f"{hour_words} o'clock" 552 | else: 553 | minute_words = num2words.num2words(int(minute)) 554 | time_words = f"{hour_words} {minute_words}" 555 | 556 | if period: 557 | time_words += f" {period.upper()}" 558 | 559 | return time_words 560 | except: 561 | pass 562 | 563 | return f"{hour} {minute} {period}".strip() 564 | 565 | def normalize_unicode(self, text: str) -> str: 566 | """Normalize Unicode characters while preserving important accents.""" 567 | # First, unescape HTML entities 568 | text = html.unescape(text) 569 | 570 | # Replace special characters 571 | for char, replacement in self.char_replacements.items(): 572 | text = text.replace(char, replacement) 573 | 574 | # Normalize Unicode but preserve accented characters in names 575 | # This is a balance between ASCII conversion and preserving pronunciation 576 | normalized = unicodedata.normalize('NFC', text) 577 | 578 | return normalized 579 | 580 | def clean_markdown(self, text: str) -> str: 581 | """Clean markdown formatting using proper parsing when available.""" 582 | if HAS_MARKDOWN: 583 | try: 584 | # Convert markdown to HTML, then extract text 585 | md = markdown.Markdown(extensions=['fenced_code', 'tables', 'codehilite']) 586 | html_content = md.convert(text) 587 | 588 | # Simple HTML tag removal (more robust than regex for basic cases) 589 | import re 590 | clean_text = re.sub(r'<[^>]+>', '', html_content) 591 | clean_text = html.unescape(clean_text) 592 | return clean_text 593 | except Exception as e: 594 | logger.warning(f"Markdown parsing failed, falling back to regex: {e}") 595 | 596 | # Fallback to regex-based cleaning 597 | cleaned = text 598 | for pattern, replacement in self.compiled_markdown: 599 | cleaned = pattern.sub(replacement, cleaned) 600 | 601 | return cleaned 602 | 603 | def expand_abbreviations(self, text: str) -> str: 604 | """Expand abbreviations with context awareness.""" 605 | result = text 606 | 607 | for abbrev, expansion_data in self.abbreviations.items(): 608 | if isinstance(expansion_data, dict): 609 | default_expansion = expansion_data['default'] 610 | # For now, use default expansion 611 | # TODO: Implement context detection for better disambiguation 612 | expansion = default_expansion 613 | else: 614 | expansion = expansion_data 615 | 616 | # Use word boundaries to avoid partial matches 617 | pattern = r'\b' + re.escape(abbrev) + r'\b' 618 | result = re.sub(pattern, expansion, result, flags=re.IGNORECASE) 619 | 620 | return result 621 | 622 | def process_numbers(self, text: str) -> str: 623 | """Process numbers, currency, and measurements.""" 624 | result = text 625 | 626 | for pattern, replacement in self.number_patterns: 627 | if callable(replacement): 628 | result = re.sub(pattern, replacement, result) 629 | else: 630 | result = re.sub(pattern, replacement, result) 631 | 632 | return result 633 | 634 | def normalize_punctuation(self, text: str) -> str: 635 | """Normalize punctuation for better TTS processing.""" 636 | # Multiple punctuation marks 637 | text = re.sub(r'\.{2,}', '...', text) # Multiple dots to ellipsis 638 | text = re.sub(r'[!]{2,}', '!', text) # Multiple exclamations 639 | text = re.sub(r'[?]{2,}', '?', text) # Multiple questions 640 | 641 | # Normalize spacing around punctuation 642 | text = re.sub(r'\s*([,.!?;:])\s*', r'\1 ', text) 643 | 644 | # Multiple spaces to single space 645 | text = re.sub(r'\s+', ' ', text) 646 | 647 | # Clean up extra whitespace 648 | text = text.strip() 649 | 650 | return text 651 | 652 | def preserve_case_markers(self, text: str) -> Tuple[str, List[Tuple[int, int, str]]]: 653 | """Identify and preserve important case information.""" 654 | case_preservations = [] 655 | 656 | # Find acronyms (2+ consecutive uppercase letters) 657 | for match in re.finditer(r'\b[A-Z]{2,}\b', text): 658 | case_preservations.append((match.start(), match.end(), match.group())) 659 | 660 | # Find proper nouns at sentence beginnings 661 | for match in re.finditer(r'(?:^|[.!?]\s+)([A-Z][a-z]+)', text): 662 | start = match.start(1) 663 | end = match.end(1) 664 | case_preservations.append((start, end, match.group(1))) 665 | 666 | return text, case_preservations 667 | 668 | def smart_chunk_text(self, text: str, max_length: int = DEFAULT_CHUNK_SIZE) -> List[str]: 669 | """Intelligently chunk text at sentence boundaries.""" 670 | if len(text) <= max_length: 671 | return [text] 672 | 673 | # First, try to split at sentence boundaries 674 | sentences = self.sentence_boundaries.split(text) 675 | if not sentences: 676 | sentences = [text] 677 | 678 | chunks = [] 679 | current_chunk = "" 680 | 681 | for sentence in sentences: 682 | sentence = sentence.strip() 683 | if not sentence: 684 | continue 685 | 686 | # If adding this sentence would exceed max_length 687 | if len(current_chunk + sentence) > max_length: 688 | if current_chunk: 689 | chunks.append(current_chunk.strip()) 690 | current_chunk = sentence + " " 691 | else: 692 | # Single sentence is too long, split at commas or other punctuation 693 | sub_parts = re.split(r'(?<=[,;:])\s+', sentence) 694 | for part in sub_parts: 695 | if len(current_chunk + part) > max_length: 696 | if current_chunk: 697 | chunks.append(current_chunk.strip()) 698 | current_chunk = part + " " 699 | else: 700 | current_chunk += part + " " 701 | else: 702 | current_chunk += sentence + " " 703 | 704 | if current_chunk.strip(): 705 | chunks.append(current_chunk.strip()) 706 | 707 | # Filter out chunks that are too short 708 | valid_chunks = [chunk for chunk in chunks if len(chunk.strip()) >= MIN_CHUNK_SIZE] 709 | 710 | return valid_chunks[:MAX_CHUNKS] # Limit total chunks 711 | 712 | def detect_input_mode(self, text: str) -> TextProcessingMode: 713 | """Detect the input text format.""" 714 | # Check for SSML 715 | if '' in text or '' in text or '
' in text or '

' in text: 720 | return TextProcessingMode.HTML 721 | 722 | # Check for Markdown 723 | markdown_indicators = [ 724 | r'^#{1,6}\s', # Headers 725 | r'\*\*.*?\*\*', # Bold 726 | r'\[.*?\]$$.*?$$', # Links 727 | r'\`\`\`', # Code blocks 728 | r'^\s*[-*+]\s', # Lists 729 | r'^\s*\d+\.\s', # Numbered lists 730 | ] 731 | 732 | for pattern in markdown_indicators: 733 | if re.search(pattern, text, re.MULTILINE): 734 | return TextProcessingMode.MARKDOWN 735 | 736 | return TextProcessingMode.PLAIN 737 | 738 | def process_text(self, text: str, mode: Optional[TextProcessingMode] = None, 739 | max_chunk_length: int = DEFAULT_CHUNK_SIZE) -> List[TextChunk]: 740 | """Main text processing pipeline.""" 741 | start_time = time.time() 742 | 743 | if not text or not text.strip(): 744 | return [] 745 | 746 | # Check cache first 747 | cache_key = hashlib.md5(f"{text}_{max_chunk_length}".encode()).hexdigest() 748 | with self._cache_lock: 749 | if cache_key in self._cache: 750 | logger.info("Using cached text processing result") 751 | return self._cache[cache_key] 752 | 753 | original_text = text 754 | logger.info(f"Processing text: {text[:100]}...") 755 | 756 | # Auto-detect mode if not provided 757 | if mode is None: 758 | mode = self.detect_input_mode(text) 759 | 760 | logger.info(f"Detected input mode: {mode.value}") 761 | 762 | # Step 1: Normalize Unicode and HTML entities 763 | processed = self.normalize_unicode(text) 764 | 765 | # Step 2: Handle different input formats 766 | if mode == TextProcessingMode.MARKDOWN: 767 | processed = self.clean_markdown(processed) 768 | elif mode == TextProcessingMode.HTML: 769 | # Basic HTML cleaning 770 | processed = re.sub(r'<[^>]+>', '', processed) 771 | processed = html.unescape(processed) 772 | elif mode == TextProcessingMode.SSML: 773 | # For SSML, we might want to preserve some tags 774 | # For now, just clean basic HTML-like tags 775 | processed = re.sub(r'<(?!speak|voice|prosody|break|emphasis)[^>]+>', '', processed) 776 | 777 | # Step 3: Preserve case information 778 | processed, case_info = self.preserve_case_markers(processed) 779 | 780 | # Step 4: Expand abbreviations 781 | processed = self.expand_abbreviations(processed) 782 | 783 | # Step 5: Process numbers and special formats 784 | processed = self.process_numbers(processed) 785 | 786 | # Step 6: Normalize punctuation 787 | processed = self.normalize_punctuation(processed) 788 | 789 | # Step 7: Smart chunking 790 | text_chunks = self.smart_chunk_text(processed, max_chunk_length) 791 | 792 | # Step 8: Create TextChunk objects 793 | chunks = [] 794 | processing_time = time.time() - start_time 795 | 796 | for i, chunk_text in enumerate(text_chunks): 797 | if len(chunk_text.strip()) >= 2: # Minimum viable chunk size 798 | chunk = TextChunk( 799 | text=chunk_text.strip(), 800 | original_text=original_text, 801 | chunk_id=i, 802 | total_chunks=len(text_chunks), 803 | processing_time=processing_time / len(text_chunks) 804 | ) 805 | chunks.append(chunk) 806 | 807 | # Cache the result 808 | with self._cache_lock: 809 | if len(self._cache) >= CACHE_SIZE: 810 | # Simple cache eviction - remove oldest entries 811 | oldest_keys = list(self._cache.keys())[:CACHE_SIZE // 2] 812 | for key in oldest_keys: 813 | del self._cache[key] 814 | self._cache[cache_key] = chunks 815 | 816 | logger.info(f"Text processed into {len(chunks)} chunks in {processing_time:.2f}s") 817 | return chunks 818 | 819 | # --- Note to Frequency Conversion -------------------------------------------- 820 | 821 | NOTE_PATTERN = re.compile(r"^([A-G])([#b]?)(\d)$") 822 | NOTE_INDEX = { 823 | "C": 0, "C#": 1, "Db": 1, "D": 2, "D#": 3, "Eb": 3, "E": 4, 824 | "F": 5, "F#": 6, "Gb": 6, "G": 7, "G#": 8, "Ab": 8, "A": 9, 825 | "A#": 10, "Bb": 10, "B": 11, 826 | } 827 | 828 | def note_to_freq(note: str) -> float: 829 | """Convert scientific pitch (e.g., A4) → frequency in Hz.""" 830 | m = NOTE_PATTERN.match(note) 831 | if not m: 832 | raise ValueError(f"Invalid note: {note}") 833 | letter, accidental, octave = m.groups() 834 | key = letter + accidental 835 | semitone = NOTE_INDEX[key] 836 | octave = int(octave) 837 | midi = semitone + 12 * (octave + 1) 838 | return 440.0 * 2 ** ((midi - 69) / 12) 839 | 840 | def available_notes() -> List[str]: 841 | """Generate a list of available musical notes.""" 842 | names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] 843 | return ["None"] + [f"{n}{o}" for o in range(2, 7) for n in names] 844 | 845 | # --- Audio Processing Functions with ZERO DEFAULTS ------------------------- 846 | 847 | def estimate_pitch(audio: np.ndarray, sr: int) -> Optional[float]: 848 | """Estimate the fundamental frequency of audio.""" 849 | try: 850 | f0 = librosa.yin(y=audio, fmin=65, fmax=1047, sr=sr) 851 | median_f0 = np.nanmedian(f0) 852 | return float(median_f0) if np.any(~np.isnan(f0)) and not np.isnan(median_f0) else None 853 | except Exception as e: 854 | logger.warning(f"Pitch estimation failed: {e}") 855 | return None 856 | 857 | def apply_volume(audio: np.ndarray, gain: float = 1.0, gain_db: Optional[float] = None) -> np.ndarray: 858 | """Apply volume adjustment with linear or dB gain.""" 859 | if gain_db is not None: 860 | gain = 10 ** (gain_db / 20.0) 861 | gain = np.clip(gain, 0.0, 2.0) 862 | if abs(gain - 1.0) < 0.01: 863 | return audio 864 | logger.info(f"Applying volume gain: {gain:.2f} (linear), {20 * math.log10(max(gain, 1e-10)):.2f}dB") 865 | try: 866 | result = audio * gain 867 | 868 | return result 869 | except Exception as e: 870 | logger.error(f"Volume adjustment failed: {e}") 871 | return audio 872 | 873 | def shift_to_target(audio: np.ndarray, sr: int, target_note: Optional[str], semitone_shift: float, preserve_formants: bool = False) -> np.ndarray: 874 | """ 875 | FIXED: Shift audio to target note or by semitones with robust librosa compatibility and true formant preservation. 876 | """ 877 | actual_semitone_shift = semitone_shift 878 | 879 | # Handle target note conversion 880 | if target_note and target_note.lower() not in ["none", "", "null"]: 881 | current_pitch = estimate_pitch(audio, sr) 882 | if current_pitch is None: 883 | logger.warning("Could not estimate current pitch. Using semitone_shift if provided.") 884 | else: 885 | try: 886 | target_freq = note_to_freq(target_note) 887 | # Calculate semitone shift needed to get from current pitch to target frequency 888 | actual_semitone_shift = 12 * math.log2(target_freq / current_pitch) 889 | logger.info(f"Pitch shift: Current={current_pitch:.2f}Hz, Target={target_note}({target_freq:.2f}Hz), Semitones={actual_semitone_shift:.2f}") 890 | except (ValueError, ZeroDivisionError) as e: 891 | logger.warning(f"Invalid target note or pitch estimation '{target_note}': {e}") 892 | actual_semitone_shift = semitone_shift 893 | 894 | # Skip if no significant shift 895 | if abs(actual_semitone_shift) < 0.01: 896 | return audio 897 | 898 | logger.info(f"Applying pitch shift of {actual_semitone_shift:.2f} semitones, preserve_formants={preserve_formants}") 899 | 900 | try: 901 | # Step 1: Apply pitch shift using librosa 902 | import inspect 903 | pitch_shift_params = inspect.signature(librosa.effects.pitch_shift).parameters 904 | 905 | if 'res_type' in pitch_shift_params: 906 | shifted_audio = librosa.effects.pitch_shift( 907 | y=audio, 908 | sr=sr, 909 | n_steps=actual_semitone_shift, 910 | res_type='kaiser_best' 911 | ) 912 | else: 913 | shifted_audio = librosa.effects.pitch_shift( 914 | y=audio, 915 | sr=sr, 916 | n_steps=actual_semitone_shift 917 | ) 918 | 919 | # Step 2: If preserving formants, apply a corrective formant shift 920 | if preserve_formants: 921 | logger.info("Applying corrective formant shift to preserve voice character.") 922 | # The pitch shift moved the formants. We need to shift them back. 923 | # The ratio of the pitch shift is 2^(semitones/12). 924 | # To reverse the formant shift, we need to shift by the inverse ratio. 925 | formant_correction_ratio = 2 ** (-actual_semitone_shift / 12.0) 926 | 927 | # Our apply_formant_shift function takes a factor where 1.0 is no change. 928 | # We can directly use this ratio. 929 | final_audio = apply_formant_shift(shifted_audio, sr, shift_factor=formant_correction_ratio) 930 | return final_audio 931 | else: 932 | return shifted_audio 933 | 934 | except Exception as e: 935 | logger.error(f"Pitch shift failed: {e}") 936 | return audio 937 | 938 | def apply_formant_shift(audio: np.ndarray, sr: int, shift_factor: float, scale: float = 1.0) -> np.ndarray: 939 | """Apply formant shifting with intensity scaling.""" 940 | if abs(shift_factor - 1.0) < 0.01: 941 | return audio 942 | shift_factor = np.clip(shift_factor, 0.5, 1.5) 943 | scale = np.clip(scale, 0.5, 2.0) 944 | effective_shift = 1.0 + (shift_factor - 1.0) * scale 945 | logger.info(f"Applying formant shift: factor={shift_factor:.2f}, scale={scale:.2f}, effective={effective_shift:.2f}") 946 | try: 947 | audio_float32 = audio.astype(np.float32) 948 | stft_result = librosa.stft(audio_float32) 949 | magnitude = np.abs(stft_result) 950 | phase = np.angle(stft_result) 951 | shifted_magnitude = np.zeros_like(magnitude) 952 | n_freq_bins = magnitude.shape[0] 953 | for i in range(magnitude.shape[1]): 954 | freq_profile = magnitude[:, i] 955 | source_freq_coords = np.arange(n_freq_bins, dtype=float) / effective_shift 956 | shifted_profile_frame = np.interp( 957 | source_freq_coords, 958 | np.arange(n_freq_bins, dtype=float), 959 | freq_profile, 960 | left=freq_profile[0] if len(freq_profile) > 0 else 0.0, 961 | right=freq_profile[-1] if len(freq_profile) > 0 else 0.0 962 | ) 963 | shifted_magnitude[:, i] = shifted_profile_frame 964 | audio_stft_shifted = shifted_magnitude * np.exp(1j * phase) 965 | audio_shifted = librosa.istft(audio_stft_shifted, length=len(audio_float32)) 966 | 967 | return audio_shifted.astype(audio.dtype) 968 | except Exception as e: 969 | logger.error(f"Formant shift failed: {e}") 970 | return audio 971 | 972 | def apply_reverb(audio: np.ndarray, sr: int, room_size: float = 0.0, damping: float = 0.5, pre_delay_ms: float = 0.0, stereo_width: float = 0.0) -> np.ndarray: 973 | """Apply reverb with pre-delay and stereo width. Only applies if room_size > 0.""" 974 | if room_size <= 0.001: 975 | return audio 976 | room_size = np.clip(room_size, 0.0, 1.0) 977 | damping = np.clip(damping, 0.0, 1.0) 978 | pre_delay_ms = np.clip(pre_delay_ms, 0.0, 100.0) 979 | stereo_width = np.clip(stereo_width, 0.0, 1.0) 980 | logger.info(f"Applying reverb: RoomSize={room_size:.2f}, Damping={damping:.2f}, PreDelay={pre_delay_ms:.1f}ms, StereoWidth={stereo_width:.2f}") 981 | try: 982 | reverb_length_sec = room_size * 1.5 983 | pre_delay_samples = int(sr * pre_delay_ms / 1000) 984 | reverb_length_samples = int(sr * reverb_length_sec) + pre_delay_samples 985 | if reverb_length_samples <= 0: 986 | return audio 987 | time_points = np.arange(reverb_length_samples) / sr 988 | decay_rate = 5.0 + (1.0 - damping) * 15.0 989 | decay_envelope = np.exp(-decay_rate * time_points) 990 | impulse = np.random.randn(reverb_length_samples) * decay_envelope 991 | impulse_energy_sq = np.sum(impulse**2) 992 | if impulse_energy_sq > 1e-12: 993 | impulse = impulse / np.sqrt(impulse_energy_sq) 994 | else: 995 | return audio 996 | if stereo_width > 0: 997 | impulse_l = impulse * np.sqrt(1 - stereo_width / 2) 998 | impulse_r = np.random.randn(reverb_length_samples) * decay_envelope * np.sqrt(stereo_width / 2) 999 | impulse_r_energy = np.sum(impulse_r**2) 1000 | if impulse_r_energy > 1e-12: 1001 | impulse_r = impulse_r / np.sqrt(impulse_r_energy) 1002 | reverb_l = signal.convolve(audio, impulse_l, mode='full')[:len(audio)] 1003 | reverb_r = signal.convolve(audio, impulse_r, mode='full')[:len(audio)] 1004 | reverb_audio = (reverb_l + reverb_r) / 2 1005 | else: 1006 | reverb_audio = signal.convolve(audio, impulse, mode='full')[:len(audio)] 1007 | 1008 | dry_gain = 0.7 1009 | wet_gain = 0.1 + room_size * 0.4 1010 | result = dry_gain * audio + wet_gain * reverb_audio 1011 | 1012 | return result 1013 | except Exception as e: 1014 | logger.error(f"Reverb failed: {e}") 1015 | return audio 1016 | 1017 | def apply_eq(audio: np.ndarray, sr: int, bands: List[Dict[str, Any]]) -> np.ndarray: 1018 | """Apply parametric equalizer with multiple bands. Only applies if bands are provided.""" 1019 | if not bands: 1020 | return audio 1021 | logger.info(f"Applying EQ with {len(bands)} bands") 1022 | try: 1023 | processed_audio = audio.copy() 1024 | nyquist = sr / 2.0 1025 | for band in bands: 1026 | freq = np.clip(band.get('frequency_hz', 1000.0), 20.0, nyquist - 1e-5) 1027 | gain_db = np.clip(band.get('gain_db', 0.0), -24.0, 24.0) 1028 | q_factor = np.clip(band.get('q_factor', 1.0), 0.1, 10.0) 1029 | band_type = band.get('type', 'peak').lower() 1030 | if abs(gain_db) < 0.1: 1031 | continue 1032 | logger.info(f"EQ Band: Type={band_type}, Freq={freq:.1f}Hz, Gain={gain_db:.1f}dB, Q={q_factor:.2f}") 1033 | try: 1034 | if band_type == 'peak': 1035 | sos = signal.iirpeak(w0=freq, Q=q_factor, gain_db=gain_db, fs=sr) 1036 | processed_audio = signal.sosfiltfilt(sos, processed_audio) 1037 | elif band_type == 'low_shelf': 1038 | try: 1039 | sos = signal.iirshelf(w0=freq, Q=0.707, gain_db=gain_db, fs=sr, ftype='AB') 1040 | processed_audio = signal.sosfiltfilt(sos, processed_audio) 1041 | except AttributeError: 1042 | logger.info(f"Fallback to Butterworth for low_shelf at {freq}Hz") 1043 | gain_linear = 10 ** (gain_db / 20.0) 1044 | norm_freq = freq / nyquist 1045 | sos = signal.butter(2, norm_freq, btype='lowpass', output='sos') 1046 | low_freq = signal.sosfiltfilt(sos, processed_audio) 1047 | processed_audio = processed_audio + (gain_linear - 1.0) * low_freq 1048 | elif band_type == 'high_shelf': 1049 | try: 1050 | sos = signal.iirshelf(w0=freq, Q=0.707, gain_db=gain_db, fs=sr, ftype='AB') 1051 | processed_audio = signal.sosfiltfilt(sos, processed_audio) 1052 | except AttributeError: 1053 | logger.info(f"Fallback to Butterworth for high_shelf at {freq}Hz") 1054 | gain_linear = 10 ** (gain_db / 20.0) 1055 | norm_freq = freq / nyquist 1056 | sos = signal.butter(2, norm_freq, btype='highpass', output='sos') 1057 | high_freq = signal.sosfiltfilt(sos, processed_audio) 1058 | processed_audio = processed_audio + (gain_linear - 1.0) * high_freq 1059 | else: 1060 | logger.warning(f"Unknown EQ band type: {band_type}") 1061 | except Exception as e: 1062 | logger.warning(f"EQ band failed: {e}") 1063 | 1064 | return processed_audio 1065 | except Exception as e: 1066 | logger.error(f"EQ failed: {e}") 1067 | return audio 1068 | 1069 | def apply_distortion(audio: np.ndarray, drive_db: float = 0.0, dist_type: str = 'tanh', mix: float = 0.0) -> np.ndarray: 1070 | """Apply distortion effect. Only applies if drive_db > 0 and mix > 0.""" 1071 | drive_db = np.clip(drive_db, 0.0, 36.0) 1072 | mix = np.clip(mix, 0.0, 1.0) 1073 | dist_type = dist_type.lower() 1074 | if drive_db < 0.1 or mix < 0.001: 1075 | return audio 1076 | logger.info(f"Applying distortion: Type={dist_type}, Drive={drive_db:.1f}dB, Mix={mix:.2f}") 1077 | try: 1078 | drive_gain = 10 ** (drive_db / 20.0) 1079 | distorted = audio * drive_gain 1080 | if dist_type == 'tanh': 1081 | distorted = np.tanh(distorted * 2.0) * 0.8 1082 | elif dist_type == 'soft': 1083 | distorted = np.clip(distorted, -1.0, 1.0) 1084 | elif dist_type == 'hard': 1085 | distorted = np.sign(distorted) * np.minimum(np.abs(distorted), 1.0) 1086 | else: 1087 | logger.warning(f"Unknown distortion type: {dist_type}, using tanh") 1088 | distorted = np.tanh(distorted * 2.0) * 0.8 1089 | 1090 | result = (1.0 - mix) * audio + mix * distorted 1091 | 1092 | return result 1093 | except Exception as e: 1094 | logger.error(f"Distortion failed: {e}") 1095 | return audio 1096 | 1097 | def apply_chorus(audio: np.ndarray, sr: int, delay_ms: float = 0.0, depth: float = 0.0, rate_hz: float = 0.0, mix: float = 0.0) -> np.ndarray: 1098 | """Apply chorus effect. Only applies if depth > 0 and mix > 0.""" 1099 | delay_ms = np.clip(delay_ms, 5.0, 50.0) 1100 | depth = np.clip(depth, 0.0, 0.1) 1101 | rate_hz = np.clip(rate_hz, 0.1, 5.0) 1102 | mix = np.clip(mix, 0.0, 1.0) 1103 | 1104 | # Early return if effect should not be applied 1105 | if depth < 0.001 or mix < 0.001: 1106 | return audio 1107 | 1108 | logger.info(f"Applying chorus: Delay={delay_ms:.1f}ms, Depth={depth:.3f}, Rate={rate_hz:.2f}Hz, Mix={mix:.2f}") 1109 | try: 1110 | delay_samples = int(sr * delay_ms / 1000) 1111 | if delay_samples <= 0: 1112 | return audio 1113 | 1114 | t = np.arange(len(audio)) / sr 1115 | mod = depth * np.sin(2 * np.pi * rate_hz * t) 1116 | indices = np.arange(len(audio)) - delay_samples * (1.0 + mod) 1117 | indices = np.clip(indices, 0, len(audio) - 1) 1118 | chorus_audio = np.interp(np.arange(len(audio)), indices, audio) 1119 | 1120 | 1121 | result = (1.0 - mix) * audio + mix * chorus_audio 1122 | 1123 | return result 1124 | except Exception as e: 1125 | logger.error(f"Chorus failed: {e}") 1126 | return audio 1127 | 1128 | def apply_flanger(audio: np.ndarray, sr: int, delay_ms: float = 0.0, depth: float = 0.0, rate_hz: float = 0.0, feedback: float = 0.0, mix: float = 0.0) -> np.ndarray: 1129 | """Apply flanger effect. Only applies if depth > 0 and mix > 0.""" 1130 | delay_ms = np.clip(delay_ms, 0.1, 10.0) 1131 | depth = np.clip(depth, 0.0, 0.05) 1132 | rate_hz = np.clip(rate_hz, 0.1, 10.0) 1133 | feedback = np.clip(feedback, 0.0, 0.9) 1134 | mix = np.clip(mix, 0.0, 1.0) 1135 | 1136 | # Early return if effect should not be applied 1137 | if depth < 0.001 or mix < 0.001: 1138 | return audio 1139 | 1140 | logger.info(f"Applying flanger: Delay={delay_ms:.1f}ms, Depth={depth:.3f}, Rate={rate_hz:.2f}Hz, Feedback={feedback:.2f}, Mix={mix:.2f}") 1141 | try: 1142 | delay_samples = int(sr * delay_ms / 1000) 1143 | if delay_samples <= 0: 1144 | return audio 1145 | 1146 | t = np.arange(len(audio)) / sr 1147 | mod = depth * np.sin(2 * np.pi * rate_hz * t) 1148 | output = audio.copy() 1149 | delay_buffer = np.zeros(len(audio) + delay_samples) 1150 | delay_buffer[:len(audio)] = audio 1151 | 1152 | for i in range(len(audio)): 1153 | delay_time = delay_samples * (1.0 + mod[i]) 1154 | idx = i - delay_time 1155 | if idx >= 0: 1156 | interp_idx = int(idx) 1157 | frac = idx - interp_idx 1158 | if interp_idx + 1 < len(delay_buffer): 1159 | delayed_sample = (1 - frac) * delay_buffer[interp_idx] + frac * delay_buffer[interp_idx + 1] 1160 | output[i] += feedback * delayed_sample 1161 | delay_buffer[i + delay_samples] += feedback * delayed_sample 1162 | 1163 | 1164 | result = (1.0 - mix) * audio + mix * output 1165 | 1166 | return result 1167 | except Exception as e: 1168 | logger.error(f"Flanger failed: {e}") 1169 | return audio 1170 | 1171 | def apply_compression(audio: np.ndarray, sr: int, threshold_db: float = 0.0, ratio: float = 1.0, attack_ms: float = 5.0, release_ms: float = 200.0) -> np.ndarray: 1172 | """Apply dynamic range compression. Only applies if ratio > 1.0 and threshold_db < 0.""" 1173 | threshold_db = np.clip(threshold_db, -60.0, 0.0) 1174 | ratio = np.clip(ratio, 1.0, 20.0) 1175 | attack_ms = np.clip(attack_ms, 0.1, 100.0) 1176 | release_ms = np.clip(release_ms, 10.0, 1000.0) 1177 | 1178 | # Early return if compression should not be applied 1179 | if ratio <= 1.01 or threshold_db >= -0.1: 1180 | return audio 1181 | 1182 | logger.info(f"Applying compression: Threshold={threshold_db:.1f}dB, Ratio={ratio:.1f}, Attack={attack_ms:.1f}ms, Release={release_ms:.1f}ms") 1183 | try: 1184 | threshold = 10 ** (threshold_db / 20.0) 1185 | attack_coeff = np.exp(-1.0 / (sr * attack_ms / 1000)) 1186 | release_coeff = np.exp(-1.0 / (sr * release_ms / 1000)) 1187 | envelope = np.zeros_like(audio) 1188 | gain = np.ones_like(audio) 1189 | 1190 | for i in range(len(audio)): 1191 | envelope[i] = abs(audio[i]) if i == 0 else (1 - attack_coeff) * abs(audio[i]) + attack_coeff * envelope[i - 1] 1192 | if envelope[i] > threshold: 1193 | excess = envelope[i] / threshold 1194 | gain_reduction = threshold * (excess ** (1 / ratio - 1)) 1195 | target_gain = gain_reduction / envelope[i] if envelope[i] > 1e-6 else 1.0 1196 | else: 1197 | target_gain = 1.0 1198 | gain[i] = (1 - release_coeff) * target_gain + release_coeff * (gain[i - 1] if i > 0 else 1.0) 1199 | 1200 | result = audio * gain 1201 | 1202 | return result 1203 | except Exception as e: 1204 | logger.error(f"Compression failed: {e}") 1205 | return audio 1206 | 1207 | def apply_voice_character(audio: np.ndarray, sr: int, character: str, params: Optional[Dict] = None) -> np.ndarray: 1208 | """Apply voice character transformation. Only applies if character is not 'none'.""" 1209 | if character == "none" or not character: 1210 | return audio 1211 | params = params or {} 1212 | logger.info(f"Applying voice character: {character} with params {params}") 1213 | try: 1214 | result = audio.copy() 1215 | if character == "child": 1216 | pitch_shift = params.get('pitch_shift', 3.0) 1217 | speed = params.get('speed', 1.1) 1218 | formant_shift = params.get('formant_shift', 1.2) 1219 | result = shift_to_target(result, sr, None, pitch_shift, False) 1220 | result = librosa.effects.time_stretch(y=result, rate=speed) 1221 | result = apply_formant_shift(result, sr, formant_shift) 1222 | elif character == "robot": 1223 | pitch_shift = params.get('pitch_shift', 0.0) 1224 | if abs(pitch_shift) > 0.01: 1225 | result = shift_to_target(result, sr, None, pitch_shift, False) 1226 | t = np.arange(len(result)) / sr 1227 | carrier = np.sin(2 * np.pi * params.get('carrier_freq', 80.0) * t) 1228 | result = result * carrier 1229 | result = np.tanh(result * params.get('distortion_factor', 2.5)) * 0.8 1230 | elif character == "deep": 1231 | pitch_shift = params.get('pitch_shift', -4.0) 1232 | speed = params.get('speed', 0.9) 1233 | formant_shift = params.get('formant_shift', 0.8) 1234 | result = shift_to_target(result, sr, None, pitch_shift, False) 1235 | result = librosa.effects.time_stretch(y=result, rate=speed) 1236 | result = apply_formant_shift(result, sr, formant_shift) 1237 | elif character == "whisper": 1238 | b, a = signal.butter(4, 3000 / (sr / 2), 'low') 1239 | result = signal.filtfilt(b, a, result) 1240 | noise = np.random.normal(0, params.get('noise_level', 0.03), len(result)).astype(result.dtype) 1241 | result = result * params.get('signal_level', 0.6) + noise 1242 | result = np.tanh(result * params.get('compression_factor', 1.2)) * 0.9 1243 | elif character == "alien": 1244 | pitch_shift = params.get('pitch_shift', 2.0) 1245 | result = shift_to_target(result, sr, None, pitch_shift, False) 1246 | result = apply_flanger(result, sr, delay_ms=params.get('flanger_delay_ms', 5.0), depth=0.02, rate_hz=0.3, feedback=0.6, mix=0.6) 1247 | elif character == "monster": 1248 | pitch_shift = params.get('pitch_shift', -6.0) 1249 | formant_shift = params.get('formant_shift', 0.7) 1250 | result = shift_to_target(result, sr, None, pitch_shift, False) 1251 | result = apply_formant_shift(result, sr, formant_shift) 1252 | result = apply_distortion(result, drive_db=params.get('distortion_drive', 12.0), dist_type='tanh', mix=0.7) 1253 | elif character == "echo": 1254 | result = apply_reverb(result, sr, room_size=0.6, damping=0.3, pre_delay_ms=params.get('pre_delay_ms', 50.0), stereo_width=0.8) 1255 | else: 1256 | logger.warning(f"Unknown voice character: {character}") 1257 | return audio 1258 | 1259 | return result 1260 | except Exception as e: 1261 | logger.error(f"Voice character transformation failed: {e}") 1262 | return audio 1263 | 1264 | # --- Enhanced Model Management ----------------------------------------------- 1265 | 1266 | class EnhancedModelManager: 1267 | """Production-grade model manager with robust text processing and caching.""" 1268 | 1269 | def __init__(self): 1270 | logger.info("Initializing EnhancedModelManager...") 1271 | 1272 | # Initialize models with detailed logging 1273 | self.models: Dict[bool, KModel] = {} 1274 | 1275 | # CPU model (always available) 1276 | logger.info("Loading CPU model...") 1277 | self.models[False] = KModel().to('cpu').eval() 1278 | logger.info("✓ CPU model loaded successfully") 1279 | 1280 | # GPU model (if available) 1281 | if CUDA_AVAILABLE: 1282 | try: 1283 | logger.info("Loading GPU model...") 1284 | log_gpu_status("before model loading") 1285 | self.models[True] = KModel().to('cuda').eval() 1286 | log_gpu_status("after model loading") 1287 | logger.info("✓ GPU model loaded successfully") 1288 | except Exception as e: 1289 | logger.error(f"Failed to load GPU model: {e}") 1290 | logger.warning("GPU model unavailable - falling back to CPU only") 1291 | else: 1292 | logger.info("GPU model not loaded (CUDA not available)") 1293 | 1294 | self.pipelines: Dict[str, KPipeline] = { 1295 | 'a': KPipeline(lang_code='a'), 1296 | 'b': KPipeline(lang_code='b') 1297 | } 1298 | 1299 | # Enhanced lexicon 1300 | self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO' 1301 | self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ' 1302 | 1303 | # Initialize text processor 1304 | self.text_processor = ProductionTextProcessor() 1305 | 1306 | # Audio cache for processed chunks 1307 | self.audio_cache = {} 1308 | self.cache_lock = threading.Lock() 1309 | 1310 | # Thread pool for parallel processing 1311 | self.executor = ThreadPoolExecutor(max_workers=4) 1312 | 1313 | logger.info("Pre-loading all voices...") 1314 | for voice_name in KOKORO_VOICES.keys(): 1315 | lang_code = voice_name[0] 1316 | if lang_code in self.pipelines: 1317 | self.pipelines[lang_code].load_voice(voice_name) 1318 | else: 1319 | logger.error(f"Unknown lang code '{lang_code}' for voice '{voice_name}'") 1320 | logger.info("All voices loaded.") 1321 | 1322 | # Log final model status 1323 | available_devices = [] 1324 | if False in self.models: 1325 | available_devices.append("CPU") 1326 | if True in self.models: 1327 | available_devices.append("GPU") 1328 | logger.info(f"Available inference devices: {', '.join(available_devices)}") 1329 | 1330 | def generate_audio_for_chunk(self, chunk: TextChunk, voice_name: str, speed: float = 1.0, use_gpu: bool = True) -> torch.Tensor: 1331 | """Generate audio for a single text chunk.""" 1332 | # Create cache key 1333 | cache_key = hashlib.md5(f"{chunk.text}_{voice_name}_{speed}".encode()).hexdigest() 1334 | 1335 | with self.cache_lock: 1336 | if cache_key in self.audio_cache: 1337 | logger.info(f"Using cached audio for chunk {chunk.chunk_id}") 1338 | return self.audio_cache[cache_key] 1339 | 1340 | if not chunk.text or len(chunk.text.strip()) < 2: 1341 | raise ValueError(f"Chunk {chunk.chunk_id} text is too short or empty") 1342 | 1343 | lang_code = voice_name[0] 1344 | if lang_code not in self.pipelines: 1345 | raise ValueError(f"No pipeline for lang code '{lang_code}'") 1346 | 1347 | pipeline = self.pipelines[lang_code] 1348 | pack = pipeline.load_voice(voice_name) 1349 | effective_use_gpu = use_gpu and CUDA_AVAILABLE and (True in self.models) 1350 | 1351 | # Detailed device selection logging 1352 | if use_gpu and not CUDA_AVAILABLE: 1353 | logger.warning(f"Chunk {chunk.chunk_id}: GPU requested but CUDA not available - using CPU") 1354 | elif use_gpu and CUDA_AVAILABLE and (True not in self.models): 1355 | logger.warning(f"Chunk {chunk.chunk_id}: GPU requested but GPU model failed to load - using CPU") 1356 | 1357 | device_name = "GPU" if effective_use_gpu else "CPU" 1358 | logger.info(f"Generating chunk {chunk.chunk_id}/{chunk.total_chunks} with {device_name} for voice {voice_name}") 1359 | 1360 | # Log GPU memory before inference if using GPU 1361 | if effective_use_gpu: 1362 | log_gpu_status(f"before chunk {chunk.chunk_id}") 1363 | 1364 | for _, ps, _ in pipeline(chunk.text, voice_name, speed): 1365 | ref_s = pack[len(ps) - 1] 1366 | try: 1367 | audio_tensor = self.models[effective_use_gpu](ps, ref_s, speed) 1368 | 1369 | # Log GPU memory after inference if using GPU 1370 | if effective_use_gpu: 1371 | log_gpu_status(f"after chunk {chunk.chunk_id}") 1372 | 1373 | # Cache the result 1374 | with self.cache_lock: 1375 | if len(self.audio_cache) >= CACHE_SIZE: 1376 | # Simple cache eviction 1377 | oldest_keys = list(self.audio_cache.keys())[:CACHE_SIZE // 2] 1378 | for key in oldest_keys: 1379 | del self.audio_cache[key] 1380 | self.audio_cache[cache_key] = audio_tensor 1381 | 1382 | return audio_tensor 1383 | except Exception as e: 1384 | error_device = "GPU" if effective_use_gpu else "CPU" 1385 | logger.error(f"Error on {error_device} for chunk {chunk.chunk_id}: {e}") 1386 | 1387 | if effective_use_gpu: 1388 | logger.warning(f"GPU inference failed for chunk {chunk.chunk_id}, attempting CPU fallback...") 1389 | try: 1390 | log_gpu_status("before CPU fallback") 1391 | audio_tensor = self.models[False](ps, ref_s, speed) 1392 | logger.info(f"✓ CPU fallback successful for chunk {chunk.chunk_id}") 1393 | 1394 | with self.cache_lock: 1395 | if len(self.audio_cache) >= CACHE_SIZE: 1396 | oldest_keys = list(self.audio_cache.keys())[:CACHE_SIZE // 2] 1397 | for key in oldest_keys: 1398 | del self.audio_cache[key] 1399 | self.audio_cache[cache_key] = audio_tensor 1400 | 1401 | return audio_tensor 1402 | except Exception as cpu_error: 1403 | logger.error(f"CPU fallback also failed for chunk {chunk.chunk_id}: {cpu_error}") 1404 | raise RuntimeError(f"Both GPU and CPU inference failed for chunk {chunk.chunk_id}") 1405 | else: 1406 | logger.error(f"CPU inference failed for chunk {chunk.chunk_id} - no fallback available") 1407 | raise 1408 | 1409 | raise RuntimeError(f"Kokoro TTS pipeline yielded no audio frames for chunk {chunk.chunk_id}") 1410 | 1411 | def generate_audio_robust(self, text: str, voice_name: str, speed: float = 1.0, 1412 | use_gpu: bool = True, max_chunk_length: int = DEFAULT_CHUNK_SIZE, 1413 | processing_mode: Optional[TextProcessingMode] = None) -> List[torch.Tensor]: 1414 | """ 1415 | Generate audio with robust text processing and chunking support. 1416 | 1417 | Returns list of audio tensors for each processed chunk. 1418 | """ 1419 | if len(text) > MAX_TEXT_LENGTH: 1420 | raise ValueError(f"Text too long. Maximum length is {MAX_TEXT_LENGTH} characters.") 1421 | 1422 | # Process text into manageable chunks 1423 | text_chunks = self.text_processor.process_text( 1424 | text, 1425 | mode=processing_mode, 1426 | max_chunk_length=max_chunk_length 1427 | ) 1428 | 1429 | if not text_chunks: 1430 | raise ValueError("No valid text chunks after processing") 1431 | 1432 | logger.info(f"Processing {len(text_chunks)} chunks for voice {voice_name}") 1433 | 1434 | audio_tensors = [] 1435 | failed_chunks = [] 1436 | 1437 | # Process chunks sequentially for now (could be parallelized) 1438 | for chunk in text_chunks: 1439 | try: 1440 | audio_tensor = self.generate_audio_for_chunk(chunk, voice_name, speed, use_gpu) 1441 | audio_tensors.append(audio_tensor) 1442 | logger.info(f"Successfully generated audio for chunk {chunk.chunk_id + 1}/{len(text_chunks)}") 1443 | except Exception as e: 1444 | logger.error(f"Failed to generate audio for chunk {chunk.chunk_id + 1}: {e}") 1445 | failed_chunks.append(chunk.chunk_id) 1446 | continue 1447 | 1448 | if not audio_tensors: 1449 | raise RuntimeError("Failed to generate audio for any text chunks") 1450 | 1451 | if failed_chunks: 1452 | logger.warning(f"Failed to process chunks: {failed_chunks}") 1453 | 1454 | return audio_tensors 1455 | 1456 | def concatenate_audio_tensors(self, audio_tensors: List[torch.Tensor], silence_duration: float = 0.2) -> torch.Tensor: 1457 | """Concatenate multiple audio tensors with configurable silence between them.""" 1458 | if len(audio_tensors) == 1: 1459 | return audio_tensors[0] 1460 | 1461 | # Adaptive silence based on content 1462 | if len(audio_tensors) > 10: 1463 | silence_duration = min(silence_duration, 0.15) # Shorter pauses for many chunks 1464 | 1465 | # Create silence tensor 1466 | silence_samples = int(SAMPLE_RATE * silence_duration) 1467 | silence = torch.zeros(silence_samples, dtype=audio_tensors[0].dtype, device=audio_tensors[0].device) 1468 | 1469 | # Concatenate with silence 1470 | result_parts = [] 1471 | for i, tensor in enumerate(audio_tensors): 1472 | result_parts.append(tensor) 1473 | if i < len(audio_tensors) - 1: # Don't add silence after last chunk 1474 | result_parts.append(silence) 1475 | 1476 | return torch.cat(result_parts, dim=0) 1477 | 1478 | def generate_audio(self, text: str, voice_name: str, speed: float = 1.0, use_gpu: bool = True) -> torch.Tensor: 1479 | """Legacy method for backward compatibility.""" 1480 | if not text or len(text.strip()) < 2: 1481 | raise ValueError("Input text is too short or empty") 1482 | 1483 | lang_code = voice_name[0] 1484 | if lang_code not in self.pipelines: 1485 | raise ValueError(f"No pipeline for lang code '{lang_code}'") 1486 | 1487 | pipeline = self.pipelines[lang_code] 1488 | pack = pipeline.load_voice(voice_name) 1489 | effective_use_gpu = use_gpu and CUDA_AVAILABLE and (True in self.models) 1490 | 1491 | # Detailed device selection logging 1492 | if use_gpu and not CUDA_AVAILABLE: 1493 | logger.warning("GPU requested but CUDA not available - using CPU") 1494 | elif use_gpu and CUDA_AVAILABLE and (True not in self.models): 1495 | logger.warning("GPU requested but GPU model failed to load - using CPU") 1496 | 1497 | device_name = "GPU" if effective_use_gpu else "CPU" 1498 | logger.info(f"Generating audio with {device_name} for voice {voice_name}") 1499 | 1500 | # Log GPU memory before inference if using GPU 1501 | if effective_use_gpu: 1502 | log_gpu_status("before audio generation") 1503 | 1504 | for _, ps, _ in pipeline(text, voice_name, speed): 1505 | ref_s = pack[len(ps) - 1] 1506 | try: 1507 | audio_tensor = self.models[effective_use_gpu](ps, ref_s, speed) 1508 | 1509 | # Log GPU memory after inference if using GPU 1510 | if effective_use_gpu: 1511 | log_gpu_status("after audio generation") 1512 | 1513 | return audio_tensor 1514 | except Exception as e: 1515 | error_device = "GPU" if effective_use_gpu else "CPU" 1516 | logger.error(f"Error on {error_device}: {e}") 1517 | 1518 | if effective_use_gpu: 1519 | logger.warning("GPU inference failed, attempting CPU fallback...") 1520 | try: 1521 | log_gpu_status("before CPU fallback") 1522 | audio_tensor = self.models[False](ps, ref_s, speed) 1523 | logger.info("✓ CPU fallback successful") 1524 | return audio_tensor 1525 | except Exception as cpu_error: 1526 | logger.error(f"CPU fallback also failed: {cpu_error}") 1527 | raise RuntimeError("Both GPU and CPU inference failed") 1528 | else: 1529 | logger.error("CPU inference failed - no fallback available") 1530 | raise 1531 | 1532 | raise RuntimeError("Kokoro TTS pipeline yielded no audio frames") 1533 | 1534 | # Initialize singleton model manager 1535 | enhanced_model_manager = EnhancedModelManager() 1536 | 1537 | # --- Flask App Setup -------------------------------------------------------- 1538 | 1539 | app = Flask(__name__) 1540 | if ENABLE_CORS: 1541 | CORS(app, resources={r"/*": {"origins": "*"}}) # Allow all origins for read-aloud compatibility 1542 | logger.info("CORS enabled for all routes") 1543 | else: 1544 | logger.info("CORS disabled") 1545 | 1546 | # --- Authentication Middleware ---------------------------------------------- 1547 | 1548 | def check_auth(f): 1549 | @wraps(f) 1550 | def decorated(*args, **kwargs): 1551 | if LOG_AUTH_ATTEMPTS: 1552 | auth_header = request.headers.get('Authorization') 1553 | if auth_header: 1554 | logger.info(f"Auth header: {auth_header[:30]}...") 1555 | else: 1556 | logger.info("No authentication provided") 1557 | return f(*args, **kwargs) 1558 | return decorated 1559 | 1560 | # --- Helper Functions ------------------------------------------------------- 1561 | 1562 | def get_kokoro_voice(voice_id: str) -> str: 1563 | if voice_id in KOKORO_VOICES: 1564 | return voice_id 1565 | logger.warning(f"Unknown voice '{voice_id}', using default '{DEFAULT_VOICE}'") 1566 | return DEFAULT_VOICE 1567 | 1568 | def tensor_to_numpy(audio_tensor: torch.Tensor) -> np.ndarray: 1569 | """Convert PyTorch tensor to NumPy array.""" 1570 | audio_numpy = audio_tensor.cpu().numpy().squeeze() 1571 | if audio_numpy.dtype != np.float32: 1572 | audio_numpy = audio_numpy.astype(np.float32) 1573 | max_val = np.max(np.abs(audio_numpy)) 1574 | if max_val > 1.0: 1575 | audio_numpy /= max_val 1576 | return audio_numpy 1577 | 1578 | def numpy_to_format(audio_numpy: np.ndarray, sr: int, audio_format: str) -> bytes: 1579 | """Convert NumPy audio array to specified format.""" 1580 | audio_int16 = (audio_numpy * 32767).astype(np.int16) 1581 | if audio_format == 'wav': 1582 | buffer = io.BytesIO() 1583 | wavfile.write(buffer, sr, audio_int16) 1584 | return buffer.getvalue() 1585 | elif audio_format == 'mp3': 1586 | audio_segment = AudioSegment( 1587 | data=audio_int16.tobytes(), 1588 | sample_width=2, 1589 | frame_rate=sr, 1590 | channels=1 1591 | ) 1592 | buffer = io.BytesIO() 1593 | audio_segment.export(buffer, format="mp3", bitrate="192k") 1594 | return buffer.getvalue() 1595 | else: 1596 | raise ValueError(f"Unsupported audio format: {audio_format}") 1597 | 1598 | def calculate_audio_duration(audio_numpy: np.ndarray, sr: int) -> float: 1599 | """Calculate audio duration in seconds.""" 1600 | return len(audio_numpy) / sr 1601 | 1602 | def play_audio_windows_with_interrupt(audio_numpy: np.ndarray, sr: int, session_id: str): 1603 | """Play audio on Windows with interrupt capability.""" 1604 | try: 1605 | import winsound 1606 | 1607 | # Calculate duration and start playback session 1608 | duration = calculate_audio_duration(audio_numpy, sr) 1609 | playback_controller.start_playback(session_id, duration) 1610 | 1611 | # Convert to WAV bytes 1612 | wav_bytes = numpy_to_format(audio_numpy, sr, 'wav') 1613 | 1614 | # Check for interruption before playing 1615 | if playback_controller.should_stop: 1616 | playback_controller.finish_playback() 1617 | return 1618 | 1619 | # Play audio (this is blocking) 1620 | winsound.PlaySound(wav_bytes, winsound.SND_MEMORY | winsound.SND_NODEFAULT) 1621 | 1622 | # Mark as finished if not interrupted 1623 | if not playback_controller.should_stop: 1624 | playback_controller.finish_playback() 1625 | logger.info("Audio playback completed successfully") 1626 | else: 1627 | logger.info("Audio playback was interrupted") 1628 | 1629 | except ImportError: 1630 | logger.error("winsound module not found") 1631 | playback_controller.finish_playback() 1632 | except Exception as e: 1633 | logger.error(f"Error playing audio: {e}") 1634 | playback_controller.finish_playback() 1635 | 1636 | def apply_effects_pipeline(audio_numpy: np.ndarray, effects_params: Dict, effect_order: List[str]) -> np.ndarray: 1637 | """Apply effects pipeline to audio with proper zero-default handling.""" 1638 | for effect in effect_order: 1639 | if effect == 'volume': 1640 | volume_params = effects_params.get('volume', {}) 1641 | gain = volume_params.get('gain', 1.0) 1642 | gain_db = volume_params.get('gain_db') 1643 | # Only apply if gain != 1.0 or gain_db is specified 1644 | if gain != 1.0 or gain_db is not None: 1645 | audio_numpy = apply_volume(audio_numpy, gain, gain_db) 1646 | 1647 | elif effect == 'equalizer': 1648 | eq_params = effects_params.get('equalizer', {}) 1649 | bands = eq_params.get('bands', []) 1650 | # Only apply if bands are provided 1651 | if bands: 1652 | audio_numpy = apply_eq(audio_numpy, SAMPLE_RATE, bands) 1653 | 1654 | elif effect == 'compression': 1655 | comp_params = effects_params.get('compression', {}) 1656 | threshold_db = comp_params.get('threshold_db', 0.0) 1657 | ratio = comp_params.get('ratio', 1.0) 1658 | attack_ms = comp_params.get('attack_ms', 0.1) 1659 | release_ms = comp_params.get('release_ms', 10.0) 1660 | # Only apply if compression parameters indicate it should be used 1661 | if threshold_db < 0.0 and ratio > 1.0: 1662 | audio_numpy = apply_compression(audio_numpy, SAMPLE_RATE, threshold_db, ratio, attack_ms, release_ms) 1663 | 1664 | elif effect == 'distortion': 1665 | dist_params = effects_params.get('distortion', {}) 1666 | drive_db = dist_params.get('drive_db', 0.0) 1667 | dist_type = dist_params.get('type', 'tanh') 1668 | mix = dist_params.get('mix', 0.0) 1669 | # Only apply if drive_db > 0 and mix > 0 1670 | if drive_db > 0.0 and mix > 0.0: 1671 | audio_numpy = apply_distortion(audio_numpy, drive_db, dist_type, mix) 1672 | 1673 | elif effect == 'pitch': 1674 | pitch_params = effects_params.get('pitch', {}) 1675 | target_note = pitch_params.get('target_note') 1676 | semitone_shift = float(pitch_params.get('semitone_shift', 0.0)) 1677 | preserve_formants = pitch_params.get('preserve_formants', False) 1678 | # Only apply if target_note is specified or semitone_shift != 0 1679 | if target_note or abs(semitone_shift) > 0.01: 1680 | audio_numpy = shift_to_target(audio_numpy, SAMPLE_RATE, target_note, semitone_shift, preserve_formants) 1681 | 1682 | elif effect == 'formant': 1683 | formant_params = effects_params.get('formant', {}) 1684 | shift_percent = float(formant_params.get('shift_percent', 0.0)) 1685 | scale = float(formant_params.get('scale', 1.0)) 1686 | # Only apply if shift_percent != 0 1687 | if abs(shift_percent) > 0.01: 1688 | shift_factor = 1.0 + (np.clip(shift_percent, -100.0, 100.0) / 200.0) 1689 | audio_numpy = apply_formant_shift(audio_numpy, SAMPLE_RATE, shift_factor, scale) 1690 | 1691 | elif effect == 'voice_character': 1692 | char_params = effects_params.get('voice_character', {}) 1693 | char_type = char_params.get('type', 'none') 1694 | char_custom_params = char_params.get('params', {}) 1695 | # Only apply if character type is not 'none' 1696 | if char_type != 'none': 1697 | audio_numpy = apply_voice_character(audio_numpy, SAMPLE_RATE, char_type, char_custom_params) 1698 | 1699 | elif effect == 'chorus': 1700 | chorus_params = effects_params.get('chorus', {}) 1701 | delay_ms = chorus_params.get('delay_ms', 0.0) 1702 | depth = chorus_params.get('depth', 0.0) 1703 | rate_hz = chorus_params.get('rate_hz', 0.0) 1704 | mix = chorus_params.get('mix', 0.0) 1705 | # Only apply if depth > 0 and mix > 0 1706 | if depth > 0.0 and mix > 0.0: 1707 | audio_numpy = apply_chorus(audio_numpy, SAMPLE_RATE, delay_ms, depth, rate_hz, mix) 1708 | 1709 | elif effect == 'flanger': 1710 | flanger_params = effects_params.get('flanger', {}) 1711 | delay_ms = flanger_params.get('delay_ms', 0.0) 1712 | depth = flanger_params.get('depth', 0.0) 1713 | rate_hz = flanger_params.get('rate_hz', 0.0) 1714 | feedback = flanger_params.get('feedback', 0.0) 1715 | mix = flanger_params.get('mix', 0.0) 1716 | # Only apply if depth > 0 and mix > 0 1717 | if depth > 0.0 and mix > 0.0: 1718 | audio_numpy = apply_flanger(audio_numpy, SAMPLE_RATE, delay_ms, depth, rate_hz, feedback, mix) 1719 | 1720 | elif effect == 'reverb': 1721 | reverb_params = effects_params.get('reverb', {}) 1722 | room_size = float(reverb_params.get('room_size_percent', 0.0)) / 100.0 1723 | damping = float(reverb_params.get('damping_percent', 50.0)) / 100.0 1724 | pre_delay_ms = float(reverb_params.get('pre_delay_ms', 0.0)) 1725 | stereo_width = float(reverb_params.get('stereo_width', 0.0)) 1726 | # Only apply if room_size > 0 1727 | if room_size > 0.01: 1728 | audio_numpy = apply_reverb(audio_numpy, SAMPLE_RATE, room_size, damping, pre_delay_ms, stereo_width) 1729 | else: 1730 | logger.warning(f"Unknown effect in order: {effect}") 1731 | 1732 | return audio_numpy 1733 | 1734 | # --- API Endpoints ---------------------------------------------------------- 1735 | 1736 | @app.route('/ping', methods=['GET']) 1737 | def ping_route(): 1738 | return jsonify({"status": "ok", "timestamp": datetime.utcnow().isoformat() + 'Z'}) 1739 | 1740 | @app.route('/v1/audio/speech', methods=['POST']) 1741 | @check_auth 1742 | def create_speech_route(): 1743 | """Standard speech generation endpoint with clean zero-default effects.""" 1744 | try: 1745 | data = request.get_json() 1746 | if not data: 1747 | return jsonify({"error": "No JSON data provided"}), 400 1748 | 1749 | text = data.get('input', '').strip() 1750 | if not text: 1751 | return jsonify({"error": "Missing or empty 'input' field"}), 400 1752 | 1753 | voice_id = data.get('voice', DEFAULT_VOICE) 1754 | speed = float(data.get('speed', 1.0)) 1755 | response_format = data.get('response_format', 'mp3').lower() 1756 | use_gpu_flag = data.get('use_gpu', True) 1757 | 1758 | # Default effect order - but effects won't apply unless explicitly configured 1759 | effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER) 1760 | 1761 | if not (0.25 <= speed <= 4.0): 1762 | return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400 1763 | 1764 | supported_formats = ['mp3', 'wav'] 1765 | if response_format not in supported_formats: 1766 | logger.warning(f"Unsupported format '{response_format}', defaulting to mp3") 1767 | response_format = 'mp3' 1768 | 1769 | kokoro_voice_id = get_kokoro_voice(voice_id) 1770 | logger.info(f"Request: voice={kokoro_voice_id}, speed={speed}, format={response_format}, text='{text[:50]}...'") 1771 | 1772 | # Generate base audio 1773 | audio_tensor = enhanced_model_manager.generate_audio(text, kokoro_voice_id, speed, use_gpu_flag) 1774 | audio_numpy = tensor_to_numpy(audio_tensor) 1775 | 1776 | # Apply effects pipeline (only applies effects that are explicitly configured) 1777 | effects_params = data.get('effects', {}) 1778 | audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order) 1779 | 1780 | # Final normalization 1781 | max_val = np.max(np.abs(audio_numpy)) 1782 | if max_val > 1e-6: # Avoid division by zero or tiny numbers 1783 | audio_numpy = audio_numpy / max_val 1784 | 1785 | # Convert to desired format 1786 | audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format) 1787 | mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}' 1788 | 1789 | return Response( 1790 | audio_bytes, 1791 | mimetype=mime_type, 1792 | headers={ 1793 | 'Content-Type': mime_type, 1794 | 'Content-Length': str(len(audio_bytes)), 1795 | 'Access-Control-Allow-Origin': '*', # For read-aloud compatibility 1796 | 'Access-Control-Allow-Methods': 'POST, GET, OPTIONS', 1797 | 'Access-Control-Allow-Headers': 'Content-Type, Authorization' 1798 | } 1799 | ) 1800 | except ValueError as ve: 1801 | logger.error(f"ValueError: {ve}") 1802 | return jsonify({"error": str(ve)}), 400 1803 | except RuntimeError as re: 1804 | logger.error(f"RuntimeError: {re}") 1805 | return jsonify({"error": str(re)}), 500 1806 | except Exception as e: 1807 | logger.error(f"Unexpected error: {e}", exc_info=True) 1808 | return jsonify({"error": "Internal server error"}), 500 1809 | 1810 | @app.route('/v1/audio/speech/robust', methods=['POST']) 1811 | @check_auth 1812 | def create_speech_robust_route(): 1813 | """Enhanced speech generation endpoint with robust text processing.""" 1814 | try: 1815 | data = request.get_json() 1816 | if not data: 1817 | return jsonify({"error": "No JSON data provided"}), 400 1818 | 1819 | text = data.get('input', '').strip() 1820 | if not text: 1821 | return jsonify({"error": "Missing or empty 'input' field"}), 400 1822 | 1823 | voice_id = data.get('voice', DEFAULT_VOICE) 1824 | speed = float(data.get('speed', 1.0)) 1825 | response_format = data.get('response_format', 'mp3').lower() 1826 | use_gpu_flag = data.get('use_gpu', True) 1827 | max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE)) 1828 | silence_between_chunks = float(data.get('silence_between_chunks', 0.2)) 1829 | processing_mode = data.get('processing_mode') 1830 | 1831 | # Convert string mode to enum 1832 | if processing_mode: 1833 | try: 1834 | processing_mode = TextProcessingMode(processing_mode.lower()) 1835 | except ValueError: 1836 | processing_mode = None 1837 | 1838 | effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER) 1839 | 1840 | if not (0.25 <= speed <= 4.0): 1841 | return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400 1842 | 1843 | if not (MIN_CHUNK_SIZE <= max_chunk_length <= 1000): 1844 | return jsonify({"error": f"max_chunk_length must be between {MIN_CHUNK_SIZE} and 1000"}), 400 1845 | 1846 | supported_formats = ['mp3', 'wav'] 1847 | if response_format not in supported_formats: 1848 | logger.warning(f"Unsupported format '{response_format}', defaulting to mp3") 1849 | response_format = 'mp3' 1850 | 1851 | kokoro_voice_id = get_kokoro_voice(voice_id) 1852 | 1853 | logger.info(f"Robust request: voice={kokoro_voice_id}, speed={speed}, " 1854 | f"format={response_format}, max_chunk={max_chunk_length}, " 1855 | f"mode={processing_mode}, text='{text[:50]}...'") 1856 | 1857 | # Generate audio for all chunks using robust processing 1858 | audio_tensors = enhanced_model_manager.generate_audio_robust( 1859 | text, kokoro_voice_id, speed, use_gpu_flag, max_chunk_length, processing_mode 1860 | ) 1861 | 1862 | # Concatenate all audio chunks 1863 | combined_audio = enhanced_model_manager.concatenate_audio_tensors( 1864 | audio_tensors, silence_between_chunks 1865 | ) 1866 | 1867 | # Convert to numpy and apply effects 1868 | audio_numpy = tensor_to_numpy(combined_audio) 1869 | effects_params = data.get('effects', {}) 1870 | audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order) 1871 | 1872 | # Final normalization 1873 | max_val = np.max(np.abs(audio_numpy)) 1874 | if max_val > 1e-6: 1875 | audio_numpy = audio_numpy / max_val 1876 | 1877 | # Convert to desired format 1878 | audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format) 1879 | mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}' 1880 | 1881 | return Response( 1882 | audio_bytes, 1883 | mimetype=mime_type, 1884 | headers={ 1885 | 'Content-Type': mime_type, 1886 | 'Content-Length': str(len(audio_bytes)), 1887 | 'X-Chunks-Processed': str(len(audio_tensors)), 1888 | 'X-Processing-Mode': processing_mode.value if processing_mode else 'auto', 1889 | 'Access-Control-Allow-Origin': '*', 1890 | 'Access-Control-Allow-Methods': 'POST, GET, OPTIONS', 1891 | 'Access-Control-Allow-Headers': 'Content-Type, Authorization' 1892 | } 1893 | ) 1894 | 1895 | except ValueError as ve: 1896 | logger.error(f"ValueError: {ve}") 1897 | return jsonify({"error": str(ve)}), 400 1898 | except RuntimeError as re: 1899 | logger.error(f"RuntimeError: {re}") 1900 | return jsonify({"error": str(re)}), 500 1901 | except Exception as e: 1902 | logger.error(f"Unexpected error: {e}", exc_info=True) 1903 | return jsonify({"error": "Internal server error"}), 500 1904 | 1905 | @app.route('/v1/audio/speech/stream', methods=['POST']) 1906 | @check_auth 1907 | def create_speech_stream_route(): 1908 | """Streaming speech generation endpoint.""" 1909 | try: 1910 | data = request.get_json() 1911 | if not data: 1912 | return jsonify({"error": "No JSON data provided"}), 400 1913 | 1914 | text = data.get('input', '').strip() 1915 | if not text: 1916 | return jsonify({"error": "Missing or empty 'input' field"}), 400 1917 | 1918 | voice_id = data.get('voice', DEFAULT_VOICE) 1919 | speed = float(data.get('speed', 1.0)) 1920 | response_format = data.get('response_format', 'mp3').lower() 1921 | use_gpu_flag = data.get('use_gpu', True) 1922 | max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE)) 1923 | processing_mode = data.get('processing_mode') 1924 | 1925 | if processing_mode: 1926 | try: 1927 | processing_mode = TextProcessingMode(processing_mode.lower()) 1928 | except ValueError: 1929 | processing_mode = None 1930 | 1931 | if not (0.25 <= speed <= 4.0): 1932 | return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400 1933 | 1934 | kokoro_voice_id = get_kokoro_voice(voice_id) 1935 | 1936 | def generate_audio_stream(): 1937 | try: 1938 | # Process text into chunks 1939 | text_chunks = enhanced_model_manager.text_processor.process_text( 1940 | text, mode=processing_mode, max_chunk_length=max_chunk_length 1941 | ) 1942 | 1943 | effects_params = data.get('effects', {}) 1944 | effect_order = effects_params.get('order', DEFAULT_EFFECT_ORDER) 1945 | 1946 | for chunk in text_chunks: 1947 | try: 1948 | # Generate audio for chunk 1949 | audio_tensor = enhanced_model_manager.generate_audio_for_chunk( 1950 | chunk, kokoro_voice_id, speed, use_gpu_flag 1951 | ) 1952 | 1953 | # Convert to numpy and apply effects 1954 | audio_numpy = tensor_to_numpy(audio_tensor) 1955 | audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order) 1956 | 1957 | # Final normalization for the chunk 1958 | max_val = np.max(np.abs(audio_numpy)) 1959 | if max_val > 1e-6: 1960 | audio_numpy = audio_numpy / max_val 1961 | 1962 | # Convert to desired format 1963 | audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format) 1964 | 1965 | yield audio_bytes 1966 | 1967 | except Exception as e: 1968 | logger.error(f"Error processing chunk {chunk.chunk_id}: {e}") 1969 | continue 1970 | 1971 | except Exception as e: 1972 | logger.error(f"Streaming error: {e}") 1973 | yield b'' # Empty response on error 1974 | 1975 | mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}' 1976 | 1977 | return Response( 1978 | stream_with_context(generate_audio_stream()), 1979 | mimetype=mime_type, 1980 | headers={ 1981 | 'Content-Type': mime_type, 1982 | 'Transfer-Encoding': 'chunked', 1983 | 'Access-Control-Allow-Origin': '*', 1984 | 'Access-Control-Allow-Methods': 'POST, GET, OPTIONS', 1985 | 'Access-Control-Allow-Headers': 'Content-Type, Authorization' 1986 | } 1987 | ) 1988 | 1989 | except Exception as e: 1990 | logger.error(f"Streaming setup error: {e}", exc_info=True) 1991 | return jsonify({"error": "Internal server error"}), 500 1992 | 1993 | @app.route('/v1/audio/speech/play', methods=['POST']) 1994 | @check_auth 1995 | def play_speech_route(): 1996 | """Play speech locally with interrupt capability (Windows only).""" 1997 | try: 1998 | data = request.get_json() 1999 | if not data: 2000 | return jsonify({"error": "No JSON data provided"}), 400 2001 | 2002 | text = data.get('input', '').strip() 2003 | if not text: 2004 | return jsonify({"error": "Missing or empty 'input' field"}), 400 2005 | 2006 | voice_id = data.get('voice', DEFAULT_VOICE) 2007 | speed = float(data.get('speed', 1.0)) 2008 | use_gpu_flag = data.get('use_gpu', True) 2009 | use_robust = data.get('use_robust_processing', False) 2010 | 2011 | # Generate unique session ID 2012 | session_id = hashlib.md5(f"{text}_{voice_id}_{speed}_{time.time()}".encode()).hexdigest()[:8] 2013 | 2014 | effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER) 2015 | 2016 | if not (0.25 <= speed <= 4.0): 2017 | return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400 2018 | 2019 | kokoro_voice_id = get_kokoro_voice(voice_id) 2020 | logger.info(f"Play request: voice={kokoro_voice_id}, speed={speed}, robust={use_robust}, session={session_id}, text='{text[:50]}...'") 2021 | 2022 | if use_robust: 2023 | # Use robust processing 2024 | max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE)) 2025 | processing_mode = data.get('processing_mode') 2026 | if processing_mode: 2027 | try: 2028 | processing_mode = TextProcessingMode(processing_mode.lower()) 2029 | except ValueError: 2030 | processing_mode = None 2031 | 2032 | audio_tensors = enhanced_model_manager.generate_audio_robust( 2033 | text, kokoro_voice_id, speed, use_gpu_flag, max_chunk_length, processing_mode 2034 | ) 2035 | combined_audio = enhanced_model_manager.concatenate_audio_tensors(audio_tensors, 0.2) 2036 | audio_numpy = tensor_to_numpy(combined_audio) 2037 | else: 2038 | # Use legacy method 2039 | audio_tensor = enhanced_model_manager.generate_audio(text, kokoro_voice_id, speed, use_gpu_flag) 2040 | audio_numpy = tensor_to_numpy(audio_tensor) 2041 | 2042 | # Apply effects 2043 | effects_params = data.get('effects', {}) 2044 | audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order) 2045 | 2046 | # Final normalization 2047 | max_val = np.max(np.abs(audio_numpy)) 2048 | if max_val > 1e-6: 2049 | audio_numpy = audio_numpy / max_val 2050 | 2051 | # Start playback in a separate thread to avoid blocking 2052 | def play_in_thread(): 2053 | play_audio_windows_with_interrupt(audio_numpy, SAMPLE_RATE, session_id) 2054 | 2055 | playback_thread = threading.Thread(target=play_in_thread, daemon=True) 2056 | playback_thread.start() 2057 | 2058 | return jsonify({ 2059 | "status": "success", 2060 | "message": "Audio playback started", 2061 | "session_id": session_id, 2062 | "robust_processing": use_robust, 2063 | "duration": calculate_audio_duration(audio_numpy, SAMPLE_RATE) 2064 | }), 200 2065 | 2066 | except ValueError as ve: 2067 | logger.error(f"ValueError: {ve}") 2068 | return jsonify({"error": str(ve)}), 400 2069 | except RuntimeError as re: 2070 | logger.error(f"RuntimeError: {re}") 2071 | return jsonify({"error": str(re)}), 500 2072 | except Exception as e: 2073 | logger.error(f"Unexpected error: {e}", exc_info=True) 2074 | return jsonify({"error": "Internal server error"}), 500 2075 | 2076 | @app.route('/v1/audio/speech/stop', methods=['POST']) 2077 | @check_auth 2078 | def stop_speech_route(): 2079 | """Stop/interrupt current audio playback.""" 2080 | try: 2081 | data = request.get_json() 2082 | session_id = data.get('session_id') if data else None 2083 | 2084 | success = playback_controller.stop_playback(session_id) 2085 | status = playback_controller.get_status() 2086 | 2087 | if success: 2088 | return jsonify({ 2089 | "status": "success", 2090 | "message": "Audio playback stopped", 2091 | "session_id": status["session_id"], 2092 | "interrupted_at": status["interrupted_at"], 2093 | "total_duration": status["total_duration"] 2094 | }), 200 2095 | else: 2096 | return jsonify({ 2097 | "status": "error", 2098 | "message": "No active playback session or wrong session ID", 2099 | "current_session": status["session_id"] 2100 | }), 400 2101 | 2102 | except Exception as e: 2103 | logger.error(f"Stop playback error: {e}", exc_info=True) 2104 | return jsonify({"error": "Internal server error"}), 500 2105 | 2106 | @app.route('/v1/audio/speech/status', methods=['GET']) 2107 | @check_auth 2108 | def playback_status_route(): 2109 | """Get current playback status.""" 2110 | try: 2111 | status = playback_controller.get_status() 2112 | return jsonify({ 2113 | "status": "success", 2114 | "playback": status 2115 | }), 200 2116 | 2117 | except Exception as e: 2118 | logger.error(f"Status check error: {e}", exc_info=True) 2119 | return jsonify({"error": "Internal server error"}), 500 2120 | 2121 | @app.route('/v1/models', methods=['GET']) 2122 | @check_auth 2123 | def list_models_route(): 2124 | return jsonify({ 2125 | "object": "list", 2126 | "data": [{ 2127 | "id": "kokoro-tts-1", 2128 | "object": "model", 2129 | "created": int(datetime.now().timestamp()), 2130 | "owned_by": "kokoro-project" 2131 | }] 2132 | }) 2133 | 2134 | @app.route('/v1/voices', methods=['GET']) 2135 | @check_auth 2136 | def list_voices_route(): 2137 | voices_list = [] 2138 | for voice_id, info in KOKORO_VOICES.items(): 2139 | voices_list.append({ 2140 | "id": voice_id, 2141 | "name": info.get('description', voice_id), 2142 | "gender": info.get('gender', 'unknown'), 2143 | "language_code": info.get('lang', 'unknown'), 2144 | "model_id": "kokoro-tts-1" 2145 | }) 2146 | return jsonify({"object": "list", "data": voices_list}) 2147 | 2148 | @app.route('/v1/effects', methods=['GET']) 2149 | @check_auth 2150 | def list_effects_route(): 2151 | return jsonify({ 2152 | "available_effects": { 2153 | "volume": { 2154 | "gain": {"type": "float", "range": [0.0, 2.0], "default": 1.0, "description": "Linear gain multiplier"}, 2155 | "gain_db": {"type": "float", "range": [-60.0, 12.0], "default": None, "description": "Gain in dB, overrides gain"} 2156 | }, 2157 | "pitch": { 2158 | "target_note": { 2159 | "type": "string", 2160 | "description": "Musical note (e.g., 'C4', 'A#5') or null", 2161 | "available_notes": available_notes() 2162 | }, 2163 | "semitone_shift": {"type": "float", "range": [-12, 12], "default": 0.0, "description": "Pitch shift in semitones"}, 2164 | "preserve_formants": {"type": "boolean", "default": False, "description": "Preserve formants during pitch shift"} 2165 | }, 2166 | "voice_character": { 2167 | "type": {"type": "string", "options": ["none", "child", "robot", "deep", "whisper", "alien", "monster", "echo"], "default": "none", "description": "Voice character transformation"}, 2168 | "params": { 2169 | "pitch_shift": {"type": "float", "range": [-12, 12], "description": "Custom pitch shift for character"}, 2170 | "formant_shift": {"type": "float", "range": [0.5, 1.5], "description": "Custom formant shift factor"}, 2171 | "speed": {"type": "float", "range": [0.5, 2.0], "description": "Custom time stretch factor"}, 2172 | "carrier_freq": {"type": "float", "range": [50, 200], "description": "Carrier frequency for robot effect"}, 2173 | "distortion_factor": {"type": "float", "range": [0.5, 5.0], "description": "Distortion intensity for robot/monster"}, 2174 | "noise_level": {"type": "float", "range": [0.0, 0.1], "description": "Noise level for whisper"}, 2175 | "signal_level": {"type": "float", "range": [0.0, 1.0], "description": "Signal level for whisper"}, 2176 | "compression_factor": {"type": "float", "range": [0.5, 2.0], "description": "Compression intensity for whisper"}, 2177 | "flanger_delay_ms": {"type": "float", "range": [0.1, 10.0], "description": "Flanger delay for alien"}, 2178 | "distortion_drive": {"type": "float", "range": [0.0, 36.0], "description": "Distortion drive for monster"}, 2179 | "pre_delay_ms": {"type": "float", "range": [0.0, 100.0], "description": "Pre-delay for echo"} 2180 | } 2181 | }, 2182 | "equalizer": { 2183 | "bands": { 2184 | "type": "array", 2185 | "items": { 2186 | "frequency_hz": {"type": "float", "range": [20, 20000], "description": "Center frequency in Hz"}, 2187 | "gain_db": {"type": "float", "range": [-24, 24], "description": "Gain in dB"}, 2188 | "q_factor": {"type": "float", "range": [0.1, 10], "default": 1.0, "description": "Bandwidth control"}, 2189 | "type": {"type": "string", "options": ["peak", "low_shelf", "high_shelf"], "description": "Filter type"} 2190 | } 2191 | } 2192 | }, 2193 | "reverb": { 2194 | "room_size_percent": {"type": "float", "range": [0, 100], "default": 0.0, "description": "Room size percentage"}, 2195 | "damping_percent": {"type": "float", "range": [0, 100], "default": 50.0, "description": "High frequency damping percentage"}, 2196 | "pre_delay_ms": {"type": "float", "range": [0, 100], "default": 0.0, "description": "Pre-delay in milliseconds"}, 2197 | "stereo_width": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Stereo width for reverb"} 2198 | }, 2199 | "formant": { 2200 | "shift_percent": {"type": "float", "range": [-100, 100], "default": 0.0, "description": "Formant shift percentage"}, 2201 | "scale": {"type": "float", "range": [0.5, 2.0], "default": 1.0, "description": "Intensity of formant shift"} 2202 | }, 2203 | "distortion": { 2204 | "drive_db": {"type": "float", "range": [0, 36], "default": 0.0, "description": "Drive gain in dB"}, 2205 | "type": {"type": "string", "options": ["soft", "hard", "tanh"], "default": "tanh", "description": "Clipping type"}, 2206 | "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"} 2207 | }, 2208 | "chorus": { 2209 | "delay_ms": {"type": "float", "range": [5, 50], "default": 0.0, "description": "Base delay in milliseconds"}, 2210 | "depth": {"type": "float", "range": [0, 0.1], "default": 0.0, "description": "Modulation depth"}, 2211 | "rate_hz": {"type": "float", "range": [0.1, 5], "default": 0.0, "description": "Modulation rate in Hz"}, 2212 | "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"} 2213 | }, 2214 | "flanger": { 2215 | "delay_ms": {"type": "float", "range": [0.1, 10], "default": 0.0, "description": "Base delay in milliseconds"}, 2216 | "depth": {"type": "float", "range": [0, 0.05], "default": 0.0, "description": "Modulation depth"}, 2217 | "rate_hz": {"type": "float", "range": [0.1, 10], "default": 0.0, "description": "LFO rate in Hz"}, 2218 | "feedback": {"type": "float", "range": [0, 0.9], "default": 0.0, "description": "Feedback amount"}, 2219 | "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"} 2220 | }, 2221 | "compression": { 2222 | "threshold_db": {"type": "float", "range": [-60, 0], "default": 0.0, "description": "Threshold in dB"}, 2223 | "ratio": {"type": "float", "range": [1, 20], "default": 1.0, "description": "Compression ratio"}, 2224 | "attack_ms": {"type": "float", "range": [0.1, 100], "default": 0.1, "description": "Attack time in milliseconds"}, 2225 | "release_ms": {"type": "float", "range": [10, 1000], "default": 10.0, "description": "Release time in milliseconds"} 2226 | }, 2227 | "order": { 2228 | "type": "array", 2229 | "items": {"type": "string", "options": ["volume", "equalizer", "compression", "distortion", "pitch", "formant", "voice_character", "chorus", "flanger", "reverb"]}, 2230 | "default": ["volume", "equalizer", "compression", "distortion", "pitch", "formant", "voice_character", "chorus", "flanger", "reverb"], 2231 | "description": "Order of effect application" 2232 | } 2233 | } 2234 | }) 2235 | 2236 | @app.route('/v1/text/process', methods=['POST']) 2237 | @check_auth 2238 | def process_text_route(): 2239 | """Text processing endpoint for testing and debugging.""" 2240 | try: 2241 | data = request.get_json() 2242 | if not data: 2243 | return jsonify({"error": "No JSON data provided"}), 400 2244 | 2245 | text = data.get('input', '').strip() 2246 | if not text: 2247 | return jsonify({"error": "Missing or empty 'input' field"}), 400 2248 | 2249 | max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE)) 2250 | processing_mode = data.get('processing_mode') 2251 | 2252 | if processing_mode: 2253 | try: 2254 | processing_mode = TextProcessingMode(processing_mode.lower()) 2255 | except ValueError: 2256 | processing_mode = None 2257 | 2258 | # Process text 2259 | text_chunks = enhanced_model_manager.text_processor.process_text( 2260 | text, mode=processing_mode, max_chunk_length=max_chunk_length 2261 | ) 2262 | 2263 | # Format response 2264 | chunks_data = [] 2265 | for chunk in text_chunks: 2266 | chunks_data.append({ 2267 | "chunk_id": chunk.chunk_id, 2268 | "text": chunk.text, 2269 | "char_count": chunk.char_count, 2270 | "processing_time": chunk.processing_time 2271 | }) 2272 | 2273 | return jsonify({ 2274 | "original_text": text, 2275 | "processing_mode": processing_mode.value if processing_mode else "auto", 2276 | "total_chunks": len(text_chunks), 2277 | "chunks": chunks_data 2278 | }) 2279 | 2280 | except Exception as e: 2281 | logger.error(f"Text processing error: {e}", exc_info=True) 2282 | return jsonify({"error": "Internal server error"}), 500 2283 | 2284 | @app.route('/health', methods=['GET']) 2285 | def health_check_route(): 2286 | # Get GPU memory info for health check 2287 | gpu_memory = get_gpu_memory_info() 2288 | 2289 | gpu_status = { 2290 | "cuda_available": CUDA_AVAILABLE, 2291 | "device_count": GPU_DEVICE_COUNT, 2292 | "device_name": GPU_DEVICE_NAME, 2293 | "memory": { 2294 | "total_gb": gpu_memory['total'], 2295 | "allocated_gb": gpu_memory['allocated'], 2296 | "free_gb": gpu_memory['free'] 2297 | } if CUDA_AVAILABLE else None, 2298 | "models_loaded": { 2299 | "cpu": False in enhanced_model_manager.models, 2300 | "gpu": True in enhanced_model_manager.models 2301 | } 2302 | } 2303 | 2304 | return jsonify({ 2305 | "status": "healthy", 2306 | "cuda_available": CUDA_AVAILABLE, # Keep for backward compatibility 2307 | "gpu_status": gpu_status, 2308 | "voices_loaded": len(KOKORO_VOICES), 2309 | "default_voice": DEFAULT_VOICE, 2310 | "effects_available": ["volume", "pitch", "voice_character", "equalizer", "reverb", "formant", "distortion", "chorus", "flanger", "compression"], 2311 | "features": { 2312 | "robust_text_processing": True, 2313 | "zero_default_effects": True, 2314 | "playback_control": True, 2315 | "interrupt_capability": True, 2316 | "streaming": True, 2317 | "caching": True, 2318 | "markdown_support": HAS_MARKDOWN, 2319 | "num2words_support": HAS_NUM2WORDS, 2320 | "read_aloud_compatible": True, 2321 | "pitch_shifting_fixed": True, 2322 | "enhanced_gpu_logging": True 2323 | }, 2324 | "cache_stats": { 2325 | "text_cache_size": len(enhanced_model_manager.text_processor._cache), 2326 | "audio_cache_size": len(enhanced_model_manager.audio_cache) 2327 | }, 2328 | "playback_status": playback_controller.get_status(), 2329 | "timestamp": datetime.utcnow().isoformat() + 'Z' 2330 | }) 2331 | 2332 | @app.route('/', methods=['GET']) 2333 | def index_route(): 2334 | return jsonify({ 2335 | "service": "Complete Kokoro TTS API", 2336 | "version": "3.3.0", 2337 | "description": "Production-grade TTS API with FIXED pitch shifting, read-aloud compatibility, and zero-default effects", 2338 | "status": "running", 2339 | "default_voice": DEFAULT_VOICE, 2340 | "cuda_available": CUDA_AVAILABLE, 2341 | "key_features": { 2342 | "zero_default_effects": "Effects only apply when explicitly configured", 2343 | "robust_text_processing": "Handles markdown, unicode, numbers, abbreviations", 2344 | "playback_control": "Local audio playback with interrupt capability", 2345 | "session_management": "Track and control audio playback sessions", 2346 | "streaming_support": "Real-time audio streaming", 2347 | "divide_by_zero_safe": "All calculations protected against mathematical errors", 2348 | "pitch_shifting_fixed": "Robust pitch shifting with librosa compatibility", 2349 | "read_aloud_compatible": "Full CORS support for browser extensions" 2350 | }, 2351 | "endpoints": { 2352 | "speech_generation": "/v1/audio/speech (POST) - Clean zero-default effects", 2353 | "robust_speech_generation": "/v1/audio/speech/robust (POST) - Enhanced with text processing", 2354 | "streaming_speech": "/v1/audio/speech/stream (POST) - Streaming generation", 2355 | "speech_playback": "/v1/audio/speech/play (POST) - Local playback with session control", 2356 | "stop_playback": "/v1/audio/speech/stop (POST) - Stop/interrupt current playback", 2357 | "playback_status": "/v1/audio/speech/status (GET) - Get current playback status", 2358 | "text_processing": "/v1/text/process (POST) - Text processing testing", 2359 | "list_models": "/v1/models (GET)", 2360 | "list_voices": "/v1/voices (GET)", 2361 | "list_effects": "/v1/effects (GET)", 2362 | "health_check": "/health (GET)", 2363 | "ping": "/ping (GET)" 2364 | }, 2365 | "example_pitch_requests": { 2366 | "squeaky_voice": { 2367 | "input": "Hello world! This is a test of the squeaky voice effect.", 2368 | "voice": "af_heart", 2369 | "speed": 1.0, 2370 | "effects": { 2371 | "pitch": {"semitone_shift": 8.0} 2372 | } 2373 | }, 2374 | "deep_voice": { 2375 | "input": "Hello world! This is a test of the deep voice effect.", 2376 | "voice": "af_heart", 2377 | "speed": 1.0, 2378 | "effects": { 2379 | "pitch": {"semitone_shift": -6.0} 2380 | } 2381 | }, 2382 | "child_character": { 2383 | "input": "Hello world! This is a test of the child voice character.", 2384 | "voice": "af_heart", 2385 | "speed": 1.0, 2386 | "effects": { 2387 | "voice_character": {"type": "child"} 2388 | } 2389 | }, 2390 | "monster_character": { 2391 | "input": "Hello world! This is a test of the monster voice character.", 2392 | "voice": "af_heart", 2393 | "speed": 1.0, 2394 | "effects": { 2395 | "voice_character": {"type": "monster"} 2396 | } 2397 | } 2398 | }, 2399 | "read_aloud_compatibility": { 2400 | "cors_enabled": True, 2401 | "supported_formats": ["mp3", "wav"], 2402 | "standard_endpoint": "/v1/audio/speech", 2403 | "example_request": { 2404 | "method": "POST", 2405 | "url": "http://localhost:5000/v1/audio/speech", 2406 | "headers": {"Content-Type": "application/json"}, 2407 | "body": { 2408 | "input": "Text to speak", 2409 | "voice": "af_heart", 2410 | "response_format": "mp3" 2411 | } 2412 | } 2413 | }, 2414 | "playback_features": { 2415 | "interrupt_support": "Stop playback at any time", 2416 | "session_tracking": "Unique session IDs for each playback", 2417 | "timing_info": "Get interrupted time and total duration", 2418 | "status_monitoring": "Real-time playback status" 2419 | } 2420 | }) 2421 | 2422 | # Handle OPTIONS requests for CORS preflight 2423 | @app.route('/v1/audio/speech', methods=['OPTIONS']) 2424 | @app.route('/v1/audio/speech/robust', methods=['OPTIONS']) 2425 | @app.route('/v1/audio/speech/stream', methods=['OPTIONS']) 2426 | @app.route('/v1/audio/speech/play', methods=['OPTIONS']) 2427 | @app.route('/v1/audio/speech/stop', methods=['OPTIONS']) 2428 | def handle_options(): 2429 | """Handle CORS preflight requests for read-aloud compatibility.""" 2430 | return '', 200, { 2431 | 'Access-Control-Allow-Origin': '*', 2432 | 'Access-Control-Allow-Methods': 'POST, GET, OPTIONS', 2433 | 'Access-Control-Allow-Headers': 'Content-Type, Authorization', 2434 | 'Access-Control-Max-Age': '86400' 2435 | } 2436 | 2437 | # --- Error Handlers --------------------------------------------------------- 2438 | 2439 | @app.errorhandler(404) 2440 | def not_found_error(error): 2441 | return jsonify({"error": "Endpoint not found"}), 404 2442 | 2443 | @app.errorhandler(500) 2444 | def internal_server_error(error): 2445 | logger.error(f"500 Internal Server Error: {error}", exc_info=True) 2446 | return jsonify({"error": "Internal server error"}), 500 2447 | 2448 | # --- Main ------------------------------------------------------------------- 2449 | 2450 | def main(): 2451 | logger.info("=" * 80) 2452 | logger.info("Complete Kokoro TTS API Server v3.3.0 - ENHANCED GPU LOGGING") 2453 | logger.info("=" * 80) 2454 | 2455 | # Enhanced GPU Information 2456 | logger.info("HARDWARE CONFIGURATION:") 2457 | logger.info(f" CUDA Available: {CUDA_AVAILABLE}") 2458 | if CUDA_AVAILABLE: 2459 | logger.info(f" GPU Device Count: {GPU_DEVICE_COUNT}") 2460 | logger.info(f" GPU Device Name: {GPU_DEVICE_NAME}") 2461 | memory_info = get_gpu_memory_info() 2462 | logger.info(f" GPU Memory Total: {memory_info['total']:.2f} GB") 2463 | logger.info(f" GPU Memory Available: {memory_info['free']:.2f} GB") 2464 | logger.info(f" PyTorch CUDA Version: {torch.version.cuda}") 2465 | else: 2466 | logger.info(" GPU Status: Not available - using CPU only") 2467 | 2468 | logger.info(f" PyTorch Version: {torch.__version__}") 2469 | logger.info("=" * 40) 2470 | 2471 | logger.info("APPLICATION CONFIGURATION:") 2472 | logger.info(f" Total Voices: {len(KOKORO_VOICES)}") 2473 | logger.info(f" Default Voice: {DEFAULT_VOICE}") 2474 | logger.info(f" CORS Enabled: {ENABLE_CORS}") 2475 | logger.info(f" Server Address: http://{API_HOST}:{API_PORT}") 2476 | logger.info(f" Markdown Support: {HAS_MARKDOWN}") 2477 | logger.info(f" Num2Words Support: {HAS_NUM2WORDS}") 2478 | logger.info("=" * 40) 2479 | 2480 | logger.info("KEY FEATURES:") 2481 | logger.info(" ✓ Enhanced GPU/CPU detection and logging") 2482 | logger.info(" ✓ Automatic GPU fallback to CPU on errors") 2483 | logger.info(" ✓ Real-time GPU memory monitoring") 2484 | logger.info(" ✓ Zero-default effects (clean slate approach)") 2485 | logger.info(" ✓ Playback control with interrupt capability") 2486 | logger.info(" ✓ Session management and status tracking") 2487 | logger.info(" ✓ Robust text processing and streaming") 2488 | logger.info(" ✓ Divide-by-zero safe calculations") 2489 | logger.info(" ✓ FIXED pitch shifting with librosa compatibility") 2490 | logger.info(" ✓ Full read-aloud extension compatibility") 2491 | logger.info("=" * 40) 2492 | 2493 | logger.info("PITCH SHIFTING EXAMPLES:") 2494 | logger.info(" Squeaky: {'effects': {'pitch': {'semitone_shift': 8.0}}}") 2495 | logger.info(" Deep: {'effects': {'pitch': {'semitone_shift': -6.0}}}") 2496 | logger.info(" Child: {'effects': {'voice_character': {'type': 'child'}}}") 2497 | logger.info(" Monster: {'effects': {'voice_character': {'type': 'monster'}}}") 2498 | logger.info("=" * 80) 2499 | 2500 | # Final GPU status check before starting server 2501 | log_gpu_status("server startup") 2502 | logger.info("Starting Flask server...") 2503 | 2504 | app.run(host=API_HOST, port=API_PORT, debug=False, threaded=True) 2505 | 2506 | if __name__ == "__main__": 2507 | main() 2508 | --------------------------------------------------------------------------------