├── .gitattributes
├── requirements.txt
├── requirementstest.txt
├── LICENSE
├── environment_backup.yml
├── run_voice_changer.bat
├── README.md
└── kokoro_api.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nodeblackbox/Kokoro-Voice-Api/HEAD/requirements.txt


--------------------------------------------------------------------------------
/requirementstest.txt:
--------------------------------------------------------------------------------
 1 | # Core Dependencies
 2 | flask>=2.3.0,<3.0.0
 3 | flask-cors>=4.0.0,<5.0.0
 4 | torch>=2.0.0,<3.0.0
 5 | torchaudio>=2.0.0,<3.0.0
 6 | 
 7 | # Kokoro TTS Model (install from source or pip if available)
 8 | # kokoro>=0.1.0  # Uncomment if available via pip
 9 | 
10 | # Audio Processing
11 | librosa>=0.10.0,<1.0.0
12 | numpy>=1.24.0,<2.0.0
13 | scipy>=1.10.0,<2.0.0
14 | soundfile>=0.12.0,<1.0.0
15 | pydub>=0.25.0,<1.0.0
16 | resampy>=0.4.0,<1.0.0
17 | 
18 | # Text Processing (Enhanced Features)
19 | num2words>=0.5.12,<1.0.0
20 | markdown>=3.4.0,<4.0.0
21 | unidecode>=1.3.0,<2.0.0
22 | regex>=2023.0.0,<2024.0.0
23 | 
24 | # Web Server & API
25 | gunicorn>=21.0.0,<22.0.0
26 | requests>=2.31.0,<3.0.0
27 | 
28 | # Utilities
29 | pyyaml>=6.0.0,<7.0.0
30 | 
31 | # Optional Dependencies for Enhanced Features
32 | # Uncomment these for additional functionality:
33 | # markdown[extra]>=3.4.0,<4.0.0  # For advanced markdown processing
34 | # tinytag>=1.10.0,<2.0.0         # For audio file metadata
35 | # mutagen>=1.47.0,<2.0.0         # Alternative audio metadata library
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 NASA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/environment_backup.yml:
--------------------------------------------------------------------------------
 1 | name: randnameko3
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - bzip2=1.0.8=h2466b09_7
 7 |   - ca-certificates=2025.4.26=h4c7d964_0
 8 |   - libexpat=2.7.0=he0c23c2_0
 9 |   - libffi=3.4.6=h537db12_1
10 |   - liblzma=5.8.1=h2466b09_2
11 |   - libsqlite=3.50.1=h67fdade_0
12 |   - libzlib=1.3.1=h2466b09_2
13 |   - openssl=3.5.0=ha4e3fda_1
14 |   - pip=25.1.1=pyh8b19718_0
15 |   - python=3.12.11=h3f84c4b_0_cpython
16 |   - setuptools=80.9.0=pyhff2d567_0
17 |   - tk=8.6.13=h2c6b04d_2
18 |   - tzdata=2025b=h78e105d_0
19 |   - ucrt=10.0.22621.0=h57928b3_1
20 |   - vc=14.3=h2b53caa_26
21 |   - vc14_runtime=14.42.34438=hfd919c2_26
22 |   - wheel=0.45.1=pyhd8ed1ab_1
23 |   - pip:
24 |       - certifi==2025.4.26
25 |       - cffi==1.17.1
26 |       - charset-normalizer==3.4.2
27 |       - colorama==0.4.6
28 |       - distro==1.9.0
29 |       - en-core-web-sm==3.8.0
30 |       - idna==3.10
31 |       - numpy==2.2.6
32 |       - packaging==25.0
33 |       - platformdirs==4.3.8
34 |       - pycparser==2.22
35 |       - pydantic==2.11.5
36 |       - pydantic-core==2.33.2
37 |       - pysocks==1.7.1
38 |       - spacy==3.8.7
39 |       - tqdm==4.67.1
40 |       - urllib3==2.4.0
41 | prefix: C:\Users\nasan\.conda\envs\randnameko3
42 | 


--------------------------------------------------------------------------------
/run_voice_changer.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | echo Starting Voice Changer API...
 3 | echo.
 4 | 
 5 | REM Check if conda is installed
 6 | if not exist "C:\ProgramData\miniconda3\condabin\conda.bat" (
 7 |     echo ERROR: Miniconda not found at C:\ProgramData\miniconda3
 8 |     echo Please install Miniconda or update the path
 9 |     pause
10 |     exit /b 1
11 | )
12 | 
13 | REM Activate the specific environment
14 | call C:\ProgramData\miniconda3\condabin\conda.bat activate randnameko3
15 | if %ERRORLEVEL% neq 0 (
16 |     echo ERROR: Failed to activate environment randnameko3
17 |     pause
18 |     exit /b 1
19 | )
20 | 
21 | REM Check if PyTorch is installed and CUDA availability
22 | echo Checking PyTorch and CUDA...
23 | python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())" 2>nul
24 | if %ERRORLEVEL% neq 0 (
25 |     echo ERROR: PyTorch not found or import failed. Install with: pip install torch
26 |     pause
27 |     exit /b 1
28 | )
29 | 
30 | REM Run the voice changer API
31 | echo Starting voicechangerapiV8.py...
32 | echo.
33 | if not exist "voicechangerapiV8.py" (
34 |     echo ERROR: voicechangerapiV8.py not found in current directory
35 |     pause
36 |     exit /b 1
37 | )
38 | python voicechangerapiV8.py
39 | if %ERRORLEVEL% neq 0 (
40 |     echo ERROR: voicechangerapiV8.py failed to run
41 |     pause
42 |     exit /b 1
43 | )
44 | 
45 | REM Keep the window open if the script exits
46 | echo.
47 | echo Script ended. Press any key to close this window...
48 | pause >nul
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🎙️ Complete Kokoro TTS API
  2 | 
  3 | [![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://python.org)
  4 | [![Flask](https://img.shields.io/badge/flask-2.3+-green.svg)](https://flask.palletsprojects.com)
  5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  6 | [![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)](https://pytorch.org)
  7 | 
  8 | > **A lightning-fast, production-grade text-to-speech server with OpenAI-style quality, robust text processing, and accessibility-first design. Processing time: ~1 second with nearly instant output.**
  9 | 
 10 | *Created by [nodeblackbox](https://github.com/nodeblackbox) - Making accessibility available for everyone.*
 11 | 
 12 | ## ✨ Overview
 13 | 
 14 | The Complete Kokoro TTS API delivers **OpenAI-style text-to-speech quality** with exceptional performance and accessibility features. Designed with a commitment that **accessibility should be for everyone**, this API provides crystal-clear voices especially suitable for dyslexic users and assistive technology integration.
 15 | 
 16 | **🚀 Performance Highlights:**
 17 | - **~1 second total processing time**
 18 | - **Nearly instant audio output**
 19 | - **GPU acceleration available**
 20 | - **Real-time streaming capabilities**
 21 | 
 22 | ## 🎯 Accessibility & Integration
 23 | 
 24 | ### 🔗 Read Aloud Chrome Extension Integration
 25 | 
 26 | This API seamlessly integrates with the **[Read Aloud](https://chromewebstore.google.com/detail/read-aloud-a-text-to-spee/hdhinadidafjejdhmfkjgnolgimiaplp)** Chrome extension, providing an excellent solution for dyslexic users and anyone who benefits from text-to-speech technology.
 27 | 
 28 | **Setup Instructions:**
 29 | 
 30 | 1. **Install the Extension**: Add [Read Aloud](https://chromewebstore.google.com/detail/read-aloud-a-text-to-spee/hdhinadidafjejdhmfkjgnolgimiaplp) to Chrome
 31 | 2. **Configure API Endpoint**: `http://127.0.0.1:5000/v1`
 32 | 3. **API Key**: `your-secret-key`
 33 | 4. **Select from 28 High-Quality Voices** (see voice configuration below)
 34 | 
 35 | ### 🎤 Available Voices for Read Aloud
 36 | 
 37 | ```json
 38 | [
 39 |   { "lang": "en-US", "model": "tts-1", "voice": "af_heart" },
 40 |   { "lang": "en-US", "model": "tts-1", "voice": "af_bella" },
 41 |   { "lang": "en-US", "model": "tts-1", "voice": "af_nicole" },
 42 |   { "lang": "en-US", "model": "tts-1", "voice": "af_aoede" },
 43 |   { "lang": "en-US", "model": "tts-1", "voice": "af_kore" },
 44 |   { "lang": "en-US", "model": "tts-1", "voice": "af_sarah" },
 45 |   { "lang": "en-US", "model": "tts-1", "voice": "af_nova" },
 46 |   { "lang": "en-US", "model": "tts-1", "voice": "af_sky" },
 47 |   { "lang": "en-US", "model": "tts-1", "voice": "af_alloy" },
 48 |   { "lang": "en-US", "model": "tts-1", "voice": "af_jessica" },
 49 |   { "lang": "en-US", "model": "tts-1", "voice": "af_river" },
 50 |   { "lang": "en-US", "model": "tts-1", "voice": "am_michael" },
 51 |   { "lang": "en-US", "model": "tts-1", "voice": "am_fenrir" },
 52 |   { "lang": "en-US", "model": "tts-1", "voice": "am_puck" },
 53 |   { "lang": "en-US", "model": "tts-1", "voice": "am_echo" },
 54 |   { "lang": "en-US", "model": "tts-1", "voice": "am_eric" },
 55 |   { "lang": "en-US", "model": "tts-1", "voice": "am_liam" },
 56 |   { "lang": "en-US", "model": "tts-1", "voice": "am_onyx" },
 57 |   { "lang": "en-US", "model": "tts-1", "voice": "am_santa" },
 58 |   { "lang": "en-US", "model": "tts-1", "voice": "am_adam" },
 59 |   { "lang": "en-GB", "model": "tts-1", "voice": "bf_emma" },
 60 |   { "lang": "en-GB", "model": "tts-1", "voice": "bf_isabella" },
 61 |   { "lang": "en-GB", "model": "tts-1", "voice": "bf_alice" },
 62 |   { "lang": "en-GB", "model": "tts-1", "voice": "bf_lily" },
 63 |   { "lang": "en-GB", "model": "tts-1", "voice": "bm_george" },
 64 |   { "lang": "en-GB", "model": "tts-1", "voice": "bm_fable" },
 65 |   { "lang": "en-GB", "model": "tts-1", "voice": "bm_lewis" },
 66 |   { "lang": "en-GB", "model": "tts-1", "voice": "bm_daniel" }
 67 | ]
 68 | ```
 69 | 
 70 | ## 🚀 Features
 71 | 
 72 | ### Core Features
 73 | - **🔧 Robust Text Processing**: Intelligent handling of markdown, Unicode characters, numbers, abbreviations, and special formatting
 74 | - **⚡ Ultra-Fast Performance**: ~1 second total processing with nearly instant output
 75 | - **🎚️ Zero-Default Effects**: Clean audio output with effects only when explicitly configured
 76 | - **🎵 Local Playback Control**: Built-in audio playback with interrupt capability and session management
 77 | - **📡 Real-time Streaming**: Live audio streaming support for compatible clients
 78 | - **🎼 Advanced Audio Effects**: FIXED and robust pitch shifting with librosa compatibility
 79 | - **🌐 Browser Integration**: Full CORS support for browser extensions and web applications
 80 | - **♿ Accessibility First**: Crystal-clear voices optimized for dyslexic users and assistive technology
 81 | 
 82 | ### Audio Processing
 83 | - **28 High-quality voices** (20 US English, 8 British English)
 84 | - **OpenAI-style TTS quality** with superior clarity
 85 | - Various audio format outputs
 86 | - Pitch shifting and formant modification
 87 | - Dynamic range compression
 88 | - Professional-grade audio processing
 89 | 
 90 | ### Developer Experience
 91 | - RESTful API design
 92 | - OpenAPI specification
 93 | - Comprehensive error handling
 94 | - Session-based playback management
 95 | - Easy integration with existing applications
 96 | - Chrome extension compatibility
 97 | 
 98 | ## 📋 Table of Contents
 99 | 
100 | - [🚀 Features](#-features)
101 | - [⚡ Quick Start](#-quick-start)
102 | - [📦 Installation](#-installation)
103 | - [🔧 Configuration](#-configuration)
104 | - [📡 API Endpoints](#-api-endpoints)
105 | - [💡 Usage Examples](#-usage-examples)
106 | - [🎨 Audio Effects](#-audio-effects)
107 | - [📚 API Documentation](#-api-documentation)
108 | - [🤝 Contributing](#-contributing)
109 | - [📄 License](#-license)
110 | 
111 | ## ⚡ Quick Start
112 | 
113 | ```bash
114 | # Clone the repository
115 | git clone https://github.com/your-username/Kokoro-Voice-Api.git
116 | cd Kokoro-Voice-Api
117 | 
118 | # Install dependencies
119 | pip install -r requirements.txt
120 | 
121 | # Run the server
122 | python kokoro_api.py
123 | 
124 | # Test the API
125 | curl -X POST \
126 |   -H "Content-Type: application/json" \
127 |   -d '{"input": "Hello, world!", "voice": "af_heart"}' \
128 |   http://localhost:5000/v1/audio/speech
129 | ```
130 | 
131 | ## 📦 Installation
132 | 
133 | ### Prerequisites
134 | - **Python 3.8+** (recommended: Python 3.10+)
135 | - **PyTorch** with CUDA support (optional, for GPU acceleration)
136 | - **Git** for cloning the repository
137 | 
138 | ### Step-by-Step Installation
139 | 
140 | 1. **Clone the Repository**
141 |    ```bash
142 |    git clone https://github.com/your-username/Kokoro-Voice-Api.git
143 |    cd Kokoro-Voice-Api
144 |    ```
145 | 
146 | 2. **Create Virtual Environment** (Recommended)
147 |    ```bash
148 |    python -m venv venv
149 |    
150 |    # On Windows
151 |    venv\Scripts\activate
152 |    
153 |    # On macOS/Linux
154 |    source venv/bin/activate
155 |    ```
156 | 
157 | 3. **Install Dependencies**
158 |    ```bash
159 |    pip install -r requirements.txt
160 |    ```
161 | 
162 | 4. **Verify Installation**
163 |    ```bash
164 |    python -c "import torch, librosa, flask; print('✅ All dependencies installed successfully!')"
165 |    ```
166 | 
167 | ## 🔧 Configuration
168 | 
169 | ### Environment Variables
170 | ```bash
171 | # Server Configuration
172 | export TTS_HOST=0.0.0.0
173 | export TTS_PORT=5000
174 | export TTS_DEBUG=false
175 | 
176 | # Audio Configuration
177 | export TTS_SAMPLE_RATE=22050
178 | export TTS_AUDIO_FORMAT=wav
179 | 
180 | # Performance
181 | export TTS_MAX_TEXT_LENGTH=1000
182 | export TTS_CACHE_SIZE=100
183 | ```
184 | 
185 | ### Configuration File
186 | Create a `config.yaml` file in the project root:
187 | ```yaml
188 | server:
189 |   host: "0.0.0.0"
190 |   port: 5000
191 |   debug: false
192 | 
193 | audio:
194 |   sample_rate: 22050
195 |   format: "wav"
196 |   quality: "high"
197 | 
198 | processing:
199 |   max_text_length: 1000
200 |   cache_enabled: true
201 |   cache_size: 100
202 | ```
203 | 
204 | ## 📡 API Endpoints
205 | 
206 | ### Speech Generation
207 | 
208 | | Endpoint | Method | Description |
209 | |----------|--------|-------------|
210 | | `/v1/audio/speech` | POST | Standard speech generation with clean zero-default effects |
211 | | `/v1/audio/speech/robust` | POST | Enhanced speech generation with advanced text processing |
212 | | `/v1/audio/speech/stream` | POST | Real-time streaming speech generation |
213 | 
214 | ### Playback Control
215 | 
216 | | Endpoint | Method | Description |
217 | |----------|--------|-------------|
218 | | `/v1/audio/speech/play` | POST | Local playback with session control |
219 | | `/v1/audio/speech/stop` | POST | Stop/interrupt current playback |
220 | | `/v1/audio/speech/status` | GET | Get current playback status |
221 | 
222 | ### System Information
223 | 
224 | | Endpoint | Method | Description |
225 | |----------|--------|-------------|
226 | | `/health` | GET | Health check endpoint |
227 | | `/voices` | GET | List available voices |
228 | | `/version` | GET | API version information |
229 | 
230 | ## 💡 Usage Examples
231 | 
232 | ### Basic Text-to-Speech
233 | ```bash
234 | curl -X POST \
235 |   -H "Content-Type: application/json" \
236 |   -d '{
237 |     "input": "Hello, world! This is a test of the Kokoro TTS API.",
238 |     "voice": "af_heart"
239 |   }' \
240 |   http://localhost:5000/v1/audio/speech \
241 |   --output hello.wav
242 | ```
243 | 
244 | ### Squeaky Voice Effect
245 | ```bash
246 | curl -X POST \
247 |   -H "Content-Type: application/json" \
248 |   -d '{
249 |     "input": "I sound like a chipmunk!",
250 |     "voice": "af_heart",
251 |     "effects": {
252 |       "pitch": {
253 |         "semitone_shift": 8.0
254 |       }
255 |     }
256 |   }' \
257 |   http://localhost:5000/v1/audio/speech \
258 |   --output squeaky.wav
259 | ```
260 | 
261 | ### Deep Voice Effect
262 | ```bash
263 | curl -X POST \
264 |   -H "Content-Type: application/json" \
265 |   -d '{
266 |     "input": "I have a very deep voice now.",
267 |     "voice": "af_heart",
268 |     "effects": {
269 |       "pitch": {
270 |         "semitone_shift": -6.0
271 |       }
272 |     }
273 |   }' \
274 |   http://localhost:5000/v1/audio/speech \
275 |   --output deep.wav
276 | ```
277 | 
278 | ### Robust Text Processing
279 | ```bash
280 | curl -X POST \
281 |   -H "Content-Type: application/json" \
282 |   -d '{
283 |     "input": "Process this: **bold text**, _italic_, numbers: 123, $50.99, and 50% off!",
284 |     "voice": "af_heart",
285 |     "robust_processing": true
286 |   }' \
287 |   http://localhost:5000/v1/audio/speech/robust \
288 |   --output processed.wav
289 | ```
290 | 
291 | ### Streaming Audio
292 | ```bash
293 | curl -X POST \
294 |   -H "Content-Type: application/json" \
295 |   -d '{
296 |     "input": "This will be streamed in real-time as it is generated.",
297 |     "voice": "af_heart",
298 |     "stream": true
299 |   }' \
300 |   http://localhost:5000/v1/audio/speech/stream \
301 |   --output stream.wav
302 | ```
303 | 
304 | ### Local Playback Control
305 | ```bash
306 | # Start playback
307 | curl -X POST \
308 |   -H "Content-Type: application/json" \
309 |   -d '{
310 |     "input": "This will play locally on the server.",
311 |     "voice": "af_heart",
312 |     "session_id": "my-session"
313 |   }' \
314 |   http://localhost:5000/v1/audio/speech/play
315 | 
316 | # Check status
317 | curl http://localhost:5000/v1/audio/speech/status
318 | 
319 | # Stop playback
320 | curl -X POST \
321 |   -H "Content-Type: application/json" \
322 |   -d '{"session_id": "my-session"}' \
323 |   http://localhost:5000/v1/audio/speech/stop
324 | ```
325 | 
326 | ## 🎨 Audio Effects
327 | 
328 | ### Pitch Modification
329 | ```json
330 | {
331 |   "effects": {
332 |     "pitch": {
333 |       "semitone_shift": 4.0,     // Shift by semitones (-12 to +12)
334 |       "preserve_formants": true   // Maintain voice character
335 |     }
336 |   }
337 | }
338 | ```
339 | 
340 | ### Dynamic Range Compression
341 | ```json
342 | {
343 |   "effects": {
344 |     "compression": {
345 |       "ratio": 4.0,              // Compression ratio
346 |       "threshold": -20.0,        // Threshold in dB
347 |       "attack": 0.003,           // Attack time in seconds
348 |       "release": 0.1             // Release time in seconds
349 |     }
350 |   }
351 | }
352 | ```
353 | 
354 | ### Multiple Effects
355 | ```json
356 | {
357 |   "effects": {
358 |     "pitch": {
359 |       "semitone_shift": 2.0
360 |     },
361 |     "compression": {
362 |       "ratio": 2.0,
363 |       "threshold": -18.0
364 |     },
365 |     "reverb": {
366 |       "room_size": 0.3,
367 |       "damping": 0.5,
368 |       "wet_level": 0.2
369 |     }
370 |   }
371 | }
372 | ```
373 | 
374 | ## 🛠️ Development
375 | 
376 | ### Running in Development Mode
377 | ```bash
378 | # Enable debug mode
379 | export FLASK_ENV=development
380 | export TTS_DEBUG=true
381 | 
382 | # Run with auto-reload
383 | python kokoro_api.py
384 | ```
385 | 
386 | ### Running Tests
387 | ```bash
388 | # Install test dependencies
389 | pip install pytest pytest-cov
390 | 
391 | # Run tests
392 | pytest tests/ -v --cov=kokoro_api
393 | ```
394 | 
395 | ### Docker Support
396 | ```dockerfile
397 | # Build Docker image
398 | docker build -t kokoro-tts-api .
399 | 
400 | # Run container
401 | docker run -p 5000:5000 kokoro-tts-api
402 | ```
403 | 
404 | ## 📚 API Documentation
405 | 
406 | ### OpenAPI Specification
407 | The complete API documentation is available in OpenAPI format:
408 | - **Specification File**: [`openapi.yaml`](openapi.yaml)
409 | - **Interactive Documentation**: Visit `/docs` when the server is running
410 | - **Redoc Documentation**: Visit `/redoc` when the server is running
411 | 
412 | ### Response Formats
413 | All endpoints return standardized responses:
414 | 
415 | **Success Response:**
416 | ```json
417 | {
418 |   "success": true,
419 |   "data": {
420 |     "audio_url": "/generated/audio.wav",
421 |     "duration": 2.5,
422 |     "sample_rate": 22050
423 |   },
424 |   "metadata": {
425 |     "voice": "af_heart",
426 |     "effects_applied": ["pitch_shift"],
427 |     "processing_time": 0.85
428 |   }
429 | }
430 | ```
431 | 
432 | **Error Response:**
433 | ```json
434 | {
435 |   "success": false,
436 |   "error": {
437 |     "code": "INVALID_VOICE",
438 |     "message": "The specified voice 'invalid_voice' is not available",
439 |     "details": {
440 |       "available_voices": ["af_heart", "af_bella", "af_sarah"]
441 |     }
442 |   }
443 | }
444 | ```
445 | 
446 | ## 🤝 Contributing
447 | 
448 | We welcome contributions! Please follow these steps:
449 | 
450 | 1. **Fork the Repository**
451 |    ```bash
452 |    git fork https://github.com/your-username/Kokoro-Voice-Api.git
453 |    ```
454 | 
455 | 2. **Create a Feature Branch**
456 |    ```bash
457 |    git checkout -b feature/amazing-new-feature
458 |    ```
459 | 
460 | 3. **Make Your Changes**
461 |    - Follow PEP 8 style guidelines
462 |    - Add tests for new functionality
463 |    - Update documentation as needed
464 | 
465 | 4. **Run Tests**
466 |    ```bash
467 |    pytest tests/ -v
468 |    black kokoro_api.py
469 |    flake8 kokoro_api.py
470 |    ```
471 | 
472 | 5. **Submit a Pull Request**
473 |    - Provide a clear description of your changes
474 |    - Reference any related issues
475 |    - Ensure all tests pass
476 | 
477 | ### Development Guidelines
478 | - **Code Style**: Follow PEP 8 and use `black` for formatting
479 | - **Testing**: Maintain >90% test coverage
480 | - **Documentation**: Update docstrings and README for new features
481 | - **Performance**: Profile code for optimization opportunities
482 | 
483 | ## 🔒 Security
484 | 
485 | - **API Keys**: Never hardcode API keys; use environment variables
486 | - **Input Validation**: All inputs are sanitized and validated
487 | - **Rate Limiting**: Built-in rate limiting to prevent abuse
488 | - **CORS**: Configurable CORS settings for web integration
489 | 
490 | ## 📊 Performance
491 | 
492 | ### Benchmarks
493 | - **Average Response Time**: ~1 second for 50-word text
494 | - **Output Latency**: Nearly instant audio delivery
495 | - **Concurrent Requests**: Supports up to 10 simultaneous requests
496 | - **Memory Usage**: ~200MB baseline + ~50MB per active session
497 | - **GPU Acceleration**: 3x faster processing with CUDA-enabled PyTorch
498 | - **Voice Quality**: OpenAI-comparable clarity and naturalness
499 | 
500 | ### Optimization Tips
501 | - Use GPU acceleration when available for fastest processing
502 | - Enable caching for repeated requests
503 | - Batch multiple requests when possible
504 | - Use streaming for long-form content
505 | - Perfect for real-time applications and accessibility tools
506 | 
507 | ## 🌟 Accessibility Statement
508 | 
509 | **We believe accessibility should be for everyone.** This API is specifically designed with dyslexic users and assistive technology in mind, providing:
510 | 
511 | - **Crystal-clear voice quality** optimized for comprehension
512 | - **Multiple accent options** (US and British English)
513 | - **Fast processing** for responsive user experience
514 | - **Browser extension compatibility** for seamless web integration
515 | - **Professional-grade audio** without distortion or artifacts
516 | 
517 | ### Perfect for:
518 | - 📚 **Dyslexic students and professionals**
519 | - 👩‍🦯 **Users with visual impairments**
520 | - 🧠 **People with learning differences**
521 | - 👥 **Anyone who benefits from audio content**
522 | - 🌐 **Web accessibility implementations**
523 | 
524 | ## 🙏 Acknowledgments
525 | 
526 | - **[nodeblackbox](https://github.com/nodeblackbox)** - Project creator and maintainer
527 | - **Kokoro TTS Team** for the underlying neural TTS technology
528 | - **PyTorch Team** for the deep learning framework
529 | - **Librosa Contributors** for audio processing capabilities
530 | - **Flask Community** for the web framework
531 | - **Accessibility advocates** who inspire inclusive technology
532 | 
533 | ---
534 | 
535 | <div align="center">
536 | 
537 | **[⬆ Back to Top](#-complete-kokoro-tts-api)**
538 | 
539 | Made with ❤️ for accessibility by [nodeblackbox](https://github.com/nodeblackbox)
540 | 
541 | *"Accessibility should be for everyone"*
542 | 
543 | </div>
544 | 


--------------------------------------------------------------------------------
/kokoro_api.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import torch
   3 | from kokoro import KModel, KPipeline
   4 | import scipy.io.wavfile as wavfile
   5 | from pydub import AudioSegment
   6 | import tempfile
   7 | import logging
   8 | import io
   9 | import html
  10 | import unicodedata
  11 | from typing import Dict, Optional, Tuple, List, Any, Union
  12 | from functools import wraps
  13 | import numpy as np
  14 | import librosa
  15 | import librosa.display
  16 | from scipy import signal
  17 | import math
  18 | import re
  19 | from datetime import datetime
  20 | from dataclasses import dataclass
  21 | from enum import Enum
  22 | import hashlib
  23 | import threading
  24 | from concurrent.futures import ThreadPoolExecutor
  25 | import time
  26 | 
  27 | # Flask imports
  28 | from flask import Flask, request, Response, jsonify, stream_with_context
  29 | from flask_cors import CORS
  30 | 
  31 | # Additional imports for enhanced text processing
  32 | try:
  33 |   import num2words
  34 |   HAS_NUM2WORDS = True
  35 | except ImportError:
  36 |   HAS_NUM2WORDS = False
  37 |   logging.warning("num2words not available - using basic number processing")
  38 | 
  39 | try:
  40 |   import markdown
  41 |   from markdown.extensions import codehilite, fenced_code, tables
  42 |   HAS_MARKDOWN = True
  43 | except ImportError:
  44 |   HAS_MARKDOWN = False
  45 |   logging.warning("markdown not available - using regex-based processing")
  46 | 
  47 | # --- Configuration ----------------------------------------------------------
  48 | 
  49 | KOKORO_VOICES = {
  50 |   'af_heart': {'lang': 'en-US', 'gender': 'female', 'description': 'Heart ❤️'},
  51 |   'af_bella': {'lang': 'en-US', 'gender': 'female', 'description': 'Bella 🔥'},
  52 |   'af_nicole': {'lang': 'en-US', 'gender': 'female', 'description': 'Nicole 🎧'},
  53 |   'af_aoede': {'lang': 'en-US', 'gender': 'female', 'description': 'Aoede'},
  54 |   'af_kore': {'lang': 'en-US', 'gender': 'female', 'description': 'Kore'},
  55 |   'af_sarah': {'lang': 'en-US', 'gender': 'female', 'description': 'Sarah'},
  56 |   'af_nova': {'lang': 'en-US', 'gender': 'female', 'description': 'Nova'},
  57 |   'af_sky': {'lang': 'en-US', 'gender': 'female', 'description': 'Sky'},
  58 |   'af_alloy': {'lang': 'en-US', 'gender': 'female', 'description': 'Alloy'},
  59 |   'af_jessica': {'lang': 'en-US', 'gender': 'female', 'description': 'Jessica'},
  60 |   'af_river': {'lang': 'en-US', 'gender': 'female', 'description': 'River'},
  61 |   'am_michael': {'lang': 'en-US', 'gender': 'male', 'description': 'Michael'},
  62 |   'am_fenrir': {'lang': 'en-US', 'gender': 'male', 'description': 'Fenrir'},
  63 |   'am_puck': {'lang': 'en-US', 'gender': 'male', 'description': 'Puck'},
  64 |   'am_echo': {'lang': 'en-US', 'gender': 'male', 'description': 'Echo'},
  65 |   'am_eric': {'lang': 'en-US', 'gender': 'male', 'description': 'Eric'},
  66 |   'am_liam': {'lang': 'en-US', 'gender': 'male', 'description': 'Liam'},
  67 |   'am_onyx': {'lang': 'en-US', 'gender': 'male', 'description': 'Onyx'},
  68 |   'am_santa': {'lang': 'en-US', 'gender': 'male', 'description': 'Santa'},
  69 |   'am_adam': {'lang': 'en-US', 'gender': 'male', 'description': 'Adam'},
  70 |   'bf_emma': {'lang': 'en-GB', 'gender': 'female', 'description': 'Emma'},
  71 |   'bf_isabella': {'lang': 'en-GB', 'gender': 'female', 'description': 'Isabella'},
  72 |   'bf_alice': {'lang': 'en-GB', 'gender': 'female', 'description': 'Alice'},
  73 |   'bf_lily': {'lang': 'en-GB', 'gender': 'female', 'description': 'Lily'},
  74 |   'bm_george': {'lang': 'en-GB', 'gender': 'male', 'description': 'George'},
  75 |   'bm_fable': {'lang': 'en-GB', 'gender': 'male', 'description': 'Fable'},
  76 |   'bm_lewis': {'lang': 'en-GB', 'gender': 'male', 'description': 'Lewis'},
  77 |   'bm_daniel': {'lang': 'en-GB', 'gender': 'male', 'description': 'Daniel'},
  78 | }
  79 | 
  80 | DEFAULT_EFFECT_ORDER = [
  81 |   'volume', 'equalizer', 'compression', 'distortion', 'pitch',
  82 |   'formant', 'voice_character', 'reverb'
  83 | ]
  84 | 
  85 | DEFAULT_VOICE = 'af_heart'
  86 | SAMPLE_RATE = 24000
  87 | API_PORT = 5000
  88 | API_HOST = '0.0.0.0'
  89 | 
  90 | ENABLE_CORS = os.getenv('ENABLE_CORS', 'true').lower() == 'true'
  91 | LOG_AUTH_ATTEMPTS = True
  92 | 
  93 | # Enhanced configuration
  94 | MAX_TEXT_LENGTH = 10000
  95 | DEFAULT_CHUNK_SIZE = 400
  96 | MIN_CHUNK_SIZE = 50
  97 | MAX_CHUNKS = 50
  98 | CACHE_SIZE = 1000
  99 | 
 100 | # --- Setup logging ----------------------------------------------------------
 101 | 
 102 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 103 | logger = logging.getLogger(__name__)
 104 | 
 105 | # --- GPU setup --------------------------------------------------------------
 106 | 
 107 | CUDA_AVAILABLE = torch.cuda.is_available()
 108 | GPU_DEVICE_COUNT = torch.cuda.device_count() if CUDA_AVAILABLE else 0
 109 | GPU_DEVICE_NAME = torch.cuda.get_device_name(0) if CUDA_AVAILABLE else "N/A"
 110 | 
 111 | def get_gpu_memory_info():
 112 |     """Get GPU memory information if CUDA is available."""
 113 |     if not CUDA_AVAILABLE:
 114 |         return {"total": 0, "allocated": 0, "free": 0}
 115 |     
 116 |     try:
 117 |         total_memory = torch.cuda.get_device_properties(0).total_memory
 118 |         allocated_memory = torch.cuda.memory_allocated(0)
 119 |         free_memory = total_memory - allocated_memory
 120 |         
 121 |         return {
 122 |             "total": total_memory / (1024**3),  # Convert to GB
 123 |             "allocated": allocated_memory / (1024**3),
 124 |             "free": free_memory / (1024**3)
 125 |         }
 126 |     except Exception as e:
 127 |         logger.warning(f"Failed to get GPU memory info: {e}")
 128 |         return {"total": 0, "allocated": 0, "free": 0}
 129 | 
 130 | # Log detailed GPU information
 131 | logger.info(f"CUDA Available: {CUDA_AVAILABLE}")
 132 | if CUDA_AVAILABLE:
 133 |     logger.info(f"GPU Device Count: {GPU_DEVICE_COUNT}")
 134 |     logger.info(f"GPU Device Name: {GPU_DEVICE_NAME}")
 135 |     memory_info = get_gpu_memory_info()
 136 |     logger.info(f"GPU Memory - Total: {memory_info['total']:.2f} GB, Available: {memory_info['free']:.2f} GB")
 137 | else:
 138 |     logger.warning("CUDA not available - will use CPU for inference (slower)")
 139 |     
 140 | # Function to log current GPU usage
 141 | def log_gpu_status(context: str = ""):
 142 |     """Log current GPU usage status."""
 143 |     if CUDA_AVAILABLE:
 144 |         memory_info = get_gpu_memory_info()
 145 |         logger.info(f"GPU Status {context}: {memory_info['allocated']:.2f} GB used / {memory_info['total']:.2f} GB total")
 146 |     else:
 147 |         logger.info(f"GPU Status {context}: Using CPU (CUDA not available)")
 148 | 
 149 | # --- Audio Playback Control -------------------------------------------------
 150 | 
 151 | class AudioPlaybackController:
 152 |   """Global controller for managing audio playback state and interruptions."""
 153 |   
 154 |   def __init__(self):
 155 |       self.is_playing = False
 156 |       self.should_stop = False
 157 |       self.current_session_id = None
 158 |       self.playback_start_time = None
 159 |       self.interrupted_at = None
 160 |       self.total_duration = None
 161 |       self.lock = threading.Lock()
 162 |       
 163 |   def start_playback(self, session_id: str, duration: float):
 164 |       """Start a new playback session."""
 165 |       with self.lock:
 166 |           self.is_playing = True
 167 |           self.should_stop = False
 168 |           self.current_session_id = session_id
 169 |           self.playback_start_time = time.time()
 170 |           self.interrupted_at = None
 171 |           self.total_duration = duration
 172 |           logger.info(f"Started playback session: {session_id}, duration: {duration:.2f}s")
 173 |   
 174 |   def stop_playback(self, session_id: str = None):
 175 |       """Stop the current playback session."""
 176 |       with self.lock:
 177 |           if session_id and session_id != self.current_session_id:
 178 |               return False  # Wrong session
 179 |           
 180 |           if self.is_playing:
 181 |               current_time = time.time()
 182 |               self.interrupted_at = current_time - self.playback_start_time if self.playback_start_time else 0
 183 |               logger.info(f"Stopped playback session: {self.current_session_id}, interrupted at: {self.interrupted_at:.2f}s")
 184 |           
 185 |           self.is_playing = False
 186 |           self.should_stop = True
 187 |           return True
 188 |   
 189 |   def finish_playback(self):
 190 |       """Mark playback as naturally finished."""
 191 |       with self.lock:
 192 |           self.is_playing = False
 193 |           self.should_stop = False
 194 |           self.interrupted_at = None
 195 |           logger.info(f"Finished playback session: {self.current_session_id}")
 196 |   
 197 |   def get_status(self):
 198 |       """Get current playback status."""
 199 |       with self.lock:
 200 |           current_time = time.time()
 201 |           elapsed = current_time - self.playback_start_time if self.playback_start_time else 0
 202 |           
 203 |           return {
 204 |               "is_playing": self.is_playing,
 205 |               "session_id": self.current_session_id,
 206 |               "elapsed_time": elapsed,
 207 |               "total_duration": self.total_duration,
 208 |               "interrupted_at": self.interrupted_at,
 209 |               "should_stop": self.should_stop
 210 |           }
 211 | 
 212 | # Global playback controller
 213 | playback_controller = AudioPlaybackController()
 214 | 
 215 | # --- Enhanced Text Processing Classes ---------------------------------------
 216 | 
 217 | @dataclass
 218 | class TextChunk:
 219 |   """Represents a processed text chunk with metadata."""
 220 |   text: str
 221 |   original_text: str
 222 |   chunk_id: int
 223 |   total_chunks: int
 224 |   processing_time: float = 0.0
 225 |   char_count: int = 0
 226 |   
 227 |   def __post_init__(self):
 228 |       self.char_count = len(self.text)
 229 | 
 230 | class TextProcessingMode(Enum):
 231 |   """Text processing modes for different input types."""
 232 |   PLAIN = "plain"
 233 |   MARKDOWN = "markdown"
 234 |   HTML = "html"
 235 |   SSML = "ssml"
 236 | 
 237 | class ProductionTextProcessor:
 238 |   """Production-grade text processor with comprehensive normalization."""
 239 |   
 240 |   def __init__(self):
 241 |       # Enhanced character replacements (comprehensive Unicode mapping)
 242 |       self.char_replacements = {
 243 |           # Smart quotes and apostrophes
 244 |           '"': '"', '"': '"', ''': "'", ''': "'",
 245 |           '‚': ',', '„': '"', '‹': '<', '›': '>',
 246 |           '«': '"', '»': '"',
 247 |           
 248 |           # Dashes and hyphens
 249 |           '–': '-', '—': '-', '―': '-', '‒': '-',
 250 |           
 251 |           # Mathematical and special symbols
 252 |           '×': ' times ', '÷': ' divided by ', '±': ' plus or minus ',
 253 |           '≤': ' less than or equal to ', '≥': ' greater than or equal to ',
 254 |           '≠': ' not equal to ', '≈': ' approximately ',
 255 |           '∞': ' infinity ', '√': ' square root of ',
 256 |           
 257 |           # Currency symbols
 258 |           '€': ' euros ', '£': ' pounds ', '¥': ' yen ',
 259 |           '₹': ' rupees ', '₽': ' rubles ', '₩': ' won ',
 260 |           
 261 |           # Other symbols
 262 |           '©': ' copyright ', '®': ' registered ', '™': ' trademark ',
 263 |           '§': ' section ', '¶': ' paragraph ', '†': ' dagger ',
 264 |           '‡': ' double dagger ', '•': ' bullet ', '‰': ' per mille ',
 265 |           '…': '...', '⋯': '...', '⋮': '...',
 266 |           
 267 |           # Fractions
 268 |           '½': ' one half ', '⅓': ' one third ', '⅔': ' two thirds ',
 269 |           '¼': ' one quarter ', '¾': ' three quarters ', '⅕': ' one fifth ',
 270 |           '⅖': ' two fifths ', '⅗': ' three fifths ', '⅘': ' four fifths ',
 271 |           '⅙': ' one sixth ', '⅚': ' five sixths ', '⅛': ' one eighth ',
 272 |           '⅜': ' three eighths ', '⅝': ' five eighths ', '⅞': ' seven eighths ',
 273 |       }
 274 |       
 275 |       # Context-aware abbreviations with disambiguation
 276 |       self.abbreviations = {
 277 |           # Titles
 278 |           'Dr.': {'default': 'Doctor', 'context': {'street': 'Drive'}},
 279 |           'Mr.': {'default': 'Mister'},
 280 |           'Mrs.': {'default': 'Missus'},
 281 |           'Ms.': {'default': 'Miss'},
 282 |           'Prof.': {'default': 'Professor'},
 283 |           
 284 |           # Places and directions
 285 |           'St.': {'default': 'Saint', 'context': {'address': 'Street'}},
 286 |           'Ave.': {'default': 'Avenue'},
 287 |           'Blvd.': {'default': 'Boulevard'},
 288 |           'Rd.': {'default': 'Road'},
 289 |           'Ln.': {'default': 'Lane'},
 290 |           'Ct.': {'default': 'Court'},
 291 |           'Pl.': {'default': 'Place'},
 292 |           'Sq.': {'default': 'Square'},
 293 |           'N.': {'default': 'North', 'context': {'name': 'N'}},
 294 |           'S.': {'default': 'South', 'context': {'name': 'S'}},
 295 |           'E.': {'default': 'East', 'context': {'name': 'E'}},
 296 |           'W.': {'default': 'West', 'context': {'name': 'W'}},
 297 |           
 298 |           # Common abbreviations
 299 |           'etc.': {'default': 'etcetera'},
 300 |           'vs.': {'default': 'versus'},
 301 |           'e.g.': {'default': 'for example'},
 302 |           'i.e.': {'default': 'that is'},
 303 |           'cf.': {'default': 'compare'},
 304 |           'et al.': {'default': 'and others'},
 305 |           'ibid.': {'default': 'in the same place'},
 306 |           'op. cit.': {'default': 'in the work cited'},
 307 |           
 308 |           # Business
 309 |           'Inc.': {'default': 'Incorporated'},
 310 |           'Corp.': {'default': 'Corporation'},
 311 |           'Ltd.': {'default': 'Limited'},
 312 |           'Co.': {'default': 'Company'},
 313 |           'LLC': {'default': 'Limited Liability Company'},
 314 |           'LLP': {'default': 'Limited Liability Partnership'},
 315 |           
 316 |           # Time and dates
 317 |           'Jan.': {'default': 'January'},
 318 |           'Feb.': {'default': 'February'},
 319 |           'Mar.': {'default': 'March'},
 320 |           'Apr.': {'default': 'April'},
 321 |           'Jun.': {'default': 'June'},
 322 |           'Jul.': {'default': 'July'},
 323 |           'Aug.': {'default': 'August'},
 324 |           'Sep.': {'default': 'September'},
 325 |           'Sept.': {'default': 'September'},
 326 |           'Oct.': {'default': 'October'},
 327 |           'Nov.': {'default': 'November'},
 328 |           'Dec.': {'default': 'December'},
 329 |           
 330 |           'Mon.': {'default': 'Monday'},
 331 |           'Tue.': {'default': 'Tuesday'},
 332 |           'Wed.': {'default': 'Wednesday'},
 333 |           'Thu.': {'default': 'Thursday'},
 334 |           'Fri.': {'default': 'Friday'},
 335 |           'Sat.': {'default': 'Saturday'},
 336 |           'Sun.': {'default': 'Sunday'},
 337 |           
 338 |           'AM': {'default': 'A M'},
 339 |           'PM': {'default': 'P M'},
 340 |           'a.m.': {'default': 'A M'},
 341 |           'p.m.': {'default': 'P M'},
 342 |       }
 343 |       
 344 |       # Enhanced markdown patterns
 345 |       self.markdown_patterns = [
 346 |           # Code blocks (must come first)
 347 |           (r'\`\`\`[\s\S]*?\`\`\`', ' [code block] '),
 348 |           (r'`([^`]+)`', r'\1'),
 349 |           
 350 |           # Headers
 351 |           (r'^#{1,6}\s+(.+)$', r'\1', re.MULTILINE),
 352 |           
 353 |           # Links and images
 354 |           (r'!\[([^\]]*)\]$$[^)]+$$', r'\1'),  # Images - use alt text
 355 |           (r'\[([^\]]+)\]$$[^)]+$$', r'\1'),   # Links - use link text
 356 |           
 357 |           # Emphasis
 358 |           (r'\*\*\*(.+?)\*\*\*', r'\1'),      # Bold italic
 359 |           (r'\*\*(.+?)\*\*', r'\1'),          # Bold
 360 |           (r'\*(.+?)\*', r'\1'),              # Italic
 361 |           (r'__(.+?)__', r'\1'),              # Bold alt
 362 |           (r'_(.+?)_', r'\1'),                # Italic alt
 363 |           (r'~~(.+?)~~', r'\1'),              # Strikethrough
 364 |           
 365 |           # Lists
 366 |           (r'^\s*[-*+]\s+(.+)$', r'\1', re.MULTILINE),  # Unordered lists
 367 |           (r'^\s*\d+\.\s+(.+)$', r'\1', re.MULTILINE),  # Ordered lists
 368 |           
 369 |           # Blockquotes
 370 |           (r'^\s*>\s*(.+)$', r'\1', re.MULTILINE),
 371 |           
 372 |           # Horizontal rules
 373 |           (r'^[-*_]{3,}$', '', re.MULTILINE),
 374 |           
 375 |           # Tables (remove pipe separators)
 376 |           (r'\|', ' '),
 377 |       ]
 378 |       
 379 |       # Number processing patterns
 380 |       self.number_patterns = [
 381 |           # Currency with amounts
 382 |           (r'(\$|USD|€|EUR|£|GBP|¥|JPY|₹|INR)\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', self._expand_currency),
 383 |           
 384 |           # Percentages
 385 |           (r'(\d+(?:\.\d+)?)\s*%', r'\1 percent'),
 386 |           
 387 |           # Temperatures
 388 |           (r'(\d+(?:\.\d+)?)\s*°([CF])', self._expand_temperature),
 389 |           
 390 |           # Measurements
 391 |           (r'(\d+(?:\.\d+)?)\s*(km|m|cm|mm|ft|in|mi|kg|g|lb|oz)', self._expand_measurement),
 392 |           
 393 |           # Years (4 digits)
 394 |           (r'\b(19|20)\d{2}\b', self._expand_year),
 395 |           
 396 |           # Large numbers with commas
 397 |           (r'\b(\d{1,3}(?:,\d{3})+)\b', self._expand_large_number),
 398 |           
 399 |           # Decimals
 400 |           (r'\b(\d+)\.(\d+)\b', self._expand_decimal),
 401 |           
 402 |           # Ordinals
 403 |           (r'\b(\d+)(st|nd|rd|th)\b', self._expand_ordinal),
 404 |           
 405 |           # Phone numbers (basic pattern)
 406 |           (r'\b(\d{3})-(\d{3})-(\d{4})\b', r'\1 \2 \3'),
 407 |           
 408 |           # Time
 409 |           (r'\b(\d{1,2}):(\d{2})\s*(AM|PM|am|pm)?\b', self._expand_time),
 410 |       ]
 411 |       
 412 |       # Sentence boundary patterns for smart chunking
 413 |       self.sentence_boundaries = re.compile(
 414 |           r'(?<=[.!?])\s+(?=[A-Z])|'  # Period/exclamation/question + space + capital
 415 |           r'(?<=[.!?])\s*\n\s*(?=[A-Z])|'  # Same with newline
 416 |           r'(?<=\.)\s+(?=["\'"]?[A-Z])'  # Period + space + optional quote + capital
 417 |       )
 418 |       
 419 |       # Compile regex patterns for performance
 420 |       self._compile_patterns()
 421 |       
 422 |       # Cache for processed text
 423 |       self._cache = {}
 424 |       self._cache_lock = threading.Lock()
 425 |   
 426 |   def _compile_patterns(self):
 427 |       """Compile regex patterns for better performance."""
 428 |       self.compiled_markdown = []
 429 |       for pattern in self.markdown_patterns:
 430 |           if len(pattern) == 3:
 431 |               self.compiled_markdown.append((re.compile(pattern[0], pattern[2]), pattern[1]))
 432 |           else:
 433 |               self.compiled_markdown.append((re.compile(pattern[0]), pattern[1]))
 434 |   
 435 |   def _expand_currency(self, match):
 436 |       """Expand currency amounts."""
 437 |       symbol = match.group(1)
 438 |       amount = match.group(2)
 439 |       
 440 |       # Currency symbol mapping
 441 |       currency_map = {
 442 |           '$': 'dollars', 'USD': 'dollars',
 443 |           '€': 'euros', 'EUR': 'euros',
 444 |           '£': 'pounds', 'GBP': 'pounds',
 445 |           '¥': 'yen', 'JPY': 'yen',
 446 |           '₹': 'rupees', 'INR': 'rupees'
 447 |       }
 448 |       
 449 |       currency_name = currency_map.get(symbol, 'units')
 450 |       
 451 |       if HAS_NUM2WORDS:
 452 |           try:
 453 |               # Remove commas and convert to float
 454 |               amount_float = float(amount.replace(',', ''))
 455 |               if amount_float == int(amount_float):
 456 |                   # Whole number
 457 |                   amount_words = num2words.num2words(int(amount_float))
 458 |               else:
 459 |                   # Has decimal places
 460 |                   dollars = int(amount_float)
 461 |                   cents = int((amount_float - dollars) * 100)
 462 |                   amount_words = f"{num2words.num2words(dollars)} {currency_name}"
 463 |                   if cents > 0:
 464 |                       amount_words += f" and {num2words.num2words(cents)} cents"
 465 |                   return amount_words
 466 |               return f"{amount_words} {currency_name}"
 467 |           except:
 468 |               pass
 469 |       
 470 |       return f"{amount} {currency_name}"
 471 |   
 472 |   def _expand_temperature(self, match):
 473 |       """Expand temperature readings."""
 474 |       temp = match.group(1)
 475 |       scale = match.group(2)
 476 |       scale_name = 'Celsius' if scale.upper() == 'C' else 'Fahrenheit'
 477 |       return f"{temp} degrees {scale_name}"
 478 |   
 479 |   def _expand_measurement(self, match):
 480 |       """Expand measurements."""
 481 |       value = match.group(1)
 482 |       unit = match.group(2)
 483 |       
 484 |       unit_map = {
 485 |           'km': 'kilometers', 'm': 'meters', 'cm': 'centimeters', 'mm': 'millimeters',
 486 |           'ft': 'feet', 'in': 'inches', 'mi': 'miles',
 487 |           'kg': 'kilograms', 'g': 'grams', 'lb': 'pounds', 'oz': 'ounces'
 488 |       }
 489 |       
 490 |       unit_name = unit_map.get(unit, unit)
 491 |       return f"{value} {unit_name}"
 492 |   
 493 |   def _expand_year(self, match):
 494 |       """Expand years for better pronunciation."""
 495 |       year = match.group(0)
 496 |       if HAS_NUM2WORDS:
 497 |           try:
 498 |               return num2words.num2words(int(year))
 499 |           except:
 500 |               pass
 501 |       return year
 502 |   
 503 |   def _expand_large_number(self, match):
 504 |       """Expand large numbers with commas."""
 505 |       number = match.group(1).replace(',', '')
 506 |       if HAS_NUM2WORDS:
 507 |           try:
 508 |               return num2words.num2words(int(number))
 509 |           except:
 510 |               pass
 511 |       return number
 512 |   
 513 |   def _expand_decimal(self, match):
 514 |       """Expand decimal numbers."""
 515 |       whole = match.group(1)
 516 |       decimal = match.group(2)
 517 |       
 518 |       if HAS_NUM2WORDS:
 519 |           try:
 520 |               whole_words = num2words.num2words(int(whole))
 521 |               decimal_words = ' '.join([num2words.num2words(int(d)) for d in decimal])
 522 |               return f"{whole_words} point {decimal_words}"
 523 |           except:
 524 |               pass
 525 |       
 526 |       return f"{whole} point {' '.join(decimal)}"
 527 |   
 528 |   def _expand_ordinal(self, match):
 529 |       """Expand ordinal numbers."""
 530 |       number = match.group(1)
 531 |       suffix = match.group(2)
 532 |       
 533 |       if HAS_NUM2WORDS:
 534 |           try:
 535 |               return num2words.num2words(int(number), ordinal=True)
 536 |           except:
 537 |               pass
 538 |       
 539 |       return f"{number}{suffix}"
 540 |   
 541 |   def _expand_time(self, match):
 542 |       """Expand time expressions."""
 543 |       hour = int(match.group(1))
 544 |       minute = match.group(2)
 545 |       period = match.group(3) if match.group(3) else ""
 546 |       
 547 |       if HAS_NUM2WORDS:
 548 |           try:
 549 |               hour_words = num2words.num2words(hour)
 550 |               if minute == "00":
 551 |                   time_words = f"{hour_words} o'clock"
 552 |               else:
 553 |                   minute_words = num2words.num2words(int(minute))
 554 |                   time_words = f"{hour_words} {minute_words}"
 555 |               
 556 |               if period:
 557 |                   time_words += f" {period.upper()}"
 558 |               
 559 |               return time_words
 560 |           except:
 561 |               pass
 562 |       
 563 |       return f"{hour} {minute} {period}".strip()
 564 |   
 565 |   def normalize_unicode(self, text: str) -> str:
 566 |       """Normalize Unicode characters while preserving important accents."""
 567 |       # First, unescape HTML entities
 568 |       text = html.unescape(text)
 569 |       
 570 |       # Replace special characters
 571 |       for char, replacement in self.char_replacements.items():
 572 |           text = text.replace(char, replacement)
 573 |       
 574 |       # Normalize Unicode but preserve accented characters in names
 575 |       # This is a balance between ASCII conversion and preserving pronunciation
 576 |       normalized = unicodedata.normalize('NFC', text)
 577 |       
 578 |       return normalized
 579 |   
 580 |   def clean_markdown(self, text: str) -> str:
 581 |       """Clean markdown formatting using proper parsing when available."""
 582 |       if HAS_MARKDOWN:
 583 |           try:
 584 |               # Convert markdown to HTML, then extract text
 585 |               md = markdown.Markdown(extensions=['fenced_code', 'tables', 'codehilite'])
 586 |               html_content = md.convert(text)
 587 |               
 588 |               # Simple HTML tag removal (more robust than regex for basic cases)
 589 |               import re
 590 |               clean_text = re.sub(r'<[^>]+>', '', html_content)
 591 |               clean_text = html.unescape(clean_text)
 592 |               return clean_text
 593 |           except Exception as e:
 594 |               logger.warning(f"Markdown parsing failed, falling back to regex: {e}")
 595 |       
 596 |       # Fallback to regex-based cleaning
 597 |       cleaned = text
 598 |       for pattern, replacement in self.compiled_markdown:
 599 |           cleaned = pattern.sub(replacement, cleaned)
 600 |       
 601 |       return cleaned
 602 |   
 603 |   def expand_abbreviations(self, text: str) -> str:
 604 |       """Expand abbreviations with context awareness."""
 605 |       result = text
 606 |       
 607 |       for abbrev, expansion_data in self.abbreviations.items():
 608 |           if isinstance(expansion_data, dict):
 609 |               default_expansion = expansion_data['default']
 610 |               # For now, use default expansion
 611 |               # TODO: Implement context detection for better disambiguation
 612 |               expansion = default_expansion
 613 |           else:
 614 |               expansion = expansion_data
 615 |           
 616 |           # Use word boundaries to avoid partial matches
 617 |           pattern = r'\b' + re.escape(abbrev) + r'\b'
 618 |           result = re.sub(pattern, expansion, result, flags=re.IGNORECASE)
 619 |       
 620 |       return result
 621 |   
 622 |   def process_numbers(self, text: str) -> str:
 623 |       """Process numbers, currency, and measurements."""
 624 |       result = text
 625 |       
 626 |       for pattern, replacement in self.number_patterns:
 627 |           if callable(replacement):
 628 |               result = re.sub(pattern, replacement, result)
 629 |           else:
 630 |               result = re.sub(pattern, replacement, result)
 631 |       
 632 |       return result
 633 |   
 634 |   def normalize_punctuation(self, text: str) -> str:
 635 |       """Normalize punctuation for better TTS processing."""
 636 |       # Multiple punctuation marks
 637 |       text = re.sub(r'\.{2,}', '...', text)  # Multiple dots to ellipsis
 638 |       text = re.sub(r'[!]{2,}', '!', text)   # Multiple exclamations
 639 |       text = re.sub(r'[?]{2,}', '?', text)   # Multiple questions
 640 |       
 641 |       # Normalize spacing around punctuation
 642 |       text = re.sub(r'\s*([,.!?;:])\s*', r'\1 ', text)
 643 |       
 644 |       # Multiple spaces to single space
 645 |       text = re.sub(r'\s+', ' ', text)
 646 |       
 647 |       # Clean up extra whitespace
 648 |       text = text.strip()
 649 |       
 650 |       return text
 651 |   
 652 |   def preserve_case_markers(self, text: str) -> Tuple[str, List[Tuple[int, int, str]]]:
 653 |       """Identify and preserve important case information."""
 654 |       case_preservations = []
 655 |       
 656 |       # Find acronyms (2+ consecutive uppercase letters)
 657 |       for match in re.finditer(r'\b[A-Z]{2,}\b', text):
 658 |           case_preservations.append((match.start(), match.end(), match.group()))
 659 |       
 660 |       # Find proper nouns at sentence beginnings
 661 |       for match in re.finditer(r'(?:^|[.!?]\s+)([A-Z][a-z]+)', text):
 662 |           start = match.start(1)
 663 |           end = match.end(1)
 664 |           case_preservations.append((start, end, match.group(1)))
 665 |       
 666 |       return text, case_preservations
 667 |   
 668 |   def smart_chunk_text(self, text: str, max_length: int = DEFAULT_CHUNK_SIZE) -> List[str]:
 669 |       """Intelligently chunk text at sentence boundaries."""
 670 |       if len(text) <= max_length:
 671 |           return [text]
 672 |       
 673 |       # First, try to split at sentence boundaries
 674 |       sentences = self.sentence_boundaries.split(text)
 675 |       if not sentences:
 676 |           sentences = [text]
 677 |       
 678 |       chunks = []
 679 |       current_chunk = ""
 680 |       
 681 |       for sentence in sentences:
 682 |           sentence = sentence.strip()
 683 |           if not sentence:
 684 |               continue
 685 |           
 686 |           # If adding this sentence would exceed max_length
 687 |           if len(current_chunk + sentence) > max_length:
 688 |               if current_chunk:
 689 |                   chunks.append(current_chunk.strip())
 690 |                   current_chunk = sentence + " "
 691 |               else:
 692 |                   # Single sentence is too long, split at commas or other punctuation
 693 |                   sub_parts = re.split(r'(?<=[,;:])\s+', sentence)
 694 |                   for part in sub_parts:
 695 |                       if len(current_chunk + part) > max_length:
 696 |                           if current_chunk:
 697 |                               chunks.append(current_chunk.strip())
 698 |                           current_chunk = part + " "
 699 |                       else:
 700 |                           current_chunk += part + " "
 701 |           else:
 702 |               current_chunk += sentence + " "
 703 |       
 704 |       if current_chunk.strip():
 705 |           chunks.append(current_chunk.strip())
 706 |       
 707 |       # Filter out chunks that are too short
 708 |       valid_chunks = [chunk for chunk in chunks if len(chunk.strip()) >= MIN_CHUNK_SIZE]
 709 |       
 710 |       return valid_chunks[:MAX_CHUNKS]  # Limit total chunks
 711 |   
 712 |   def detect_input_mode(self, text: str) -> TextProcessingMode:
 713 |       """Detect the input text format."""
 714 |       # Check for SSML
 715 |       if '<speak>' in text or '<voice' in text or '<prosody' in text:
 716 |           return TextProcessingMode.SSML
 717 |       
 718 |       # Check for HTML
 719 |       if '<html>' in text or '<div>' in text or '<p>' in text:
 720 |           return TextProcessingMode.HTML
 721 |       
 722 |       # Check for Markdown
 723 |       markdown_indicators = [
 724 |           r'^#{1,6}\s',  # Headers
 725 |           r'\*\*.*?\*\*',  # Bold
 726 |           r'\[.*?\]$$.*?$$',  # Links
 727 |           r'\`\`\`',  # Code blocks
 728 |           r'^\s*[-*+]\s',  # Lists
 729 |           r'^\s*\d+\.\s',  # Numbered lists
 730 |       ]
 731 |       
 732 |       for pattern in markdown_indicators:
 733 |           if re.search(pattern, text, re.MULTILINE):
 734 |               return TextProcessingMode.MARKDOWN
 735 |       
 736 |       return TextProcessingMode.PLAIN
 737 |   
 738 |   def process_text(self, text: str, mode: Optional[TextProcessingMode] = None, 
 739 |                   max_chunk_length: int = DEFAULT_CHUNK_SIZE) -> List[TextChunk]:
 740 |       """Main text processing pipeline."""
 741 |       start_time = time.time()
 742 |       
 743 |       if not text or not text.strip():
 744 |           return []
 745 |       
 746 |       # Check cache first
 747 |       cache_key = hashlib.md5(f"{text}_{max_chunk_length}".encode()).hexdigest()
 748 |       with self._cache_lock:
 749 |           if cache_key in self._cache:
 750 |               logger.info("Using cached text processing result")
 751 |               return self._cache[cache_key]
 752 |       
 753 |       original_text = text
 754 |       logger.info(f"Processing text: {text[:100]}...")
 755 |       
 756 |       # Auto-detect mode if not provided
 757 |       if mode is None:
 758 |           mode = self.detect_input_mode(text)
 759 |       
 760 |       logger.info(f"Detected input mode: {mode.value}")
 761 |       
 762 |       # Step 1: Normalize Unicode and HTML entities
 763 |       processed = self.normalize_unicode(text)
 764 |       
 765 |       # Step 2: Handle different input formats
 766 |       if mode == TextProcessingMode.MARKDOWN:
 767 |           processed = self.clean_markdown(processed)
 768 |       elif mode == TextProcessingMode.HTML:
 769 |           # Basic HTML cleaning
 770 |           processed = re.sub(r'<[^>]+>', '', processed)
 771 |           processed = html.unescape(processed)
 772 |       elif mode == TextProcessingMode.SSML:
 773 |           # For SSML, we might want to preserve some tags
 774 |           # For now, just clean basic HTML-like tags
 775 |           processed = re.sub(r'<(?!speak|voice|prosody|break|emphasis)[^>]+>', '', processed)
 776 |       
 777 |       # Step 3: Preserve case information
 778 |       processed, case_info = self.preserve_case_markers(processed)
 779 |       
 780 |       # Step 4: Expand abbreviations
 781 |       processed = self.expand_abbreviations(processed)
 782 |       
 783 |       # Step 5: Process numbers and special formats
 784 |       processed = self.process_numbers(processed)
 785 |       
 786 |       # Step 6: Normalize punctuation
 787 |       processed = self.normalize_punctuation(processed)
 788 |       
 789 |       # Step 7: Smart chunking
 790 |       text_chunks = self.smart_chunk_text(processed, max_chunk_length)
 791 |       
 792 |       # Step 8: Create TextChunk objects
 793 |       chunks = []
 794 |       processing_time = time.time() - start_time
 795 |       
 796 |       for i, chunk_text in enumerate(text_chunks):
 797 |           if len(chunk_text.strip()) >= 2:  # Minimum viable chunk size
 798 |               chunk = TextChunk(
 799 |                   text=chunk_text.strip(),
 800 |                   original_text=original_text,
 801 |                   chunk_id=i,
 802 |                   total_chunks=len(text_chunks),
 803 |                   processing_time=processing_time / len(text_chunks)
 804 |               )
 805 |               chunks.append(chunk)
 806 |       
 807 |       # Cache the result
 808 |       with self._cache_lock:
 809 |           if len(self._cache) >= CACHE_SIZE:
 810 |               # Simple cache eviction - remove oldest entries
 811 |               oldest_keys = list(self._cache.keys())[:CACHE_SIZE // 2]
 812 |               for key in oldest_keys:
 813 |                   del self._cache[key]
 814 |           self._cache[cache_key] = chunks
 815 |       
 816 |       logger.info(f"Text processed into {len(chunks)} chunks in {processing_time:.2f}s")
 817 |       return chunks
 818 | 
 819 | # --- Note to Frequency Conversion --------------------------------------------
 820 | 
 821 | NOTE_PATTERN = re.compile(r"^([A-G])([#b]?)(\d)$")
 822 | NOTE_INDEX = {
 823 |   "C": 0, "C#": 1, "Db": 1, "D": 2, "D#": 3, "Eb": 3, "E": 4,
 824 |   "F": 5, "F#": 6, "Gb": 6, "G": 7, "G#": 8, "Ab": 8, "A": 9,
 825 |   "A#": 10, "Bb": 10, "B": 11,
 826 | }
 827 | 
 828 | def note_to_freq(note: str) -> float:
 829 |   """Convert scientific pitch (e.g., A4) → frequency in Hz."""
 830 |   m = NOTE_PATTERN.match(note)
 831 |   if not m:
 832 |       raise ValueError(f"Invalid note: {note}")
 833 |   letter, accidental, octave = m.groups()
 834 |   key = letter + accidental
 835 |   semitone = NOTE_INDEX[key]
 836 |   octave = int(octave)
 837 |   midi = semitone + 12 * (octave + 1)
 838 |   return 440.0 * 2 ** ((midi - 69) / 12)
 839 | 
 840 | def available_notes() -> List[str]:
 841 |   """Generate a list of available musical notes."""
 842 |   names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
 843 |   return ["None"] + [f"{n}{o}" for o in range(2, 7) for n in names]
 844 | 
 845 | # --- Audio Processing Functions with ZERO DEFAULTS -------------------------
 846 | 
 847 | def estimate_pitch(audio: np.ndarray, sr: int) -> Optional[float]:
 848 |   """Estimate the fundamental frequency of audio."""
 849 |   try:
 850 |       f0 = librosa.yin(y=audio, fmin=65, fmax=1047, sr=sr)
 851 |       median_f0 = np.nanmedian(f0)
 852 |       return float(median_f0) if np.any(~np.isnan(f0)) and not np.isnan(median_f0) else None
 853 |   except Exception as e:
 854 |       logger.warning(f"Pitch estimation failed: {e}")
 855 |       return None
 856 | 
 857 | def apply_volume(audio: np.ndarray, gain: float = 1.0, gain_db: Optional[float] = None) -> np.ndarray:
 858 |   """Apply volume adjustment with linear or dB gain."""
 859 |   if gain_db is not None:
 860 |       gain = 10 ** (gain_db / 20.0)
 861 |   gain = np.clip(gain, 0.0, 2.0)
 862 |   if abs(gain - 1.0) < 0.01:
 863 |       return audio
 864 |   logger.info(f"Applying volume gain: {gain:.2f} (linear), {20 * math.log10(max(gain, 1e-10)):.2f}dB")
 865 |   try:
 866 |       result = audio * gain
 867 |       
 868 |       return result
 869 |   except Exception as e:
 870 |       logger.error(f"Volume adjustment failed: {e}")
 871 |       return audio
 872 | 
 873 | def shift_to_target(audio: np.ndarray, sr: int, target_note: Optional[str], semitone_shift: float, preserve_formants: bool = False) -> np.ndarray:
 874 |     """
 875 |     FIXED: Shift audio to target note or by semitones with robust librosa compatibility and true formant preservation.
 876 |     """
 877 |     actual_semitone_shift = semitone_shift
 878 |     
 879 |     # Handle target note conversion
 880 |     if target_note and target_note.lower() not in ["none", "", "null"]:
 881 |         current_pitch = estimate_pitch(audio, sr)
 882 |         if current_pitch is None:
 883 |             logger.warning("Could not estimate current pitch. Using semitone_shift if provided.")
 884 |         else:
 885 |             try:
 886 |                 target_freq = note_to_freq(target_note)
 887 |                 # Calculate semitone shift needed to get from current pitch to target frequency
 888 |                 actual_semitone_shift = 12 * math.log2(target_freq / current_pitch)
 889 |                 logger.info(f"Pitch shift: Current={current_pitch:.2f}Hz, Target={target_note}({target_freq:.2f}Hz), Semitones={actual_semitone_shift:.2f}")
 890 |             except (ValueError, ZeroDivisionError) as e:
 891 |                 logger.warning(f"Invalid target note or pitch estimation '{target_note}': {e}")
 892 |                 actual_semitone_shift = semitone_shift
 893 |     
 894 |     # Skip if no significant shift
 895 |     if abs(actual_semitone_shift) < 0.01:
 896 |         return audio
 897 |     
 898 |     logger.info(f"Applying pitch shift of {actual_semitone_shift:.2f} semitones, preserve_formants={preserve_formants}")
 899 |     
 900 |     try:
 901 |         # Step 1: Apply pitch shift using librosa
 902 |         import inspect
 903 |         pitch_shift_params = inspect.signature(librosa.effects.pitch_shift).parameters
 904 |         
 905 |         if 'res_type' in pitch_shift_params:
 906 |             shifted_audio = librosa.effects.pitch_shift(
 907 |                 y=audio, 
 908 |                 sr=sr, 
 909 |                 n_steps=actual_semitone_shift,
 910 |                 res_type='kaiser_best'
 911 |             )
 912 |         else:
 913 |             shifted_audio = librosa.effects.pitch_shift(
 914 |                 y=audio, 
 915 |                 sr=sr, 
 916 |                 n_steps=actual_semitone_shift
 917 |             )
 918 |             
 919 |         # Step 2: If preserving formants, apply a corrective formant shift
 920 |         if preserve_formants:
 921 |             logger.info("Applying corrective formant shift to preserve voice character.")
 922 |             # The pitch shift moved the formants. We need to shift them back.
 923 |             # The ratio of the pitch shift is 2^(semitones/12).
 924 |             # To reverse the formant shift, we need to shift by the inverse ratio.
 925 |             formant_correction_ratio = 2 ** (-actual_semitone_shift / 12.0)
 926 |             
 927 |             # Our apply_formant_shift function takes a factor where 1.0 is no change.
 928 |             # We can directly use this ratio.
 929 |             final_audio = apply_formant_shift(shifted_audio, sr, shift_factor=formant_correction_ratio)
 930 |             return final_audio
 931 |         else:
 932 |             return shifted_audio
 933 |             
 934 |     except Exception as e:
 935 |         logger.error(f"Pitch shift failed: {e}")
 936 |         return audio
 937 | 
 938 | def apply_formant_shift(audio: np.ndarray, sr: int, shift_factor: float, scale: float = 1.0) -> np.ndarray:
 939 |   """Apply formant shifting with intensity scaling."""
 940 |   if abs(shift_factor - 1.0) < 0.01:
 941 |       return audio
 942 |   shift_factor = np.clip(shift_factor, 0.5, 1.5)
 943 |   scale = np.clip(scale, 0.5, 2.0)
 944 |   effective_shift = 1.0 + (shift_factor - 1.0) * scale
 945 |   logger.info(f"Applying formant shift: factor={shift_factor:.2f}, scale={scale:.2f}, effective={effective_shift:.2f}")
 946 |   try:
 947 |       audio_float32 = audio.astype(np.float32)
 948 |       stft_result = librosa.stft(audio_float32)
 949 |       magnitude = np.abs(stft_result)
 950 |       phase = np.angle(stft_result)
 951 |       shifted_magnitude = np.zeros_like(magnitude)
 952 |       n_freq_bins = magnitude.shape[0]
 953 |       for i in range(magnitude.shape[1]):
 954 |           freq_profile = magnitude[:, i]
 955 |           source_freq_coords = np.arange(n_freq_bins, dtype=float) / effective_shift
 956 |           shifted_profile_frame = np.interp(
 957 |               source_freq_coords,
 958 |               np.arange(n_freq_bins, dtype=float),
 959 |               freq_profile,
 960 |               left=freq_profile[0] if len(freq_profile) > 0 else 0.0,
 961 |               right=freq_profile[-1] if len(freq_profile) > 0 else 0.0
 962 |           )
 963 |           shifted_magnitude[:, i] = shifted_profile_frame
 964 |       audio_stft_shifted = shifted_magnitude * np.exp(1j * phase)
 965 |       audio_shifted = librosa.istft(audio_stft_shifted, length=len(audio_float32))
 966 |       
 967 |       return audio_shifted.astype(audio.dtype)
 968 |   except Exception as e:
 969 |       logger.error(f"Formant shift failed: {e}")
 970 |       return audio
 971 | 
 972 | def apply_reverb(audio: np.ndarray, sr: int, room_size: float = 0.0, damping: float = 0.5, pre_delay_ms: float = 0.0, stereo_width: float = 0.0) -> np.ndarray:
 973 |   """Apply reverb with pre-delay and stereo width. Only applies if room_size > 0."""
 974 |   if room_size <= 0.001:
 975 |       return audio
 976 |   room_size = np.clip(room_size, 0.0, 1.0)
 977 |   damping = np.clip(damping, 0.0, 1.0)
 978 |   pre_delay_ms = np.clip(pre_delay_ms, 0.0, 100.0)
 979 |   stereo_width = np.clip(stereo_width, 0.0, 1.0)
 980 |   logger.info(f"Applying reverb: RoomSize={room_size:.2f}, Damping={damping:.2f}, PreDelay={pre_delay_ms:.1f}ms, StereoWidth={stereo_width:.2f}")
 981 |   try:
 982 |       reverb_length_sec = room_size * 1.5
 983 |       pre_delay_samples = int(sr * pre_delay_ms / 1000)
 984 |       reverb_length_samples = int(sr * reverb_length_sec) + pre_delay_samples
 985 |       if reverb_length_samples <= 0:
 986 |           return audio
 987 |       time_points = np.arange(reverb_length_samples) / sr
 988 |       decay_rate = 5.0 + (1.0 - damping) * 15.0
 989 |       decay_envelope = np.exp(-decay_rate * time_points)
 990 |       impulse = np.random.randn(reverb_length_samples) * decay_envelope
 991 |       impulse_energy_sq = np.sum(impulse**2)
 992 |       if impulse_energy_sq > 1e-12:
 993 |           impulse = impulse / np.sqrt(impulse_energy_sq)
 994 |       else:
 995 |           return audio
 996 |       if stereo_width > 0:
 997 |           impulse_l = impulse * np.sqrt(1 - stereo_width / 2)
 998 |           impulse_r = np.random.randn(reverb_length_samples) * decay_envelope * np.sqrt(stereo_width / 2)
 999 |           impulse_r_energy = np.sum(impulse_r**2)
1000 |           if impulse_r_energy > 1e-12:
1001 |               impulse_r = impulse_r / np.sqrt(impulse_r_energy)
1002 |           reverb_l = signal.convolve(audio, impulse_l, mode='full')[:len(audio)]
1003 |           reverb_r = signal.convolve(audio, impulse_r, mode='full')[:len(audio)]
1004 |           reverb_audio = (reverb_l + reverb_r) / 2
1005 |       else:
1006 |           reverb_audio = signal.convolve(audio, impulse, mode='full')[:len(audio)]
1007 |       
1008 |       dry_gain = 0.7
1009 |       wet_gain = 0.1 + room_size * 0.4
1010 |       result = dry_gain * audio + wet_gain * reverb_audio
1011 |       
1012 |       return result
1013 |   except Exception as e:
1014 |       logger.error(f"Reverb failed: {e}")
1015 |       return audio
1016 | 
1017 | def apply_eq(audio: np.ndarray, sr: int, bands: List[Dict[str, Any]]) -> np.ndarray:
1018 |   """Apply parametric equalizer with multiple bands. Only applies if bands are provided."""
1019 |   if not bands:
1020 |       return audio
1021 |   logger.info(f"Applying EQ with {len(bands)} bands")
1022 |   try:
1023 |       processed_audio = audio.copy()
1024 |       nyquist = sr / 2.0
1025 |       for band in bands:
1026 |           freq = np.clip(band.get('frequency_hz', 1000.0), 20.0, nyquist - 1e-5)
1027 |           gain_db = np.clip(band.get('gain_db', 0.0), -24.0, 24.0)
1028 |           q_factor = np.clip(band.get('q_factor', 1.0), 0.1, 10.0)
1029 |           band_type = band.get('type', 'peak').lower()
1030 |           if abs(gain_db) < 0.1:
1031 |               continue
1032 |           logger.info(f"EQ Band: Type={band_type}, Freq={freq:.1f}Hz, Gain={gain_db:.1f}dB, Q={q_factor:.2f}")
1033 |           try:
1034 |               if band_type == 'peak':
1035 |                   sos = signal.iirpeak(w0=freq, Q=q_factor, gain_db=gain_db, fs=sr)
1036 |                   processed_audio = signal.sosfiltfilt(sos, processed_audio)
1037 |               elif band_type == 'low_shelf':
1038 |                   try:
1039 |                       sos = signal.iirshelf(w0=freq, Q=0.707, gain_db=gain_db, fs=sr, ftype='AB')
1040 |                       processed_audio = signal.sosfiltfilt(sos, processed_audio)
1041 |                   except AttributeError:
1042 |                       logger.info(f"Fallback to Butterworth for low_shelf at {freq}Hz")
1043 |                       gain_linear = 10 ** (gain_db / 20.0)
1044 |                       norm_freq = freq / nyquist
1045 |                       sos = signal.butter(2, norm_freq, btype='lowpass', output='sos')
1046 |                       low_freq = signal.sosfiltfilt(sos, processed_audio)
1047 |                       processed_audio = processed_audio + (gain_linear - 1.0) * low_freq
1048 |               elif band_type == 'high_shelf':
1049 |                   try:
1050 |                       sos = signal.iirshelf(w0=freq, Q=0.707, gain_db=gain_db, fs=sr, ftype='AB')
1051 |                       processed_audio = signal.sosfiltfilt(sos, processed_audio)
1052 |                   except AttributeError:
1053 |                       logger.info(f"Fallback to Butterworth for high_shelf at {freq}Hz")
1054 |                       gain_linear = 10 ** (gain_db / 20.0)
1055 |                       norm_freq = freq / nyquist
1056 |                       sos = signal.butter(2, norm_freq, btype='highpass', output='sos')
1057 |                       high_freq = signal.sosfiltfilt(sos, processed_audio)
1058 |                       processed_audio = processed_audio + (gain_linear - 1.0) * high_freq
1059 |               else:
1060 |                   logger.warning(f"Unknown EQ band type: {band_type}")
1061 |           except Exception as e:
1062 |               logger.warning(f"EQ band failed: {e}")
1063 |       
1064 |       return processed_audio
1065 |   except Exception as e:
1066 |       logger.error(f"EQ failed: {e}")
1067 |       return audio
1068 | 
1069 | def apply_distortion(audio: np.ndarray, drive_db: float = 0.0, dist_type: str = 'tanh', mix: float = 0.0) -> np.ndarray:
1070 |   """Apply distortion effect. Only applies if drive_db > 0 and mix > 0."""
1071 |   drive_db = np.clip(drive_db, 0.0, 36.0)
1072 |   mix = np.clip(mix, 0.0, 1.0)
1073 |   dist_type = dist_type.lower()
1074 |   if drive_db < 0.1 or mix < 0.001:
1075 |       return audio
1076 |   logger.info(f"Applying distortion: Type={dist_type}, Drive={drive_db:.1f}dB, Mix={mix:.2f}")
1077 |   try:
1078 |       drive_gain = 10 ** (drive_db / 20.0)
1079 |       distorted = audio * drive_gain
1080 |       if dist_type == 'tanh':
1081 |           distorted = np.tanh(distorted * 2.0) * 0.8
1082 |       elif dist_type == 'soft':
1083 |           distorted = np.clip(distorted, -1.0, 1.0)
1084 |       elif dist_type == 'hard':
1085 |           distorted = np.sign(distorted) * np.minimum(np.abs(distorted), 1.0)
1086 |       else:
1087 |           logger.warning(f"Unknown distortion type: {dist_type}, using tanh")
1088 |           distorted = np.tanh(distorted * 2.0) * 0.8
1089 |       
1090 |       result = (1.0 - mix) * audio + mix * distorted
1091 |       
1092 |       return result
1093 |   except Exception as e:
1094 |       logger.error(f"Distortion failed: {e}")
1095 |       return audio
1096 | 
1097 | def apply_chorus(audio: np.ndarray, sr: int, delay_ms: float = 0.0, depth: float = 0.0, rate_hz: float = 0.0, mix: float = 0.0) -> np.ndarray:
1098 |   """Apply chorus effect. Only applies if depth > 0 and mix > 0."""
1099 |   delay_ms = np.clip(delay_ms, 5.0, 50.0)
1100 |   depth = np.clip(depth, 0.0, 0.1)
1101 |   rate_hz = np.clip(rate_hz, 0.1, 5.0)
1102 |   mix = np.clip(mix, 0.0, 1.0)
1103 |   
1104 |   # Early return if effect should not be applied
1105 |   if depth < 0.001 or mix < 0.001:
1106 |       return audio
1107 |       
1108 |   logger.info(f"Applying chorus: Delay={delay_ms:.1f}ms, Depth={depth:.3f}, Rate={rate_hz:.2f}Hz, Mix={mix:.2f}")
1109 |   try:
1110 |       delay_samples = int(sr * delay_ms / 1000)
1111 |       if delay_samples <= 0:
1112 |           return audio
1113 |           
1114 |       t = np.arange(len(audio)) / sr
1115 |       mod = depth * np.sin(2 * np.pi * rate_hz * t)
1116 |       indices = np.arange(len(audio)) - delay_samples * (1.0 + mod)
1117 |       indices = np.clip(indices, 0, len(audio) - 1)
1118 |       chorus_audio = np.interp(np.arange(len(audio)), indices, audio)
1119 |       
1120 |       
1121 |       result = (1.0 - mix) * audio + mix * chorus_audio
1122 |       
1123 |       return result
1124 |   except Exception as e:
1125 |       logger.error(f"Chorus failed: {e}")
1126 |       return audio
1127 | 
1128 | def apply_flanger(audio: np.ndarray, sr: int, delay_ms: float = 0.0, depth: float = 0.0, rate_hz: float = 0.0, feedback: float = 0.0, mix: float = 0.0) -> np.ndarray:
1129 |   """Apply flanger effect. Only applies if depth > 0 and mix > 0."""
1130 |   delay_ms = np.clip(delay_ms, 0.1, 10.0)
1131 |   depth = np.clip(depth, 0.0, 0.05)
1132 |   rate_hz = np.clip(rate_hz, 0.1, 10.0)
1133 |   feedback = np.clip(feedback, 0.0, 0.9)
1134 |   mix = np.clip(mix, 0.0, 1.0)
1135 |   
1136 |   # Early return if effect should not be applied
1137 |   if depth < 0.001 or mix < 0.001:
1138 |       return audio
1139 |       
1140 |   logger.info(f"Applying flanger: Delay={delay_ms:.1f}ms, Depth={depth:.3f}, Rate={rate_hz:.2f}Hz, Feedback={feedback:.2f}, Mix={mix:.2f}")
1141 |   try:
1142 |       delay_samples = int(sr * delay_ms / 1000)
1143 |       if delay_samples <= 0:
1144 |           return audio
1145 |           
1146 |       t = np.arange(len(audio)) / sr
1147 |       mod = depth * np.sin(2 * np.pi * rate_hz * t)
1148 |       output = audio.copy()
1149 |       delay_buffer = np.zeros(len(audio) + delay_samples)
1150 |       delay_buffer[:len(audio)] = audio
1151 |       
1152 |       for i in range(len(audio)):
1153 |           delay_time = delay_samples * (1.0 + mod[i])
1154 |           idx = i - delay_time
1155 |           if idx >= 0:
1156 |               interp_idx = int(idx)
1157 |               frac = idx - interp_idx
1158 |               if interp_idx + 1 < len(delay_buffer):
1159 |                   delayed_sample = (1 - frac) * delay_buffer[interp_idx] + frac * delay_buffer[interp_idx + 1]
1160 |                   output[i] += feedback * delayed_sample
1161 |                   delay_buffer[i + delay_samples] += feedback * delayed_sample
1162 |                   
1163 |       
1164 |       result = (1.0 - mix) * audio + mix * output
1165 |       
1166 |       return result
1167 |   except Exception as e:
1168 |       logger.error(f"Flanger failed: {e}")
1169 |       return audio
1170 | 
1171 | def apply_compression(audio: np.ndarray, sr: int, threshold_db: float = 0.0, ratio: float = 1.0, attack_ms: float = 5.0, release_ms: float = 200.0) -> np.ndarray:
1172 |   """Apply dynamic range compression. Only applies if ratio > 1.0 and threshold_db < 0."""
1173 |   threshold_db = np.clip(threshold_db, -60.0, 0.0)
1174 |   ratio = np.clip(ratio, 1.0, 20.0)
1175 |   attack_ms = np.clip(attack_ms, 0.1, 100.0)
1176 |   release_ms = np.clip(release_ms, 10.0, 1000.0)
1177 |   
1178 |   # Early return if compression should not be applied
1179 |   if ratio <= 1.01 or threshold_db >= -0.1:
1180 |       return audio
1181 |       
1182 |   logger.info(f"Applying compression: Threshold={threshold_db:.1f}dB, Ratio={ratio:.1f}, Attack={attack_ms:.1f}ms, Release={release_ms:.1f}ms")
1183 |   try:
1184 |       threshold = 10 ** (threshold_db / 20.0)
1185 |       attack_coeff = np.exp(-1.0 / (sr * attack_ms / 1000))
1186 |       release_coeff = np.exp(-1.0 / (sr * release_ms / 1000))
1187 |       envelope = np.zeros_like(audio)
1188 |       gain = np.ones_like(audio)
1189 |       
1190 |       for i in range(len(audio)):
1191 |           envelope[i] = abs(audio[i]) if i == 0 else (1 - attack_coeff) * abs(audio[i]) + attack_coeff * envelope[i - 1]
1192 |           if envelope[i] > threshold:
1193 |               excess = envelope[i] / threshold
1194 |               gain_reduction = threshold * (excess ** (1 / ratio - 1))
1195 |               target_gain = gain_reduction / envelope[i] if envelope[i] > 1e-6 else 1.0
1196 |           else:
1197 |               target_gain = 1.0
1198 |           gain[i] = (1 - release_coeff) * target_gain + release_coeff * (gain[i - 1] if i > 0 else 1.0)
1199 |           
1200 |       result = audio * gain
1201 |       
1202 |       return result
1203 |   except Exception as e:
1204 |       logger.error(f"Compression failed: {e}")
1205 |       return audio
1206 | 
1207 | def apply_voice_character(audio: np.ndarray, sr: int, character: str, params: Optional[Dict] = None) -> np.ndarray:
1208 |   """Apply voice character transformation. Only applies if character is not 'none'."""
1209 |   if character == "none" or not character:
1210 |       return audio
1211 |   params = params or {}
1212 |   logger.info(f"Applying voice character: {character} with params {params}")
1213 |   try:
1214 |       result = audio.copy()
1215 |       if character == "child":
1216 |           pitch_shift = params.get('pitch_shift', 3.0)
1217 |           speed = params.get('speed', 1.1)
1218 |           formant_shift = params.get('formant_shift', 1.2)
1219 |           result = shift_to_target(result, sr, None, pitch_shift, False)
1220 |           result = librosa.effects.time_stretch(y=result, rate=speed)
1221 |           result = apply_formant_shift(result, sr, formant_shift)
1222 |       elif character == "robot":
1223 |           pitch_shift = params.get('pitch_shift', 0.0)
1224 |           if abs(pitch_shift) > 0.01:
1225 |               result = shift_to_target(result, sr, None, pitch_shift, False)
1226 |           t = np.arange(len(result)) / sr
1227 |           carrier = np.sin(2 * np.pi * params.get('carrier_freq', 80.0) * t)
1228 |           result = result * carrier
1229 |           result = np.tanh(result * params.get('distortion_factor', 2.5)) * 0.8
1230 |       elif character == "deep":
1231 |           pitch_shift = params.get('pitch_shift', -4.0)
1232 |           speed = params.get('speed', 0.9)
1233 |           formant_shift = params.get('formant_shift', 0.8)
1234 |           result = shift_to_target(result, sr, None, pitch_shift, False)
1235 |           result = librosa.effects.time_stretch(y=result, rate=speed)
1236 |           result = apply_formant_shift(result, sr, formant_shift)
1237 |       elif character == "whisper":
1238 |           b, a = signal.butter(4, 3000 / (sr / 2), 'low')
1239 |           result = signal.filtfilt(b, a, result)
1240 |           noise = np.random.normal(0, params.get('noise_level', 0.03), len(result)).astype(result.dtype)
1241 |           result = result * params.get('signal_level', 0.6) + noise
1242 |           result = np.tanh(result * params.get('compression_factor', 1.2)) * 0.9
1243 |       elif character == "alien":
1244 |           pitch_shift = params.get('pitch_shift', 2.0)
1245 |           result = shift_to_target(result, sr, None, pitch_shift, False)
1246 |           result = apply_flanger(result, sr, delay_ms=params.get('flanger_delay_ms', 5.0), depth=0.02, rate_hz=0.3, feedback=0.6, mix=0.6)
1247 |       elif character == "monster":
1248 |           pitch_shift = params.get('pitch_shift', -6.0)
1249 |           formant_shift = params.get('formant_shift', 0.7)
1250 |           result = shift_to_target(result, sr, None, pitch_shift, False)
1251 |           result = apply_formant_shift(result, sr, formant_shift)
1252 |           result = apply_distortion(result, drive_db=params.get('distortion_drive', 12.0), dist_type='tanh', mix=0.7)
1253 |       elif character == "echo":
1254 |           result = apply_reverb(result, sr, room_size=0.6, damping=0.3, pre_delay_ms=params.get('pre_delay_ms', 50.0), stereo_width=0.8)
1255 |       else:
1256 |           logger.warning(f"Unknown voice character: {character}")
1257 |           return audio
1258 |       
1259 |       return result
1260 |   except Exception as e:
1261 |       logger.error(f"Voice character transformation failed: {e}")
1262 |       return audio
1263 | 
1264 | # --- Enhanced Model Management -----------------------------------------------
1265 | 
1266 | class EnhancedModelManager:
1267 |   """Production-grade model manager with robust text processing and caching."""
1268 |   
1269 |   def __init__(self):
1270 |       logger.info("Initializing EnhancedModelManager...")
1271 |       
1272 |       # Initialize models with detailed logging
1273 |       self.models: Dict[bool, KModel] = {}
1274 |       
1275 |       # CPU model (always available)
1276 |       logger.info("Loading CPU model...")
1277 |       self.models[False] = KModel().to('cpu').eval()
1278 |       logger.info("✓ CPU model loaded successfully")
1279 |       
1280 |       # GPU model (if available)
1281 |       if CUDA_AVAILABLE:
1282 |           try:
1283 |               logger.info("Loading GPU model...")
1284 |               log_gpu_status("before model loading")
1285 |               self.models[True] = KModel().to('cuda').eval()
1286 |               log_gpu_status("after model loading")
1287 |               logger.info("✓ GPU model loaded successfully")
1288 |           except Exception as e:
1289 |               logger.error(f"Failed to load GPU model: {e}")
1290 |               logger.warning("GPU model unavailable - falling back to CPU only")
1291 |       else:
1292 |           logger.info("GPU model not loaded (CUDA not available)")
1293 |       
1294 |       self.pipelines: Dict[str, KPipeline] = {
1295 |           'a': KPipeline(lang_code='a'),
1296 |           'b': KPipeline(lang_code='b')
1297 |       }
1298 |       
1299 |       # Enhanced lexicon
1300 |       self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
1301 |       self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
1302 |       
1303 |       # Initialize text processor
1304 |       self.text_processor = ProductionTextProcessor()
1305 |       
1306 |       # Audio cache for processed chunks
1307 |       self.audio_cache = {}
1308 |       self.cache_lock = threading.Lock()
1309 |       
1310 |       # Thread pool for parallel processing
1311 |       self.executor = ThreadPoolExecutor(max_workers=4)
1312 |       
1313 |       logger.info("Pre-loading all voices...")
1314 |       for voice_name in KOKORO_VOICES.keys():
1315 |           lang_code = voice_name[0]
1316 |           if lang_code in self.pipelines:
1317 |               self.pipelines[lang_code].load_voice(voice_name)
1318 |           else:
1319 |               logger.error(f"Unknown lang code '{lang_code}' for voice '{voice_name}'")
1320 |       logger.info("All voices loaded.")
1321 |       
1322 |       # Log final model status
1323 |       available_devices = []
1324 |       if False in self.models:
1325 |           available_devices.append("CPU")
1326 |       if True in self.models:
1327 |           available_devices.append("GPU")
1328 |       logger.info(f"Available inference devices: {', '.join(available_devices)}")
1329 |   
1330 |   def generate_audio_for_chunk(self, chunk: TextChunk, voice_name: str, speed: float = 1.0, use_gpu: bool = True) -> torch.Tensor:
1331 |       """Generate audio for a single text chunk."""
1332 |       # Create cache key
1333 |       cache_key = hashlib.md5(f"{chunk.text}_{voice_name}_{speed}".encode()).hexdigest()
1334 |       
1335 |       with self.cache_lock:
1336 |           if cache_key in self.audio_cache:
1337 |               logger.info(f"Using cached audio for chunk {chunk.chunk_id}")
1338 |               return self.audio_cache[cache_key]
1339 |       
1340 |       if not chunk.text or len(chunk.text.strip()) < 2:
1341 |           raise ValueError(f"Chunk {chunk.chunk_id} text is too short or empty")
1342 |       
1343 |       lang_code = voice_name[0]
1344 |       if lang_code not in self.pipelines:
1345 |           raise ValueError(f"No pipeline for lang code '{lang_code}'")
1346 |       
1347 |       pipeline = self.pipelines[lang_code]
1348 |       pack = pipeline.load_voice(voice_name)
1349 |       effective_use_gpu = use_gpu and CUDA_AVAILABLE and (True in self.models)
1350 |       
1351 |       # Detailed device selection logging
1352 |       if use_gpu and not CUDA_AVAILABLE:
1353 |           logger.warning(f"Chunk {chunk.chunk_id}: GPU requested but CUDA not available - using CPU")
1354 |       elif use_gpu and CUDA_AVAILABLE and (True not in self.models):
1355 |           logger.warning(f"Chunk {chunk.chunk_id}: GPU requested but GPU model failed to load - using CPU")
1356 |       
1357 |       device_name = "GPU" if effective_use_gpu else "CPU"
1358 |       logger.info(f"Generating chunk {chunk.chunk_id}/{chunk.total_chunks} with {device_name} for voice {voice_name}")
1359 |       
1360 |       # Log GPU memory before inference if using GPU
1361 |       if effective_use_gpu:
1362 |           log_gpu_status(f"before chunk {chunk.chunk_id}")
1363 |       
1364 |       for _, ps, _ in pipeline(chunk.text, voice_name, speed):
1365 |           ref_s = pack[len(ps) - 1]
1366 |           try:
1367 |               audio_tensor = self.models[effective_use_gpu](ps, ref_s, speed)
1368 |               
1369 |               # Log GPU memory after inference if using GPU
1370 |               if effective_use_gpu:
1371 |                   log_gpu_status(f"after chunk {chunk.chunk_id}")
1372 |               
1373 |               # Cache the result
1374 |               with self.cache_lock:
1375 |                   if len(self.audio_cache) >= CACHE_SIZE:
1376 |                       # Simple cache eviction
1377 |                       oldest_keys = list(self.audio_cache.keys())[:CACHE_SIZE // 2]
1378 |                       for key in oldest_keys:
1379 |                           del self.audio_cache[key]
1380 |                   self.audio_cache[cache_key] = audio_tensor
1381 |               
1382 |               return audio_tensor
1383 |           except Exception as e:
1384 |               error_device = "GPU" if effective_use_gpu else "CPU"
1385 |               logger.error(f"Error on {error_device} for chunk {chunk.chunk_id}: {e}")
1386 |               
1387 |               if effective_use_gpu:
1388 |                   logger.warning(f"GPU inference failed for chunk {chunk.chunk_id}, attempting CPU fallback...")
1389 |                   try:
1390 |                       log_gpu_status("before CPU fallback")
1391 |                       audio_tensor = self.models[False](ps, ref_s, speed)
1392 |                       logger.info(f"✓ CPU fallback successful for chunk {chunk.chunk_id}")
1393 |                       
1394 |                       with self.cache_lock:
1395 |                           if len(self.audio_cache) >= CACHE_SIZE:
1396 |                               oldest_keys = list(self.audio_cache.keys())[:CACHE_SIZE // 2]
1397 |                               for key in oldest_keys:
1398 |                                   del self.audio_cache[key]
1399 |                           self.audio_cache[cache_key] = audio_tensor
1400 |                       
1401 |                       return audio_tensor
1402 |                   except Exception as cpu_error:
1403 |                       logger.error(f"CPU fallback also failed for chunk {chunk.chunk_id}: {cpu_error}")
1404 |                       raise RuntimeError(f"Both GPU and CPU inference failed for chunk {chunk.chunk_id}")
1405 |               else:
1406 |                   logger.error(f"CPU inference failed for chunk {chunk.chunk_id} - no fallback available")
1407 |                   raise
1408 |       
1409 |       raise RuntimeError(f"Kokoro TTS pipeline yielded no audio frames for chunk {chunk.chunk_id}")
1410 |   
1411 |   def generate_audio_robust(self, text: str, voice_name: str, speed: float = 1.0, 
1412 |                           use_gpu: bool = True, max_chunk_length: int = DEFAULT_CHUNK_SIZE,
1413 |                           processing_mode: Optional[TextProcessingMode] = None) -> List[torch.Tensor]:
1414 |       """
1415 |       Generate audio with robust text processing and chunking support.
1416 |       
1417 |       Returns list of audio tensors for each processed chunk.
1418 |       """
1419 |       if len(text) > MAX_TEXT_LENGTH:
1420 |           raise ValueError(f"Text too long. Maximum length is {MAX_TEXT_LENGTH} characters.")
1421 |       
1422 |       # Process text into manageable chunks
1423 |       text_chunks = self.text_processor.process_text(
1424 |           text, 
1425 |           mode=processing_mode, 
1426 |           max_chunk_length=max_chunk_length
1427 |       )
1428 |       
1429 |       if not text_chunks:
1430 |           raise ValueError("No valid text chunks after processing")
1431 |       
1432 |       logger.info(f"Processing {len(text_chunks)} chunks for voice {voice_name}")
1433 |       
1434 |       audio_tensors = []
1435 |       failed_chunks = []
1436 |       
1437 |       # Process chunks sequentially for now (could be parallelized)
1438 |       for chunk in text_chunks:
1439 |           try:
1440 |               audio_tensor = self.generate_audio_for_chunk(chunk, voice_name, speed, use_gpu)
1441 |               audio_tensors.append(audio_tensor)
1442 |               logger.info(f"Successfully generated audio for chunk {chunk.chunk_id + 1}/{len(text_chunks)}")
1443 |           except Exception as e:
1444 |               logger.error(f"Failed to generate audio for chunk {chunk.chunk_id + 1}: {e}")
1445 |               failed_chunks.append(chunk.chunk_id)
1446 |               continue
1447 |       
1448 |       if not audio_tensors:
1449 |           raise RuntimeError("Failed to generate audio for any text chunks")
1450 |       
1451 |       if failed_chunks:
1452 |           logger.warning(f"Failed to process chunks: {failed_chunks}")
1453 |       
1454 |       return audio_tensors
1455 |   
1456 |   def concatenate_audio_tensors(self, audio_tensors: List[torch.Tensor], silence_duration: float = 0.2) -> torch.Tensor:
1457 |       """Concatenate multiple audio tensors with configurable silence between them."""
1458 |       if len(audio_tensors) == 1:
1459 |           return audio_tensors[0]
1460 |       
1461 |       # Adaptive silence based on content
1462 |       if len(audio_tensors) > 10:
1463 |           silence_duration = min(silence_duration, 0.15)  # Shorter pauses for many chunks
1464 |       
1465 |       # Create silence tensor
1466 |       silence_samples = int(SAMPLE_RATE * silence_duration)
1467 |       silence = torch.zeros(silence_samples, dtype=audio_tensors[0].dtype, device=audio_tensors[0].device)
1468 |       
1469 |       # Concatenate with silence
1470 |       result_parts = []
1471 |       for i, tensor in enumerate(audio_tensors):
1472 |           result_parts.append(tensor)
1473 |           if i < len(audio_tensors) - 1:  # Don't add silence after last chunk
1474 |               result_parts.append(silence)
1475 |       
1476 |       return torch.cat(result_parts, dim=0)
1477 |   
1478 |   def generate_audio(self, text: str, voice_name: str, speed: float = 1.0, use_gpu: bool = True) -> torch.Tensor:
1479 |       """Legacy method for backward compatibility."""
1480 |       if not text or len(text.strip()) < 2:
1481 |           raise ValueError("Input text is too short or empty")
1482 |       
1483 |       lang_code = voice_name[0]
1484 |       if lang_code not in self.pipelines:
1485 |           raise ValueError(f"No pipeline for lang code '{lang_code}'")
1486 |       
1487 |       pipeline = self.pipelines[lang_code]
1488 |       pack = pipeline.load_voice(voice_name)
1489 |       effective_use_gpu = use_gpu and CUDA_AVAILABLE and (True in self.models)
1490 |       
1491 |       # Detailed device selection logging
1492 |       if use_gpu and not CUDA_AVAILABLE:
1493 |           logger.warning("GPU requested but CUDA not available - using CPU")
1494 |       elif use_gpu and CUDA_AVAILABLE and (True not in self.models):
1495 |           logger.warning("GPU requested but GPU model failed to load - using CPU")
1496 |       
1497 |       device_name = "GPU" if effective_use_gpu else "CPU"
1498 |       logger.info(f"Generating audio with {device_name} for voice {voice_name}")
1499 |       
1500 |       # Log GPU memory before inference if using GPU
1501 |       if effective_use_gpu:
1502 |           log_gpu_status("before audio generation")
1503 |       
1504 |       for _, ps, _ in pipeline(text, voice_name, speed):
1505 |           ref_s = pack[len(ps) - 1]
1506 |           try:
1507 |               audio_tensor = self.models[effective_use_gpu](ps, ref_s, speed)
1508 |               
1509 |               # Log GPU memory after inference if using GPU
1510 |               if effective_use_gpu:
1511 |                   log_gpu_status("after audio generation")
1512 |               
1513 |               return audio_tensor
1514 |           except Exception as e:
1515 |               error_device = "GPU" if effective_use_gpu else "CPU"
1516 |               logger.error(f"Error on {error_device}: {e}")
1517 |               
1518 |               if effective_use_gpu:
1519 |                   logger.warning("GPU inference failed, attempting CPU fallback...")
1520 |                   try:
1521 |                       log_gpu_status("before CPU fallback")
1522 |                       audio_tensor = self.models[False](ps, ref_s, speed)
1523 |                       logger.info("✓ CPU fallback successful")
1524 |                       return audio_tensor
1525 |                   except Exception as cpu_error:
1526 |                       logger.error(f"CPU fallback also failed: {cpu_error}")
1527 |                       raise RuntimeError("Both GPU and CPU inference failed")
1528 |               else:
1529 |                   logger.error("CPU inference failed - no fallback available")
1530 |                   raise
1531 |       
1532 |       raise RuntimeError("Kokoro TTS pipeline yielded no audio frames")
1533 | 
1534 | # Initialize singleton model manager
1535 | enhanced_model_manager = EnhancedModelManager()
1536 | 
1537 | # --- Flask App Setup --------------------------------------------------------
1538 | 
1539 | app = Flask(__name__)
1540 | if ENABLE_CORS:
1541 |   CORS(app, resources={r"/*": {"origins": "*"}})  # Allow all origins for read-aloud compatibility
1542 |   logger.info("CORS enabled for all routes")
1543 | else:
1544 |   logger.info("CORS disabled")
1545 | 
1546 | # --- Authentication Middleware ----------------------------------------------
1547 | 
1548 | def check_auth(f):
1549 |   @wraps(f)
1550 |   def decorated(*args, **kwargs):
1551 |       if LOG_AUTH_ATTEMPTS:
1552 |           auth_header = request.headers.get('Authorization')
1553 |           if auth_header:
1554 |               logger.info(f"Auth header: {auth_header[:30]}...")
1555 |           else:
1556 |               logger.info("No authentication provided")
1557 |       return f(*args, **kwargs)
1558 |   return decorated
1559 | 
1560 | # --- Helper Functions -------------------------------------------------------
1561 | 
1562 | def get_kokoro_voice(voice_id: str) -> str:
1563 |   if voice_id in KOKORO_VOICES:
1564 |       return voice_id
1565 |   logger.warning(f"Unknown voice '{voice_id}', using default '{DEFAULT_VOICE}'")
1566 |   return DEFAULT_VOICE
1567 | 
1568 | def tensor_to_numpy(audio_tensor: torch.Tensor) -> np.ndarray:
1569 |   """Convert PyTorch tensor to NumPy array."""
1570 |   audio_numpy = audio_tensor.cpu().numpy().squeeze()
1571 |   if audio_numpy.dtype != np.float32:
1572 |       audio_numpy = audio_numpy.astype(np.float32)
1573 |   max_val = np.max(np.abs(audio_numpy))
1574 |   if max_val > 1.0:
1575 |       audio_numpy /= max_val
1576 |   return audio_numpy
1577 | 
1578 | def numpy_to_format(audio_numpy: np.ndarray, sr: int, audio_format: str) -> bytes:
1579 |   """Convert NumPy audio array to specified format."""
1580 |   audio_int16 = (audio_numpy * 32767).astype(np.int16)
1581 |   if audio_format == 'wav':
1582 |       buffer = io.BytesIO()
1583 |       wavfile.write(buffer, sr, audio_int16)
1584 |       return buffer.getvalue()
1585 |   elif audio_format == 'mp3':
1586 |       audio_segment = AudioSegment(
1587 |           data=audio_int16.tobytes(),
1588 |           sample_width=2,
1589 |           frame_rate=sr,
1590 |           channels=1
1591 |       )
1592 |       buffer = io.BytesIO()
1593 |       audio_segment.export(buffer, format="mp3", bitrate="192k")
1594 |       return buffer.getvalue()
1595 |   else:
1596 |       raise ValueError(f"Unsupported audio format: {audio_format}")
1597 | 
1598 | def calculate_audio_duration(audio_numpy: np.ndarray, sr: int) -> float:
1599 |   """Calculate audio duration in seconds."""
1600 |   return len(audio_numpy) / sr
1601 | 
1602 | def play_audio_windows_with_interrupt(audio_numpy: np.ndarray, sr: int, session_id: str):
1603 |   """Play audio on Windows with interrupt capability."""
1604 |   try:
1605 |       import winsound
1606 |       
1607 |       # Calculate duration and start playback session
1608 |       duration = calculate_audio_duration(audio_numpy, sr)
1609 |       playback_controller.start_playback(session_id, duration)
1610 |       
1611 |       # Convert to WAV bytes
1612 |       wav_bytes = numpy_to_format(audio_numpy, sr, 'wav')
1613 |       
1614 |       # Check for interruption before playing
1615 |       if playback_controller.should_stop:
1616 |           playback_controller.finish_playback()
1617 |           return
1618 |       
1619 |       # Play audio (this is blocking)
1620 |       winsound.PlaySound(wav_bytes, winsound.SND_MEMORY | winsound.SND_NODEFAULT)
1621 |       
1622 |       # Mark as finished if not interrupted
1623 |       if not playback_controller.should_stop:
1624 |           playback_controller.finish_playback()
1625 |           logger.info("Audio playback completed successfully")
1626 |       else:
1627 |           logger.info("Audio playback was interrupted")
1628 |           
1629 |   except ImportError:
1630 |       logger.error("winsound module not found")
1631 |       playback_controller.finish_playback()
1632 |   except Exception as e:
1633 |       logger.error(f"Error playing audio: {e}")
1634 |       playback_controller.finish_playback()
1635 | 
1636 | def apply_effects_pipeline(audio_numpy: np.ndarray, effects_params: Dict, effect_order: List[str]) -> np.ndarray:
1637 |   """Apply effects pipeline to audio with proper zero-default handling."""
1638 |   for effect in effect_order:
1639 |       if effect == 'volume':
1640 |           volume_params = effects_params.get('volume', {})
1641 |           gain = volume_params.get('gain', 1.0)
1642 |           gain_db = volume_params.get('gain_db')
1643 |           # Only apply if gain != 1.0 or gain_db is specified
1644 |           if gain != 1.0 or gain_db is not None:
1645 |               audio_numpy = apply_volume(audio_numpy, gain, gain_db)
1646 |               
1647 |       elif effect == 'equalizer':
1648 |           eq_params = effects_params.get('equalizer', {})
1649 |           bands = eq_params.get('bands', [])
1650 |           # Only apply if bands are provided
1651 |           if bands:
1652 |               audio_numpy = apply_eq(audio_numpy, SAMPLE_RATE, bands)
1653 |               
1654 |       elif effect == 'compression':
1655 |           comp_params = effects_params.get('compression', {})
1656 |           threshold_db = comp_params.get('threshold_db', 0.0)
1657 |           ratio = comp_params.get('ratio', 1.0)
1658 |           attack_ms = comp_params.get('attack_ms', 0.1)
1659 |           release_ms = comp_params.get('release_ms', 10.0)
1660 |           # Only apply if compression parameters indicate it should be used
1661 |           if threshold_db < 0.0 and ratio > 1.0:
1662 |               audio_numpy = apply_compression(audio_numpy, SAMPLE_RATE, threshold_db, ratio, attack_ms, release_ms)
1663 |               
1664 |       elif effect == 'distortion':
1665 |           dist_params = effects_params.get('distortion', {})
1666 |           drive_db = dist_params.get('drive_db', 0.0)
1667 |           dist_type = dist_params.get('type', 'tanh')
1668 |           mix = dist_params.get('mix', 0.0)
1669 |           # Only apply if drive_db > 0 and mix > 0
1670 |           if drive_db > 0.0 and mix > 0.0:
1671 |               audio_numpy = apply_distortion(audio_numpy, drive_db, dist_type, mix)
1672 |               
1673 |       elif effect == 'pitch':
1674 |           pitch_params = effects_params.get('pitch', {})
1675 |           target_note = pitch_params.get('target_note')
1676 |           semitone_shift = float(pitch_params.get('semitone_shift', 0.0))
1677 |           preserve_formants = pitch_params.get('preserve_formants', False)
1678 |           # Only apply if target_note is specified or semitone_shift != 0
1679 |           if target_note or abs(semitone_shift) > 0.01:
1680 |               audio_numpy = shift_to_target(audio_numpy, SAMPLE_RATE, target_note, semitone_shift, preserve_formants)
1681 |               
1682 |       elif effect == 'formant':
1683 |           formant_params = effects_params.get('formant', {})
1684 |           shift_percent = float(formant_params.get('shift_percent', 0.0))
1685 |           scale = float(formant_params.get('scale', 1.0))
1686 |           # Only apply if shift_percent != 0
1687 |           if abs(shift_percent) > 0.01:
1688 |               shift_factor = 1.0 + (np.clip(shift_percent, -100.0, 100.0) / 200.0)
1689 |               audio_numpy = apply_formant_shift(audio_numpy, SAMPLE_RATE, shift_factor, scale)
1690 |               
1691 |       elif effect == 'voice_character':
1692 |           char_params = effects_params.get('voice_character', {})
1693 |           char_type = char_params.get('type', 'none')
1694 |           char_custom_params = char_params.get('params', {})
1695 |           # Only apply if character type is not 'none'
1696 |           if char_type != 'none':
1697 |               audio_numpy = apply_voice_character(audio_numpy, SAMPLE_RATE, char_type, char_custom_params)
1698 |               
1699 |       elif effect == 'chorus':
1700 |           chorus_params = effects_params.get('chorus', {})
1701 |           delay_ms = chorus_params.get('delay_ms', 0.0)
1702 |           depth = chorus_params.get('depth', 0.0)
1703 |           rate_hz = chorus_params.get('rate_hz', 0.0)
1704 |           mix = chorus_params.get('mix', 0.0)
1705 |           # Only apply if depth > 0 and mix > 0
1706 |           if depth > 0.0 and mix > 0.0:
1707 |               audio_numpy = apply_chorus(audio_numpy, SAMPLE_RATE, delay_ms, depth, rate_hz, mix)
1708 |               
1709 |       elif effect == 'flanger':
1710 |           flanger_params = effects_params.get('flanger', {})
1711 |           delay_ms = flanger_params.get('delay_ms', 0.0)
1712 |           depth = flanger_params.get('depth', 0.0)
1713 |           rate_hz = flanger_params.get('rate_hz', 0.0)
1714 |           feedback = flanger_params.get('feedback', 0.0)
1715 |           mix = flanger_params.get('mix', 0.0)
1716 |           # Only apply if depth > 0 and mix > 0
1717 |           if depth > 0.0 and mix > 0.0:
1718 |               audio_numpy = apply_flanger(audio_numpy, SAMPLE_RATE, delay_ms, depth, rate_hz, feedback, mix)
1719 |               
1720 |       elif effect == 'reverb':
1721 |           reverb_params = effects_params.get('reverb', {})
1722 |           room_size = float(reverb_params.get('room_size_percent', 0.0)) / 100.0
1723 |           damping = float(reverb_params.get('damping_percent', 50.0)) / 100.0
1724 |           pre_delay_ms = float(reverb_params.get('pre_delay_ms', 0.0))
1725 |           stereo_width = float(reverb_params.get('stereo_width', 0.0))
1726 |           # Only apply if room_size > 0
1727 |           if room_size > 0.01:
1728 |               audio_numpy = apply_reverb(audio_numpy, SAMPLE_RATE, room_size, damping, pre_delay_ms, stereo_width)
1729 |       else:
1730 |           logger.warning(f"Unknown effect in order: {effect}")
1731 |   
1732 |   return audio_numpy
1733 | 
1734 | # --- API Endpoints ----------------------------------------------------------
1735 | 
1736 | @app.route('/ping', methods=['GET'])
1737 | def ping_route():
1738 |   return jsonify({"status": "ok", "timestamp": datetime.utcnow().isoformat() + 'Z'})
1739 | 
1740 | @app.route('/v1/audio/speech', methods=['POST'])
1741 | @check_auth
1742 | def create_speech_route():
1743 |   """Standard speech generation endpoint with clean zero-default effects."""
1744 |   try:
1745 |       data = request.get_json()
1746 |       if not data:
1747 |           return jsonify({"error": "No JSON data provided"}), 400
1748 |       
1749 |       text = data.get('input', '').strip()
1750 |       if not text:
1751 |           return jsonify({"error": "Missing or empty 'input' field"}), 400
1752 |       
1753 |       voice_id = data.get('voice', DEFAULT_VOICE)
1754 |       speed = float(data.get('speed', 1.0))
1755 |       response_format = data.get('response_format', 'mp3').lower()
1756 |       use_gpu_flag = data.get('use_gpu', True)
1757 |       
1758 |       # Default effect order - but effects won't apply unless explicitly configured
1759 |       effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER)
1760 |       
1761 |       if not (0.25 <= speed <= 4.0):
1762 |           return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400
1763 |       
1764 |       supported_formats = ['mp3', 'wav']
1765 |       if response_format not in supported_formats:
1766 |           logger.warning(f"Unsupported format '{response_format}', defaulting to mp3")
1767 |           response_format = 'mp3'
1768 |       
1769 |       kokoro_voice_id = get_kokoro_voice(voice_id)
1770 |       logger.info(f"Request: voice={kokoro_voice_id}, speed={speed}, format={response_format}, text='{text[:50]}...'")
1771 |       
1772 |       # Generate base audio
1773 |       audio_tensor = enhanced_model_manager.generate_audio(text, kokoro_voice_id, speed, use_gpu_flag)
1774 |       audio_numpy = tensor_to_numpy(audio_tensor)
1775 |       
1776 |       # Apply effects pipeline (only applies effects that are explicitly configured)
1777 |       effects_params = data.get('effects', {})
1778 |       audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order)
1779 |       
1780 |       # Final normalization
1781 |       max_val = np.max(np.abs(audio_numpy))
1782 |       if max_val > 1e-6: # Avoid division by zero or tiny numbers
1783 |           audio_numpy = audio_numpy / max_val
1784 |       
1785 |       # Convert to desired format
1786 |       audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format)
1787 |       mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}'
1788 |       
1789 |       return Response(
1790 |           audio_bytes,
1791 |           mimetype=mime_type,
1792 |           headers={
1793 |               'Content-Type': mime_type,
1794 |               'Content-Length': str(len(audio_bytes)),
1795 |               'Access-Control-Allow-Origin': '*',  # For read-aloud compatibility
1796 |               'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
1797 |               'Access-Control-Allow-Headers': 'Content-Type, Authorization'
1798 |           }
1799 |       )
1800 |   except ValueError as ve:
1801 |       logger.error(f"ValueError: {ve}")
1802 |       return jsonify({"error": str(ve)}), 400
1803 |   except RuntimeError as re:
1804 |       logger.error(f"RuntimeError: {re}")
1805 |       return jsonify({"error": str(re)}), 500
1806 |   except Exception as e:
1807 |       logger.error(f"Unexpected error: {e}", exc_info=True)
1808 |       return jsonify({"error": "Internal server error"}), 500
1809 | 
1810 | @app.route('/v1/audio/speech/robust', methods=['POST'])
1811 | @check_auth
1812 | def create_speech_robust_route():
1813 |   """Enhanced speech generation endpoint with robust text processing."""
1814 |   try:
1815 |       data = request.get_json()
1816 |       if not data:
1817 |           return jsonify({"error": "No JSON data provided"}), 400
1818 |       
1819 |       text = data.get('input', '').strip()
1820 |       if not text:
1821 |           return jsonify({"error": "Missing or empty 'input' field"}), 400
1822 |       
1823 |       voice_id = data.get('voice', DEFAULT_VOICE)
1824 |       speed = float(data.get('speed', 1.0))
1825 |       response_format = data.get('response_format', 'mp3').lower()
1826 |       use_gpu_flag = data.get('use_gpu', True)
1827 |       max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE))
1828 |       silence_between_chunks = float(data.get('silence_between_chunks', 0.2))
1829 |       processing_mode = data.get('processing_mode')
1830 |       
1831 |       # Convert string mode to enum
1832 |       if processing_mode:
1833 |           try:
1834 |               processing_mode = TextProcessingMode(processing_mode.lower())
1835 |           except ValueError:
1836 |               processing_mode = None
1837 |       
1838 |       effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER)
1839 |       
1840 |       if not (0.25 <= speed <= 4.0):
1841 |           return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400
1842 |       
1843 |       if not (MIN_CHUNK_SIZE <= max_chunk_length <= 1000):
1844 |           return jsonify({"error": f"max_chunk_length must be between {MIN_CHUNK_SIZE} and 1000"}), 400
1845 |       
1846 |       supported_formats = ['mp3', 'wav']
1847 |       if response_format not in supported_formats:
1848 |           logger.warning(f"Unsupported format '{response_format}', defaulting to mp3")
1849 |           response_format = 'mp3'
1850 |       
1851 |       kokoro_voice_id = get_kokoro_voice(voice_id)
1852 |       
1853 |       logger.info(f"Robust request: voice={kokoro_voice_id}, speed={speed}, "
1854 |                  f"format={response_format}, max_chunk={max_chunk_length}, "
1855 |                  f"mode={processing_mode}, text='{text[:50]}...'")
1856 |       
1857 |       # Generate audio for all chunks using robust processing
1858 |       audio_tensors = enhanced_model_manager.generate_audio_robust(
1859 |           text, kokoro_voice_id, speed, use_gpu_flag, max_chunk_length, processing_mode
1860 |       )
1861 |       
1862 |       # Concatenate all audio chunks
1863 |       combined_audio = enhanced_model_manager.concatenate_audio_tensors(
1864 |           audio_tensors, silence_between_chunks
1865 |       )
1866 |       
1867 |       # Convert to numpy and apply effects
1868 |       audio_numpy = tensor_to_numpy(combined_audio)
1869 |       effects_params = data.get('effects', {})
1870 |       audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order)
1871 |       
1872 |       # Final normalization
1873 |       max_val = np.max(np.abs(audio_numpy))
1874 |       if max_val > 1e-6:
1875 |           audio_numpy = audio_numpy / max_val
1876 |       
1877 |       # Convert to desired format
1878 |       audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format)
1879 |       mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}'
1880 |       
1881 |       return Response(
1882 |           audio_bytes,
1883 |           mimetype=mime_type,
1884 |           headers={
1885 |               'Content-Type': mime_type,
1886 |               'Content-Length': str(len(audio_bytes)),
1887 |               'X-Chunks-Processed': str(len(audio_tensors)),
1888 |               'X-Processing-Mode': processing_mode.value if processing_mode else 'auto',
1889 |               'Access-Control-Allow-Origin': '*',
1890 |               'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
1891 |               'Access-Control-Allow-Headers': 'Content-Type, Authorization'
1892 |           }
1893 |       )
1894 |       
1895 |   except ValueError as ve:
1896 |       logger.error(f"ValueError: {ve}")
1897 |       return jsonify({"error": str(ve)}), 400
1898 |   except RuntimeError as re:
1899 |       logger.error(f"RuntimeError: {re}")
1900 |       return jsonify({"error": str(re)}), 500
1901 |   except Exception as e:
1902 |       logger.error(f"Unexpected error: {e}", exc_info=True)
1903 |       return jsonify({"error": "Internal server error"}), 500
1904 | 
1905 | @app.route('/v1/audio/speech/stream', methods=['POST'])
1906 | @check_auth
1907 | def create_speech_stream_route():
1908 |   """Streaming speech generation endpoint."""
1909 |   try:
1910 |       data = request.get_json()
1911 |       if not data:
1912 |           return jsonify({"error": "No JSON data provided"}), 400
1913 |       
1914 |       text = data.get('input', '').strip()
1915 |       if not text:
1916 |           return jsonify({"error": "Missing or empty 'input' field"}), 400
1917 |       
1918 |       voice_id = data.get('voice', DEFAULT_VOICE)
1919 |       speed = float(data.get('speed', 1.0))
1920 |       response_format = data.get('response_format', 'mp3').lower()
1921 |       use_gpu_flag = data.get('use_gpu', True)
1922 |       max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE))
1923 |       processing_mode = data.get('processing_mode')
1924 |       
1925 |       if processing_mode:
1926 |           try:
1927 |               processing_mode = TextProcessingMode(processing_mode.lower())
1928 |           except ValueError:
1929 |               processing_mode = None
1930 |       
1931 |       if not (0.25 <= speed <= 4.0):
1932 |           return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400
1933 |       
1934 |       kokoro_voice_id = get_kokoro_voice(voice_id)
1935 |       
1936 |       def generate_audio_stream():
1937 |           try:
1938 |               # Process text into chunks
1939 |               text_chunks = enhanced_model_manager.text_processor.process_text(
1940 |                   text, mode=processing_mode, max_chunk_length=max_chunk_length
1941 |               )
1942 |               
1943 |               effects_params = data.get('effects', {})
1944 |               effect_order = effects_params.get('order', DEFAULT_EFFECT_ORDER)
1945 |               
1946 |               for chunk in text_chunks:
1947 |                   try:
1948 |                       # Generate audio for chunk
1949 |                       audio_tensor = enhanced_model_manager.generate_audio_for_chunk(
1950 |                           chunk, kokoro_voice_id, speed, use_gpu_flag
1951 |                       )
1952 |                       
1953 |                       # Convert to numpy and apply effects
1954 |                       audio_numpy = tensor_to_numpy(audio_tensor)
1955 |                       audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order)
1956 |                       
1957 |                       # Final normalization for the chunk
1958 |                       max_val = np.max(np.abs(audio_numpy))
1959 |                       if max_val > 1e-6:
1960 |                           audio_numpy = audio_numpy / max_val
1961 |                       
1962 |                       # Convert to desired format
1963 |                       audio_bytes = numpy_to_format(audio_numpy, SAMPLE_RATE, response_format)
1964 |                       
1965 |                       yield audio_bytes
1966 |                       
1967 |                   except Exception as e:
1968 |                       logger.error(f"Error processing chunk {chunk.chunk_id}: {e}")
1969 |                       continue
1970 |                       
1971 |           except Exception as e:
1972 |               logger.error(f"Streaming error: {e}")
1973 |               yield b''  # Empty response on error
1974 |       
1975 |       mime_type = 'audio/mpeg' if response_format == 'mp3' else f'audio/{response_format}'
1976 |       
1977 |       return Response(
1978 |           stream_with_context(generate_audio_stream()),
1979 |           mimetype=mime_type,
1980 |           headers={
1981 |               'Content-Type': mime_type,
1982 |               'Transfer-Encoding': 'chunked',
1983 |               'Access-Control-Allow-Origin': '*',
1984 |               'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
1985 |               'Access-Control-Allow-Headers': 'Content-Type, Authorization'
1986 |           }
1987 |       )
1988 |       
1989 |   except Exception as e:
1990 |       logger.error(f"Streaming setup error: {e}", exc_info=True)
1991 |       return jsonify({"error": "Internal server error"}), 500
1992 | 
1993 | @app.route('/v1/audio/speech/play', methods=['POST'])
1994 | @check_auth
1995 | def play_speech_route():
1996 |   """Play speech locally with interrupt capability (Windows only)."""
1997 |   try:
1998 |       data = request.get_json()
1999 |       if not data:
2000 |           return jsonify({"error": "No JSON data provided"}), 400
2001 |       
2002 |       text = data.get('input', '').strip()
2003 |       if not text:
2004 |           return jsonify({"error": "Missing or empty 'input' field"}), 400
2005 |       
2006 |       voice_id = data.get('voice', DEFAULT_VOICE)
2007 |       speed = float(data.get('speed', 1.0))
2008 |       use_gpu_flag = data.get('use_gpu', True)
2009 |       use_robust = data.get('use_robust_processing', False)
2010 |       
2011 |       # Generate unique session ID
2012 |       session_id = hashlib.md5(f"{text}_{voice_id}_{speed}_{time.time()}".encode()).hexdigest()[:8]
2013 |       
2014 |       effect_order = data.get('effects', {}).get('order', DEFAULT_EFFECT_ORDER)
2015 |       
2016 |       if not (0.25 <= speed <= 4.0):
2017 |           return jsonify({"error": "Speed must be between 0.25 and 4.0"}), 400
2018 |       
2019 |       kokoro_voice_id = get_kokoro_voice(voice_id)
2020 |       logger.info(f"Play request: voice={kokoro_voice_id}, speed={speed}, robust={use_robust}, session={session_id}, text='{text[:50]}...'")
2021 |       
2022 |       if use_robust:
2023 |           # Use robust processing
2024 |           max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE))
2025 |           processing_mode = data.get('processing_mode')
2026 |           if processing_mode:
2027 |               try:
2028 |                   processing_mode = TextProcessingMode(processing_mode.lower())
2029 |               except ValueError:
2030 |                   processing_mode = None
2031 |           
2032 |           audio_tensors = enhanced_model_manager.generate_audio_robust(
2033 |               text, kokoro_voice_id, speed, use_gpu_flag, max_chunk_length, processing_mode
2034 |           )
2035 |           combined_audio = enhanced_model_manager.concatenate_audio_tensors(audio_tensors, 0.2)
2036 |           audio_numpy = tensor_to_numpy(combined_audio)
2037 |       else:
2038 |           # Use legacy method
2039 |           audio_tensor = enhanced_model_manager.generate_audio(text, kokoro_voice_id, speed, use_gpu_flag)
2040 |           audio_numpy = tensor_to_numpy(audio_tensor)
2041 |       
2042 |       # Apply effects
2043 |       effects_params = data.get('effects', {})
2044 |       audio_numpy = apply_effects_pipeline(audio_numpy, effects_params, effect_order)
2045 |       
2046 |       # Final normalization
2047 |       max_val = np.max(np.abs(audio_numpy))
2048 |       if max_val > 1e-6:
2049 |           audio_numpy = audio_numpy / max_val
2050 |       
2051 |       # Start playback in a separate thread to avoid blocking
2052 |       def play_in_thread():
2053 |           play_audio_windows_with_interrupt(audio_numpy, SAMPLE_RATE, session_id)
2054 |       
2055 |       playback_thread = threading.Thread(target=play_in_thread, daemon=True)
2056 |       playback_thread.start()
2057 |       
2058 |       return jsonify({
2059 |           "status": "success", 
2060 |           "message": "Audio playback started",
2061 |           "session_id": session_id,
2062 |           "robust_processing": use_robust,
2063 |           "duration": calculate_audio_duration(audio_numpy, SAMPLE_RATE)
2064 |       }), 200
2065 |       
2066 |   except ValueError as ve:
2067 |       logger.error(f"ValueError: {ve}")
2068 |       return jsonify({"error": str(ve)}), 400
2069 |   except RuntimeError as re:
2070 |       logger.error(f"RuntimeError: {re}")
2071 |       return jsonify({"error": str(re)}), 500
2072 |   except Exception as e:
2073 |       logger.error(f"Unexpected error: {e}", exc_info=True)
2074 |       return jsonify({"error": "Internal server error"}), 500
2075 | 
2076 | @app.route('/v1/audio/speech/stop', methods=['POST'])
2077 | @check_auth
2078 | def stop_speech_route():
2079 |   """Stop/interrupt current audio playback."""
2080 |   try:
2081 |       data = request.get_json()
2082 |       session_id = data.get('session_id') if data else None
2083 |       
2084 |       success = playback_controller.stop_playback(session_id)
2085 |       status = playback_controller.get_status()
2086 |       
2087 |       if success:
2088 |           return jsonify({
2089 |               "status": "success",
2090 |               "message": "Audio playback stopped",
2091 |               "session_id": status["session_id"],
2092 |               "interrupted_at": status["interrupted_at"],
2093 |               "total_duration": status["total_duration"]
2094 |           }), 200
2095 |       else:
2096 |           return jsonify({
2097 |               "status": "error",
2098 |               "message": "No active playback session or wrong session ID",
2099 |               "current_session": status["session_id"]
2100 |           }), 400
2101 |           
2102 |   except Exception as e:
2103 |       logger.error(f"Stop playback error: {e}", exc_info=True)
2104 |       return jsonify({"error": "Internal server error"}), 500
2105 | 
2106 | @app.route('/v1/audio/speech/status', methods=['GET'])
2107 | @check_auth
2108 | def playback_status_route():
2109 |   """Get current playback status."""
2110 |   try:
2111 |       status = playback_controller.get_status()
2112 |       return jsonify({
2113 |           "status": "success",
2114 |           "playback": status
2115 |       }), 200
2116 |       
2117 |   except Exception as e:
2118 |       logger.error(f"Status check error: {e}", exc_info=True)
2119 |       return jsonify({"error": "Internal server error"}), 500
2120 | 
2121 | @app.route('/v1/models', methods=['GET'])
2122 | @check_auth
2123 | def list_models_route():
2124 |   return jsonify({
2125 |       "object": "list",
2126 |       "data": [{
2127 |           "id": "kokoro-tts-1",
2128 |           "object": "model",
2129 |           "created": int(datetime.now().timestamp()),
2130 |           "owned_by": "kokoro-project"
2131 |       }]
2132 |   })
2133 | 
2134 | @app.route('/v1/voices', methods=['GET'])
2135 | @check_auth
2136 | def list_voices_route():
2137 |   voices_list = []
2138 |   for voice_id, info in KOKORO_VOICES.items():
2139 |       voices_list.append({
2140 |           "id": voice_id,
2141 |           "name": info.get('description', voice_id),
2142 |           "gender": info.get('gender', 'unknown'),
2143 |           "language_code": info.get('lang', 'unknown'),
2144 |           "model_id": "kokoro-tts-1"
2145 |       })
2146 |   return jsonify({"object": "list", "data": voices_list})
2147 | 
2148 | @app.route('/v1/effects', methods=['GET'])
2149 | @check_auth
2150 | def list_effects_route():
2151 |   return jsonify({
2152 |       "available_effects": {
2153 |           "volume": {
2154 |               "gain": {"type": "float", "range": [0.0, 2.0], "default": 1.0, "description": "Linear gain multiplier"},
2155 |               "gain_db": {"type": "float", "range": [-60.0, 12.0], "default": None, "description": "Gain in dB, overrides gain"}
2156 |           },
2157 |           "pitch": {
2158 |               "target_note": {
2159 |                   "type": "string",
2160 |                   "description": "Musical note (e.g., 'C4', 'A#5') or null",
2161 |                   "available_notes": available_notes()
2162 |               },
2163 |               "semitone_shift": {"type": "float", "range": [-12, 12], "default": 0.0, "description": "Pitch shift in semitones"},
2164 |               "preserve_formants": {"type": "boolean", "default": False, "description": "Preserve formants during pitch shift"}
2165 |           },
2166 |           "voice_character": {
2167 |               "type": {"type": "string", "options": ["none", "child", "robot", "deep", "whisper", "alien", "monster", "echo"], "default": "none", "description": "Voice character transformation"},
2168 |               "params": {
2169 |                   "pitch_shift": {"type": "float", "range": [-12, 12], "description": "Custom pitch shift for character"},
2170 |                   "formant_shift": {"type": "float", "range": [0.5, 1.5], "description": "Custom formant shift factor"},
2171 |                   "speed": {"type": "float", "range": [0.5, 2.0], "description": "Custom time stretch factor"},
2172 |                   "carrier_freq": {"type": "float", "range": [50, 200], "description": "Carrier frequency for robot effect"},
2173 |                   "distortion_factor": {"type": "float", "range": [0.5, 5.0], "description": "Distortion intensity for robot/monster"},
2174 |                   "noise_level": {"type": "float", "range": [0.0, 0.1], "description": "Noise level for whisper"},
2175 |                   "signal_level": {"type": "float", "range": [0.0, 1.0], "description": "Signal level for whisper"},
2176 |                   "compression_factor": {"type": "float", "range": [0.5, 2.0], "description": "Compression intensity for whisper"},
2177 |                   "flanger_delay_ms": {"type": "float", "range": [0.1, 10.0], "description": "Flanger delay for alien"},
2178 |                   "distortion_drive": {"type": "float", "range": [0.0, 36.0], "description": "Distortion drive for monster"},
2179 |                   "pre_delay_ms": {"type": "float", "range": [0.0, 100.0], "description": "Pre-delay for echo"}
2180 |               }
2181 |           },
2182 |           "equalizer": {
2183 |               "bands": {
2184 |                   "type": "array",
2185 |                   "items": {
2186 |                       "frequency_hz": {"type": "float", "range": [20, 20000], "description": "Center frequency in Hz"},
2187 |                       "gain_db": {"type": "float", "range": [-24, 24], "description": "Gain in dB"},
2188 |                       "q_factor": {"type": "float", "range": [0.1, 10], "default": 1.0, "description": "Bandwidth control"},
2189 |                       "type": {"type": "string", "options": ["peak", "low_shelf", "high_shelf"], "description": "Filter type"}
2190 |                   }
2191 |               }
2192 |           },
2193 |           "reverb": {
2194 |               "room_size_percent": {"type": "float", "range": [0, 100], "default": 0.0, "description": "Room size percentage"},
2195 |               "damping_percent": {"type": "float", "range": [0, 100], "default": 50.0, "description": "High frequency damping percentage"},
2196 |               "pre_delay_ms": {"type": "float", "range": [0, 100], "default": 0.0, "description": "Pre-delay in milliseconds"},
2197 |               "stereo_width": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Stereo width for reverb"}
2198 |           },
2199 |           "formant": {
2200 |               "shift_percent": {"type": "float", "range": [-100, 100], "default": 0.0, "description": "Formant shift percentage"},
2201 |               "scale": {"type": "float", "range": [0.5, 2.0], "default": 1.0, "description": "Intensity of formant shift"}
2202 |           },
2203 |           "distortion": {
2204 |               "drive_db": {"type": "float", "range": [0, 36], "default": 0.0, "description": "Drive gain in dB"},
2205 |               "type": {"type": "string", "options": ["soft", "hard", "tanh"], "default": "tanh", "description": "Clipping type"},
2206 |               "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"}
2207 |           },
2208 |           "chorus": {
2209 |               "delay_ms": {"type": "float", "range": [5, 50], "default": 0.0, "description": "Base delay in milliseconds"},
2210 |               "depth": {"type": "float", "range": [0, 0.1], "default": 0.0, "description": "Modulation depth"},
2211 |               "rate_hz": {"type": "float", "range": [0.1, 5], "default": 0.0, "description": "Modulation rate in Hz"},
2212 |               "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"}
2213 |           },
2214 |           "flanger": {
2215 |               "delay_ms": {"type": "float", "range": [0.1, 10], "default": 0.0, "description": "Base delay in milliseconds"},
2216 |               "depth": {"type": "float", "range": [0, 0.05], "default": 0.0, "description": "Modulation depth"},
2217 |               "rate_hz": {"type": "float", "range": [0.1, 10], "default": 0.0, "description": "LFO rate in Hz"},
2218 |               "feedback": {"type": "float", "range": [0, 0.9], "default": 0.0, "description": "Feedback amount"},
2219 |               "mix": {"type": "float", "range": [0, 1], "default": 0.0, "description": "Dry/wet mix"}
2220 |           },
2221 |           "compression": {
2222 |               "threshold_db": {"type": "float", "range": [-60, 0], "default": 0.0, "description": "Threshold in dB"},
2223 |               "ratio": {"type": "float", "range": [1, 20], "default": 1.0, "description": "Compression ratio"},
2224 |               "attack_ms": {"type": "float", "range": [0.1, 100], "default": 0.1, "description": "Attack time in milliseconds"},
2225 |               "release_ms": {"type": "float", "range": [10, 1000], "default": 10.0, "description": "Release time in milliseconds"}
2226 |           },
2227 |           "order": {
2228 |               "type": "array",
2229 |               "items": {"type": "string", "options": ["volume", "equalizer", "compression", "distortion", "pitch", "formant", "voice_character", "chorus", "flanger", "reverb"]},
2230 |               "default": ["volume", "equalizer", "compression", "distortion", "pitch", "formant", "voice_character", "chorus", "flanger", "reverb"],
2231 |               "description": "Order of effect application"
2232 |           }
2233 |       }
2234 |   })
2235 | 
2236 | @app.route('/v1/text/process', methods=['POST'])
2237 | @check_auth
2238 | def process_text_route():
2239 |   """Text processing endpoint for testing and debugging."""
2240 |   try:
2241 |       data = request.get_json()
2242 |       if not data:
2243 |           return jsonify({"error": "No JSON data provided"}), 400
2244 |       
2245 |       text = data.get('input', '').strip()
2246 |       if not text:
2247 |           return jsonify({"error": "Missing or empty 'input' field"}), 400
2248 |       
2249 |       max_chunk_length = int(data.get('max_chunk_length', DEFAULT_CHUNK_SIZE))
2250 |       processing_mode = data.get('processing_mode')
2251 |       
2252 |       if processing_mode:
2253 |           try:
2254 |               processing_mode = TextProcessingMode(processing_mode.lower())
2255 |           except ValueError:
2256 |               processing_mode = None
2257 |       
2258 |       # Process text
2259 |       text_chunks = enhanced_model_manager.text_processor.process_text(
2260 |           text, mode=processing_mode, max_chunk_length=max_chunk_length
2261 |       )
2262 |       
2263 |       # Format response
2264 |       chunks_data = []
2265 |       for chunk in text_chunks:
2266 |           chunks_data.append({
2267 |               "chunk_id": chunk.chunk_id,
2268 |               "text": chunk.text,
2269 |               "char_count": chunk.char_count,
2270 |               "processing_time": chunk.processing_time
2271 |           })
2272 |       
2273 |       return jsonify({
2274 |           "original_text": text,
2275 |           "processing_mode": processing_mode.value if processing_mode else "auto",
2276 |           "total_chunks": len(text_chunks),
2277 |           "chunks": chunks_data
2278 |       })
2279 |       
2280 |   except Exception as e:
2281 |       logger.error(f"Text processing error: {e}", exc_info=True)
2282 |       return jsonify({"error": "Internal server error"}), 500
2283 | 
2284 | @app.route('/health', methods=['GET'])
2285 | def health_check_route():
2286 |   # Get GPU memory info for health check
2287 |   gpu_memory = get_gpu_memory_info()
2288 |   
2289 |   gpu_status = {
2290 |       "cuda_available": CUDA_AVAILABLE,
2291 |       "device_count": GPU_DEVICE_COUNT,
2292 |       "device_name": GPU_DEVICE_NAME,
2293 |       "memory": {
2294 |           "total_gb": gpu_memory['total'],
2295 |           "allocated_gb": gpu_memory['allocated'],
2296 |           "free_gb": gpu_memory['free']
2297 |       } if CUDA_AVAILABLE else None,
2298 |       "models_loaded": {
2299 |           "cpu": False in enhanced_model_manager.models,
2300 |           "gpu": True in enhanced_model_manager.models
2301 |       }
2302 |   }
2303 |   
2304 |   return jsonify({
2305 |       "status": "healthy",
2306 |       "cuda_available": CUDA_AVAILABLE,  # Keep for backward compatibility
2307 |       "gpu_status": gpu_status,
2308 |       "voices_loaded": len(KOKORO_VOICES),
2309 |       "default_voice": DEFAULT_VOICE,
2310 |       "effects_available": ["volume", "pitch", "voice_character", "equalizer", "reverb", "formant", "distortion", "chorus", "flanger", "compression"],
2311 |       "features": {
2312 |           "robust_text_processing": True,
2313 |           "zero_default_effects": True,
2314 |           "playback_control": True,
2315 |           "interrupt_capability": True,
2316 |           "streaming": True,
2317 |           "caching": True,
2318 |           "markdown_support": HAS_MARKDOWN,
2319 |           "num2words_support": HAS_NUM2WORDS,
2320 |           "read_aloud_compatible": True,
2321 |           "pitch_shifting_fixed": True,
2322 |           "enhanced_gpu_logging": True
2323 |       },
2324 |       "cache_stats": {
2325 |           "text_cache_size": len(enhanced_model_manager.text_processor._cache),
2326 |           "audio_cache_size": len(enhanced_model_manager.audio_cache)
2327 |       },
2328 |       "playback_status": playback_controller.get_status(),
2329 |       "timestamp": datetime.utcnow().isoformat() + 'Z'
2330 |   })
2331 | 
2332 | @app.route('/', methods=['GET'])
2333 | def index_route():
2334 |   return jsonify({
2335 |       "service": "Complete Kokoro TTS API",
2336 |       "version": "3.3.0",
2337 |       "description": "Production-grade TTS API with FIXED pitch shifting, read-aloud compatibility, and zero-default effects",
2338 |       "status": "running",
2339 |       "default_voice": DEFAULT_VOICE,
2340 |       "cuda_available": CUDA_AVAILABLE,
2341 |       "key_features": {
2342 |           "zero_default_effects": "Effects only apply when explicitly configured",
2343 |           "robust_text_processing": "Handles markdown, unicode, numbers, abbreviations",
2344 |           "playback_control": "Local audio playback with interrupt capability",
2345 |           "session_management": "Track and control audio playback sessions",
2346 |           "streaming_support": "Real-time audio streaming",
2347 |           "divide_by_zero_safe": "All calculations protected against mathematical errors",
2348 |           "pitch_shifting_fixed": "Robust pitch shifting with librosa compatibility",
2349 |           "read_aloud_compatible": "Full CORS support for browser extensions"
2350 |       },
2351 |       "endpoints": {
2352 |           "speech_generation": "/v1/audio/speech (POST) - Clean zero-default effects",
2353 |           "robust_speech_generation": "/v1/audio/speech/robust (POST) - Enhanced with text processing",
2354 |           "streaming_speech": "/v1/audio/speech/stream (POST) - Streaming generation",
2355 |           "speech_playback": "/v1/audio/speech/play (POST) - Local playback with session control",
2356 |           "stop_playback": "/v1/audio/speech/stop (POST) - Stop/interrupt current playback",
2357 |           "playback_status": "/v1/audio/speech/status (GET) - Get current playback status",
2358 |           "text_processing": "/v1/text/process (POST) - Text processing testing",
2359 |           "list_models": "/v1/models (GET)",
2360 |           "list_voices": "/v1/voices (GET)",
2361 |           "list_effects": "/v1/effects (GET)",
2362 |           "health_check": "/health (GET)",
2363 |           "ping": "/ping (GET)"
2364 |       },
2365 |       "example_pitch_requests": {
2366 |           "squeaky_voice": {
2367 |               "input": "Hello world! This is a test of the squeaky voice effect.",
2368 |               "voice": "af_heart",
2369 |               "speed": 1.0,
2370 |               "effects": {
2371 |                   "pitch": {"semitone_shift": 8.0}
2372 |               }
2373 |           },
2374 |           "deep_voice": {
2375 |               "input": "Hello world! This is a test of the deep voice effect.",
2376 |               "voice": "af_heart", 
2377 |               "speed": 1.0,
2378 |               "effects": {
2379 |                   "pitch": {"semitone_shift": -6.0}
2380 |               }
2381 |           },
2382 |           "child_character": {
2383 |               "input": "Hello world! This is a test of the child voice character.",
2384 |               "voice": "af_heart",
2385 |               "speed": 1.0,
2386 |               "effects": {
2387 |                   "voice_character": {"type": "child"}
2388 |               }
2389 |           },
2390 |           "monster_character": {
2391 |               "input": "Hello world! This is a test of the monster voice character.",
2392 |               "voice": "af_heart",
2393 |               "speed": 1.0,
2394 |               "effects": {
2395 |                   "voice_character": {"type": "monster"}
2396 |               }
2397 |           }
2398 |       },
2399 |       "read_aloud_compatibility": {
2400 |           "cors_enabled": True,
2401 |           "supported_formats": ["mp3", "wav"],
2402 |           "standard_endpoint": "/v1/audio/speech",
2403 |           "example_request": {
2404 |               "method": "POST",
2405 |               "url": "http://localhost:5000/v1/audio/speech",
2406 |               "headers": {"Content-Type": "application/json"},
2407 |               "body": {
2408 |                   "input": "Text to speak",
2409 |                   "voice": "af_heart",
2410 |                   "response_format": "mp3"
2411 |               }
2412 |           }
2413 |       },
2414 |       "playback_features": {
2415 |           "interrupt_support": "Stop playback at any time",
2416 |           "session_tracking": "Unique session IDs for each playback",
2417 |           "timing_info": "Get interrupted time and total duration",
2418 |           "status_monitoring": "Real-time playback status"
2419 |       }
2420 |   })
2421 | 
2422 | # Handle OPTIONS requests for CORS preflight
2423 | @app.route('/v1/audio/speech', methods=['OPTIONS'])
2424 | @app.route('/v1/audio/speech/robust', methods=['OPTIONS'])
2425 | @app.route('/v1/audio/speech/stream', methods=['OPTIONS'])
2426 | @app.route('/v1/audio/speech/play', methods=['OPTIONS'])
2427 | @app.route('/v1/audio/speech/stop', methods=['OPTIONS'])
2428 | def handle_options():
2429 |   """Handle CORS preflight requests for read-aloud compatibility."""
2430 |   return '', 200, {
2431 |       'Access-Control-Allow-Origin': '*',
2432 |       'Access-Control-Allow-Methods': 'POST, GET, OPTIONS',
2433 |       'Access-Control-Allow-Headers': 'Content-Type, Authorization',
2434 |       'Access-Control-Max-Age': '86400'
2435 |   }
2436 | 
2437 | # --- Error Handlers ---------------------------------------------------------
2438 | 
2439 | @app.errorhandler(404)
2440 | def not_found_error(error):
2441 |   return jsonify({"error": "Endpoint not found"}), 404
2442 | 
2443 | @app.errorhandler(500)
2444 | def internal_server_error(error):
2445 |   logger.error(f"500 Internal Server Error: {error}", exc_info=True)
2446 |   return jsonify({"error": "Internal server error"}), 500
2447 | 
2448 | # --- Main -------------------------------------------------------------------
2449 | 
2450 | def main():
2451 |   logger.info("=" * 80)
2452 |   logger.info("Complete Kokoro TTS API Server v3.3.0 - ENHANCED GPU LOGGING")
2453 |   logger.info("=" * 80)
2454 |   
2455 |   # Enhanced GPU Information
2456 |   logger.info("HARDWARE CONFIGURATION:")
2457 |   logger.info(f"  CUDA Available: {CUDA_AVAILABLE}")
2458 |   if CUDA_AVAILABLE:
2459 |       logger.info(f"  GPU Device Count: {GPU_DEVICE_COUNT}")
2460 |       logger.info(f"  GPU Device Name: {GPU_DEVICE_NAME}")
2461 |       memory_info = get_gpu_memory_info()
2462 |       logger.info(f"  GPU Memory Total: {memory_info['total']:.2f} GB")
2463 |       logger.info(f"  GPU Memory Available: {memory_info['free']:.2f} GB")
2464 |       logger.info(f"  PyTorch CUDA Version: {torch.version.cuda}")
2465 |   else:
2466 |       logger.info("  GPU Status: Not available - using CPU only")
2467 |   
2468 |   logger.info(f"  PyTorch Version: {torch.__version__}")
2469 |   logger.info("=" * 40)
2470 |   
2471 |   logger.info("APPLICATION CONFIGURATION:")
2472 |   logger.info(f"  Total Voices: {len(KOKORO_VOICES)}")
2473 |   logger.info(f"  Default Voice: {DEFAULT_VOICE}")
2474 |   logger.info(f"  CORS Enabled: {ENABLE_CORS}")
2475 |   logger.info(f"  Server Address: http://{API_HOST}:{API_PORT}")
2476 |   logger.info(f"  Markdown Support: {HAS_MARKDOWN}")
2477 |   logger.info(f"  Num2Words Support: {HAS_NUM2WORDS}")
2478 |   logger.info("=" * 40)
2479 |   
2480 |   logger.info("KEY FEATURES:")
2481 |   logger.info("  ✓ Enhanced GPU/CPU detection and logging")
2482 |   logger.info("  ✓ Automatic GPU fallback to CPU on errors")
2483 |   logger.info("  ✓ Real-time GPU memory monitoring")
2484 |   logger.info("  ✓ Zero-default effects (clean slate approach)")
2485 |   logger.info("  ✓ Playback control with interrupt capability")
2486 |   logger.info("  ✓ Session management and status tracking")
2487 |   logger.info("  ✓ Robust text processing and streaming")
2488 |   logger.info("  ✓ Divide-by-zero safe calculations")
2489 |   logger.info("  ✓ FIXED pitch shifting with librosa compatibility")
2490 |   logger.info("  ✓ Full read-aloud extension compatibility")
2491 |   logger.info("=" * 40)
2492 |   
2493 |   logger.info("PITCH SHIFTING EXAMPLES:")
2494 |   logger.info("  Squeaky: {'effects': {'pitch': {'semitone_shift': 8.0}}}")
2495 |   logger.info("  Deep: {'effects': {'pitch': {'semitone_shift': -6.0}}}")
2496 |   logger.info("  Child: {'effects': {'voice_character': {'type': 'child'}}}")
2497 |   logger.info("  Monster: {'effects': {'voice_character': {'type': 'monster'}}}")
2498 |   logger.info("=" * 80)
2499 |   
2500 |   # Final GPU status check before starting server
2501 |   log_gpu_status("server startup")
2502 |   logger.info("Starting Flask server...")
2503 |   
2504 |   app.run(host=API_HOST, port=API_PORT, debug=False, threaded=True)
2505 | 
2506 | if __name__ == "__main__":
2507 |   main()
2508 | 


--------------------------------------------------------------------------------