├── public
    ├── mia.wav
    ├── arya.wav
    ├── logo.png
    ├── saul.wav
    └── tommy.wav
├── examples
    └── basic
    │   ├── generation
    │       ├── __init__.py
    │       └── generator.py
    │   ├── audio
    │       ├── __init__.py
    │       ├── player.py
    │       └── streaming.py
    │   ├── config.py
    │   ├── main.py
    │   ├── setup.sh
    │   ├── README.md
    │   ├── server.py
    │   └── client.html
├── .gitignore
├── README.md
└── LICENSE


/public/mia.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/mia.wav


--------------------------------------------------------------------------------
/public/arya.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/arya.wav


--------------------------------------------------------------------------------
/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/logo.png


--------------------------------------------------------------------------------
/public/saul.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/saul.wav


--------------------------------------------------------------------------------
/public/tommy.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/tommy.wav


--------------------------------------------------------------------------------
/examples/basic/generation/__init__.py:
--------------------------------------------------------------------------------
1 | """Text-to-speech generation modules"""
2 | 
3 | from .generator import TTSGenerator
4 | 
5 | __all__ = ['TTSGenerator']
6 | 


--------------------------------------------------------------------------------
/examples/basic/audio/__init__.py:
--------------------------------------------------------------------------------
1 | """Audio processing modules for Kani TTS"""
2 | 
3 | from .player import LLMAudioPlayer
4 | from .streaming import StreamingAudioWriter
5 | 
6 | __all__ = ['LLMAudioPlayer', 'StreamingAudioWriter']
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | venv/
 8 | env/
 9 | ENV/
10 | 
11 | # OS
12 | .DS_Store
13 | .DS_Store?
14 | ._*
15 | .Spotlight-V100
16 | .Trashes
17 | 
18 | *.safetensors
19 | *.bin
20 | 
21 | 
22 | # IDE
23 | .vscode/
24 | .idea/
25 | *.swp
26 | *.swo
27 | *~
28 | 
29 | # Logs
30 | *.log
31 | *.wav
32 | .venv
33 | 


--------------------------------------------------------------------------------
/examples/basic/config.py:
--------------------------------------------------------------------------------
 1 | """Configuration and constants for Kani TTS"""
 2 | 
 3 | # Tokenizer configuration
 4 | TOKENIZER_LENGTH = 64400
 5 | 
 6 | # Special tokens
 7 | START_OF_TEXT = 1
 8 | END_OF_TEXT = 2
 9 | START_OF_SPEECH = TOKENIZER_LENGTH + 1
10 | END_OF_SPEECH = TOKENIZER_LENGTH + 2
11 | START_OF_HUMAN = TOKENIZER_LENGTH + 3
12 | END_OF_HUMAN = TOKENIZER_LENGTH + 4
13 | START_OF_AI = TOKENIZER_LENGTH + 5
14 | END_OF_AI = TOKENIZER_LENGTH + 6
15 | PAD_TOKEN = TOKENIZER_LENGTH + 7
16 | AUDIO_TOKENS_START = TOKENIZER_LENGTH + 10
17 | 
18 | # Audio configuration
19 | CODEBOOK_SIZE = 4032
20 | SAMPLE_RATE = 22050
21 | 
22 | # Streaming configuration
23 | CHUNK_SIZE = 25  # Number of new frames to output per iteration
24 | LOOKBACK_FRAMES = 15  # Number of frames to include from previous context
25 | 
26 | # Generation configuration
27 | TEMPERATURE = 0.6
28 | TOP_P = 0.95
29 | REPETITION_PENALTY = 1.1
30 | REPETITION_CONTEXT_SIZE = 20
31 | MAX_TOKENS = 1200
32 | 
33 | # Model paths
34 | MODEL_NAME = "nineninesix/kani-tts-370m"
35 | CODEC_MODEL_NAME = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
36 | 


--------------------------------------------------------------------------------
/examples/basic/main.py:
--------------------------------------------------------------------------------
 1 | """Kani TTS - Text to Speech Generation"""
 2 | 
 3 | import time
 4 | from audio import LLMAudioPlayer, StreamingAudioWriter
 5 | from generation import TTSGenerator
 6 | from config import CHUNK_SIZE, LOOKBACK_FRAMES
 7 | 
 8 | from nemo.utils.nemo_logging import Logger
 9 | 
10 | nemo_logger = Logger()
11 | nemo_logger.remove_stream_handlers()
12 | 
13 | 
14 | def time_report(point_1, point_2, point_3):
15 |     model_request = point_2 - point_1
16 |     player_time = point_3 - point_2
17 |     total_time = point_3 - point_1
18 |     report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
19 |     return report
20 | 
21 | 
22 | def main():
23 |     # Initialize generator and audio player
24 |     generator = TTSGenerator()
25 |     player = LLMAudioPlayer(generator.tokenizer)
26 | 
27 |     # Set prompt
28 |     prompt = "katie: Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?"
29 | 
30 |     # Create streaming audio writer with sliding window decoder
31 |     # Uses lookback context from previous frames to maintain codec continuity
32 |     audio_writer = StreamingAudioWriter(
33 |         player,
34 |         'output.wav',
35 |         chunk_size=CHUNK_SIZE,        # Output 25 new frames (2.0s) per iteration
36 |         lookback_frames=LOOKBACK_FRAMES    # Include 15 previous frames (1.2s) for context
37 |     )
38 |     audio_writer.start()
39 | 
40 |     # Generate speech
41 |     result = generator.generate(prompt, audio_writer)
42 | 
43 |     # Finalize and write audio file
44 |     audio = audio_writer.finalize()
45 | 
46 |     point_3 = time.time()
47 | 
48 |     # Print results
49 |     print(time_report(result['point_1'], result['point_2'], point_3))
50 |     # print(f"\n[DEBUG] First 100 chars of generated text: {result['generated_text'][:100]}")
51 |     # print(f"[DEBUG] Last 100 chars of generated text: {result['generated_text'][-100:]}")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/examples/basic/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo "=== KaniTTS Setup ==="
 5 | echo ""
 6 | 
 7 | # Check if Python is available
 8 | if ! command -v python3 &> /dev/null; then
 9 |     echo "Error: Python 3 is not installed. Please install Python 3.10+ first."
10 |     exit 1
11 | fi
12 | 
13 | # Check Python version
14 | PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
15 | echo "Python version: $PYTHON_VERSION"
16 | 
17 | # Validate Python version (only 3.10-3.12 supported)
18 | PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
19 | PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
20 | 
21 | if [ "$PYTHON_MAJOR" -ne 3 ] || [ "$PYTHON_MINOR" -lt 10 ] || [ "$PYTHON_MINOR" -gt 12 ]; then
22 |     echo "Error: This project requires Python 3.10, 3.11, or 3.12"
23 |     echo "Current version: $PYTHON_VERSION"
24 |     exit 1
25 | fi
26 | echo "Python version is supported"
27 | 
28 | # Create virtual environment if it doesn't exist
29 | if [ ! -d "venv" ]; then
30 |     echo ""
31 |     echo "Creating virtual environment..."
32 |     python3 -m venv venv
33 |     echo "Virtual environment created successfully"
34 | else
35 |     echo ""
36 |     echo "Virtual environment already exists"
37 | fi
38 | 
39 | # Activate virtual environment
40 | echo "Activating virtual environment..."
41 | source venv/bin/activate
42 | 
43 | # Upgrade pip
44 | echo "Upgrading pip..."
45 | pip install --upgrade pip
46 | 
47 | # Add dependencies
48 | echo ""
49 | echo "Adding dependencies..."
50 | 
51 | # Install FastAPI and Uvicorn
52 | echo "Installing FastAPI and Uvicorn..."
53 | pip install fastapi uvicorn
54 | 
55 | # Install nemo-toolkit (which will install transformers 4.53)
56 | echo ""
57 | echo "Installing nemo-toolkit[tts]..."
58 | pip install "nemo-toolkit[tts]==2.4.0"
59 | 
60 | # Force reinstall transformers to 4.57.1 (required for model compatibility)
61 | echo ""
62 | echo "Upgrading transformers to 4.57.1..."
63 | echo "Note: nemo-toolkit[tts] requires transformers==4.53, but we need 4.57.1 for model compatibility"
64 | pip install "transformers==4.57.1"
65 | 
66 | # Verify installation
67 | echo ""
68 | echo "=== Verifying Installation ==="
69 | echo ""
70 | 
71 | python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
72 | python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
73 | python -c "import torch; print(f'CUDA version: {torch.version.cuda}') if torch.cuda.is_available() else print('CUDA not available')"
74 | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
75 | python -c "import transformers; print(f'Transformers version: {transformers.__version__}')"
76 | python -c "import fastapi; print(f'FastAPI version: {fastapi.__version__}')"
77 | 
78 | echo ""
79 | echo "=== Setup Complete ==="
80 | echo ""
81 | echo "You can now start the server with:"
82 | echo "  source venv/bin/activate"
83 | echo "  python server.py"
84 | echo ""
85 | echo "Note: Models will be automatically downloaded on first run (~1.5GB)"
86 | 


--------------------------------------------------------------------------------
/examples/basic/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Basic example for KaniTTS
  3 | 
  4 | ## Installation
  5 | ### Prerequisites
  6 | 
  7 | - Python 3.10
  8 | - Git
  9 | 
 10 | ### Setup
 11 | 
 12 | ```bash
 13 | chmod +x setup.sh
 14 | ./setup.sh
 15 | ```
 16 | 
 17 | ## Usage
 18 | 
 19 | ### Option 1: Standalone Generation (Local WAV File)
 20 | 
 21 | Generate audio and save it to a WAV file:
 22 | 
 23 | ```bash
 24 | python main.py
 25 | ```
 26 | 
 27 | This will:
 28 | - Generate speech from the prompt in [main.py:28](main.py#L28)
 29 | - Save output to `output.wav`
 30 | - Display timing metrics for performance analysis
 31 | 
 32 | ### Option 2: FastAPI Server + Web Interface
 33 | 
 34 | 1. Start the server:
 35 | ```bash
 36 | python server.py
 37 | ```
 38 | 
 39 | The server will start on `http://localhost:8000`
 40 | 
 41 | 2. Open the web interface:
 42 | ```bash
 43 | open client.html
 44 | ```
 45 | 
 46 | Or navigate to `http://localhost:8000` in your browser
 47 | 
 48 | ## API Endpoints
 49 | 
 50 | ### `POST /tts`
 51 | Generate complete audio file (non-streaming)
 52 | 
 53 | **Request:**
 54 | ```json
 55 | {
 56 |   "text": "Hello world!",
 57 |   "temperature": 0.6,
 58 |   "max_tokens": 1200,
 59 |   "top_p": 0.95,
 60 |   "chunk_size": 25,
 61 |   "lookback_frames": 15
 62 | }
 63 | ```
 64 | 
 65 | **Response:** WAV audio file
 66 | 
 67 | ### `POST /stream-tts`
 68 | Stream audio chunks for immediate playback
 69 | 
 70 | **Request:** Same as `/tts`
 71 | 
 72 | **Response:** Streaming PCM audio chunks with metadata headers
 73 | 
 74 | ## Configuration
 75 | 
 76 | Edit [config.py](config.py) to customize:
 77 | 
 78 | ```python
 79 | # Audio settings
 80 | CHUNK_SIZE = 25 # Frames per streaming iteration (2.0s)
 81 | LOOKBACK_FRAMES = 15 # Context frames for continuity (1.2s)
 82 | 
 83 | # Generation parameters
 84 | TEMPERATURE = 0.6
 85 | TOP_P = 0.95
 86 | REPETITION_PENALTY = 1.1
 87 | MAX_TOKENS = 1200
 88 | 
 89 | # Model configuration
 90 | MODEL_NAME = "nineninesix/kani-tts-370m"
 91 | CODEC_MODEL_NAME = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
 92 | ```
 93 | 
 94 | ## Technical Details
 95 | 
 96 | ### Streaming Architecture
 97 | 
 98 | The system uses a **sliding window decoder** for smooth audio generation:
 99 | 
100 | 1. **Chunk Size (25 frames)** - Outputs ~2.0 seconds of new audio per iteration
101 | 2. **Lookback Frames (15 frames)** - Includes ~1.2 seconds of context from previous output
102 | 
103 | ### Tested on
104 | 
105 | - NVIDIA GeForce RTX 5080
106 | - Driver Version: 570.169
107 | - CUDA Version: 12.8
108 | - 16GB GPU memory
109 | - Python: 3.12
110 | - Transformers: 4.57.1
111 | 
112 | In order to generate 15sec audio it takes ~1sec and ~2Gb GPU VRAM
113 | 
114 | > **Note:** If you experience audio breaks during streaming, try increasing `CHUNK_SIZE` in [config.py](config.py) to buffer more frames per chunk.
115 | 
116 | ## Models
117 | 
118 | - **TTS Model:** [nineninesix/kani-tts-370m](https://huggingface.co/nineninesix/kani-tts-370m)
119 | - **Codec Model:** [nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps](https://huggingface.co/nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps)
120 | 
121 | Models are automatically downloaded from Hugging Face on first run.
122 | 
123 | ## Browser Compatibility
124 | 
125 | The web interface requires a modern browser with support for:
126 | - Web Audio API
127 | - Fetch API with streaming
128 | 
129 | ## License
130 | Apache 2
131 | 
132 | ## Contributing
133 | 
134 | Contributions are welcome! Please feel free to submit issues or pull requests.
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/examples/basic/generation/generator.py:
--------------------------------------------------------------------------------
  1 | """Text-to-speech generation logic"""
  2 | 
  3 | import time
  4 | import torch
  5 | from transformers import AutoModelForCausalLM, AutoTokenizer
  6 | from transformers.generation.streamers import BaseStreamer
  7 | from threading import Thread
  8 | 
  9 | from config import (
 10 |     MODEL_NAME, START_OF_HUMAN, END_OF_TEXT, END_OF_HUMAN, END_OF_AI,
 11 |     TEMPERATURE, TOP_P, REPETITION_PENALTY, REPETITION_CONTEXT_SIZE, MAX_TOKENS
 12 | )
 13 | 
 14 | 
 15 | class TokenIDStreamer(BaseStreamer):
 16 |     """Custom streamer that yields token IDs"""
 17 |     def __init__(self, callback):
 18 |         self.callback = callback
 19 | 
 20 |     def put(self, value):
 21 |         """Called by model.generate() with token IDs"""
 22 |         if len(value.shape) > 1:
 23 |             token_ids = value[0].tolist()
 24 |         else:
 25 |             token_ids = value.tolist()
 26 | 
 27 |         for token_id in token_ids:
 28 |             self.callback(token_id)
 29 | 
 30 |     def end(self):
 31 |         """Called when generation is complete"""
 32 |         pass
 33 | 
 34 | 
 35 | class TTSGenerator:
 36 |     def __init__(self):
 37 |         self.model = AutoModelForCausalLM.from_pretrained(
 38 |             MODEL_NAME,
 39 |             torch_dtype=torch.bfloat16,
 40 |             device_map="auto",
 41 |         )
 42 |         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 43 | 
 44 |         if torch.cuda.is_available():
 45 |             self.device = 'cuda'
 46 |         elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
 47 |             self.device = 'mps'
 48 |         else:
 49 |             self.device = 'cpu'
 50 | 
 51 |     def prepare_input(self, prompt):
 52 |         """Build custom input_ids with special tokens"""
 53 |         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
 54 |         start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
 55 |         end_tokens = torch.tensor([[END_OF_TEXT, END_OF_HUMAN]], dtype=torch.int64)
 56 |         modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
 57 |         modified_input_ids = modified_input_ids.to(self.device)
 58 | 
 59 |         attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
 60 |         attention_mask = attention_mask.to(self.device)
 61 | 
 62 |         return modified_input_ids, attention_mask
 63 | 
 64 |     def generate(self, prompt, audio_writer, max_tokens=MAX_TOKENS):
 65 |         """Generate speech tokens from text prompt"""
 66 |         modified_input_ids, attention_mask = self.prepare_input(prompt)
 67 | 
 68 |         point_1 = time.time()
 69 | 
 70 |         # Stream tokens from LLM
 71 |         all_token_ids = []
 72 | 
 73 |         def on_token_generated(token_id):
 74 |             """Callback for each generated token"""
 75 |             all_token_ids.append(token_id)
 76 |             # print(f"[LLM] Token {len(all_token_ids)}: {token_id}")
 77 |             audio_writer.add_token(token_id)
 78 | 
 79 |         streamer = TokenIDStreamer(callback=on_token_generated)
 80 | 
 81 |         generation_kwargs = dict(
 82 |             input_ids=modified_input_ids,
 83 |             attention_mask=attention_mask,
 84 |             max_new_tokens=max_tokens,
 85 |             do_sample=True,
 86 |             temperature=TEMPERATURE,
 87 |             top_p=TOP_P,
 88 |             repetition_penalty=REPETITION_PENALTY,
 89 |             num_return_sequences=1,
 90 |             eos_token_id=END_OF_AI,
 91 |             streamer=streamer,
 92 |         )
 93 | 
 94 |         thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
 95 |         thread.start()
 96 |         thread.join()
 97 | 
 98 |         point_2 = time.time()
 99 | 
100 |         print(f"\n[MAIN] Generation complete. Total tokens: {len(all_token_ids)}")
101 | 
102 |         # Decode generated text from token IDs
103 |         generated_text = self.tokenizer.decode(all_token_ids, skip_special_tokens=True)
104 | 
105 |         return {
106 |             'generated_text': generated_text,
107 |             'all_token_ids': all_token_ids,
108 |             'generation_time': point_2 - point_1,
109 |             'point_1': point_1,
110 |             'point_2': point_2
111 |         }
112 | 


--------------------------------------------------------------------------------
/examples/basic/audio/player.py:
--------------------------------------------------------------------------------
  1 | """Audio player for LLM-generated speech tokens"""
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from nemo.collections.tts.models import AudioCodecModel
  6 | 
  7 | from config import (
  8 |     TOKENIZER_LENGTH, START_OF_TEXT, END_OF_TEXT,
  9 |     START_OF_SPEECH, END_OF_SPEECH, START_OF_HUMAN, END_OF_HUMAN,
 10 |     START_OF_AI, END_OF_AI, PAD_TOKEN, AUDIO_TOKENS_START, CODEBOOK_SIZE
 11 | )
 12 | 
 13 | 
 14 | class LLMAudioPlayer:
 15 |     def __init__(self, tokenizer) -> None:
 16 |         self.nemo_codec_model = AudioCodecModel\
 17 |                 .from_pretrained("nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps").eval()
 18 | 
 19 |         if torch.cuda.is_available():
 20 |             self.device = 'cuda'
 21 |         elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
 22 |             self.device = 'mps'
 23 |         else:
 24 |             self.device = 'cpu'
 25 | 
 26 |         self.nemo_codec_model.to(self.device)
 27 |         self.tokenizer = tokenizer
 28 | 
 29 |         self.tokeniser_length = TOKENIZER_LENGTH
 30 |         self.start_of_text = START_OF_TEXT
 31 |         self.end_of_text = END_OF_TEXT
 32 |         self.start_of_speech = START_OF_SPEECH
 33 |         self.end_of_speech = END_OF_SPEECH
 34 |         self.start_of_human = START_OF_HUMAN
 35 |         self.end_of_human = END_OF_HUMAN
 36 |         self.start_of_ai = START_OF_AI
 37 |         self.end_of_ai = END_OF_AI
 38 |         self.pad_token = PAD_TOKEN
 39 |         self.audio_tokens_start = AUDIO_TOKENS_START
 40 |         self.codebook_size = CODEBOOK_SIZE
 41 | 
 42 |     def output_validation(self, out_ids):
 43 |         start_of_speech_flag = self.start_of_speech in out_ids
 44 |         end_of_speech_flag = self.end_of_speech in out_ids
 45 |         if not (start_of_speech_flag and end_of_speech_flag):
 46 |             raise ValueError('Special speech tokens not exist!')
 47 | 
 48 |     def get_nano_codes(self, out_ids):
 49 |         start_a_idx = (out_ids == self.start_of_speech).nonzero(as_tuple=True)[0].item()
 50 |         end_a_idx   = (out_ids == self.end_of_speech).nonzero(as_tuple=True)[0].item()
 51 |         if start_a_idx >= end_a_idx:
 52 |             raise ValueError('Invalid audio codes sequence!')
 53 | 
 54 |         audio_codes = out_ids[start_a_idx+1 : end_a_idx]
 55 |         if len(audio_codes) % 4:
 56 |             raise ValueError('The length of the sequence must be a multiple of 4!')
 57 |         audio_codes = audio_codes.reshape(-1, 4)
 58 |         audio_codes = audio_codes - torch.tensor([self.codebook_size * i for i in range(4)])
 59 |         audio_codes = audio_codes - self.audio_tokens_start
 60 |         if (audio_codes < 0).sum().item() > 0:
 61 |             raise ValueError('Invalid audio tokens!')
 62 | 
 63 |         audio_codes = audio_codes.T.unsqueeze(0)
 64 |         len_ = torch.tensor([audio_codes.shape[-1]])
 65 |         return audio_codes, len_
 66 | 
 67 |     def get_text(self, out_ids):
 68 |         try:
 69 |             start_t_idx = (out_ids == self.start_of_text).tolist().index(True)
 70 |             end_t_idx   = (out_ids == self.end_of_text).tolist().index(True)
 71 |             txt_tokens = out_ids[start_t_idx : end_t_idx+1]
 72 |             text = self.tokenizer.decode(txt_tokens, skip_special_tokens=True)
 73 |             return text
 74 |         except ValueError:
 75 |             return None
 76 | 
 77 |     def get_waveform(self, out_ids):
 78 |         out_ids = out_ids.flatten()
 79 |         self.output_validation(out_ids)
 80 |         audio_codes, len_ = self.get_nano_codes(out_ids)
 81 |         audio_codes, len_ = audio_codes.to(self.device), len_.to(self.device)
 82 |         with torch.inference_mode():
 83 |             reconstructed_audio, _ = self.nemo_codec_model.decode(tokens=audio_codes, tokens_len=len_)
 84 |             output_audio = reconstructed_audio.cpu().detach().numpy().squeeze()
 85 | 
 86 |         text = self.get_text(out_ids)
 87 |         return output_audio, text
 88 | 
 89 |     def decode_audio_chunk(self, audio_codes):
 90 |         """Decode a chunk of audio codes (shape: [num_frames, 4])"""
 91 |         if len(audio_codes) == 0:
 92 |             return None
 93 | 
 94 |         # Process audio codes: subtract offsets for each codebook
 95 |         audio_codes = torch.tensor(audio_codes, device=self.device)
 96 |         audio_codes = audio_codes - torch.tensor([self.codebook_size * i for i in range(4)], device=self.device)
 97 |         audio_codes = audio_codes - self.audio_tokens_start
 98 | 
 99 |         if (audio_codes < 0).sum().item() > 0:
100 |             return None  # Invalid tokens, skip
101 | 
102 |         # Shape: (1, 4, num_frames) - batch_size=1, num_codebooks=4, num_frames
103 |         audio_codes = audio_codes.T.unsqueeze(0)
104 |         len_ = torch.tensor([audio_codes.shape[-1]], device=self.device)
105 | 
106 |         with torch.inference_mode():
107 |             reconstructed_audio, _ = self.nemo_codec_model.decode(tokens=audio_codes, tokens_len=len_)
108 |             output_audio = reconstructed_audio.cpu().detach().numpy().squeeze()
109 | 
110 |         return output_audio
111 | 


--------------------------------------------------------------------------------
/examples/basic/server.py:
--------------------------------------------------------------------------------
  1 | """FastAPI server for Kani TTS with streaming support"""
  2 | 
  3 | import io
  4 | import time
  5 | from fastapi import FastAPI, HTTPException
  6 | from fastapi.middleware.cors import CORSMiddleware
  7 | from fastapi.responses import StreamingResponse, Response
  8 | from pydantic import BaseModel
  9 | from typing import Optional
 10 | import numpy as np
 11 | from scipy.io.wavfile import write as wav_write
 12 | 
 13 | from audio import LLMAudioPlayer, StreamingAudioWriter
 14 | from generation import TTSGenerator
 15 | from config import CHUNK_SIZE, LOOKBACK_FRAMES, TEMPERATURE, TOP_P, MAX_TOKENS
 16 | 
 17 | from nemo.utils.nemo_logging import Logger
 18 | 
 19 | nemo_logger = Logger()
 20 | nemo_logger.remove_stream_handlers()
 21 | 
 22 | 
 23 | app = FastAPI(title="Kani TTS API", version="1.0.0")
 24 | 
 25 | # Add CORS middleware to allow client.html to connect
 26 | app.add_middleware(
 27 |     CORSMiddleware,
 28 |     allow_origins=["*"],  # In production, specify your frontend domain
 29 |     allow_credentials=True,
 30 |     allow_methods=["*"],
 31 |     allow_headers=["*"],
 32 | )
 33 | 
 34 | # Global instances (initialized on startup)
 35 | generator = None
 36 | player = None
 37 | 
 38 | 
 39 | class TTSRequest(BaseModel):
 40 |     text: str
 41 |     temperature: Optional[float] = TEMPERATURE
 42 |     max_tokens: Optional[int] = MAX_TOKENS
 43 |     top_p: Optional[float] = TOP_P
 44 |     chunk_size: Optional[int] = CHUNK_SIZE
 45 |     lookback_frames: Optional[int] = LOOKBACK_FRAMES
 46 | 
 47 | 
 48 | @app.on_event("startup")
 49 | async def startup_event():
 50 |     """Initialize models on startup"""
 51 |     global generator, player
 52 |     print("🚀 Initializing TTS models...")
 53 |     generator = TTSGenerator()
 54 |     player = LLMAudioPlayer(generator.tokenizer)
 55 |     print("✅ TTS models initialized successfully!")
 56 | 
 57 | 
 58 | @app.get("/health")
 59 | async def health_check():
 60 |     """Check if server is ready"""
 61 |     return {
 62 |         "status": "healthy",
 63 |         "tts_initialized": generator is not None and player is not None
 64 |     }
 65 | 
 66 | 
 67 | @app.post("/tts")
 68 | async def generate_speech(request: TTSRequest):
 69 |     """Generate complete audio file (non-streaming)"""
 70 |     if not generator or not player:
 71 |         raise HTTPException(status_code=503, detail="TTS models not initialized")
 72 | 
 73 |     try:
 74 |         # Create audio writer
 75 |         audio_writer = StreamingAudioWriter(
 76 |             player,
 77 |             output_file=None,  # We won't write to file
 78 |             chunk_size=request.chunk_size,
 79 |             lookback_frames=request.lookback_frames
 80 |         )
 81 |         audio_writer.start()
 82 | 
 83 |         # Generate speech
 84 |         result = generator.generate(
 85 |             request.text,
 86 |             audio_writer,
 87 |             max_tokens=request.max_tokens
 88 |         )
 89 | 
 90 |         # Finalize and get audio
 91 |         audio_writer.finalize()
 92 | 
 93 |         if not audio_writer.audio_chunks:
 94 |             raise HTTPException(status_code=500, detail="No audio generated")
 95 | 
 96 |         # Concatenate all chunks
 97 |         full_audio = np.concatenate(audio_writer.audio_chunks)
 98 | 
 99 |         # Convert to WAV bytes
100 |         wav_buffer = io.BytesIO()
101 |         wav_write(wav_buffer, 22050, full_audio)
102 |         wav_buffer.seek(0)
103 | 
104 |         return Response(
105 |             content=wav_buffer.read(),
106 |             media_type="audio/wav",
107 |             headers={
108 |                 "Content-Disposition": "attachment; filename=speech.wav"
109 |             }
110 |         )
111 | 
112 |     except Exception as e:
113 |         raise HTTPException(status_code=500, detail=str(e))
114 | 
115 | 
116 | @app.post("/stream-tts")
117 | async def stream_speech(request: TTSRequest):
118 |     """Stream audio chunks as they're generated for immediate playback"""
119 |     if not generator or not player:
120 |         raise HTTPException(status_code=503, detail="TTS models not initialized")
121 | 
122 |     import queue
123 |     import threading
124 |     import struct
125 | 
126 |     async def audio_chunk_generator():
127 |         """Yield audio chunks as raw PCM data with length prefix"""
128 |         chunk_queue = queue.Queue()
129 | 
130 |         # Create a custom list wrapper that pushes chunks to queue
131 |         class ChunkList(list):
132 |             def append(self, chunk):
133 |                 super().append(chunk)
134 |                 chunk_queue.put(("chunk", chunk))
135 | 
136 |         audio_writer = StreamingAudioWriter(
137 |             player,
138 |             output_file=None,
139 |             chunk_size=request.chunk_size,
140 |             lookback_frames=request.lookback_frames
141 |         )
142 | 
143 |         # Replace audio_chunks list with our custom one
144 |         audio_writer.audio_chunks = ChunkList()
145 | 
146 |         # Start generation in background thread
147 |         def generate():
148 |             try:
149 |                 audio_writer.start()
150 |                 generator.generate(
151 |                     request.text,
152 |                     audio_writer,
153 |                     max_tokens=request.max_tokens
154 |                 )
155 |                 audio_writer.finalize()
156 |                 chunk_queue.put(("done", None))  # Signal completion
157 |             except Exception as e:
158 |                 print(f"Generation error: {e}")
159 |                 chunk_queue.put(("error", str(e)))
160 | 
161 |         gen_thread = threading.Thread(target=generate)
162 |         gen_thread.start()
163 | 
164 |         # Stream chunks as they arrive
165 |         try:
166 |             while True:
167 |                 msg_type, data = chunk_queue.get(timeout=30)  # 30s timeout
168 | 
169 |                 if msg_type == "chunk":
170 |                     # Convert numpy array to int16 PCM
171 |                     pcm_data = (data * 32767).astype(np.int16)
172 |                     chunk_bytes = pcm_data.tobytes()
173 | 
174 |                     # Send chunk length (4 bytes) + chunk data
175 |                     length_prefix = struct.pack('<I', len(chunk_bytes))
176 |                     print(f"[STREAM] Sending chunk: {len(chunk_bytes)} bytes ({len(data)/22050:.2f}s)")
177 |                     yield length_prefix + chunk_bytes
178 | 
179 |                 elif msg_type == "done":
180 |                     # Send end marker (length = 0)
181 |                     yield struct.pack('<I', 0)
182 |                     break
183 | 
184 |                 elif msg_type == "error":
185 |                     # Send error marker (length = 0xFFFFFFFF)
186 |                     yield struct.pack('<I', 0xFFFFFFFF)
187 |                     break
188 | 
189 |         finally:
190 |             gen_thread.join()
191 | 
192 |     return StreamingResponse(
193 |         audio_chunk_generator(),
194 |         media_type="application/octet-stream",
195 |         headers={
196 |             "X-Sample-Rate": "22050",
197 |             "X-Channels": "1",
198 |             "X-Bit-Depth": "16"
199 |         }
200 |     )
201 | 
202 | 
203 | @app.get("/")
204 | async def root():
205 |     """Root endpoint with API info"""
206 |     return {
207 |         "name": "Kani TTS API",
208 |         "version": "1.0.0",
209 |         "endpoints": {
210 |             "/tts": "POST - Generate complete audio",
211 |             "/stream-tts": "POST - Stream audio chunks",
212 |             "/health": "GET - Health check"
213 |         }
214 |     }
215 | 
216 | 
217 | if __name__ == "__main__":
218 |     import uvicorn
219 |     print("🎤 Starting Kani TTS Server...")
220 |     uvicorn.run(app, host="0.0.0.0", port=8000)
221 | 


--------------------------------------------------------------------------------
/examples/basic/audio/streaming.py:
--------------------------------------------------------------------------------
  1 | """Streaming audio writer with sliding window decoder"""
  2 | 
  3 | import threading
  4 | import queue
  5 | import numpy as np
  6 | from scipy.io.wavfile import write
  7 | 
  8 | from config import SAMPLE_RATE, CHUNK_SIZE, LOOKBACK_FRAMES
  9 | 
 10 | 
 11 | class StreamingAudioWriter:
 12 |     def __init__(self, player, output_file, sample_rate=SAMPLE_RATE,
 13 |                  chunk_size=CHUNK_SIZE, lookback_frames=LOOKBACK_FRAMES):
 14 |         """
 15 |         Sliding window decoder with lookback context.
 16 | 
 17 |         Args:
 18 |             player: LLMAudioPlayer instance
 19 |             output_file: Output WAV file path
 20 |             sample_rate: Audio sample rate (22050 Hz for nanocodec)
 21 |             chunk_size: Number of NEW frames to output per iteration
 22 |             lookback_frames: Number of frames to include from previous context for continuity
 23 |         """
 24 |         self.player = player
 25 |         self.output_file = output_file
 26 |         self.sample_rate = sample_rate
 27 |         self.chunk_size = chunk_size
 28 |         self.lookback_frames = lookback_frames
 29 |         self.token_queue = queue.Queue()
 30 |         self.audio_chunks = []
 31 |         self.running = True
 32 |         self.inside_speech = False
 33 |         self.audio_token_buffer = []
 34 |         self.all_tokens = []  # Store all audio tokens for sliding window decoding
 35 |         self.frames_decoded = 0  # Track how many frames we've already output
 36 | 
 37 |     def decoder_worker(self):
 38 |         """Background thread that decodes audio chunks as they arrive"""
 39 |         speech_ended = False
 40 | 
 41 |         while self.running or not self.token_queue.empty():
 42 |             try:
 43 |                 token_id = self.token_queue.get(timeout=0.1)
 44 | 
 45 |                 # Check for start/end of speech markers
 46 |                 if token_id == self.player.start_of_speech:
 47 |                     print(f"[DECODER] START_OF_SPEECH detected")
 48 |                     self.inside_speech = True
 49 |                     speech_ended = False
 50 |                     self.audio_token_buffer = []
 51 |                     continue
 52 | 
 53 |                 if token_id == self.player.end_of_speech:
 54 |                     if speech_ended:
 55 |                         print(f"[DECODER] Warning: Duplicate END_OF_SPEECH detected, ignoring")
 56 |                         continue
 57 | 
 58 |                     print(f"[DECODER] END_OF_SPEECH detected")
 59 | 
 60 |                     # Decode any remaining frames with sliding window
 61 |                     total_frames = len(self.all_tokens) // 4
 62 |                     remaining_frames = total_frames - self.frames_decoded
 63 | 
 64 |                     if remaining_frames >= 1:
 65 |                         # Decode from lookback point to end
 66 |                         start_frame = max(0, self.frames_decoded - self.lookback_frames)
 67 |                         start_token = start_frame * 4
 68 | 
 69 |                         tokens_to_decode = self.all_tokens[start_token:]
 70 |                         num_frames = len(tokens_to_decode) // 4
 71 | 
 72 |                         if num_frames > 0:
 73 |                             codes = np.array(tokens_to_decode[:num_frames * 4]).reshape(-1, 4)
 74 |                             audio_chunk = self.player.decode_audio_chunk(codes)
 75 | 
 76 |                             if audio_chunk is not None:
 77 |                                 samples_per_frame = len(audio_chunk) // num_frames
 78 | 
 79 |                                 # Skip lookback portion, only save new frames
 80 |                                 lookback_skip = min(self.frames_decoded, self.lookback_frames)
 81 |                                 skip_samples = lookback_skip * samples_per_frame
 82 |                                 new_audio = audio_chunk[skip_samples:]
 83 | 
 84 |                                 self.audio_chunks.append(new_audio)
 85 |                                 print(f"[DECODER] Final chunk: {remaining_frames} frames ({remaining_frames/12.5:.2f}s audio)")
 86 | 
 87 |                     self.inside_speech = False
 88 |                     speech_ended = True
 89 |                     self.audio_token_buffer = []
 90 |                     continue
 91 | 
 92 |                 # Accumulate audio tokens (only if speech hasn't ended)
 93 |                 if self.inside_speech and not speech_ended:
 94 |                     self.audio_token_buffer.append(token_id)
 95 |                     self.all_tokens.append(token_id)  # Keep all tokens for sliding window
 96 | 
 97 |                     # Decode when we have enough NEW frames to process
 98 |                     total_frames = len(self.all_tokens) // 4
 99 |                     new_frames = total_frames - self.frames_decoded
100 | 
101 |                     if new_frames >= self.chunk_size:
102 |                         # Calculate sliding window: include lookback_frames from previous context
103 |                         start_frame = max(0, self.frames_decoded - self.lookback_frames)
104 |                         start_token = start_frame * 4
105 | 
106 |                         # Decode from start_frame to current end
107 |                         tokens_to_decode = self.all_tokens[start_token:]
108 |                         num_frames = len(tokens_to_decode) // 4
109 | 
110 |                         codes = np.array(tokens_to_decode[:num_frames * 4]).reshape(-1, 4)
111 |                         audio_chunk = self.player.decode_audio_chunk(codes)
112 | 
113 |                         if audio_chunk is not None:
114 |                             samples_per_frame = len(audio_chunk) // num_frames
115 | 
116 |                             # Skip the lookback portion - only save the NEW frames
117 |                             lookback_skip = min(self.frames_decoded, self.lookback_frames)
118 |                             skip_samples = lookback_skip * samples_per_frame
119 | 
120 |                             # Extract only the new chunk_size frames worth of audio
121 |                             new_samples = self.chunk_size * samples_per_frame
122 |                             new_audio = audio_chunk[skip_samples:skip_samples + new_samples]
123 | 
124 |                             self.audio_chunks.append(new_audio)
125 |                             self.frames_decoded += self.chunk_size
126 | 
127 |                             print(f"[DECODER] Decoded {self.chunk_size} frames ({self.chunk_size/12.5:.2f}s audio) with {self.lookback_frames}-frame lookback context")
128 | 
129 |                         # Clear buffer (we've stored everything in all_tokens)
130 |                         self.audio_token_buffer = []
131 | 
132 |             except queue.Empty:
133 |                 continue
134 | 
135 |     def add_token(self, token_id):
136 |         """Add a token to the processing queue"""
137 |         self.token_queue.put(token_id)
138 | 
139 |     def finalize(self):
140 |         """Stop the decoder thread and write final audio file"""
141 |         self.running = False
142 |         self.decoder_thread.join()
143 | 
144 |         if self.audio_chunks:
145 |             # Concatenate all audio chunks
146 |             full_audio = np.concatenate(self.audio_chunks)
147 | 
148 |             # Only write to file if output_file is specified
149 |             if self.output_file:
150 |                 write(self.output_file, self.sample_rate, full_audio)
151 |                 print(f"[WRITER] Wrote {len(full_audio)/self.sample_rate:.2f}s of audio to {self.output_file}")
152 | 
153 |             return full_audio
154 |         return None
155 | 
156 |     def start(self):
157 |         """Start the decoder thread"""
158 |         self.decoder_thread = threading.Thread(target=self.decoder_worker)
159 |         self.decoder_thread.start()
160 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="public/logo.png" alt="Kani TTS Logo" width="150"/>
  3 | 
  4 |   [![](https://dcbadge.limes.pink/api/server/https://discord.gg/NzP3rjB4SB?style=flat)](https://discord.gg/NzP3rjB4SB) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  5 | 
  6 |   # Kani TTS
  7 | A fast, modular and human-like TTS that generates high-quality speech from text input.
  8 | </div>
  9 | 
 10 | ## Models
 11 | 
 12 | - [kani-tts-400m-en](https://huggingface.co/nineninesix/kani-tts-400m-en) - English
 13 | 
 14 | - [kani-tts-400m-zh](https://huggingface.co/nineninesix/kani-tts-400m-zh) - Chinese
 15 | 
 16 | - [kani-tts-400m-de](https://huggingface.co/nineninesix/kani-tts-400m-de) - German
 17 | 
 18 | - [kani-tts-400m-ar](https://huggingface.co/nineninesix/kani-tts-400m-ar) - Arabic
 19 | 
 20 | - [kani-tts-400m-es](https://huggingface.co/nineninesix/kani-tts-400m-es) - Spanish
 21 | 
 22 | - [kani-tts-400m-ko](https://huggingface.co/nineninesix/kani-tts-400m-ko) - Korean
 23 | 
 24 | - [kani-tts-370m-expo2025-osaka-ja](https://huggingface.co/nineninesix/kani-tts-370m-expo2025-osaka-ja) - Japanese
 25 | 
 26 | - [kani-tts-400m-0.3-pt](https://huggingface.co/nineninesix/kani-tts-400m-0.3-pt) - Pretrained checkpoint v0.3
 27 | 
 28 | - [kani-tts-370m multilingual](https://huggingface.co/nineninesix/kani-tts-370m) - English, Spanish, Chinese, German, Korean, Arabic
 29 | 
 30 | - [kani-tts-370m-mlx](https://huggingface.co/nineninesix/kani-tts-370m-MLX) - Multilingual model for Apple Silicon
 31 | 
 32 | - [kani-tts-450m-0.2-pt](https://huggingface.co/nineninesix/kani-tts-450m-0.2-pt) - Pretrained checkpoint v0.2 for posttraining and fine-tuning on custom datasets.
 33 | 
 34 | - [nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX](https://huggingface.co/nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX) - MLX implementation of NVIDIA NeMo NanoCodec, a lightweight neural audio codec.
 35 | 
 36 | 
 37 | **Notes:**
 38 | - Primarily optimized for English
 39 | - Performance degrades with inputs >1000 tokens
 40 | - Limited emotional expressivity without fine-tuning
 41 | 
 42 | 
 43 | ## Inference
 44 | 
 45 | Kani TTS offers multiple inference options optimized for different hardware:
 46 | 
 47 | ### Basic Example (GPU/CPU)
 48 | The basic inference example runs on both GPU and CPU, making it accessible for various hardware setups. Check the `examples/basic` in this repository for getting started.
 49 | You can use the PyPi package `kani-tts`. [More detailes...](https://pypi.org/project/kani-tts/)
 50 | 
 51 | ### vLLM (NVIDIA GPU)
 52 | For high-performance inference on NVIDIA GPUs, use [KaniTTS-vLLM](https://github.com/nineninesix-ai/kanitts-vllm). This option is super fast and provides an OpenAI compatible API, making it easy to integrate with existing tools and workflows.
 53 | 
 54 | ### MLX (Apple Silicon)
 55 | For Apple Silicon users, we provide an optimized [KaniTTS-MLX](https://github.com/nineninesix-ai/kani-mlx) that takes full advantage of the unified memory architecture and Neural Engine on M1/M2/M3 chips.
 56 | 
 57 | 
 58 | ### NeMo NanoCodec
 59 | - **Base Model:** [NVIDIA NeMo NanoCodec](https://developer.nvidia.com/nemo)
 60 | - **License:** [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf)
 61 | - **Sample Rate:** 22.05 kHz
 62 | - **Purpose:** Neural audio compression/decompression for TTS pipelines
 63 | 
 64 | ---
 65 | 
 66 | #### GPU Benchmark Results
 67 | 
 68 | | GPU Model | VRAM | Cost ($/hr) | RTF |
 69 | |-----------|------|-------------|-----|
 70 | | RTX 5090 | 32GB | $0.423 | 0.190 |
 71 | | RTX 4080 | 16GB | $0.220 | 0.200 |
 72 | | RTX 5060 Ti | 16GB | $0.138 | 0.529 |
 73 | | RTX 4060 Ti | 16GB | $0.122 | 0.537 |
 74 | | RTX 3060 | 12GB | $0.093 | 0.600 |
 75 | 
 76 | *Lower RTF is better (< 1.0 means faster than real-time). Benchmarks conducted on [Vast AI](https://vast.ai/).*
 77 | 
 78 | ---
 79 | 
 80 | ## Dataset Preparation
 81 | 
 82 | ### 1. Audio Dataset Collection
 83 | 
 84 | You can prepare your audio dataset using [Datamio](https://app.datamio.dev/), our active community members. Datamio provides tools to help you collect, organize, and manage high-quality audio datasets for TTS training.
 85 | 
 86 | ### 2. Audio Processing Pipeline
 87 | 
 88 | After collecting your raw audio dataset, you need to process it for training. Check out this audio processing pipeline: [nano-codec-dataset-pipeline](https://github.com/nineninesix-ai/nano-codec-dataset-pipeline)
 89 | 
 90 | This pipeline handles:
 91 | - Audio preprocessing and normalization
 92 | - Feature extraction
 93 | - Dataset formatting for training
 94 | - Quality validation
 95 | 
 96 | ---
 97 | 
 98 | ## Finetuning
 99 | 
100 | For finetuning KaniTTS on your own dataset, check out this comprehensive finetuning pipeline: [KaniTTS-Finetune-pipeline](https://github.com/nineninesix-ai/KaniTTS-Finetune-pipeline)
101 | 
102 | This pipeline provides:
103 | - Step-by-step finetuning guides
104 | - Configuration templates
105 | - Training scripts optimized for different hardware setups
106 | - Evaluation code to assess model performance
107 | - Best practices for achieving high-quality results
108 | 
109 | ---
110 | 
111 | ## App Examples
112 | 
113 | - [ComfyUI node](https://github.com/wildminder/ComfyUI-KaniTTS) by [WildAi](https://github.com/wildminder)
114 | 
115 | - [NextJS basic app](https://github.com/nineninesix-ai/open-audio). It uses the OpenAI npm package to connect to the OpenAI-compatible server API provided by [kanitts-vllm](https://github.com/nineninesix-ai/kanitts-vllm).
116 | 
117 | - [Livekit Agent](https://github.com/nineninesix-ai/livekit-agent) - A real-time voice AI assistant built with LiveKit Agents framework, featuring speech-to-text, language processing, and text-to-speech capabilities.
118 | 
119 | ---
120 | 
121 | ## Areas of improvement
122 | 
123 | We're continuously working to enhance KaniTTS. Here are key areas where we're focusing our efforts:
124 | 
125 | ### Core Architecture
126 | - **Create new LLM for TTS exclusively** - Develop a specialized LLM designed specifically for text-to-speech generation, optimized for audio token prediction rather than adapted from general-purpose LLMs
127 | 
128 | ### Model Enhancements
129 | - **Add more languages** - Expand support beyond the current languages to cover more language families and dialects
130 | - **Add more speakers** - Increase speaker diversity with different accents, age groups, and voice characteristics
131 | - **Voice cloning examples** - Provide tutorials and code examples for cloning custom voices from audio samples
132 | 
133 | ### Audio Codec Improvements
134 | - **Fine-tune codec** - Optimize the existing NanoCodec for better audio quality and compression efficiency
135 | - **Create new codec** - Develop a next-generation neural audio codec with improved naturalness and lower latency
136 | 
137 | ### Dataset Development
138 | Build and release high-quality, diverse audio datasets for training and fine-tuning.
139 | - Multi-speaker datasets across different languages
140 | - Domain-specific datasets (conversational, storytelling, professional voice-over)
141 | - Benchmark datasets for evaluation
142 | 
143 | 
144 | If you're interested in contributing to any of these areas, please check our [Contributing](#contributing) section and join our [Discord server](https://discord.gg/NzP3rjB4SB).
145 | 
146 | ---
147 | 
148 | ## License
149 | 
150 | Apache 2. See [LICENSE](LICENSE) file for details.
151 | 
152 | ---
153 | 
154 | ## Contributing
155 | 
156 | We're **open for community contributions**! KaniTTS is built with the community, and we welcome contributions of all kinds:
157 | 
158 | - **Code contributions** - Bug fixes, new features, optimizations, and documentation improvements
159 | - **Model contributions** - Fine-tuned models, voice clones, and language-specific adaptations
160 | - **Dataset contributions** - High-quality audio datasets for training and evaluation
161 | - **Examples and tutorials** - Integration examples, use cases, and guides
162 | - **Bug reports and feature requests** - Help us improve by reporting issues and suggesting enhancements
163 | 
164 | **How to contribute:**
165 | 1. Check our [Areas of improvement](#areas-of-improvement) section for current priorities
166 | 2. Join our [Discord server](https://discord.gg/NzP3rjB4SB) to discuss ideas and get support
167 | 3. Submit issues or pull requests on GitHub
168 | 4. Share your projects and use cases with the community
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction,
 10 | and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity granting the License.
 13 | 
 14 | "Legal Entity" shall mean the union of the acting entity and all
 15 | other entities that control, are controlled by, or are under common
 16 | control with that entity. For the purposes of this definition,
 17 | "control" means (i) the power, direct or indirect, to cause the
 18 | direction or management of such entity, whether by contract or
 19 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity
 23 | exercising permissions granted by this License.
 24 | 
 25 | "Source" shall mean the preferred form for making modifications,
 26 | including but not limited to software source code, documentation
 27 | source, and configuration files.
 28 | 
 29 | "Object" shall mean any form resulting from mechanical
 30 | transformation or translation of a Source form, including but
 31 | not limited to compiled object code, generated documentation,
 32 | and conversions to other media types.
 33 | 
 34 | "Work" shall mean the work of authorship, whether in Source or
 35 | Object form, made available under the License, as indicated by a
 36 | copyright notice that is included in or attached to the work
 37 | (which shall not include communications that are solely in the
 38 | nature of comments, suggestions or bug reports, but does not
 39 | include comments that are part of a notice similar to the Exemplary
 40 | legal notices set forth below).
 41 | 
 42 | "Derivative Works" shall mean any work, whether in Source or Object
 43 | form, that is based upon (or derived from) the Work and for which the
 44 | editorial revisions, annotations, elaborations, or other modifications
 45 | represent, as a whole, an original work of authorship. For the purposes
 46 | of this License, Derivative Works shall not include works that remain
 47 | separable from, or merely link (or bind by name) to the interfaces of,
 48 | the Work and derivative works thereof.
 49 | 
 50 | "Contributor" shall mean Licensor and any individual or Legal Entity
 51 | on behalf of whom a Contribution has been received by Licensor and
 52 | subsequently incorporated within the Work.
 53 | 
 54 | "Contribution" shall mean any work of authorship, including
 55 | the original version of the Work and any modifications or additions
 56 | to that Work or Derivative Works thereof, that is intentionally
 57 | submitted to Licensor for inclusion in the Work by the copyright
 58 | owner or by an individual or Legal Entity authorized to submit on
 59 | behalf of the copyright owner. For the purposes of this definition,
 60 | "submitted" means any form of electronic, verbal, or written
 61 | communication sent to the Licensor or its representatives, including
 62 | but not limited to communication on electronic mailing lists,
 63 | source code control systems, and issue tracking systems that are
 64 | managed by, or on behalf of, the Licensor for the purpose of
 65 | discussing and improving the Work, but excluding communication that
 66 | is conspicuously marked or otherwise designated in writing by the
 67 | copyright owner as "Not a Contribution."
 68 | 
 69 | 2. Grant of Copyright License. Subject to the terms and conditions of
 70 | this License, each Contributor hereby grants to You a perpetual,
 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 72 | copyright license to use, reproduce, modify, display, perform,
 73 | sublicense, and distribute the Work and such Derivative Works in
 74 | Source or Object form.
 75 | 
 76 | 3. Grant of Patent License. Subject to the terms and conditions of
 77 | this License, each Contributor hereby grants to You a perpetual,
 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 79 | (except as stated in this section) patent license to make, have made,
 80 | use, offer to sell, sell, import, and otherwise transfer the Work,
 81 | where such license applies only to those patent claims licensable
 82 | by such Contributor that are necessarily infringed by their
 83 | Contribution(s) alone or by combination of their Contribution(s)
 84 | with the Work to which such Contribution(s) was submitted. If You
 85 | institute patent litigation against any entity (including a
 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work
 87 | or a Contribution incorporated within the Work constitutes direct
 88 | or contributory patent infringement, then any patent licenses
 89 | granted to You under this License for that Work shall terminate
 90 | as of the date such litigation is filed.
 91 | 
 92 | 4. Redistribution. You may reproduce and distribute copies of the
 93 | Work or Derivative Works thereof in any medium, with or without
 94 | modifications, and in Source or Object form, provided that You
 95 | meet the following conditions:
 96 | 
 97 | (a) You must give any other recipients of the Work or
 98 | Derivative Works a copy of this License; and
 99 | 
100 | (b) You must cause any modified files to carry prominent notices
101 | stating that You changed the files; and
102 | 
103 | (c) You must retain, in the Source form of any Derivative Works
104 | that You distribute, all copyright, patent, trademark, and
105 | attribution notices from the Source form of the Work,
106 | excluding those notices that do not pertain to any part of
107 | the Derivative Works; and
108 | 
109 | (d) If the Work includes a "NOTICE" text file as part of its
110 | distribution, then any Derivative Works that You distribute must
111 | include a readable copy of the attribution notices contained
112 | within such NOTICE file, excluding those notices that do not
113 | pertain to any part of the Derivative Works, in at least one
114 | of the following places: within a NOTICE text file distributed
115 | as part of the Derivative Works; within the Source form or
116 | documentation, if provided along with the Derivative Works; or,
117 | within a display generated by the Derivative Works, if and
118 | wherever such third-party notices normally appear. The contents
119 | of the NOTICE file are for informational purposes only and
120 | do not modify the License. You may add Your own attribution
121 | notices within Derivative Works that You distribute, alongside
122 | or as an addendum to the NOTICE text from the Work, provided
123 | that such additional attribution notices cannot be construed
124 | as modifying the License.
125 | 
126 | You may add Your own copyright notice and may provide additional or
127 | different license terms and conditions for use, reproduction, or
128 | distribution of Your Derivative Works as a whole, provided Your use,
129 | reproduction, and distribution of the Work otherwise complies with
130 | the conditions stated in this License.
131 | 
132 | 5. Submission of Contributions. Unless You explicitly state otherwise,
133 | any Contribution intentionally submitted for inclusion in the Work
134 | by You to the Licensor shall be under the terms and conditions of
135 | this License, without any additional terms or conditions.
136 | Notwithstanding the above, nothing herein shall supersede or modify
137 | the terms of any separate license agreement you may have executed
138 | with Licensor regarding such Contributions.
139 | 
140 | 6. Trademarks. This License does not grant permission to use the trade
141 | names, trademarks, service marks, or product names of the Licensor,
142 | except as required for reasonable and customary use in describing the
143 | origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 | 7. Disclaimer of Warranty. Unless required by applicable law or
146 | agreed to in writing, Licensor provides the Work (and each
147 | Contributor provides its Contributions) on an "AS IS" BASIS,
148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 | implied, including, without limitation, any warranties or conditions
150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 | PARTICULAR PURPOSE. You are solely responsible for determining the
152 | appropriateness of using or redistributing the Work and assume any
153 | risks associated with Your exercise of permissions under this License.
154 | 
155 | 8. Limitation of Liability. In no event and under no legal theory,
156 | whether in tort (including negligence), contract, or otherwise,
157 | unless required by applicable law (such as deliberate and grossly
158 | negligent acts) or agreed to in writing, shall any Contributor be
159 | liable to You for damages, including any direct, indirect, special,
160 | incidental, or consequential damages of any character arising as a
161 | result of this License or out of the use or inability to use the
162 | Work (including but not limited to damages for loss of goodwill,
163 | work stoppage, computer failure or malfunction, or any and all
164 | other commercial damages or losses), even if such Contributor
165 | has been advised of the possibility of such damages.
166 | 
167 | 9. Accepting Warranty or Support. You may choose to offer, and to
168 | charge a fee for, warranty, support, indemnity, or other liability
169 | obligations and/or rights consistent with this License. However, in
170 | accepting such obligations, You may act only on Your own behalf and on
171 | Your sole responsibility, not on behalf of any other Contributor, and
172 | only if You agree to indemnify, defend, and hold each Contributor
173 | harmless for any liability incurred by, or claims asserted against,
174 | such Contributor by reason of your accepting any such warranty or support.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!)  The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same page as the copyright notice for easier identification within
187 | third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/examples/basic/client.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Kani TTS Client</title>
  7 |     <style>
  8 |         body {
  9 |             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 10 |             max-width: 800px;
 11 |             margin: 0 auto;
 12 |             padding: 20px;
 13 |             background-color: #f5f5f5;
 14 |         }
 15 |         
 16 |         .container {
 17 |             background: white;
 18 |             padding: 30px;
 19 |             border-radius: 10px;
 20 |             box-shadow: 0 2px 10px rgba(0,0,0,0.1);
 21 |         }
 22 |         
 23 |         h1 {
 24 |             color: #333;
 25 |             text-align: center;
 26 |             margin-bottom: 30px;
 27 |         }
 28 |         
 29 |         .form-group {
 30 |             margin-bottom: 20px;
 31 |         }
 32 |         
 33 |         label {
 34 |             display: block;
 35 |             margin-bottom: 5px;
 36 |             font-weight: bold;
 37 |             color: #555;
 38 |         }
 39 |         
 40 |         textarea {
 41 |             width: 100%;
 42 |             height: 120px;
 43 |             padding: 12px;
 44 |             border: 2px solid #ddd;
 45 |             border-radius: 5px;
 46 |             font-size: 16px;
 47 |             resize: vertical;
 48 |             box-sizing: border-box;
 49 |         }
 50 |         
 51 |         textarea:focus {
 52 |             border-color: #4CAF50;
 53 |             outline: none;
 54 |         }
 55 |         
 56 |         .controls {
 57 |             display: flex;
 58 |             gap: 15px;
 59 |             align-items: center;
 60 |             flex-wrap: wrap;
 61 |         }
 62 |         
 63 |         .control-group {
 64 |             display: flex;
 65 |             flex-direction: column;
 66 |             min-width: 120px;
 67 |         }
 68 |         
 69 |         .control-group label {
 70 |             font-size: 14px;
 71 |             margin-bottom: 3px;
 72 |         }
 73 |         
 74 |         input[type="number"], input[type="range"] {
 75 |             padding: 8px;
 76 |             border: 2px solid #ddd;
 77 |             border-radius: 3px;
 78 |             font-size: 14px;
 79 |         }
 80 |         
 81 |         input[type="range"] {
 82 |             width: 100px;
 83 |         }
 84 |         
 85 |         .buttons {
 86 |             display: flex;
 87 |             gap: 10px;
 88 |             margin-top: 20px;
 89 |         }
 90 |         
 91 |         button {
 92 |             padding: 12px 24px;
 93 |             font-size: 16px;
 94 |             border: none;
 95 |             border-radius: 5px;
 96 |             cursor: pointer;
 97 |             transition: background-color 0.3s;
 98 |         }
 99 |         
100 |         .btn-primary {
101 |             background-color: #4CAF50;
102 |             color: white;
103 |         }
104 |         
105 |         .btn-primary:hover {
106 |             background-color: #45a049;
107 |         }
108 |         
109 |         .btn-secondary {
110 |             background-color: #2196F3;
111 |             color: white;
112 |         }
113 |         
114 |         .btn-secondary:hover {
115 |             background-color: #1976D2;
116 |         }
117 |         
118 |         .btn-danger {
119 |             background-color: #f44336;
120 |             color: white;
121 |         }
122 |         
123 |         .btn-danger:hover {
124 |             background-color: #d32f2f;
125 |         }
126 |         
127 |         button:disabled {
128 |             background-color: #cccccc;
129 |             cursor: not-allowed;
130 |         }
131 |         
132 |         .audio-container {
133 |             margin-top: 20px;
134 |             padding: 20px;
135 |             background-color: #f9f9f9;
136 |             border-radius: 5px;
137 |             border: 1px solid #ddd;
138 |         }
139 |         
140 |         .audio-container h3 {
141 |             margin-top: 0;
142 |             color: #333;
143 |         }
144 |         
145 |         audio {
146 |             width: 100%;
147 |             margin-top: 10px;
148 |         }
149 |         
150 |         .status {
151 |             margin-top: 15px;
152 |             padding: 10px;
153 |             border-radius: 3px;
154 |             font-weight: bold;
155 |         }
156 |         
157 |         .status.loading {
158 |             background-color: #fff3cd;
159 |             color: #856404;
160 |             border: 1px solid #ffeaa7;
161 |         }
162 |         
163 |         .status.success {
164 |             background-color: #d4edda;
165 |             color: #155724;
166 |             border: 1px solid #c3e6cb;
167 |         }
168 |         
169 |         .status.error {
170 |             background-color: #f8d7da;
171 |             color: #721c24;
172 |             border: 1px solid #f1b0b7;
173 |         }
174 |         
175 |         .examples {
176 |             margin-top: 30px;
177 |             padding: 20px;
178 |             background-color: #e3f2fd;
179 |             border-radius: 5px;
180 |         }
181 |         
182 |         .examples h3 {
183 |             margin-top: 0;
184 |             color: #1565c0;
185 |         }
186 |         
187 |         .example-btn {
188 |             display: inline-block;
189 |             margin: 5px;
190 |             padding: 8px 12px;
191 |             background-color: #1976d2;
192 |             color: white;
193 |             border: none;
194 |             border-radius: 3px;
195 |             cursor: pointer;
196 |             font-size: 14px;
197 |         }
198 |         
199 |         .example-btn:hover {
200 |             background-color: #1565c0;
201 |         }
202 |         
203 |         .api-info {
204 |             margin-top: 30px;
205 |             padding: 15px;
206 |             background-color: #f0f0f0;
207 |             border-radius: 5px;
208 |             font-size: 14px;
209 |             color: #666;
210 |         }
211 |         img {
212 |             margin-left: auto;
213 |             margin-right: auto;
214 |             display: block;
215 |         }
216 |     </style>
217 | </head>
218 | <body>
219 |     <div class="container">
220 |         <img alt="Logo" width="100px" height="100px" src="logo.png" />
221 |         <h1>Kani TTS</h1>
222 |         
223 |         <div class="form-group">
224 |             <label for="textInput">Enter text to convert to speech:</label>
225 |             <textarea 
226 |                 id="textInput" 
227 |                 placeholder="Type your text here... For example: 'Hello world! My name is Kani, I'm a speech generation model!'"
228 |             >The morning fog rolled across the valley like a gentle gray blanket, slowly revealing the ancient oak trees that had stood sentinel for centuries.</textarea>
229 |         </div>
230 |         
231 |         <div class="form-group">
232 |             <label>Generation Parameters:</label>
233 |             <div class="controls">
234 |                 <div class="control-group">
235 |                     <label for="temperature">Temperature:</label>
236 |                     <input type="range" id="temperature" min="0.1" max="1.5" step="0.1" value="0.6">
237 |                     <span id="tempValue">0.6</span>
238 |                 </div>
239 |                 <div class="control-group">
240 |                     <label for="maxTokens">Max Tokens:</label>
241 |                     <input type="number" id="maxTokens" min="100" max="2000" value="1200">
242 |                 </div>
243 |             </div>
244 |         </div>
245 |         
246 |         <div class="buttons">
247 |             <button id="generateBtn" class="btn-primary">🎵 Generate Speech</button>
248 |             <button id="streamBtn" class="btn-secondary">📡 Stream Speech</button>
249 |             <button id="stopBtn" class="btn-danger" disabled>⏹️ Stop</button>
250 |         </div>
251 |         
252 |         <div id="status"></div>
253 |         
254 |         <div id="audioContainer" class="audio-container" style="display: none;">
255 |             <h3>Generated Audio:</h3>
256 |             <audio id="audioPlayer" controls>
257 |                 Your browser does not support the audio element.
258 |             </audio>
259 |             <div>
260 |                 <button id="downloadBtn" class="btn-secondary" style="margin-top: 10px;">💾 Download Audio</button>
261 |             </div>
262 |         </div>
263 |         
264 |         <div class="examples">
265 |             <h3>Quick Examples:</h3>
266 |             <button class="example-btn" onclick="setExample('Hello world! My name is Kani, I am a speech generation model!')">Greeting</button>
267 |             <button class="example-btn" onclick="setExample('puck: The quick brown fox jumps over the lazy dog.')">Pangram</button>
268 |             <button class="example-btn" onclick="setExample('katie: Welcome to our artificial intelligence powered text to speech system.')">Welcome</button>
269 |             <button class="example-btn" onclick="setExample('ming: In a hole in the ground there lived a hobbit.')">Story</button>
270 |         </div>
271 |         
272 |         <div class="api-info">
273 |             <strong>API Endpoints:</strong><br>
274 |             • POST /tts - Generate complete audio file<br>
275 |             • POST /stream-tts - Stream audio generation<br>
276 |             • GET /health - Check server status<br>
277 |             <br>
278 |             <strong>Server:</strong> <span id="serverUrl">http://localhost:8000</span>
279 |         </div>
280 |     </div>
281 | 
282 |     <script>
283 |         const API_BASE = 'http://localhost:8000';
284 |         let currentAudioUrl = null;
285 |         let isGenerating = false;
286 |         
287 |         // Elements
288 |         const textInput = document.getElementById('textInput');
289 |         const temperatureSlider = document.getElementById('temperature');
290 |         const tempValue = document.getElementById('tempValue');
291 |         const maxTokensInput = document.getElementById('maxTokens');
292 |         const generateBtn = document.getElementById('generateBtn');
293 |         const streamBtn = document.getElementById('streamBtn');
294 |         const stopBtn = document.getElementById('stopBtn');
295 |         const statusDiv = document.getElementById('status');
296 |         const audioContainer = document.getElementById('audioContainer');
297 |         const audioPlayer = document.getElementById('audioPlayer');
298 |         const downloadBtn = document.getElementById('downloadBtn');
299 |         
300 |         // Update temperature display
301 |         temperatureSlider.addEventListener('input', (e) => {
302 |             tempValue.textContent = e.target.value;
303 |         });
304 |         
305 |         // Set example text
306 |         function setExample(text) {
307 |             textInput.value = text;
308 |         }
309 |         
310 |         // Show status message
311 |         function showStatus(message, type = 'info') {
312 |             statusDiv.innerHTML = `<div class="status ${type}">${message}</div>`;
313 |         }
314 |         
315 |         // Clear status
316 |         function clearStatus() {
317 |             statusDiv.innerHTML = '';
318 |         }
319 |         
320 |         // Set loading state
321 |         function setLoading(loading) {
322 |             isGenerating = loading;
323 |             generateBtn.disabled = loading;
324 |             streamBtn.disabled = loading;
325 |             stopBtn.disabled = !loading;
326 |             
327 |             if (loading) {
328 |                 showStatus('🎵 Generating speech... This may take a few seconds.', 'loading');
329 |             }
330 |         }
331 |         
332 |         // Generate speech (regular endpoint)
333 |         async function generateSpeech() {
334 |             const text = textInput.value.trim();
335 |             if (!text) {
336 |                 showStatus('Please enter some text to convert to speech.', 'error');
337 |                 return;
338 |             }
339 |             
340 |             setLoading(true);
341 |             
342 |             try {
343 |                 const response = await fetch(`${API_BASE}/tts`, {
344 |                     method: 'POST',
345 |                     headers: {
346 |                         'Content-Type': 'application/json',
347 |                     },
348 |                     body: JSON.stringify({
349 |                         text: text,
350 |                         temperature: parseFloat(temperatureSlider.value),
351 |                         max_tokens: parseInt(maxTokensInput.value)
352 |                     })
353 |                 });
354 |                 
355 |                 if (!response.ok) {
356 |                     const error = await response.json();
357 |                     throw new Error(error.detail || 'Speech generation failed');
358 |                 }
359 |                 
360 |                 const audioBlob = await response.blob();
361 |                 displayAudio(audioBlob);
362 |                 showStatus('✅ Speech generated successfully!', 'success');
363 |                 
364 |             } catch (error) {
365 |                 console.error('Error:', error);
366 |                 showStatus(`❌ Error: ${error.message}`, 'error');
367 |             } finally {
368 |                 setLoading(false);
369 |             }
370 |         }
371 |         
372 |         // Stream speech with immediate playback
373 |         async function streamSpeech() {
374 |             const text = textInput.value.trim();
375 |             if (!text) {
376 |                 showStatus('Please enter some text to convert to speech.', 'error');
377 |                 return;
378 |             }
379 | 
380 |             setLoading(true);
381 |             showStatus('🎵 Starting audio generation... Audio will begin playing shortly.', 'loading');
382 | 
383 |             try {
384 |                 const response = await fetch(`${API_BASE}/stream-tts`, {
385 |                     method: 'POST',
386 |                     headers: {
387 |                         'Content-Type': 'application/json',
388 |                     },
389 |                     body: JSON.stringify({
390 |                         text: text,
391 |                         temperature: parseFloat(temperatureSlider.value),
392 |                         max_tokens: parseInt(maxTokensInput.value)
393 |                     })
394 |                 });
395 | 
396 |                 if (!response.ok) {
397 |                     const error = await response.json();
398 |                     throw new Error(error.detail || 'Speech streaming failed');
399 |                 }
400 | 
401 |                 // Handle streaming PCM audio
402 |                 await handleStreamingPCM(response);
403 |                 showStatus('✅ Speech streamed and played successfully!', 'success');
404 | 
405 |             } catch (error) {
406 |                 console.error('Error:', error);
407 |                 showStatus(`❌ Error: ${error.message}`, 'error');
408 |             } finally {
409 |                 setLoading(false);
410 |             }
411 |         }
412 | 
413 |         // Handle streaming PCM audio chunks
414 |         async function handleStreamingPCM(response) {
415 |             const reader = response.body.getReader();
416 |             const sampleRate = 22050;
417 |             const channels = 1;
418 | 
419 |             // Create Audio Context for progressive playback
420 |             const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate });
421 | 
422 |             let nextStartTime = null;
423 |             let hasStartedPlaying = false;
424 |             let chunksScheduled = 0;
425 |             const PREBUFFER_CHUNKS = 1; // Buffer 2 chunks before starting
426 | 
427 |             const allPCMChunks = [];
428 |             let buffer = new Uint8Array(0);
429 | 
430 |             try {
431 |                 while (true) {
432 |                     const { done, value } = await reader.read();
433 | 
434 |                     if (done) break;
435 | 
436 |                     // Append to buffer
437 |                     const newBuffer = new Uint8Array(buffer.length + value.length);
438 |                     newBuffer.set(buffer);
439 |                     newBuffer.set(value, buffer.length);
440 |                     buffer = newBuffer;
441 | 
442 |                     // Process complete chunks (4 bytes length + data)
443 |                     while (buffer.length >= 4) {
444 |                         const dataView = new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
445 |                         const chunkLength = dataView.getUint32(0, true); // little-endian
446 | 
447 |                         if (chunkLength === 0) {
448 |                             // End marker
449 |                             console.log('[STREAM] Received end marker');
450 |                             buffer = buffer.slice(4);
451 |                             break;
452 |                         }
453 | 
454 |                         if (chunkLength === 0xFFFFFFFF) {
455 |                             // Error marker
456 |                             throw new Error('Server error during streaming');
457 |                         }
458 | 
459 |                         if (buffer.length < 4 + chunkLength) {
460 |                             // Not enough data yet
461 |                             break;
462 |                         }
463 | 
464 |                         // Extract PCM chunk
465 |                         const pcmData = buffer.slice(4, 4 + chunkLength);
466 |                         buffer = buffer.slice(4 + chunkLength);
467 | 
468 |                         // Convert Int16 PCM to Float32
469 |                         const int16Array = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.length / 2);
470 |                         const float32Array = new Float32Array(int16Array.length);
471 |                         for (let i = 0; i < int16Array.length; i++) {
472 |                             float32Array[i] = int16Array[i] / 32768.0;
473 |                         }
474 | 
475 |                         allPCMChunks.push(float32Array);
476 | 
477 |                         // Create audio buffer
478 |                         const audioBuffer = audioContext.createBuffer(channels, float32Array.length, sampleRate);
479 |                         audioBuffer.getChannelData(0).set(float32Array);
480 | 
481 |                         const source = audioContext.createBufferSource();
482 |                         source.buffer = audioBuffer;
483 |                         source.connect(audioContext.destination);
484 | 
485 |                         // Start playback only after buffering initial chunks
486 |                         if (!hasStartedPlaying && allPCMChunks.length >= PREBUFFER_CHUNKS) {
487 |                             nextStartTime = audioContext.currentTime + 0.05;
488 |                             hasStartedPlaying = true;
489 |                             showStatus('🔊 Audio playing! Streaming in real-time...', 'loading');
490 |                             console.log(`[STREAM] Starting playback with ${PREBUFFER_CHUNKS} chunks buffered`);
491 |                         }
492 | 
493 |                         // Schedule chunk (only if playback has started)
494 |                         if (hasStartedPlaying) {
495 |                             const now = audioContext.currentTime;
496 |                             const bufferAhead = nextStartTime - now;
497 | 
498 |                             // Dynamic buffer: keep at least 50% of chunk duration buffered
499 |                             const minBuffer = audioBuffer.duration * 0.5;
500 |                             if (bufferAhead < minBuffer) {
501 |                                 nextStartTime = now + minBuffer;
502 |                                 console.warn(`[STREAM] Buffer underrun! Adding ${(minBuffer * 1000).toFixed(0)}ms safety gap`);
503 |                             }
504 | 
505 |                             source.start(nextStartTime);
506 |                             nextStartTime += audioBuffer.duration;
507 |                             chunksScheduled++;
508 |                             console.log(`[STREAM] Scheduled chunk ${chunksScheduled}, duration ${audioBuffer.duration.toFixed(3)}s, next ${nextStartTime.toFixed(3)}s`);
509 |                         } else {
510 |                             console.log(`[STREAM] Buffering chunk ${allPCMChunks.length}/${PREBUFFER_CHUNKS}...`);
511 |                         }
512 |                     }
513 |                 }
514 | 
515 |                 // Create final WAV file for download
516 |                 if (allPCMChunks.length > 0) {
517 |                     const totalLength = allPCMChunks.reduce((sum, chunk) => sum + chunk.length, 0);
518 |                     const finalPCM = new Float32Array(totalLength);
519 |                     let offset = 0;
520 |                     for (const chunk of allPCMChunks) {
521 |                         finalPCM.set(chunk, offset);
522 |                         offset += chunk.length;
523 |                     }
524 | 
525 |                     // Convert to WAV
526 |                     const wavBlob = createWavBlob(finalPCM, sampleRate);
527 |                     displayStreamingAudio(wavBlob, false);
528 |                 }
529 | 
530 |             } finally {
531 |                 reader.releaseLock();
532 |             }
533 |         }
534 | 
535 |         // Create WAV file from PCM data
536 |         function createWavBlob(pcmData, sampleRate) {
537 |             const numChannels = 1;
538 |             const bitsPerSample = 16;
539 |             const bytesPerSample = bitsPerSample / 8;
540 |             const blockAlign = numChannels * bytesPerSample;
541 | 
542 |             const dataLength = pcmData.length * bytesPerSample;
543 |             const buffer = new ArrayBuffer(44 + dataLength);
544 |             const view = new DataView(buffer);
545 | 
546 |             // WAV header
547 |             const writeString = (offset, string) => {
548 |                 for (let i = 0; i < string.length; i++) {
549 |                     view.setUint8(offset + i, string.charCodeAt(i));
550 |                 }
551 |             };
552 | 
553 |             writeString(0, 'RIFF');
554 |             view.setUint32(4, 36 + dataLength, true);
555 |             writeString(8, 'WAVE');
556 |             writeString(12, 'fmt ');
557 |             view.setUint32(16, 16, true); // fmt chunk size
558 |             view.setUint16(20, 1, true); // PCM format
559 |             view.setUint16(22, numChannels, true);
560 |             view.setUint32(24, sampleRate, true);
561 |             view.setUint32(28, sampleRate * blockAlign, true); // byte rate
562 |             view.setUint16(32, blockAlign, true);
563 |             view.setUint16(34, bitsPerSample, true);
564 |             writeString(36, 'data');
565 |             view.setUint32(40, dataLength, true);
566 | 
567 |             // PCM data
568 |             let offset = 44;
569 |             for (let i = 0; i < pcmData.length; i++) {
570 |                 const sample = Math.max(-1, Math.min(1, pcmData[i]));
571 |                 view.setInt16(offset, sample * 0x7FFF, true);
572 |                 offset += 2;
573 |             }
574 | 
575 |             return new Blob([buffer], { type: 'audio/wav' });
576 |         }
577 | 
578 |         // Display streaming audio with option for immediate playback
579 |         function displayStreamingAudio(audioBlob, isStreaming = false) {
580 |             // Clean up previous audio URL
581 |             if (currentAudioUrl) {
582 |                 URL.revokeObjectURL(currentAudioUrl);
583 |             }
584 | 
585 |             // For final audio: update the main player
586 |             currentAudioUrl = URL.createObjectURL(audioBlob);
587 |             audioPlayer.src = currentAudioUrl;
588 |             audioContainer.style.display = 'block';
589 | 
590 |             // Set up download for complete audio
591 |             downloadBtn.onclick = () => {
592 |                 const a = document.createElement('a');
593 |                 a.href = currentAudioUrl;
594 |                 a.download = `tts_audio_${new Date().getTime()}.wav`;
595 |                 a.click();
596 |             };
597 |         }
598 |         
599 |         // Display audio
600 |         function displayAudio(audioBlob) {
601 |             // Clean up previous audio URL
602 |             if (currentAudioUrl) {
603 |                 URL.revokeObjectURL(currentAudioUrl);
604 |             }
605 |             
606 |             currentAudioUrl = URL.createObjectURL(audioBlob);
607 |             audioPlayer.src = currentAudioUrl;
608 |             audioContainer.style.display = 'block';
609 |             
610 |             // Set up download
611 |             downloadBtn.onclick = () => {
612 |                 const a = document.createElement('a');
613 |                 a.href = currentAudioUrl;
614 |                 a.download = `tts_audio_${new Date().getTime()}.wav`;
615 |                 a.click();
616 |             };
617 |         }
618 |         
619 |         // Stop generation (placeholder - would need server-side implementation)
620 |         function stopGeneration() {
621 |             showStatus('Stop functionality would require server-side implementation.', 'info');
622 |             setLoading(false);
623 |         }
624 |         
625 |         // Event listeners
626 |         generateBtn.addEventListener('click', generateSpeech);
627 |         streamBtn.addEventListener('click', streamSpeech);
628 |         stopBtn.addEventListener('click', stopGeneration);
629 |         
630 |         // Enter key support
631 |         textInput.addEventListener('keydown', (e) => {
632 |             if (e.ctrlKey && e.key === 'Enter') {
633 |                 generateSpeech();
634 |             }
635 |         });
636 |         
637 |         // Health check on load
638 |         window.addEventListener('load', async () => {
639 |             try {
640 |                 const response = await fetch(`${API_BASE}/health`);
641 |                 if (response.ok) {
642 |                     const health = await response.json();
643 |                     if (health.tts_initialized) {
644 |                         showStatus('🟢 Server is ready!', 'success');
645 |                         setTimeout(clearStatus, 3000);
646 |                     } else {
647 |                         showStatus('🟡 Server is starting up...', 'loading');
648 |                     }
649 |                 } else {
650 |                     showStatus('🔴 Cannot connect to server. Make sure it\'s running on http://localhost:8000', 'error');
651 |                 }
652 |             } catch (error) {
653 |                 showStatus('🔴 Cannot connect to server. Make sure it\'s running on http://localhost:8000', 'error');
654 |             }
655 |         });
656 |         
657 |         // Cleanup on page unload
658 |         window.addEventListener('beforeunload', () => {
659 |             if (currentAudioUrl) {
660 |                 URL.revokeObjectURL(currentAudioUrl);
661 |             }
662 |         });
663 |     </script>
664 | </body>
665 | </html>


--------------------------------------------------------------------------------