├── public ├── mia.wav ├── arya.wav ├── logo.png ├── saul.wav └── tommy.wav ├── examples └── basic │ ├── generation │ ├── __init__.py │ └── generator.py │ ├── audio │ ├── __init__.py │ ├── player.py │ └── streaming.py │ ├── config.py │ ├── main.py │ ├── setup.sh │ ├── README.md │ ├── server.py │ └── client.html ├── .gitignore ├── README.md └── LICENSE /public/mia.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/mia.wav -------------------------------------------------------------------------------- /public/arya.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/arya.wav -------------------------------------------------------------------------------- /public/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/logo.png -------------------------------------------------------------------------------- /public/saul.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/saul.wav -------------------------------------------------------------------------------- /public/tommy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nineninesix-ai/kani-tts/HEAD/public/tommy.wav -------------------------------------------------------------------------------- /examples/basic/generation/__init__.py: -------------------------------------------------------------------------------- 1 | """Text-to-speech generation modules""" 2 | 3 | from .generator import TTSGenerator 4 | 5 | __all__ = ['TTSGenerator'] 6 | -------------------------------------------------------------------------------- /examples/basic/audio/__init__.py: -------------------------------------------------------------------------------- 1 | """Audio processing modules for Kani TTS""" 2 | 3 | from .player import LLMAudioPlayer 4 | from .streaming import StreamingAudioWriter 5 | 6 | __all__ = ['LLMAudioPlayer', 'StreamingAudioWriter'] 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | venv/ 8 | env/ 9 | ENV/ 10 | 11 | # OS 12 | .DS_Store 13 | .DS_Store? 14 | ._* 15 | .Spotlight-V100 16 | .Trashes 17 | 18 | *.safetensors 19 | *.bin 20 | 21 | 22 | # IDE 23 | .vscode/ 24 | .idea/ 25 | *.swp 26 | *.swo 27 | *~ 28 | 29 | # Logs 30 | *.log 31 | *.wav 32 | .venv 33 | -------------------------------------------------------------------------------- /examples/basic/config.py: -------------------------------------------------------------------------------- 1 | """Configuration and constants for Kani TTS""" 2 | 3 | # Tokenizer configuration 4 | TOKENIZER_LENGTH = 64400 5 | 6 | # Special tokens 7 | START_OF_TEXT = 1 8 | END_OF_TEXT = 2 9 | START_OF_SPEECH = TOKENIZER_LENGTH + 1 10 | END_OF_SPEECH = TOKENIZER_LENGTH + 2 11 | START_OF_HUMAN = TOKENIZER_LENGTH + 3 12 | END_OF_HUMAN = TOKENIZER_LENGTH + 4 13 | START_OF_AI = TOKENIZER_LENGTH + 5 14 | END_OF_AI = TOKENIZER_LENGTH + 6 15 | PAD_TOKEN = TOKENIZER_LENGTH + 7 16 | AUDIO_TOKENS_START = TOKENIZER_LENGTH + 10 17 | 18 | # Audio configuration 19 | CODEBOOK_SIZE = 4032 20 | SAMPLE_RATE = 22050 21 | 22 | # Streaming configuration 23 | CHUNK_SIZE = 25 # Number of new frames to output per iteration 24 | LOOKBACK_FRAMES = 15 # Number of frames to include from previous context 25 | 26 | # Generation configuration 27 | TEMPERATURE = 0.6 28 | TOP_P = 0.95 29 | REPETITION_PENALTY = 1.1 30 | REPETITION_CONTEXT_SIZE = 20 31 | MAX_TOKENS = 1200 32 | 33 | # Model paths 34 | MODEL_NAME = "nineninesix/kani-tts-370m" 35 | CODEC_MODEL_NAME = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps" 36 | -------------------------------------------------------------------------------- /examples/basic/main.py: -------------------------------------------------------------------------------- 1 | """Kani TTS - Text to Speech Generation""" 2 | 3 | import time 4 | from audio import LLMAudioPlayer, StreamingAudioWriter 5 | from generation import TTSGenerator 6 | from config import CHUNK_SIZE, LOOKBACK_FRAMES 7 | 8 | from nemo.utils.nemo_logging import Logger 9 | 10 | nemo_logger = Logger() 11 | nemo_logger.remove_stream_handlers() 12 | 13 | 14 | def time_report(point_1, point_2, point_3): 15 | model_request = point_2 - point_1 16 | player_time = point_3 - point_2 17 | total_time = point_3 - point_1 18 | report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}" 19 | return report 20 | 21 | 22 | def main(): 23 | # Initialize generator and audio player 24 | generator = TTSGenerator() 25 | player = LLMAudioPlayer(generator.tokenizer) 26 | 27 | # Set prompt 28 | prompt = "katie: Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?" 29 | 30 | # Create streaming audio writer with sliding window decoder 31 | # Uses lookback context from previous frames to maintain codec continuity 32 | audio_writer = StreamingAudioWriter( 33 | player, 34 | 'output.wav', 35 | chunk_size=CHUNK_SIZE, # Output 25 new frames (2.0s) per iteration 36 | lookback_frames=LOOKBACK_FRAMES # Include 15 previous frames (1.2s) for context 37 | ) 38 | audio_writer.start() 39 | 40 | # Generate speech 41 | result = generator.generate(prompt, audio_writer) 42 | 43 | # Finalize and write audio file 44 | audio = audio_writer.finalize() 45 | 46 | point_3 = time.time() 47 | 48 | # Print results 49 | print(time_report(result['point_1'], result['point_2'], point_3)) 50 | # print(f"\n[DEBUG] First 100 chars of generated text: {result['generated_text'][:100]}") 51 | # print(f"[DEBUG] Last 100 chars of generated text: {result['generated_text'][-100:]}") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | 57 | -------------------------------------------------------------------------------- /examples/basic/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "=== KaniTTS Setup ===" 5 | echo "" 6 | 7 | # Check if Python is available 8 | if ! command -v python3 &> /dev/null; then 9 | echo "Error: Python 3 is not installed. Please install Python 3.10+ first." 10 | exit 1 11 | fi 12 | 13 | # Check Python version 14 | PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}') 15 | echo "Python version: $PYTHON_VERSION" 16 | 17 | # Validate Python version (only 3.10-3.12 supported) 18 | PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1) 19 | PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2) 20 | 21 | if [ "$PYTHON_MAJOR" -ne 3 ] || [ "$PYTHON_MINOR" -lt 10 ] || [ "$PYTHON_MINOR" -gt 12 ]; then 22 | echo "Error: This project requires Python 3.10, 3.11, or 3.12" 23 | echo "Current version: $PYTHON_VERSION" 24 | exit 1 25 | fi 26 | echo "Python version is supported" 27 | 28 | # Create virtual environment if it doesn't exist 29 | if [ ! -d "venv" ]; then 30 | echo "" 31 | echo "Creating virtual environment..." 32 | python3 -m venv venv 33 | echo "Virtual environment created successfully" 34 | else 35 | echo "" 36 | echo "Virtual environment already exists" 37 | fi 38 | 39 | # Activate virtual environment 40 | echo "Activating virtual environment..." 41 | source venv/bin/activate 42 | 43 | # Upgrade pip 44 | echo "Upgrading pip..." 45 | pip install --upgrade pip 46 | 47 | # Add dependencies 48 | echo "" 49 | echo "Adding dependencies..." 50 | 51 | # Install FastAPI and Uvicorn 52 | echo "Installing FastAPI and Uvicorn..." 53 | pip install fastapi uvicorn 54 | 55 | # Install nemo-toolkit (which will install transformers 4.53) 56 | echo "" 57 | echo "Installing nemo-toolkit[tts]..." 58 | pip install "nemo-toolkit[tts]==2.4.0" 59 | 60 | # Force reinstall transformers to 4.57.1 (required for model compatibility) 61 | echo "" 62 | echo "Upgrading transformers to 4.57.1..." 63 | echo "Note: nemo-toolkit[tts] requires transformers==4.53, but we need 4.57.1 for model compatibility" 64 | pip install "transformers==4.57.1" 65 | 66 | # Verify installation 67 | echo "" 68 | echo "=== Verifying Installation ===" 69 | echo "" 70 | 71 | python -c "import torch; print(f'PyTorch version: {torch.__version__}')" 72 | python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" 73 | python -c "import torch; print(f'CUDA version: {torch.version.cuda}') if torch.cuda.is_available() else print('CUDA not available')" 74 | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" 75 | python -c "import transformers; print(f'Transformers version: {transformers.__version__}')" 76 | python -c "import fastapi; print(f'FastAPI version: {fastapi.__version__}')" 77 | 78 | echo "" 79 | echo "=== Setup Complete ===" 80 | echo "" 81 | echo "You can now start the server with:" 82 | echo " source venv/bin/activate" 83 | echo " python server.py" 84 | echo "" 85 | echo "Note: Models will be automatically downloaded on first run (~1.5GB)" 86 | -------------------------------------------------------------------------------- /examples/basic/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Basic example for KaniTTS 3 | 4 | ## Installation 5 | ### Prerequisites 6 | 7 | - Python 3.10 8 | - Git 9 | 10 | ### Setup 11 | 12 | ```bash 13 | chmod +x setup.sh 14 | ./setup.sh 15 | ``` 16 | 17 | ## Usage 18 | 19 | ### Option 1: Standalone Generation (Local WAV File) 20 | 21 | Generate audio and save it to a WAV file: 22 | 23 | ```bash 24 | python main.py 25 | ``` 26 | 27 | This will: 28 | - Generate speech from the prompt in [main.py:28](main.py#L28) 29 | - Save output to `output.wav` 30 | - Display timing metrics for performance analysis 31 | 32 | ### Option 2: FastAPI Server + Web Interface 33 | 34 | 1. Start the server: 35 | ```bash 36 | python server.py 37 | ``` 38 | 39 | The server will start on `http://localhost:8000` 40 | 41 | 2. Open the web interface: 42 | ```bash 43 | open client.html 44 | ``` 45 | 46 | Or navigate to `http://localhost:8000` in your browser 47 | 48 | ## API Endpoints 49 | 50 | ### `POST /tts` 51 | Generate complete audio file (non-streaming) 52 | 53 | **Request:** 54 | ```json 55 | { 56 | "text": "Hello world!", 57 | "temperature": 0.6, 58 | "max_tokens": 1200, 59 | "top_p": 0.95, 60 | "chunk_size": 25, 61 | "lookback_frames": 15 62 | } 63 | ``` 64 | 65 | **Response:** WAV audio file 66 | 67 | ### `POST /stream-tts` 68 | Stream audio chunks for immediate playback 69 | 70 | **Request:** Same as `/tts` 71 | 72 | **Response:** Streaming PCM audio chunks with metadata headers 73 | 74 | ## Configuration 75 | 76 | Edit [config.py](config.py) to customize: 77 | 78 | ```python 79 | # Audio settings 80 | CHUNK_SIZE = 25 # Frames per streaming iteration (2.0s) 81 | LOOKBACK_FRAMES = 15 # Context frames for continuity (1.2s) 82 | 83 | # Generation parameters 84 | TEMPERATURE = 0.6 85 | TOP_P = 0.95 86 | REPETITION_PENALTY = 1.1 87 | MAX_TOKENS = 1200 88 | 89 | # Model configuration 90 | MODEL_NAME = "nineninesix/kani-tts-370m" 91 | CODEC_MODEL_NAME = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps" 92 | ``` 93 | 94 | ## Technical Details 95 | 96 | ### Streaming Architecture 97 | 98 | The system uses a **sliding window decoder** for smooth audio generation: 99 | 100 | 1. **Chunk Size (25 frames)** - Outputs ~2.0 seconds of new audio per iteration 101 | 2. **Lookback Frames (15 frames)** - Includes ~1.2 seconds of context from previous output 102 | 103 | ### Tested on 104 | 105 | - NVIDIA GeForce RTX 5080 106 | - Driver Version: 570.169 107 | - CUDA Version: 12.8 108 | - 16GB GPU memory 109 | - Python: 3.12 110 | - Transformers: 4.57.1 111 | 112 | In order to generate 15sec audio it takes ~1sec and ~2Gb GPU VRAM 113 | 114 | > **Note:** If you experience audio breaks during streaming, try increasing `CHUNK_SIZE` in [config.py](config.py) to buffer more frames per chunk. 115 | 116 | ## Models 117 | 118 | - **TTS Model:** [nineninesix/kani-tts-370m](https://huggingface.co/nineninesix/kani-tts-370m) 119 | - **Codec Model:** [nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps](https://huggingface.co/nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps) 120 | 121 | Models are automatically downloaded from Hugging Face on first run. 122 | 123 | ## Browser Compatibility 124 | 125 | The web interface requires a modern browser with support for: 126 | - Web Audio API 127 | - Fetch API with streaming 128 | 129 | ## License 130 | Apache 2 131 | 132 | ## Contributing 133 | 134 | Contributions are welcome! Please feel free to submit issues or pull requests. 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /examples/basic/generation/generator.py: -------------------------------------------------------------------------------- 1 | """Text-to-speech generation logic""" 2 | 3 | import time 4 | import torch 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers.generation.streamers import BaseStreamer 7 | from threading import Thread 8 | 9 | from config import ( 10 | MODEL_NAME, START_OF_HUMAN, END_OF_TEXT, END_OF_HUMAN, END_OF_AI, 11 | TEMPERATURE, TOP_P, REPETITION_PENALTY, REPETITION_CONTEXT_SIZE, MAX_TOKENS 12 | ) 13 | 14 | 15 | class TokenIDStreamer(BaseStreamer): 16 | """Custom streamer that yields token IDs""" 17 | def __init__(self, callback): 18 | self.callback = callback 19 | 20 | def put(self, value): 21 | """Called by model.generate() with token IDs""" 22 | if len(value.shape) > 1: 23 | token_ids = value[0].tolist() 24 | else: 25 | token_ids = value.tolist() 26 | 27 | for token_id in token_ids: 28 | self.callback(token_id) 29 | 30 | def end(self): 31 | """Called when generation is complete""" 32 | pass 33 | 34 | 35 | class TTSGenerator: 36 | def __init__(self): 37 | self.model = AutoModelForCausalLM.from_pretrained( 38 | MODEL_NAME, 39 | torch_dtype=torch.bfloat16, 40 | device_map="auto", 41 | ) 42 | self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 43 | 44 | if torch.cuda.is_available(): 45 | self.device = 'cuda' 46 | elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): 47 | self.device = 'mps' 48 | else: 49 | self.device = 'cpu' 50 | 51 | def prepare_input(self, prompt): 52 | """Build custom input_ids with special tokens""" 53 | input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids 54 | start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64) 55 | end_tokens = torch.tensor([[END_OF_TEXT, END_OF_HUMAN]], dtype=torch.int64) 56 | modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) 57 | modified_input_ids = modified_input_ids.to(self.device) 58 | 59 | attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64) 60 | attention_mask = attention_mask.to(self.device) 61 | 62 | return modified_input_ids, attention_mask 63 | 64 | def generate(self, prompt, audio_writer, max_tokens=MAX_TOKENS): 65 | """Generate speech tokens from text prompt""" 66 | modified_input_ids, attention_mask = self.prepare_input(prompt) 67 | 68 | point_1 = time.time() 69 | 70 | # Stream tokens from LLM 71 | all_token_ids = [] 72 | 73 | def on_token_generated(token_id): 74 | """Callback for each generated token""" 75 | all_token_ids.append(token_id) 76 | # print(f"[LLM] Token {len(all_token_ids)}: {token_id}") 77 | audio_writer.add_token(token_id) 78 | 79 | streamer = TokenIDStreamer(callback=on_token_generated) 80 | 81 | generation_kwargs = dict( 82 | input_ids=modified_input_ids, 83 | attention_mask=attention_mask, 84 | max_new_tokens=max_tokens, 85 | do_sample=True, 86 | temperature=TEMPERATURE, 87 | top_p=TOP_P, 88 | repetition_penalty=REPETITION_PENALTY, 89 | num_return_sequences=1, 90 | eos_token_id=END_OF_AI, 91 | streamer=streamer, 92 | ) 93 | 94 | thread = Thread(target=self.model.generate, kwargs=generation_kwargs) 95 | thread.start() 96 | thread.join() 97 | 98 | point_2 = time.time() 99 | 100 | print(f"\n[MAIN] Generation complete. Total tokens: {len(all_token_ids)}") 101 | 102 | # Decode generated text from token IDs 103 | generated_text = self.tokenizer.decode(all_token_ids, skip_special_tokens=True) 104 | 105 | return { 106 | 'generated_text': generated_text, 107 | 'all_token_ids': all_token_ids, 108 | 'generation_time': point_2 - point_1, 109 | 'point_1': point_1, 110 | 'point_2': point_2 111 | } 112 | -------------------------------------------------------------------------------- /examples/basic/audio/player.py: -------------------------------------------------------------------------------- 1 | """Audio player for LLM-generated speech tokens""" 2 | 3 | import torch 4 | import numpy as np 5 | from nemo.collections.tts.models import AudioCodecModel 6 | 7 | from config import ( 8 | TOKENIZER_LENGTH, START_OF_TEXT, END_OF_TEXT, 9 | START_OF_SPEECH, END_OF_SPEECH, START_OF_HUMAN, END_OF_HUMAN, 10 | START_OF_AI, END_OF_AI, PAD_TOKEN, AUDIO_TOKENS_START, CODEBOOK_SIZE 11 | ) 12 | 13 | 14 | class LLMAudioPlayer: 15 | def __init__(self, tokenizer) -> None: 16 | self.nemo_codec_model = AudioCodecModel\ 17 | .from_pretrained("nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps").eval() 18 | 19 | if torch.cuda.is_available(): 20 | self.device = 'cuda' 21 | elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): 22 | self.device = 'mps' 23 | else: 24 | self.device = 'cpu' 25 | 26 | self.nemo_codec_model.to(self.device) 27 | self.tokenizer = tokenizer 28 | 29 | self.tokeniser_length = TOKENIZER_LENGTH 30 | self.start_of_text = START_OF_TEXT 31 | self.end_of_text = END_OF_TEXT 32 | self.start_of_speech = START_OF_SPEECH 33 | self.end_of_speech = END_OF_SPEECH 34 | self.start_of_human = START_OF_HUMAN 35 | self.end_of_human = END_OF_HUMAN 36 | self.start_of_ai = START_OF_AI 37 | self.end_of_ai = END_OF_AI 38 | self.pad_token = PAD_TOKEN 39 | self.audio_tokens_start = AUDIO_TOKENS_START 40 | self.codebook_size = CODEBOOK_SIZE 41 | 42 | def output_validation(self, out_ids): 43 | start_of_speech_flag = self.start_of_speech in out_ids 44 | end_of_speech_flag = self.end_of_speech in out_ids 45 | if not (start_of_speech_flag and end_of_speech_flag): 46 | raise ValueError('Special speech tokens not exist!') 47 | 48 | def get_nano_codes(self, out_ids): 49 | start_a_idx = (out_ids == self.start_of_speech).nonzero(as_tuple=True)[0].item() 50 | end_a_idx = (out_ids == self.end_of_speech).nonzero(as_tuple=True)[0].item() 51 | if start_a_idx >= end_a_idx: 52 | raise ValueError('Invalid audio codes sequence!') 53 | 54 | audio_codes = out_ids[start_a_idx+1 : end_a_idx] 55 | if len(audio_codes) % 4: 56 | raise ValueError('The length of the sequence must be a multiple of 4!') 57 | audio_codes = audio_codes.reshape(-1, 4) 58 | audio_codes = audio_codes - torch.tensor([self.codebook_size * i for i in range(4)]) 59 | audio_codes = audio_codes - self.audio_tokens_start 60 | if (audio_codes < 0).sum().item() > 0: 61 | raise ValueError('Invalid audio tokens!') 62 | 63 | audio_codes = audio_codes.T.unsqueeze(0) 64 | len_ = torch.tensor([audio_codes.shape[-1]]) 65 | return audio_codes, len_ 66 | 67 | def get_text(self, out_ids): 68 | try: 69 | start_t_idx = (out_ids == self.start_of_text).tolist().index(True) 70 | end_t_idx = (out_ids == self.end_of_text).tolist().index(True) 71 | txt_tokens = out_ids[start_t_idx : end_t_idx+1] 72 | text = self.tokenizer.decode(txt_tokens, skip_special_tokens=True) 73 | return text 74 | except ValueError: 75 | return None 76 | 77 | def get_waveform(self, out_ids): 78 | out_ids = out_ids.flatten() 79 | self.output_validation(out_ids) 80 | audio_codes, len_ = self.get_nano_codes(out_ids) 81 | audio_codes, len_ = audio_codes.to(self.device), len_.to(self.device) 82 | with torch.inference_mode(): 83 | reconstructed_audio, _ = self.nemo_codec_model.decode(tokens=audio_codes, tokens_len=len_) 84 | output_audio = reconstructed_audio.cpu().detach().numpy().squeeze() 85 | 86 | text = self.get_text(out_ids) 87 | return output_audio, text 88 | 89 | def decode_audio_chunk(self, audio_codes): 90 | """Decode a chunk of audio codes (shape: [num_frames, 4])""" 91 | if len(audio_codes) == 0: 92 | return None 93 | 94 | # Process audio codes: subtract offsets for each codebook 95 | audio_codes = torch.tensor(audio_codes, device=self.device) 96 | audio_codes = audio_codes - torch.tensor([self.codebook_size * i for i in range(4)], device=self.device) 97 | audio_codes = audio_codes - self.audio_tokens_start 98 | 99 | if (audio_codes < 0).sum().item() > 0: 100 | return None # Invalid tokens, skip 101 | 102 | # Shape: (1, 4, num_frames) - batch_size=1, num_codebooks=4, num_frames 103 | audio_codes = audio_codes.T.unsqueeze(0) 104 | len_ = torch.tensor([audio_codes.shape[-1]], device=self.device) 105 | 106 | with torch.inference_mode(): 107 | reconstructed_audio, _ = self.nemo_codec_model.decode(tokens=audio_codes, tokens_len=len_) 108 | output_audio = reconstructed_audio.cpu().detach().numpy().squeeze() 109 | 110 | return output_audio 111 | -------------------------------------------------------------------------------- /examples/basic/server.py: -------------------------------------------------------------------------------- 1 | """FastAPI server for Kani TTS with streaming support""" 2 | 3 | import io 4 | import time 5 | from fastapi import FastAPI, HTTPException 6 | from fastapi.middleware.cors import CORSMiddleware 7 | from fastapi.responses import StreamingResponse, Response 8 | from pydantic import BaseModel 9 | from typing import Optional 10 | import numpy as np 11 | from scipy.io.wavfile import write as wav_write 12 | 13 | from audio import LLMAudioPlayer, StreamingAudioWriter 14 | from generation import TTSGenerator 15 | from config import CHUNK_SIZE, LOOKBACK_FRAMES, TEMPERATURE, TOP_P, MAX_TOKENS 16 | 17 | from nemo.utils.nemo_logging import Logger 18 | 19 | nemo_logger = Logger() 20 | nemo_logger.remove_stream_handlers() 21 | 22 | 23 | app = FastAPI(title="Kani TTS API", version="1.0.0") 24 | 25 | # Add CORS middleware to allow client.html to connect 26 | app.add_middleware( 27 | CORSMiddleware, 28 | allow_origins=["*"], # In production, specify your frontend domain 29 | allow_credentials=True, 30 | allow_methods=["*"], 31 | allow_headers=["*"], 32 | ) 33 | 34 | # Global instances (initialized on startup) 35 | generator = None 36 | player = None 37 | 38 | 39 | class TTSRequest(BaseModel): 40 | text: str 41 | temperature: Optional[float] = TEMPERATURE 42 | max_tokens: Optional[int] = MAX_TOKENS 43 | top_p: Optional[float] = TOP_P 44 | chunk_size: Optional[int] = CHUNK_SIZE 45 | lookback_frames: Optional[int] = LOOKBACK_FRAMES 46 | 47 | 48 | @app.on_event("startup") 49 | async def startup_event(): 50 | """Initialize models on startup""" 51 | global generator, player 52 | print("🚀 Initializing TTS models...") 53 | generator = TTSGenerator() 54 | player = LLMAudioPlayer(generator.tokenizer) 55 | print("✅ TTS models initialized successfully!") 56 | 57 | 58 | @app.get("/health") 59 | async def health_check(): 60 | """Check if server is ready""" 61 | return { 62 | "status": "healthy", 63 | "tts_initialized": generator is not None and player is not None 64 | } 65 | 66 | 67 | @app.post("/tts") 68 | async def generate_speech(request: TTSRequest): 69 | """Generate complete audio file (non-streaming)""" 70 | if not generator or not player: 71 | raise HTTPException(status_code=503, detail="TTS models not initialized") 72 | 73 | try: 74 | # Create audio writer 75 | audio_writer = StreamingAudioWriter( 76 | player, 77 | output_file=None, # We won't write to file 78 | chunk_size=request.chunk_size, 79 | lookback_frames=request.lookback_frames 80 | ) 81 | audio_writer.start() 82 | 83 | # Generate speech 84 | result = generator.generate( 85 | request.text, 86 | audio_writer, 87 | max_tokens=request.max_tokens 88 | ) 89 | 90 | # Finalize and get audio 91 | audio_writer.finalize() 92 | 93 | if not audio_writer.audio_chunks: 94 | raise HTTPException(status_code=500, detail="No audio generated") 95 | 96 | # Concatenate all chunks 97 | full_audio = np.concatenate(audio_writer.audio_chunks) 98 | 99 | # Convert to WAV bytes 100 | wav_buffer = io.BytesIO() 101 | wav_write(wav_buffer, 22050, full_audio) 102 | wav_buffer.seek(0) 103 | 104 | return Response( 105 | content=wav_buffer.read(), 106 | media_type="audio/wav", 107 | headers={ 108 | "Content-Disposition": "attachment; filename=speech.wav" 109 | } 110 | ) 111 | 112 | except Exception as e: 113 | raise HTTPException(status_code=500, detail=str(e)) 114 | 115 | 116 | @app.post("/stream-tts") 117 | async def stream_speech(request: TTSRequest): 118 | """Stream audio chunks as they're generated for immediate playback""" 119 | if not generator or not player: 120 | raise HTTPException(status_code=503, detail="TTS models not initialized") 121 | 122 | import queue 123 | import threading 124 | import struct 125 | 126 | async def audio_chunk_generator(): 127 | """Yield audio chunks as raw PCM data with length prefix""" 128 | chunk_queue = queue.Queue() 129 | 130 | # Create a custom list wrapper that pushes chunks to queue 131 | class ChunkList(list): 132 | def append(self, chunk): 133 | super().append(chunk) 134 | chunk_queue.put(("chunk", chunk)) 135 | 136 | audio_writer = StreamingAudioWriter( 137 | player, 138 | output_file=None, 139 | chunk_size=request.chunk_size, 140 | lookback_frames=request.lookback_frames 141 | ) 142 | 143 | # Replace audio_chunks list with our custom one 144 | audio_writer.audio_chunks = ChunkList() 145 | 146 | # Start generation in background thread 147 | def generate(): 148 | try: 149 | audio_writer.start() 150 | generator.generate( 151 | request.text, 152 | audio_writer, 153 | max_tokens=request.max_tokens 154 | ) 155 | audio_writer.finalize() 156 | chunk_queue.put(("done", None)) # Signal completion 157 | except Exception as e: 158 | print(f"Generation error: {e}") 159 | chunk_queue.put(("error", str(e))) 160 | 161 | gen_thread = threading.Thread(target=generate) 162 | gen_thread.start() 163 | 164 | # Stream chunks as they arrive 165 | try: 166 | while True: 167 | msg_type, data = chunk_queue.get(timeout=30) # 30s timeout 168 | 169 | if msg_type == "chunk": 170 | # Convert numpy array to int16 PCM 171 | pcm_data = (data * 32767).astype(np.int16) 172 | chunk_bytes = pcm_data.tobytes() 173 | 174 | # Send chunk length (4 bytes) + chunk data 175 | length_prefix = struct.pack('= 1: 65 | # Decode from lookback point to end 66 | start_frame = max(0, self.frames_decoded - self.lookback_frames) 67 | start_token = start_frame * 4 68 | 69 | tokens_to_decode = self.all_tokens[start_token:] 70 | num_frames = len(tokens_to_decode) // 4 71 | 72 | if num_frames > 0: 73 | codes = np.array(tokens_to_decode[:num_frames * 4]).reshape(-1, 4) 74 | audio_chunk = self.player.decode_audio_chunk(codes) 75 | 76 | if audio_chunk is not None: 77 | samples_per_frame = len(audio_chunk) // num_frames 78 | 79 | # Skip lookback portion, only save new frames 80 | lookback_skip = min(self.frames_decoded, self.lookback_frames) 81 | skip_samples = lookback_skip * samples_per_frame 82 | new_audio = audio_chunk[skip_samples:] 83 | 84 | self.audio_chunks.append(new_audio) 85 | print(f"[DECODER] Final chunk: {remaining_frames} frames ({remaining_frames/12.5:.2f}s audio)") 86 | 87 | self.inside_speech = False 88 | speech_ended = True 89 | self.audio_token_buffer = [] 90 | continue 91 | 92 | # Accumulate audio tokens (only if speech hasn't ended) 93 | if self.inside_speech and not speech_ended: 94 | self.audio_token_buffer.append(token_id) 95 | self.all_tokens.append(token_id) # Keep all tokens for sliding window 96 | 97 | # Decode when we have enough NEW frames to process 98 | total_frames = len(self.all_tokens) // 4 99 | new_frames = total_frames - self.frames_decoded 100 | 101 | if new_frames >= self.chunk_size: 102 | # Calculate sliding window: include lookback_frames from previous context 103 | start_frame = max(0, self.frames_decoded - self.lookback_frames) 104 | start_token = start_frame * 4 105 | 106 | # Decode from start_frame to current end 107 | tokens_to_decode = self.all_tokens[start_token:] 108 | num_frames = len(tokens_to_decode) // 4 109 | 110 | codes = np.array(tokens_to_decode[:num_frames * 4]).reshape(-1, 4) 111 | audio_chunk = self.player.decode_audio_chunk(codes) 112 | 113 | if audio_chunk is not None: 114 | samples_per_frame = len(audio_chunk) // num_frames 115 | 116 | # Skip the lookback portion - only save the NEW frames 117 | lookback_skip = min(self.frames_decoded, self.lookback_frames) 118 | skip_samples = lookback_skip * samples_per_frame 119 | 120 | # Extract only the new chunk_size frames worth of audio 121 | new_samples = self.chunk_size * samples_per_frame 122 | new_audio = audio_chunk[skip_samples:skip_samples + new_samples] 123 | 124 | self.audio_chunks.append(new_audio) 125 | self.frames_decoded += self.chunk_size 126 | 127 | print(f"[DECODER] Decoded {self.chunk_size} frames ({self.chunk_size/12.5:.2f}s audio) with {self.lookback_frames}-frame lookback context") 128 | 129 | # Clear buffer (we've stored everything in all_tokens) 130 | self.audio_token_buffer = [] 131 | 132 | except queue.Empty: 133 | continue 134 | 135 | def add_token(self, token_id): 136 | """Add a token to the processing queue""" 137 | self.token_queue.put(token_id) 138 | 139 | def finalize(self): 140 | """Stop the decoder thread and write final audio file""" 141 | self.running = False 142 | self.decoder_thread.join() 143 | 144 | if self.audio_chunks: 145 | # Concatenate all audio chunks 146 | full_audio = np.concatenate(self.audio_chunks) 147 | 148 | # Only write to file if output_file is specified 149 | if self.output_file: 150 | write(self.output_file, self.sample_rate, full_audio) 151 | print(f"[WRITER] Wrote {len(full_audio)/self.sample_rate:.2f}s of audio to {self.output_file}") 152 | 153 | return full_audio 154 | return None 155 | 156 | def start(self): 157 | """Start the decoder thread""" 158 | self.decoder_thread = threading.Thread(target=self.decoder_worker) 159 | self.decoder_thread.start() 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Kani TTS Logo 3 | 4 | [![](https://dcbadge.limes.pink/api/server/https://discord.gg/NzP3rjB4SB?style=flat)](https://discord.gg/NzP3rjB4SB) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | 6 | # Kani TTS 7 | A fast, modular and human-like TTS that generates high-quality speech from text input. 8 |
9 | 10 | ## Models 11 | 12 | - [kani-tts-400m-en](https://huggingface.co/nineninesix/kani-tts-400m-en) - English 13 | 14 | - [kani-tts-400m-zh](https://huggingface.co/nineninesix/kani-tts-400m-zh) - Chinese 15 | 16 | - [kani-tts-400m-de](https://huggingface.co/nineninesix/kani-tts-400m-de) - German 17 | 18 | - [kani-tts-400m-ar](https://huggingface.co/nineninesix/kani-tts-400m-ar) - Arabic 19 | 20 | - [kani-tts-400m-es](https://huggingface.co/nineninesix/kani-tts-400m-es) - Spanish 21 | 22 | - [kani-tts-400m-ko](https://huggingface.co/nineninesix/kani-tts-400m-ko) - Korean 23 | 24 | - [kani-tts-370m-expo2025-osaka-ja](https://huggingface.co/nineninesix/kani-tts-370m-expo2025-osaka-ja) - Japanese 25 | 26 | - [kani-tts-400m-0.3-pt](https://huggingface.co/nineninesix/kani-tts-400m-0.3-pt) - Pretrained checkpoint v0.3 27 | 28 | - [kani-tts-370m multilingual](https://huggingface.co/nineninesix/kani-tts-370m) - English, Spanish, Chinese, German, Korean, Arabic 29 | 30 | - [kani-tts-370m-mlx](https://huggingface.co/nineninesix/kani-tts-370m-MLX) - Multilingual model for Apple Silicon 31 | 32 | - [kani-tts-450m-0.2-pt](https://huggingface.co/nineninesix/kani-tts-450m-0.2-pt) - Pretrained checkpoint v0.2 for posttraining and fine-tuning on custom datasets. 33 | 34 | - [nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX](https://huggingface.co/nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX) - MLX implementation of NVIDIA NeMo NanoCodec, a lightweight neural audio codec. 35 | 36 | 37 | **Notes:** 38 | - Primarily optimized for English 39 | - Performance degrades with inputs >1000 tokens 40 | - Limited emotional expressivity without fine-tuning 41 | 42 | 43 | ## Inference 44 | 45 | Kani TTS offers multiple inference options optimized for different hardware: 46 | 47 | ### Basic Example (GPU/CPU) 48 | The basic inference example runs on both GPU and CPU, making it accessible for various hardware setups. Check the `examples/basic` in this repository for getting started. 49 | You can use the PyPi package `kani-tts`. [More detailes...](https://pypi.org/project/kani-tts/) 50 | 51 | ### vLLM (NVIDIA GPU) 52 | For high-performance inference on NVIDIA GPUs, use [KaniTTS-vLLM](https://github.com/nineninesix-ai/kanitts-vllm). This option is super fast and provides an OpenAI compatible API, making it easy to integrate with existing tools and workflows. 53 | 54 | ### MLX (Apple Silicon) 55 | For Apple Silicon users, we provide an optimized [KaniTTS-MLX](https://github.com/nineninesix-ai/kani-mlx) that takes full advantage of the unified memory architecture and Neural Engine on M1/M2/M3 chips. 56 | 57 | 58 | ### NeMo NanoCodec 59 | - **Base Model:** [NVIDIA NeMo NanoCodec](https://developer.nvidia.com/nemo) 60 | - **License:** [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) 61 | - **Sample Rate:** 22.05 kHz 62 | - **Purpose:** Neural audio compression/decompression for TTS pipelines 63 | 64 | --- 65 | 66 | #### GPU Benchmark Results 67 | 68 | | GPU Model | VRAM | Cost ($/hr) | RTF | 69 | |-----------|------|-------------|-----| 70 | | RTX 5090 | 32GB | $0.423 | 0.190 | 71 | | RTX 4080 | 16GB | $0.220 | 0.200 | 72 | | RTX 5060 Ti | 16GB | $0.138 | 0.529 | 73 | | RTX 4060 Ti | 16GB | $0.122 | 0.537 | 74 | | RTX 3060 | 12GB | $0.093 | 0.600 | 75 | 76 | *Lower RTF is better (< 1.0 means faster than real-time). Benchmarks conducted on [Vast AI](https://vast.ai/).* 77 | 78 | --- 79 | 80 | ## Dataset Preparation 81 | 82 | ### 1. Audio Dataset Collection 83 | 84 | You can prepare your audio dataset using [Datamio](https://app.datamio.dev/), our active community members. Datamio provides tools to help you collect, organize, and manage high-quality audio datasets for TTS training. 85 | 86 | ### 2. Audio Processing Pipeline 87 | 88 | After collecting your raw audio dataset, you need to process it for training. Check out this audio processing pipeline: [nano-codec-dataset-pipeline](https://github.com/nineninesix-ai/nano-codec-dataset-pipeline) 89 | 90 | This pipeline handles: 91 | - Audio preprocessing and normalization 92 | - Feature extraction 93 | - Dataset formatting for training 94 | - Quality validation 95 | 96 | --- 97 | 98 | ## Finetuning 99 | 100 | For finetuning KaniTTS on your own dataset, check out this comprehensive finetuning pipeline: [KaniTTS-Finetune-pipeline](https://github.com/nineninesix-ai/KaniTTS-Finetune-pipeline) 101 | 102 | This pipeline provides: 103 | - Step-by-step finetuning guides 104 | - Configuration templates 105 | - Training scripts optimized for different hardware setups 106 | - Evaluation code to assess model performance 107 | - Best practices for achieving high-quality results 108 | 109 | --- 110 | 111 | ## App Examples 112 | 113 | - [ComfyUI node](https://github.com/wildminder/ComfyUI-KaniTTS) by [WildAi](https://github.com/wildminder) 114 | 115 | - [NextJS basic app](https://github.com/nineninesix-ai/open-audio). It uses the OpenAI npm package to connect to the OpenAI-compatible server API provided by [kanitts-vllm](https://github.com/nineninesix-ai/kanitts-vllm). 116 | 117 | - [Livekit Agent](https://github.com/nineninesix-ai/livekit-agent) - A real-time voice AI assistant built with LiveKit Agents framework, featuring speech-to-text, language processing, and text-to-speech capabilities. 118 | 119 | --- 120 | 121 | ## Areas of improvement 122 | 123 | We're continuously working to enhance KaniTTS. Here are key areas where we're focusing our efforts: 124 | 125 | ### Core Architecture 126 | - **Create new LLM for TTS exclusively** - Develop a specialized LLM designed specifically for text-to-speech generation, optimized for audio token prediction rather than adapted from general-purpose LLMs 127 | 128 | ### Model Enhancements 129 | - **Add more languages** - Expand support beyond the current languages to cover more language families and dialects 130 | - **Add more speakers** - Increase speaker diversity with different accents, age groups, and voice characteristics 131 | - **Voice cloning examples** - Provide tutorials and code examples for cloning custom voices from audio samples 132 | 133 | ### Audio Codec Improvements 134 | - **Fine-tune codec** - Optimize the existing NanoCodec for better audio quality and compression efficiency 135 | - **Create new codec** - Develop a next-generation neural audio codec with improved naturalness and lower latency 136 | 137 | ### Dataset Development 138 | Build and release high-quality, diverse audio datasets for training and fine-tuning. 139 | - Multi-speaker datasets across different languages 140 | - Domain-specific datasets (conversational, storytelling, professional voice-over) 141 | - Benchmark datasets for evaluation 142 | 143 | 144 | If you're interested in contributing to any of these areas, please check our [Contributing](#contributing) section and join our [Discord server](https://discord.gg/NzP3rjB4SB). 145 | 146 | --- 147 | 148 | ## License 149 | 150 | Apache 2. See [LICENSE](LICENSE) file for details. 151 | 152 | --- 153 | 154 | ## Contributing 155 | 156 | We're **open for community contributions**! KaniTTS is built with the community, and we welcome contributions of all kinds: 157 | 158 | - **Code contributions** - Bug fixes, new features, optimizations, and documentation improvements 159 | - **Model contributions** - Fine-tuned models, voice clones, and language-specific adaptations 160 | - **Dataset contributions** - High-quality audio datasets for training and evaluation 161 | - **Examples and tutorials** - Integration examples, use cases, and guides 162 | - **Bug reports and feature requests** - Help us improve by reporting issues and suggesting enhancements 163 | 164 | **How to contribute:** 165 | 1. Check our [Areas of improvement](#areas-of-improvement) section for current priorities 166 | 2. Join our [Discord server](https://discord.gg/NzP3rjB4SB) to discuss ideas and get support 167 | 3. Submit issues or pull requests on GitHub 168 | 4. Share your projects and use cases with the community 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity granting the License. 13 | 14 | "Legal Entity" shall mean the union of the acting entity and all 15 | other entities that control, are controlled by, or are under common 16 | control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the 18 | direction or management of such entity, whether by contract or 19 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity 23 | exercising permissions granted by this License. 24 | 25 | "Source" shall mean the preferred form for making modifications, 26 | including but not limited to software source code, documentation 27 | source, and configuration files. 28 | 29 | "Object" shall mean any form resulting from mechanical 30 | transformation or translation of a Source form, including but 31 | not limited to compiled object code, generated documentation, 32 | and conversions to other media types. 33 | 34 | "Work" shall mean the work of authorship, whether in Source or 35 | Object form, made available under the License, as indicated by a 36 | copyright notice that is included in or attached to the work 37 | (which shall not include communications that are solely in the 38 | nature of comments, suggestions or bug reports, but does not 39 | include comments that are part of a notice similar to the Exemplary 40 | legal notices set forth below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based upon (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and derivative works thereof. 49 | 50 | "Contributor" shall mean Licensor and any individual or Legal Entity 51 | on behalf of whom a Contribution has been received by Licensor and 52 | subsequently incorporated within the Work. 53 | 54 | "Contribution" shall mean any work of authorship, including 55 | the original version of the Work and any modifications or additions 56 | to that Work or Derivative Works thereof, that is intentionally 57 | submitted to Licensor for inclusion in the Work by the copyright 58 | owner or by an individual or Legal Entity authorized to submit on 59 | behalf of the copyright owner. For the purposes of this definition, 60 | "submitted" means any form of electronic, verbal, or written 61 | communication sent to the Licensor or its representatives, including 62 | but not limited to communication on electronic mailing lists, 63 | source code control systems, and issue tracking systems that are 64 | managed by, or on behalf of, the Licensor for the purpose of 65 | discussing and improving the Work, but excluding communication that 66 | is conspicuously marked or otherwise designated in writing by the 67 | copyright owner as "Not a Contribution." 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to use, reproduce, modify, display, perform, 73 | sublicense, and distribute the Work and such Derivative Works in 74 | Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright notice and may provide additional or 127 | different license terms and conditions for use, reproduction, or 128 | distribution of Your Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Support. You may choose to offer, and to 168 | charge a fee for, warranty, support, indemnity, or other liability 169 | obligations and/or rights consistent with this License. However, in 170 | accepting such obligations, You may act only on Your own behalf and on 171 | Your sole responsibility, not on behalf of any other Contributor, and 172 | only if You agree to indemnify, defend, and hold each Contributor 173 | harmless for any liability incurred by, or claims asserted against, 174 | such Contributor by reason of your accepting any such warranty or support. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same page as the copyright notice for easier identification within 187 | third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /examples/basic/client.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Kani TTS Client 7 | 217 | 218 | 219 |
220 | Logo 221 |

Kani TTS

222 | 223 |
224 | 225 | 229 |
230 | 231 |
232 | 233 |
234 |
235 | 236 | 237 | 0.6 238 |
239 |
240 | 241 | 242 |
243 |
244 |
245 | 246 |
247 | 248 | 249 | 250 |
251 | 252 |
253 | 254 | 263 | 264 |
265 |

Quick Examples:

266 | 267 | 268 | 269 | 270 |
271 | 272 |
273 | API Endpoints:
274 | • POST /tts - Generate complete audio file
275 | • POST /stream-tts - Stream audio generation
276 | • GET /health - Check server status
277 |
278 | Server: http://localhost:8000 279 |
280 |
281 | 282 | 664 | 665 | --------------------------------------------------------------------------------