├── NeuTTS ├── __init__.py ├── codec.py └── engine.py ├── pyproject.toml ├── docs └── api.md ├── app.py ├── README.md └── LICENSE /NeuTTS/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "FastNeuTTS" 7 | version = "0.0.2" 8 | authors = [ 9 | { name="Yatharth Sharma", email="yatharthsharma3501@gmail.com" }, 10 | ] 11 | description = "Fast batched audio generation with neutts-air" 12 | readme = "README.md" 13 | requires-python = ">=3.8" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "lmdeploy", 21 | "librosa", 22 | # "fastaudiosr @ git+https://github.com/ysharma3501/FlashSR.git", 23 | "neucodec", 24 | "phonemizer" 25 | 26 | ] 27 | 28 | [project.urls] 29 | Homepage = "https://github.com/ysharma3501/FastNeuTTS" 30 | Issues = "https://github.com/ysharma3501/FastNeuTTS/issues" 31 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # FastAPI app usage 2 | 3 | This library also provides a rough async FastAPI app. Code usage is seen below. 4 | 5 | STEP 1: Installation and running the app: 6 | ``` 7 | # Install necessary libraries 8 | pip install fastapi uvicorn 9 | 10 | # run app, found in app.py 11 | uvicorn app:app --reload 12 | ``` 13 | 14 | 15 | 16 | The app should be running in port 8000 by default. 17 | STEP 2: set voices, currently a rough implementation. Will improve later on. 18 | ``` 19 | # Register the voice file and capture the output (which includes the user_id). 20 | # Replace 'my_reference_audio.wav' with your actual filename. 21 | # 22 | # Try to manually copy the "user_id" from the output. 23 | # For example, if the output is {"user_id": "929302"}, use that ID below. 24 | 25 | curl -X 'GET' \ 26 | 'http://127.0.0.1:8000/set_voice/?audio_file=my_reference_audio.wav' \ 27 | -H 'accept: application/json' 28 | ``` 29 | 30 | 31 | 32 | STEP 3: Run the model: 33 | ``` 34 | USER_ID="929302" # Use the SAME user id you got from step 2 35 | TEXT_TO_SAY="Hello, this is my custom cloned voice being streamed in real time." 36 | 37 | # 2. POST request to stream the raw PCM audio and save it to a file 38 | curl -X 'POST' \ 39 | 'http://127.0.0.1:8000/v1/audio/speech' \ 40 | -H 'Content-Type: application/json' \ 41 | --data-raw '{ 42 | "input": "'"$TEXT_TO_SAY"'", 43 | "voice": "'"$USER_ID"'", 44 | "model": "tts-1", 45 | "response_format": "pcm" 46 | }' \ 47 | --output "output_audio.raw" 48 | 49 | # The api will output 16 bit, 24khz, mono channel audio. 50 | ``` 51 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import torch 4 | import asyncio 5 | import numpy as np 6 | from typing import Optional 7 | from NeuTTS.engine import TTSEngine 8 | from fastapi import FastAPI, HTTPException, Body, Query 9 | from fastapi.responses import StreamingResponse 10 | 11 | 12 | try: 13 | tts_engine = TTSEngine() 14 | user_voice_map = {} 15 | print("✅ TTSEngine loaded successfully.") 16 | except Exception as e: 17 | print(f"❌ Error loading TTSEngine: {e}") 18 | tts_engine = None 19 | 20 | app = FastAPI(title="Streaming TTS Service", version="1.0") 21 | 22 | @app.get("/set_voice/", summary="Register a voice file and get a unique User ID.") 23 | async def set_voice( 24 | audio_file: str = Query(..., description="The filename of the custom reference audio for the voice."), 25 | user_id: Optional[str] = Query(None, description="Optional: A preferred unique User ID.") 26 | ): 27 | """ 28 | Registers a new speaker voice using a reference audio file. 29 | It assigns or uses a unique User ID for this voice profile. 30 | 31 | This is the function that calls `tts_engine.add_speaker(audio_file)`. 32 | """ 33 | if tts_engine is None: 34 | raise HTTPException(status_code=503, detail="TTS Engine is not available.") 35 | 36 | try: 37 | # The engine handles creating a unique ID if one isn't provided/valid. 38 | final_user_id = tts_engine.add_speaker(audio_file, speaker_id=user_id) 39 | user_voice_map[final_user_id] = audio_file # Store the mapping 40 | 41 | return { 42 | "message": "Speaker voice registered successfully.", 43 | "user_id": final_user_id, 44 | "audio_file": audio_file 45 | } 46 | except Exception as e: 47 | raise HTTPException(status_code=400, detail=f"Failed to register speaker: {e}") 48 | 49 | 50 | async def stream_audio_generator(input_text: str, user_id: str, display_audio: bool): 51 | """ 52 | An asynchronous generator that yields converted 16-bit PCM audio chunks. 53 | """ 54 | if tts_engine is None: 55 | raise RuntimeError("TTS Engine is not initialized.") 56 | 57 | try: 58 | async for wav_float32 in tts_engine.stream_audio(input_text, user_id, display_audio=display_audio): 59 | # 1. Convert float32 array (-1.0 to 1.0) to int16 PCM (-32768 to 32767) 60 | wav_int16 = (wav_float32 * 32767).astype(np.int16) 61 | 62 | # 2. Convert the int16 NumPy array to raw bytes 63 | yield wav_int16.tobytes() 64 | 65 | except Exception as e: 66 | print(f"Error during audio generation: {e}") 67 | pass 68 | 69 | 70 | @app.post("/v1/audio/speech", summary="Stream TTS audio (OpenAI compatible).") 71 | async def tts_stream( 72 | input: str = Body(..., embed=True, description="The text to generate audio for."), 73 | voice: str = Body(..., embed=True, description="The 'voice' maps to our custom speaker user_id."), 74 | model: str = Body("tts-1", embed=True, description="Placeholder model name."), 75 | response_format: str = Body("pcm", embed=True, description="Desired audio format.") 76 | ): 77 | # In your logic, map the 'voice' back to your 'user_id' 78 | ## model and response format do not matter currently 79 | user_id = voice 80 | try: 81 | audio_generator = stream_audio_generator( 82 | input_text=input, 83 | user_id=user_id, 84 | display_audio=False 85 | ) 86 | 87 | return StreamingResponse( 88 | audio_generator, 89 | media_type="application/octet-stream" 90 | # The client must know the format (SR=24000, 16-bit, mono, little-endian) 91 | ) 92 | except RuntimeError as e: 93 | raise HTTPException(status_code=500, detail=str(e)) 94 | except Exception as e: 95 | raise HTTPException(status_code=400, detail=f"TTS generation failed: {e}") 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastNeuTTS 2 | Fast NeuTTS is a highly optimized engine for [NeuTTS-air](https://huggingface.co/neuphonic/neutts-air) using [LMdeploy](https://github.com/InternLM/lmdeploy) to generate minutes of audio in just seconds. This repo is similar to the previous [FastMaya](https://github.com/ysharma3501/FastMaya) repo but much more faster and supports voice cloning as well. 3 | It will soon support multilingual models and multi-speaker models as well with streaming and latencies as low as 100ms. 4 | 5 | ## Key improvements in this repo 6 | * Much faster then original implementation and can reach over **200x realtime** on consumer gpus using batching! 7 | * Memory efficent as it works on **6gb vram gpus**. 8 | * Works with multiple gpus using tensor parallel to improve speed further. 9 | * Incredibly low potential latency of just **100ms** 10 | 11 | Speed was tested on 4070 Ti Super 12 | - Input text can be found in test.txt 13 | - 2.397 seconds to generate 508 seconds of audio 14 | - Hence **211x realtime** or 0.0047 RTF factor 15 | 16 | Simple 2 line installation; requires pip and git but uv will speed up installation considerably 17 | ``` 18 | uv pip install git+https://github.com/ysharma3501/FastNeuTTS.git 19 | sudo apt install espeak-ng -y 20 | ``` 21 | 22 | Usage for single batch size: 23 | ```python 24 | import re 25 | import time 26 | import torch 27 | from IPython.display import Audio 28 | from NeuTTS.engine import TTSEngine 29 | 30 | tts_engine = TTSEngine() 31 | text = "Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now." 32 | 33 | audio_file = "audio_file" ## custom reference file, should be 3s or more 34 | 35 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again 36 | audio = tts_engine.batch_generate([text], [codes_str], [transcript]) 37 | 38 | display(Audio(audio, rate=24000)) 39 | ``` 40 | 41 | 42 | Usage for larger batch sizes: 43 | ```python 44 | 45 | text = ["Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now.", "You dare challenge me, mortal. How amusing. Your kind always thinks they can win!"] 46 | audio_file = "custom_reference_file" ## should be 3+ seconds 47 | 48 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again 49 | audio = tts_engine.batch_generate(text, [codes_str], [transcript]) 50 | 51 | display(Audio(audio, rate=24000)) 52 | ``` 53 | 54 | Usage for **auto-splitting text** into sentences and batching(good for paragraphs): 55 | ```python 56 | text = """Paris, often affectionately known as the City of Light or La Ville Lumière, is the historic capital of France, globally celebrated as a center of art, fashion, gastronomy, and romance. Situated on the winding Seine River, which divides the city into the Left Bank and Right Bank, Paris offers a captivating blend of magnificent Haussmann architecture, grand boulevards, and charming, intimate neighborhoods. It is home to world-renowned landmarks like the iconic Eiffel Tower, the colossal Louvre Museum housing the Mona Lisa, and the historic Notre-Dame Cathedral. Millions of visitors flock here annually to soak in the cultural richness, from the bohemian streets of Montmartre to the high-fashion boutiques along the Champs-Élysées, making it a perennial top destination for travelers worldwide.""" 57 | text = tts_engine.split_sentences(text) 58 | 59 | audio_file = "custom_reference_file" ## should be 3+ seconds 60 | 61 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again 62 | audio = tts_engine.batch_generate(text, [codes_str], [transcript]) 63 | 64 | display(Audio(audio, rate=24000)) 65 | ``` 66 | 67 | Newly added: Async streaming inference that supports multiple users! 68 | ```python 69 | input_text = "Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now." 70 | display_audio = True 71 | 72 | audio_file = "custom_reference_file" 73 | user_id = tts_engine.add_speaker(audio_file) ## this will create a unique user for this reference file 74 | 75 | async for wav in stream_audio(input_text, user_id, display_audio=display_audio): 76 | ## you can manipulate wav now or just display it 77 | pass 78 | ``` 79 | 80 | It is important to note that larger batch sizes lead to **larger speedups**. For single batch sizes, it is roughly 6x realtime which is still considerably faster then FastMaya while larger batch sizes are 200x realtime or more. 81 | 82 | Stars would be greatly appreciated and I would be happy to implement other features as well. 83 | 84 | ## Next priorities 85 | - [x] fast streaming generation, current testing shows latencies low as 200ms 86 | - [ ] Multilingual models(hi, fr, de, etc.) 87 | - [ ] Efficent Multi speaker generation 88 | - [x] Online inference using async LMdeploy. Rough implementation done, will improve later on. 89 | -------------------------------------------------------------------------------- /NeuTTS/codec.py: -------------------------------------------------------------------------------- 1 | #from FastAudioSR import FASR 2 | from transformers import pipeline as transformers_pipeline 3 | from huggingface_hub import snapshot_download 4 | from phonemizer.backend.espeak.wrapper import EspeakWrapper 5 | from phonemizer.backend import EspeakBackend 6 | from typing import Optional, Dict 7 | from pathlib import Path 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torchaudio 13 | from torchaudio import transforms as T 14 | from huggingface_hub import PyTorchModelHubMixin, ModelHubMixin, hf_hub_download 15 | from transformers import AutoFeatureExtractor, HubertModel, Wav2Vec2BertModel 16 | from neucodec import DistillNeuCodec 17 | import librosa 18 | import gc 19 | import re 20 | 21 | class TTSCodec: 22 | def __init__(self, espeak_lib=None): 23 | 24 | decoder_paths = snapshot_download("YatharthS/FlashSR") 25 | #self.upsampler = FASR(f'{decoder_paths}/upsampler.pth') 26 | #self.upsampler.model.half().eval() 27 | self.transcriber = transformers_pipeline("automatic-speech-recognition", model="openai/whisper-small", device='cuda:0', torch_dtype=torch.bfloat16) 28 | if espeak_lib: 29 | EspeakWrapper.set_library(espeak_lib) 30 | self.phonemizer = EspeakBackend( 31 | language="en-us", preserve_punctuation=True, with_stress=True 32 | ) 33 | self.codec_encoder = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec").to("cuda:0").eval() 34 | 35 | 36 | 37 | @torch.inference_mode() 38 | def encode_audio(self, audio, duration=8, add_silence=8000): 39 | 40 | """encodes audio file into speech tokens and context tokens""" 41 | audio, sr = librosa.load(audio, duration=duration, sr=16000) 42 | if add_silence: 43 | audio = np.concatenate((audio, np.zeros(add_silence))) 44 | transcript = self.transcriber(audio[:-32000])['text'].lstrip() 45 | audio = torch.from_numpy(audio)[None, None, ...].float() 46 | 47 | context_codes = self.codec_encoder.encode_code(audio).cpu().numpy() 48 | codes_str = "".join([f"<|speech_{i}|>" for i in context_codes[0][0]]) 49 | 50 | return codes_str, transcript 51 | 52 | def format_prompt(self, text, transcript, codes_str): 53 | 54 | transcript_phones = " ".join(self.phonemizer.phonemize([transcript])[0].split()) 55 | text_phones = " ".join(self.phonemizer.phonemize([text])[0].split()) 56 | 57 | prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{transcript_phones} {text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}" 58 | return prompt 59 | @torch.inference_mode() 60 | def decode_tokens_batched(self, tokens, batch=False, upsample=False): 61 | 62 | """decodes the speech tokens with context tokens for audio output, optionally upsamples to 48khz for higher quality output""" 63 | 64 | my_data = [int(num) for token in tokens for num in re.findall(r"<\|speech_(\d+)\|>", token)] 65 | 66 | chunk_size = 50 67 | total_len = len(my_data) 68 | pad_len = (chunk_size - (total_len % chunk_size)) % chunk_size 69 | padded_data = my_data + [0] * pad_len 70 | codes_1d = np.array(padded_data, dtype=np.int32) 71 | 72 | codes_reshaped = codes_1d.reshape(-1, 1, chunk_size) 73 | codes = torch.from_numpy(codes_reshaped).to('cuda:0') 74 | 75 | recon = self.codec_encoder.decode_code(codes).cpu() 76 | if upsample: 77 | recon = T.Resample(24_000, 16_000)(recon.squeeze(1)).half() 78 | chunks = recon.split(64) 79 | 80 | processed_chunks = [self.upsampler.run(chunk) for chunk in chunks] 81 | wav = torch.cat(processed_chunks, dim=0) 82 | 83 | return wav, pad_len 84 | else: 85 | return recon, pad_len 86 | 87 | @torch.inference_mode() 88 | def decode_tokens(self, tokens, batch=False, upsample=False): 89 | 90 | """decodes the speech tokens with context tokens for audio output, optionally upsamples to 48khz for higher quality output""" 91 | 92 | speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", tokens)] 93 | codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :] 94 | codes = torch.from_numpy(codes).to('cuda:0') 95 | 96 | recon = self.codec_encoder.decode_code(codes).cpu() 97 | if upsample: 98 | recon = T.Resample(24_000, 16_000)(torch.from_numpy(recon).squeeze(1)) 99 | wav = self.upsampler.run(recon.half()) 100 | return wav, 48000 101 | else: 102 | return recon, 24000 103 | 104 | 105 | def c_cache(self): 106 | """clears any vram/cache, very useful""" 107 | gc.collect() 108 | torch.cuda.empty_cache() 109 | 110 | def overlap(frames: list[np.ndarray], overlap: int) -> np.ndarray: 111 | if len(frames) <= 1: 112 | return frames[0] if frames else np.array([]) 113 | 114 | last = frames[-1].squeeze() 115 | prev = frames[-2].squeeze() 116 | 117 | # Calculate Stride (Hop Size) and Overlap Segments 118 | stride = prev.shape[-1] - overlap 119 | 120 | # Generate linear fade windows: fade-out for previous, fade-in for last 121 | t = np.linspace(0.0, 1.0, overlap, dtype=last.dtype) 122 | 123 | # Weighted Sum (Overlap-Add) 124 | weighted_sum = (prev[stride:] * (1.0 - t)) + (last[:overlap] * t) 125 | 126 | # Replace the overlapped start of the last frame 127 | result = last.copy() 128 | result[:overlap] = weighted_sum 129 | return result 130 | -------------------------------------------------------------------------------- /NeuTTS/engine.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import torch 4 | import random 5 | import librosa 6 | import numpy as np 7 | from itertools import cycle 8 | from IPython.display import Audio 9 | from collections import defaultdict 10 | from NeuTTS.codec import TTSCodec, overlap 11 | from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig 12 | 13 | def compile_upsampler_with_triton_check(upsampler): 14 | """ 15 | Checks for Triton and compiles the upsampler's forward pass if found. 16 | 17 | Args: 18 | upsampler: The model object containing the upsampler structure. 19 | """ 20 | try: 21 | # Check if Triton is available. Importing it is the standard way. 22 | import triton 23 | 24 | # If the import succeeds, proceed with compilation 25 | upsampler.model.dec.resblocks[2].forward = torch.compile( 26 | upsampler.model.dec.resblocks[2].forward, 27 | mode="reduce-overhead", 28 | dynamic=True 29 | ) 30 | except ImportError: 31 | # If Triton is not found, print the required message and pass 32 | print("Triton not found, please install triton/triton_windows for faster speed although optional.") 33 | pass 34 | 35 | class TTSEngine: 36 | """ 37 | Uses LMdeploy to run maya-1 with great speed 38 | """ 39 | 40 | def __init__(self, memory_util = 0.1, tp = 1, enable_prefix_caching = True, quant_policy = 0, model="neuphonic/neutts-air"): 41 | """ 42 | Initializes the model configuration. 43 | 44 | Args: 45 | memory_util (float): Target fraction of GPU memory usage (0.0 to 1.0). Default: 0.3 46 | tp (int): Number of Tensor Parallel (TP) ranks. Use for multiple gpus. Default: 1 47 | enable_prefix_caching (bool): If True, cache input prefixes. Use for batching. Default: True 48 | quant_policy (int): KV cache quant bit-width (e.g., 8 or None). Saves vram at slight quality cost. Default: 8 49 | """ 50 | self.tts_codec = TTSCodec() 51 | backend_config = TurbomindEngineConfig(cache_max_entry_count=memory_util, tp=tp, enable_prefix_caching=enable_prefix_caching, dtype='bfloat16', quant_policy=quant_policy) 52 | self.pipe = pipeline(model, backend_config=backend_config) 53 | self.gen_config = GenerationConfig(top_p=0.95, 54 | top_k=50, 55 | temperature=1.0, 56 | max_new_tokens=1024, 57 | repetition_penalty=1.1, 58 | do_sample=True, 59 | min_p=0.1, 60 | min_new_tokens=40 61 | ) 62 | self.stored_dict = defaultdict(dict) 63 | # compile_upsampler_with_triton_check(tts_codec.upsampler) ## optionally compiles upsampler with triton for considerable speed boosts 64 | 65 | 66 | 67 | # 3. An instance method (operates on the object's data) 68 | def encode_audio(self, voice): 69 | """ 70 | Encodes the voice file. Takes time, hence good idea too cache it for later use. 71 | 72 | Args: 73 | voice (str): audio file path 74 | """ 75 | 76 | codes_str, transcript = self.tts_codec.encode_audio(voice, add_silence=32000) 77 | return codes_str, transcript 78 | 79 | 80 | # 4. Another instance method (modifies the object's state) 81 | def split_sentences(self, text): 82 | """ 83 | Splits paragraphs into list of sentences 84 | 85 | Args: 86 | text (str): input paragraphs 87 | """ 88 | sentences = re.split(r'(?<=[.!?])\s+', text) 89 | return sentences 90 | 91 | def decode_audio(self, tokens, batched=False): 92 | """ 93 | Decodes audio from neucodec tokens 94 | 95 | Args: 96 | tokens (list/str): List or str of tokens to decode 97 | batched (bool): To decode tokens as list or single string 98 | """ 99 | if batched: 100 | decoded = self.tts_codec.decode_tokens_batched(tokens) 101 | pad_len = decoded[1] 102 | audio = decoded[0].squeeze(1).flatten().numpy() 103 | audio = audio[:-pad_len*480] 104 | else: 105 | audio = self.tts_codec.decode_tokens(tokens)[0][0][0].numpy() 106 | 107 | return audio 108 | 109 | def generate(self, prompt, codes_str, transcript): 110 | """ 111 | Generates speech from text, for single batch size 112 | 113 | Args: 114 | prompt (str): Input for tts model 115 | voice (str): Description of voice 116 | """ 117 | formatted_prompt = self.tts_codec.format_prompt(prompt, transcript, codes_str) 118 | responses = self.pipe([formatted_prompt], gen_config=self.gen_config, do_preprocess=False) 119 | generated_tokens = responses[0].text 120 | audio = self.decode_audio(generated_tokens) 121 | return audio 122 | 123 | def batch_generate(self, prompts, codes_strs, transcripts): 124 | """ 125 | Generates speech from text, for larger batch size 126 | 127 | Args: 128 | prompt (list): Input for tts model, list of prompts 129 | voice (list): Description of voice, list of voices respective to prompt 130 | """ 131 | formatted_prompts = [] 132 | for prompt, code_str, transcript in zip(prompts, cycle(codes_strs), cycle(transcripts)): 133 | formatted_prompt = self.tts_codec.format_prompt(prompt, transcript, code_str) 134 | formatted_prompts.append(formatted_prompt) 135 | 136 | responses = self.pipe(formatted_prompts, gen_config=self.gen_config, do_preprocess=False) 137 | generated_tokens = [response.text for response in responses] 138 | audios = self.decode_audio(generated_tokens, batched=True) 139 | return audios 140 | 141 | async def stream_audio(self, text, user_id, display_audio=True): 142 | """ 143 | Fast async function for streaming audio: low as 100ms latency(depends on how long text is and reference file) 144 | 145 | Args: 146 | text (str): Input for tts model, single prompt 147 | user_id (int): Unique user id for each seperate user stored in stored dict 148 | display_audio (bool): To display audio or not 149 | """ 150 | duration = 6 151 | all_audios = [] 152 | all_tokens = "" 153 | num_tokens = 0 154 | first_audio = True 155 | fade_samples = 100 156 | 157 | transcript = self.stored_dict[f"{user_id}"]['transcript'] 158 | codes_str = self.stored_dict[f"{user_id}"]['codes_str'] 159 | 160 | prompt = self.tts_codec.format_prompt(text, transcript, codes_str) 161 | 162 | t0 = time.time() 163 | async for response in self.pipe.generate(messages=prompt, gen_config=self.gen_config, session_id=user_id, sequence_start=True, sequence_end=True, do_preprocess=False): 164 | all_tokens += response.response 165 | num_tokens += 1 166 | if num_tokens == 50: 167 | 168 | if first_audio: 169 | print(f"Latency is {time.time() - t0} seconds.") 170 | first_audio = False 171 | 172 | wav = self.decode_audio(all_tokens, False).astype(np.float32) 173 | all_audios.append(wav) 174 | wav = overlap(all_audios, fade_samples) 175 | all_audios[-1] = wav 176 | 177 | yield wav 178 | 179 | num_tokens = 0 180 | all_tokens = "" 181 | 182 | if num_tokens > 20: 183 | wav = self.decode_audio(all_tokens, False).astype(np.float32) 184 | 185 | all_audios.append(wav) 186 | wav = overlap(all_audios, fade_samples) 187 | all_audios[-1] = wav 188 | 189 | yield wav 190 | 191 | if display_audio: 192 | display(Audio(np.concatenate(all_audios), rate=24000)) 193 | 194 | def add_speaker(self, audio_file): 195 | """ 196 | Function to add a new user and unique speaker transcript and codes, returns user id to use for stream_audio function 197 | 198 | Args: 199 | audio_file (str): new audio file to encode and create unique user id for 200 | """ 201 | codes_str, transcript = self.encode_audio(audio_file) 202 | 203 | user_id = random.randint(100000, 999999) 204 | self.stored_dict[f"{user_id}"]['transcript'] = transcript 205 | self.stored_dict[f"{user_id}"]['codes_str'] = codes_str 206 | return user_id 207 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------