├── NeuTTS
    ├── __init__.py
    ├── codec.py
    └── engine.py
├── pyproject.toml
├── docs
    └── api.md
├── app.py
├── README.md
└── LICENSE


/NeuTTS/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "FastNeuTTS"
 7 | version = "0.0.2"
 8 | authors = [
 9 |   { name="Yatharth Sharma", email="yatharthsharma3501@gmail.com" },
10 | ]
11 | description = "Fast batched audio generation with neutts-air"
12 | readme = "README.md"
13 | requires-python = ">=3.8"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "lmdeploy",
21 |     "librosa",
22 | #    "fastaudiosr @ git+https://github.com/ysharma3501/FlashSR.git",
23 |     "neucodec",
24 |     "phonemizer"
25 |     
26 | ]
27 | 
28 | [project.urls]
29 | Homepage = "https://github.com/ysharma3501/FastNeuTTS"
30 | Issues = "https://github.com/ysharma3501/FastNeuTTS/issues"
31 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
 1 | # FastAPI app usage
 2 | 
 3 | This library also provides a rough async FastAPI app. Code usage is seen below.
 4 | 
 5 | STEP 1: Installation and running the app:
 6 | ```
 7 | # Install necessary libraries
 8 | pip install fastapi uvicorn
 9 | 
10 | # run app, found in app.py
11 | uvicorn app:app --reload
12 | ```
13 | 
14 | 
15 | 
16 | The app should be running in port 8000 by default.
17 | STEP 2: set voices, currently a rough implementation. Will improve later on.
18 | ```
19 | # Register the voice file and capture the output (which includes the user_id).
20 | # Replace 'my_reference_audio.wav' with your actual filename.
21 | #
22 | # Try to manually copy the "user_id" from the output.
23 | # For example, if the output is {"user_id": "929302"}, use that ID below.
24 | 
25 | curl -X 'GET' \
26 |   'http://127.0.0.1:8000/set_voice/?audio_file=my_reference_audio.wav' \
27 |   -H 'accept: application/json'
28 | ```
29 | 
30 | 
31 | 
32 | STEP 3: Run the model:
33 | ```
34 | USER_ID="929302"  # Use the SAME user id you got from step 2
35 | TEXT_TO_SAY="Hello, this is my custom cloned voice being streamed in real time."
36 | 
37 | # 2. POST request to stream the raw PCM audio and save it to a file
38 | curl -X 'POST' \
39 |   'http://127.0.0.1:8000/v1/audio/speech' \
40 |   -H 'Content-Type: application/json' \
41 |   --data-raw '{
42 |     "input": "'"$TEXT_TO_SAY"'",
43 |     "voice": "'"$USER_ID"'",
44 |     "model": "tts-1",
45 |     "response_format": "pcm"
46 |   }' \
47 |   --output "output_audio.raw"
48 | 
49 | # The api will output 16 bit, 24khz, mono channel audio.
50 | ```
51 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import torch
 4 | import asyncio
 5 | import numpy as np
 6 | from typing import Optional
 7 | from NeuTTS.engine import TTSEngine
 8 | from fastapi import FastAPI, HTTPException, Body, Query
 9 | from fastapi.responses import StreamingResponse
10 | 
11 | 
12 | try:
13 |     tts_engine = TTSEngine()
14 |     user_voice_map = {}
15 |     print("✅ TTSEngine loaded successfully.")
16 | except Exception as e:
17 |     print(f"❌ Error loading TTSEngine: {e}")
18 |     tts_engine = None
19 | 
20 | app = FastAPI(title="Streaming TTS Service", version="1.0")
21 | 
22 | @app.get("/set_voice/", summary="Register a voice file and get a unique User ID.")
23 | async def set_voice(
24 |     audio_file: str = Query(..., description="The filename of the custom reference audio for the voice."),
25 |     user_id: Optional[str] = Query(None, description="Optional: A preferred unique User ID.")
26 | ):
27 |     """
28 |     Registers a new speaker voice using a reference audio file. 
29 |     It assigns or uses a unique User ID for this voice profile.
30 |     
31 |     This is the function that calls `tts_engine.add_speaker(audio_file)`.
32 |     """
33 |     if tts_engine is None:
34 |         raise HTTPException(status_code=503, detail="TTS Engine is not available.")
35 |         
36 |     try:
37 |         # The engine handles creating a unique ID if one isn't provided/valid.
38 |         final_user_id = tts_engine.add_speaker(audio_file, speaker_id=user_id)
39 |         user_voice_map[final_user_id] = audio_file # Store the mapping
40 |         
41 |         return {
42 |             "message": "Speaker voice registered successfully.",
43 |             "user_id": final_user_id,
44 |             "audio_file": audio_file
45 |         }
46 |     except Exception as e:
47 |         raise HTTPException(status_code=400, detail=f"Failed to register speaker: {e}")
48 | 
49 | 
50 | async def stream_audio_generator(input_text: str, user_id: str, display_audio: bool):
51 |     """
52 |     An asynchronous generator that yields converted 16-bit PCM audio chunks.
53 |     """
54 |     if tts_engine is None:
55 |         raise RuntimeError("TTS Engine is not initialized.")
56 |         
57 |     try:
58 |         async for wav_float32 in tts_engine.stream_audio(input_text, user_id, display_audio=display_audio):
59 |             # 1. Convert float32 array (-1.0 to 1.0) to int16 PCM (-32768 to 32767)
60 |             wav_int16 = (wav_float32 * 32767).astype(np.int16)
61 |             
62 |             # 2. Convert the int16 NumPy array to raw bytes
63 |             yield wav_int16.tobytes()
64 |             
65 |     except Exception as e:
66 |         print(f"Error during audio generation: {e}")
67 |         pass
68 | 
69 | 
70 | @app.post("/v1/audio/speech", summary="Stream TTS audio (OpenAI compatible).")
71 | async def tts_stream(
72 |     input: str = Body(..., embed=True, description="The text to generate audio for."),
73 |     voice: str = Body(..., embed=True, description="The 'voice' maps to our custom speaker user_id."),
74 |     model: str = Body("tts-1", embed=True, description="Placeholder model name."), 
75 |     response_format: str = Body("pcm", embed=True, description="Desired audio format.") 
76 | ):
77 |     # In your logic, map the 'voice' back to your 'user_id'
78 |     ## model and response format do not matter currently
79 |     user_id = voice
80 |     try:
81 |         audio_generator = stream_audio_generator(
82 |             input_text=input, 
83 |             user_id=user_id, 
84 |             display_audio=False
85 |         )
86 |         
87 |         return StreamingResponse(
88 |             audio_generator,
89 |             media_type="application/octet-stream" 
90 |             # The client must know the format (SR=24000, 16-bit, mono, little-endian)
91 |         )
92 |     except RuntimeError as e:
93 |         raise HTTPException(status_code=500, detail=str(e))
94 |     except Exception as e:
95 |         raise HTTPException(status_code=400, detail=f"TTS generation failed: {e}")
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FastNeuTTS
 2 | Fast NeuTTS is a highly optimized engine for [NeuTTS-air](https://huggingface.co/neuphonic/neutts-air) using [LMdeploy](https://github.com/InternLM/lmdeploy) to generate minutes of audio in just seconds. This repo is similar to the previous [FastMaya](https://github.com/ysharma3501/FastMaya) repo but much more faster and supports voice cloning as well.
 3 | It will soon support multilingual models and multi-speaker models as well with streaming and latencies as low as 100ms.
 4 | 
 5 | ## Key improvements in this repo
 6 | * Much faster then original  implementation and can reach over **200x realtime** on consumer gpus using batching!
 7 | * Memory efficent as it works on **6gb vram gpus**.
 8 | * Works with multiple gpus using tensor parallel to improve speed further.
 9 | * Incredibly low potential latency of just **100ms**
10 |   
11 | Speed was tested on 4070 Ti Super
12 | - Input text can be found in test.txt
13 | - 2.397 seconds to generate 508 seconds of audio
14 | - Hence **211x realtime** or 0.0047 RTF factor
15 | 
16 | Simple 2 line installation; requires pip and git but uv will speed up installation considerably
17 | ```
18 | uv pip install git+https://github.com/ysharma3501/FastNeuTTS.git
19 | sudo apt install espeak-ng -y
20 | ```
21 | 
22 | Usage for single batch size:
23 | ```python
24 | import re
25 | import time
26 | import torch
27 | from IPython.display import Audio
28 | from NeuTTS.engine import TTSEngine
29 | 
30 | tts_engine = TTSEngine()
31 | text = "Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now."
32 | 
33 | audio_file = "audio_file" ## custom reference file, should be 3s or more
34 | 
35 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again
36 | audio = tts_engine.batch_generate([text], [codes_str], [transcript])
37 | 
38 | display(Audio(audio, rate=24000))
39 | ```
40 | 
41 | 
42 | Usage for larger batch sizes:
43 | ```python
44 | 
45 | text = ["Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now.", "You dare challenge me, mortal. How amusing. Your kind always thinks they can win!"]
46 | audio_file = "custom_reference_file" ## should be 3+ seconds
47 | 
48 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again
49 | audio = tts_engine.batch_generate(text, [codes_str], [transcript])
50 | 
51 | display(Audio(audio, rate=24000))
52 | ```
53 | 
54 | Usage for **auto-splitting text** into sentences and batching(good for paragraphs):
55 | ```python
56 | text = """Paris, often affectionately known as the City of Light or La Ville Lumière, is the historic capital of France, globally celebrated as a center of art, fashion, gastronomy, and romance. Situated on the winding Seine River, which divides the city into the Left Bank and Right Bank, Paris offers a captivating blend of magnificent Haussmann architecture, grand boulevards, and charming, intimate neighborhoods. It is home to world-renowned landmarks like the iconic Eiffel Tower, the colossal Louvre Museum housing the Mona Lisa, and the historic Notre-Dame Cathedral. Millions of visitors flock here annually to soak in the cultural richness, from the bohemian streets of Montmartre to the high-fashion boutiques along the Champs-Élysées, making it a perennial top destination for travelers worldwide."""
57 | text = tts_engine.split_sentences(text)
58 | 
59 | audio_file = "custom_reference_file" ## should be 3+ seconds
60 | 
61 | codes_str, transcript = tts_engine.encode_audio(audio_file) ## good idea to cache speaker codes and transcripts so no need to encode again
62 | audio = tts_engine.batch_generate(text, [codes_str], [transcript])
63 | 
64 | display(Audio(audio, rate=24000))
65 | ```
66 | 
67 | Newly added: Async streaming inference that supports multiple users!
68 | ```python
69 | input_text = "Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now."
70 | display_audio = True
71 | 
72 | audio_file = "custom_reference_file"
73 | user_id = tts_engine.add_speaker(audio_file) ## this will create a unique user for this reference file
74 | 
75 | async for wav in stream_audio(input_text, user_id, display_audio=display_audio):
76 |     ## you can manipulate wav now or just display it
77 |     pass
78 | ```
79 | 
80 | It is important to note that larger batch sizes lead to **larger speedups**. For single batch sizes, it is roughly 6x realtime which is still considerably faster then FastMaya while larger batch sizes are 200x realtime or more.
81 | 
82 | Stars would be greatly appreciated and I would be happy to implement other features as well.
83 | 
84 | ## Next priorities
85 | - [x] fast streaming generation, current testing shows latencies low as 200ms
86 | - [ ] Multilingual models(hi, fr, de, etc.)
87 | - [ ] Efficent Multi speaker generation
88 | - [x] Online inference using async LMdeploy. Rough implementation done, will improve later on.
89 | 


--------------------------------------------------------------------------------
/NeuTTS/codec.py:
--------------------------------------------------------------------------------
  1 | #from FastAudioSR import FASR
  2 | from transformers import pipeline as transformers_pipeline
  3 | from huggingface_hub import snapshot_download
  4 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
  5 | from phonemizer.backend import EspeakBackend
  6 | from typing import Optional, Dict
  7 | from pathlib import Path
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torchaudio
 13 | from torchaudio import transforms as T
 14 | from huggingface_hub import PyTorchModelHubMixin, ModelHubMixin, hf_hub_download
 15 | from transformers import AutoFeatureExtractor, HubertModel, Wav2Vec2BertModel
 16 | from neucodec import DistillNeuCodec
 17 | import librosa
 18 | import gc
 19 | import re
 20 | 
 21 | class TTSCodec:
 22 |     def __init__(self, espeak_lib=None):
 23 |         
 24 |         decoder_paths = snapshot_download("YatharthS/FlashSR")
 25 |         #self.upsampler = FASR(f'{decoder_paths}/upsampler.pth')
 26 |         #self.upsampler.model.half().eval()
 27 |         self.transcriber = transformers_pipeline("automatic-speech-recognition", model="openai/whisper-small", device='cuda:0', torch_dtype=torch.bfloat16)
 28 |         if espeak_lib:
 29 |             EspeakWrapper.set_library(espeak_lib)
 30 |         self.phonemizer = EspeakBackend(
 31 |             language="en-us", preserve_punctuation=True, with_stress=True
 32 |         )
 33 |         self.codec_encoder = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec").to("cuda:0").eval()
 34 | 
 35 |         
 36 |         
 37 |     @torch.inference_mode()   
 38 |     def encode_audio(self, audio, duration=8, add_silence=8000):
 39 | 
 40 |         """encodes audio file into speech tokens and context tokens"""
 41 |         audio, sr = librosa.load(audio, duration=duration, sr=16000)
 42 |         if add_silence:
 43 |             audio = np.concatenate((audio, np.zeros(add_silence)))
 44 |         transcript = self.transcriber(audio[:-32000])['text'].lstrip()
 45 |         audio = torch.from_numpy(audio)[None, None, ...].float()
 46 |       
 47 |         context_codes = self.codec_encoder.encode_code(audio).cpu().numpy()
 48 |         codes_str = "".join([f"<|speech_{i}|>" for i in context_codes[0][0]])
 49 | 
 50 |         return codes_str, transcript
 51 |       
 52 |     def format_prompt(self, text, transcript, codes_str):
 53 |       
 54 |         transcript_phones = " ".join(self.phonemizer.phonemize([transcript])[0].split())
 55 |         text_phones = " ".join(self.phonemizer.phonemize([text])[0].split())
 56 | 
 57 |         prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{transcript_phones} {text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
 58 |         return prompt
 59 |     @torch.inference_mode()
 60 |     def decode_tokens_batched(self, tokens, batch=False, upsample=False):
 61 | 
 62 |         """decodes the speech tokens with context tokens for audio output, optionally upsamples to 48khz for higher quality output"""
 63 | 
 64 |         my_data = [int(num) for token in tokens for num in re.findall(r"<\|speech_(\d+)\|>", token)] 
 65 |         
 66 |         chunk_size = 50
 67 |         total_len = len(my_data)
 68 |         pad_len = (chunk_size - (total_len % chunk_size)) % chunk_size
 69 |         padded_data = my_data + [0] * pad_len
 70 |         codes_1d = np.array(padded_data, dtype=np.int32)
 71 |         
 72 |         codes_reshaped = codes_1d.reshape(-1, 1, chunk_size)
 73 |         codes = torch.from_numpy(codes_reshaped).to('cuda:0')
 74 |         
 75 |         recon = self.codec_encoder.decode_code(codes).cpu()
 76 |         if upsample:
 77 |             recon = T.Resample(24_000, 16_000)(recon.squeeze(1)).half()
 78 |             chunks = recon.split(64)
 79 | 
 80 |             processed_chunks = [self.upsampler.run(chunk) for chunk in chunks]
 81 |             wav = torch.cat(processed_chunks, dim=0)
 82 | 
 83 |             return wav, pad_len
 84 |         else:
 85 |             return recon, pad_len
 86 |             
 87 |     @torch.inference_mode()
 88 |     def decode_tokens(self, tokens, batch=False, upsample=False):
 89 | 
 90 |         """decodes the speech tokens with context tokens for audio output, optionally upsamples to 48khz for higher quality output"""
 91 |         
 92 |         speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", tokens)]
 93 |         codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
 94 |         codes = torch.from_numpy(codes).to('cuda:0')
 95 |         
 96 |         recon = self.codec_encoder.decode_code(codes).cpu()
 97 |         if upsample:
 98 |             recon = T.Resample(24_000, 16_000)(torch.from_numpy(recon).squeeze(1))
 99 |             wav = self.upsampler.run(recon.half())
100 |             return wav, 48000
101 |         else:
102 |             return recon, 24000
103 | 
104 | 
105 |     def c_cache(self):
106 |         """clears any vram/cache, very useful"""
107 |         gc.collect()
108 |         torch.cuda.empty_cache()
109 | 
110 | def overlap(frames: list[np.ndarray], overlap: int) -> np.ndarray:
111 |     if len(frames) <= 1:
112 |         return frames[0] if frames else np.array([])
113 |     
114 |     last = frames[-1].squeeze()
115 |     prev = frames[-2].squeeze()
116 |     
117 |     # Calculate Stride (Hop Size) and Overlap Segments
118 |     stride = prev.shape[-1] - overlap
119 |     
120 |     # Generate linear fade windows: fade-out for previous, fade-in for last
121 |     t = np.linspace(0.0, 1.0, overlap, dtype=last.dtype)
122 |     
123 |     # Weighted Sum (Overlap-Add)
124 |     weighted_sum = (prev[stride:] * (1.0 - t)) + (last[:overlap] * t)
125 |     
126 |     # Replace the overlapped start of the last frame
127 |     result = last.copy()
128 |     result[:overlap] = weighted_sum
129 |     return result
130 | 


--------------------------------------------------------------------------------
/NeuTTS/engine.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import torch
  4 | import random
  5 | import librosa
  6 | import numpy as np
  7 | from itertools import cycle
  8 | from IPython.display import Audio
  9 | from collections import defaultdict
 10 | from NeuTTS.codec import TTSCodec, overlap
 11 | from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
 12 | 
 13 | def compile_upsampler_with_triton_check(upsampler):
 14 |     """
 15 |     Checks for Triton and compiles the upsampler's forward pass if found.
 16 |     
 17 |     Args:
 18 |         upsampler: The model object containing the upsampler structure.
 19 |     """
 20 |     try:
 21 |         # Check if Triton is available. Importing it is the standard way.
 22 |         import triton
 23 |         
 24 |         # If the import succeeds, proceed with compilation
 25 |         upsampler.model.dec.resblocks[2].forward = torch.compile(
 26 |             upsampler.model.dec.resblocks[2].forward,
 27 |             mode="reduce-overhead", 
 28 |             dynamic=True            
 29 |         )
 30 |     except ImportError:
 31 |         # If Triton is not found, print the required message and pass
 32 |         print("Triton not found, please install triton/triton_windows for faster speed although optional.")
 33 |         pass
 34 |         
 35 | class TTSEngine:
 36 |     """
 37 |     Uses LMdeploy to run maya-1 with great speed
 38 |     """
 39 | 
 40 |     def __init__(self, memory_util = 0.1, tp = 1, enable_prefix_caching = True, quant_policy = 0, model="neuphonic/neutts-air"):
 41 |         """
 42 |         Initializes the model configuration.
 43 | 
 44 |         Args:
 45 |             memory_util (float): Target fraction of GPU memory usage (0.0 to 1.0). Default: 0.3
 46 |             tp (int): Number of Tensor Parallel (TP) ranks. Use for multiple gpus. Default: 1
 47 |             enable_prefix_caching (bool): If True, cache input prefixes. Use for batching. Default: True
 48 |             quant_policy (int): KV cache quant bit-width (e.g., 8 or None). Saves vram at slight quality cost. Default: 8
 49 |         """
 50 |         self.tts_codec = TTSCodec()
 51 |         backend_config = TurbomindEngineConfig(cache_max_entry_count=memory_util, tp=tp, enable_prefix_caching=enable_prefix_caching, dtype='bfloat16', quant_policy=quant_policy)
 52 |         self.pipe = pipeline(model, backend_config=backend_config)
 53 |         self.gen_config = GenerationConfig(top_p=0.95,
 54 |                               top_k=50,
 55 |                               temperature=1.0,
 56 |                               max_new_tokens=1024,
 57 |                               repetition_penalty=1.1,
 58 |                               do_sample=True,
 59 |                               min_p=0.1,
 60 |                               min_new_tokens=40
 61 |                               )
 62 |         self.stored_dict = defaultdict(dict) 
 63 |        # compile_upsampler_with_triton_check(tts_codec.upsampler) ## optionally compiles upsampler with triton for considerable speed boosts
 64 |         
 65 |         
 66 | 
 67 |     # 3. An instance method (operates on the object's data)
 68 |     def encode_audio(self, voice):
 69 |         """
 70 |         Encodes the voice file. Takes time, hence good idea too cache it for later use.
 71 | 
 72 |         Args:
 73 |             voice (str): audio file path
 74 |         """
 75 |         
 76 |         codes_str, transcript = self.tts_codec.encode_audio(voice, add_silence=32000)
 77 |         return codes_str, transcript
 78 | 
 79 |         
 80 |     # 4. Another instance method (modifies the object's state)
 81 |     def split_sentences(self, text):
 82 |         """
 83 |         Splits paragraphs into list of sentences
 84 | 
 85 |         Args:
 86 |             text (str): input paragraphs
 87 |         """
 88 |         sentences = re.split(r'(?<=[.!?])\s+', text)
 89 |         return sentences
 90 | 
 91 |     def decode_audio(self, tokens, batched=False):
 92 |         """
 93 |         Decodes audio from neucodec tokens
 94 | 
 95 |         Args:
 96 |             tokens (list/str): List or str of tokens to decode
 97 |             batched (bool): To decode tokens as list or single string
 98 |         """
 99 |         if batched:
100 |             decoded = self.tts_codec.decode_tokens_batched(tokens)
101 |             pad_len = decoded[1]
102 |             audio = decoded[0].squeeze(1).flatten().numpy()
103 |             audio = audio[:-pad_len*480]
104 |         else:
105 |             audio = self.tts_codec.decode_tokens(tokens)[0][0][0].numpy()
106 | 
107 |         return audio
108 |         
109 |     def generate(self, prompt, codes_str, transcript):
110 |         """
111 |         Generates speech from text, for single batch size
112 | 
113 |         Args:
114 |             prompt (str): Input for tts model
115 |             voice (str): Description of voice
116 |         """
117 |         formatted_prompt = self.tts_codec.format_prompt(prompt, transcript, codes_str)
118 |         responses = self.pipe([formatted_prompt], gen_config=self.gen_config, do_preprocess=False)
119 |         generated_tokens = responses[0].text
120 |         audio = self.decode_audio(generated_tokens)
121 |         return audio
122 | 
123 |     def batch_generate(self, prompts, codes_strs, transcripts):
124 |         """
125 |         Generates speech from text, for larger batch size
126 | 
127 |         Args:
128 |             prompt (list): Input for tts model, list of prompts
129 |             voice (list): Description of voice, list of voices respective to prompt
130 |         """
131 |         formatted_prompts = []
132 |         for prompt, code_str, transcript in zip(prompts, cycle(codes_strs), cycle(transcripts)):
133 |             formatted_prompt = self.tts_codec.format_prompt(prompt, transcript, code_str)
134 |             formatted_prompts.append(formatted_prompt)
135 |         
136 |         responses = self.pipe(formatted_prompts, gen_config=self.gen_config, do_preprocess=False)
137 |         generated_tokens = [response.text for response in responses]
138 |         audios = self.decode_audio(generated_tokens, batched=True)
139 |         return audios
140 |         
141 |     async def stream_audio(self, text, user_id, display_audio=True):
142 |         """
143 |         Fast async function for streaming audio: low as 100ms latency(depends on how long text is and reference file)
144 | 
145 |         Args:
146 |             text (str): Input for tts model, single prompt
147 |             user_id (int): Unique user id for each seperate user stored in stored dict
148 |             display_audio (bool): To display audio or not
149 |         """
150 |         duration = 6
151 |         all_audios = []
152 |         all_tokens = ""
153 |         num_tokens = 0
154 |         first_audio = True
155 |         fade_samples = 100
156 |      
157 |         transcript = self.stored_dict[f"{user_id}"]['transcript']
158 |         codes_str = self.stored_dict[f"{user_id}"]['codes_str']
159 | 
160 |         prompt = self.tts_codec.format_prompt(text, transcript, codes_str)
161 |         
162 |         t0 = time.time()
163 |         async for response in self.pipe.generate(messages=prompt, gen_config=self.gen_config, session_id=user_id, sequence_start=True, sequence_end=True, do_preprocess=False):
164 |             all_tokens += response.response
165 |             num_tokens += 1 
166 |             if num_tokens == 50:
167 |             
168 |                 if first_audio:
169 |                     print(f"Latency is {time.time() - t0} seconds.")
170 |                     first_audio = False
171 |             
172 |                 wav = self.decode_audio(all_tokens, False).astype(np.float32)
173 |                 all_audios.append(wav)
174 |                 wav = overlap(all_audios, fade_samples)
175 |                 all_audios[-1] = wav
176 | 
177 |                 yield wav
178 |             
179 |                 num_tokens = 0
180 |                 all_tokens = ""
181 |             
182 |         if num_tokens > 20:
183 |             wav = self.decode_audio(all_tokens, False).astype(np.float32)
184 |         
185 |             all_audios.append(wav)
186 |             wav = overlap(all_audios, fade_samples)
187 |             all_audios[-1] = wav
188 |         
189 |             yield wav
190 | 
191 |         if display_audio:
192 |             display(Audio(np.concatenate(all_audios), rate=24000))
193 |             
194 |     def add_speaker(self, audio_file):
195 |         """
196 |         Function to add a new user and unique speaker transcript and codes, returns user id to use for stream_audio function
197 | 
198 |         Args:
199 |             audio_file (str): new audio file to encode and create unique user id for
200 |         """
201 |         codes_str, transcript = self.encode_audio(audio_file)
202 |         
203 |         user_id = random.randint(100000, 999999)
204 |         self.stored_dict[f"{user_id}"]['transcript'] = transcript
205 |         self.stored_dict[f"{user_id}"]['codes_str'] = codes_str
206 |         return user_id
207 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------