├── logs └── test_male.wav ├── image └── api_for_tts.png ├── __pycache__ └── tts_inference.cpython-38.pyc ├── modules ├── __pycache__ │ ├── synthsizer.cpython-38.pyc │ └── model_download.cpython-38.pyc ├── model_download.py ├── tokenizer.py └── synthsizer.py ├── app.py ├── README.md ├── inference.py └── requirements.txt /logs/test_male.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/logs/test_male.wav -------------------------------------------------------------------------------- /image/api_for_tts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/image/api_for_tts.png -------------------------------------------------------------------------------- /__pycache__/tts_inference.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/__pycache__/tts_inference.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/synthsizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/modules/__pycache__/synthsizer.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/model_download.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/modules/__pycache__/model_download.cpython-38.pyc -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import json 5 | import requests 6 | import time 7 | from flask import Flask, request, jsonify 8 | from inference import model_loading, bangla_tts 9 | 10 | 11 | DEBUG = True 12 | tts_model = model_loading() 13 | 14 | app = Flask(__name__) 15 | 16 | @app.route('/tts', methods=['POST']) 17 | def process_text(): 18 | st = time.time() 19 | data = request.get_json() 20 | sender = data.get('sender', '') 21 | text = data.get('text', '') 22 | save_dir = data.get("save_dir", ) 23 | print("==============================") 24 | print("request : ", request) 25 | print("==============================") 26 | print(f"sender : {sender}") 27 | print(f"text : {text}") 28 | 29 | audio= bangla_tts( 30 | model= tts_model, 31 | text = text, 32 | is_male = False, 33 | is_e2e_vits = True, 34 | log_dir = save_dir 35 | ) 36 | 37 | print("type : ", type(audio)) 38 | response = { 39 | "audio_url" : audio, 40 | "sender" : sender, 41 | "status" : 200, 42 | "processing_time" : time.time()-st 43 | } 44 | 45 | 46 | return response 47 | 48 | 49 | if __name__ == '__main__': 50 | app.run(debug=True, host="192.168.0.114",port=8009) -------------------------------------------------------------------------------- /modules/model_download.py: -------------------------------------------------------------------------------- 1 | """ 2 | @ Author : Saiful, Sagor 3 | @ Email : saifulbrur79@gmail.com 4 | """ 5 | import os 6 | import wget 7 | root_dir = os.getcwd() 8 | 9 | model_dict = { 10 | "male" : { 11 | "model_path" : "https://huggingface.co/bangla-speech-processing/bangla_tts_male/resolve/main/pytorch_model.pth", 12 | "config" : "https://huggingface.co/bangla-speech-processing/bangla_tts_male/resolve/main/config.json" 13 | }, 14 | "female" : { 15 | "model_path" : 'https://huggingface.co/bangla-speech-processing/bangla_tts_female/resolve/main/pytorch_model.pth', 16 | "config" : "https://huggingface.co/bangla-speech-processing/bangla_tts_female/resolve/main/config.json" 17 | } 18 | } 19 | 20 | def download_file(root_dir = "./", output_path="models", gender = "female"): 21 | path_dir = os.path.join(root_dir, output_path, gender) 22 | 23 | model_dir = os.path.join(path_dir, "pytorch_model.pth") 24 | config_dir = os.path.join(path_dir, "config.json") 25 | 26 | 27 | 28 | # print(model_dir) 29 | os.makedirs(path_dir, exist_ok=True) 30 | 31 | if os.path.exists(model_dir) and os.path.exists(config_dir): 32 | print("model and config already exits") 33 | else: 34 | wget.download(model_dict[gender]["config"], out=path_dir) 35 | wget.download(model_dict[gender]["model_path"], out=path_dir) 36 | 37 | return model_dir, config_dir 38 | 39 | 40 | if __name__ == "__main__": 41 | model_dir, config_dir = download_file(root_dir=root_dir) 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Bangla TTS 3 | The Bangla TTS was training mono(male) speakers using Vit TTS model. The paper is ViT-TTS: Visual Text-to-Speech with Scalable Diffusion Transformer, we used the coqui-ai🐸-toolkit for Bangla Text-to-Speech training as well as inference. 4 | 5 | __N.B : This pipeline only for inference as well as end point API testing purposes.__ 6 | 7 | __Please check the faster test into [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ea_BVSinWFy_9W2AH7NI55Ur0XO4Tr-a?usp=sharing) 8 | 9 | # Requiremnts 10 | Create Environments 11 | ``` 12 | conda create -n bn_tts python==3.8 13 | conda activate bn_tts 14 | ``` 15 | Install require modules 16 | 17 | ``` 18 | pip install -r requirements.txt 19 | ``` 20 | # Dataset 21 | 22 | Bangla Speech corpus prepared by the Indic TTS Team of IIT Madras. I've downsampled the dataset down to 22050 and converted the raw iitm annotation format into ljspeech format for training several TTS models for bangla. 23 | in this dataset, i am sharing the final processed dataset for Bangla TTS along with trained best models weight files. please cite this paper: https://aclanthology.org/2020.lrec-1.789.pdf if you are using the dataset in your research works. 24 | 25 | Dataset link: https://www.kaggle.com/datasets/mobassir/comprehensive-bangla-tts 26 | 27 | 28 | 29 | # Training 30 | 31 | Training code [jupyter](train_bangla_vits.ipynb) 32 | 33 | 34 | # Single Test[Inference] 35 | 36 | For the single testing run, 37 | 38 | ``` 39 | python inference.py 40 | ``` 41 | or 42 | 43 | Inference on [jupyter notebook](inference.ipynb) 44 | 45 | [huggingface](https://huggingface.co/bangla-speech-processing/bangla_tts_female) 46 | 47 | 48 | # End Point API 49 | For the API testing, 50 | 51 | ### 1. Run the ```app.py``` script 52 | ``` 53 | python app.py 54 | 55 | ``` 56 | ### 2. Testing using python request 57 | Write a .py script and run this code the audio .wav file will save into logs directory, 58 | 59 | ``` 60 | import os 61 | import request 62 | import time 63 | 64 | username = "saiful" 65 | text = "আপনি কেমন আছেন।" 66 | log_dir = "logs" 67 | filename = "audio_file_"+str(time.strftime("%Y%m%d-%H%M%S"))+".wav" 68 | os.makedirs(log_dir, exist_ok= True) 69 | 70 | file_dir = os.path.join(log_dir, filename) 71 | # here use your localhost machine api or localhost and post 72 | url = 'http://192.168.1.154:8009/tts' 73 | 74 | payload = { 75 | "sender": username, 76 | "message": text 77 | } 78 | 79 | payload = { 80 | "text" : text, 81 | "sender" : username, 82 | "save_dir" : file_dir 83 | } 84 | headers = {'content-type': 'application/json'} 85 | result = requests.post(url, json=payload, headers=headers) 86 | print(result) 87 | 88 | ``` 89 | 90 | ### 3. if want to use Postman skip the procedure 2 91 | 92 | ![alt text](image/api_for_tts.png) 93 | 94 | 95 | # Reference 96 | 97 | 1. https://aclanthology.org/2020.lrec-1.789.pdf 98 | 2. https://arxiv.org/pdf/2106.06103.pdf 99 | 3. https://arxiv.org/abs/2005.11129 100 | 4. https://aclanthology.org/2020.emnlp-main.207.pdf 101 | 5. https://github.com/mobassir94 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | @Author : Saiful Islam 3 | @Email : saifulbrur79@gmail.com 4 | """ 5 | import re 6 | import os 7 | import torch 8 | import bangla 9 | import base64 10 | from IPython.display import Audio 11 | import soundfile as sf 12 | from bnnumerizer import numerize 13 | from bnunicodenormalizer import Normalizer 14 | from modules.model_download import download_file 15 | from modules.synthsizer import Synthesizer 16 | bnorm=Normalizer() 17 | root_dir = os.getcwd() 18 | 19 | use_cuda = True 20 | DEBUG_SAVE = True 21 | 22 | # set pretrain model female or male 23 | DEBUG_GENDER = ["female", "male"] 24 | GENDER = DEBUG_GENDER[1] 25 | 26 | def model_loading(model_path=None, config_path=None): 27 | tts_bn_model=Synthesizer( 28 | model_path, 29 | config_path, 30 | use_cuda = use_cuda 31 | ) 32 | return tts_bn_model 33 | 34 | def normalize(sen): 35 | _words = [bnorm(word)['normalized'] for word in sen.split()] 36 | return " ".join([word for word in _words if word is not None]) 37 | 38 | def bangla_tts(model:object=None, text = "আমি বাংলা শিখেছি",is_male = True, is_e2e_vits = True, log_dir = "logs/unknown.wav"): 39 | ''' 40 | params: 41 | text : input bangla text that needs to be synthesized. 42 | is_male : if True then uses cloned voice of male speaker,otherwise female speaker is used. 43 | is_e2e_vits : if True then uses vits model,otherwise glowtts gets used. 44 | 45 | ''' 46 | if(text[-1] != '।'): 47 | text += '।' 48 | # english numbers to bangla conversion 49 | res = re.search('[0-9]', text) 50 | if res is not None: 51 | text = bangla.convert_english_digit_to_bangla_digit(text) 52 | 53 | #replace ':' in between two bangla numbers with ' এর ' 54 | pattern=r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]" 55 | matches=re.findall(pattern,text) 56 | for m in matches: 57 | r=m.replace(":"," এর ") 58 | text=text.replace(m,r) 59 | try: 60 | text=numerize(text) 61 | except: 62 | pass 63 | text = normalize(text) 64 | sentenceEnders = re.compile('[।!?]') 65 | sentences = sentenceEnders.split(str(text)) 66 | audio_list = [] 67 | for i in range(len(sentences)): 68 | if(not sentences[i]): 69 | continue 70 | text = sentences[i]+'।' 71 | audio_list.append(torch.as_tensor(model.tts(text))) 72 | audio = torch.cat([k for k in audio_list]) 73 | numpy_audio = audio.detach().cpu().numpy() 74 | return numpy_audio 75 | 76 | if __name__ == "__main__": 77 | 78 | text = 'রওশন এরশাদের সঙ্গে দেখা করলেন জিএম কাদের।' 79 | fileName = 'logs/test_male.wav' 80 | 81 | print("Model Downloading : .......") 82 | model_path, config_path = download_file( 83 | root_dir=root_dir, 84 | output_path="models", 85 | gender=GENDER 86 | ) 87 | print("Done") 88 | 89 | tts_bn_model = model_loading( 90 | model_path=model_path, 91 | config_path=config_path 92 | ) 93 | audio= bangla_tts( 94 | model= tts_bn_model, 95 | text = text, 96 | is_male = False, 97 | is_e2e_vits = True 98 | ) 99 | 100 | 101 | if DEBUG_SAVE: 102 | sf.write(fileName, audio, 22050) 103 | # 22050 104 | # Audio(fileName, autoplay=True) 105 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | accelerate==0.21.0 3 | aiohttp==3.8.5 4 | aiosignal==1.3.1 5 | anyascii==0.3.2 6 | anyio 7 | appdirs==1.4.4 8 | argon2-cffi 9 | argon2-cffi-bindings 10 | asttokens 11 | async-timeout==4.0.2 12 | attrs==23.1.0 13 | audioread==3.0.0 14 | Babel==2.12.1 15 | backcall 16 | backports.zoneinfo==0.2.1 17 | bangla==0.0.2 18 | beautifulsoup4 19 | bleach 20 | blinker==1.6.2 21 | bnnumerizer==0.0.2 22 | bnunicodenormalizer==0.1.1 23 | boltons==23.0.0 24 | cachetools==5.3.1 25 | certifi==2023.7.22 26 | cffi 27 | charset-normalizer==3.2.0 28 | clean-fid==0.1.35 29 | click==8.1.6 30 | clip-anytorch==2.5.2 31 | comm 32 | contourpy==1.1.0 33 | coqpit==0.0.17 34 | cycler==0.11.0 35 | Cython==0.29.28 36 | dateparser==1.1.8 37 | debugpy 38 | decorator 39 | defusedxml 40 | docker-pycreds==0.4.0 41 | docopt==0.6.2 42 | einops==0.6.1 43 | entrypoints 44 | executing 45 | fastjsonschema 46 | filelock==3.12.2 47 | Flask==2.3.2 48 | fonttools==4.42.0 49 | frozenlist==1.4.0 50 | fsspec==2023.6.0 51 | ftfy==6.1.1 52 | g2pkk==0.1.2 53 | gitdb==4.0.10 54 | GitPython==3.1.32 55 | google-auth==2.22.0 56 | google-auth-oauthlib==1.0.0 57 | grpcio==1.56.2 58 | gruut==2.2.3 59 | gruut-ipa==0.13.0 60 | gruut-lang-de==2.0.0 61 | gruut-lang-en==2.0.0 62 | gruut-lang-es==2.0.0 63 | gruut-lang-fr==2.0.2 64 | huggingface-hub==0.16.4 65 | idna 66 | imageio==2.31.1 67 | importlib-metadata==6.8.0 68 | importlib-resources==6.0.0 69 | inflect==5.6.0 70 | ipykernel 71 | ipython 72 | ipython-genutils 73 | itsdangerous==2.1.2 74 | jamo==0.4.1 75 | jedi 76 | jieba==0.42.1 77 | Jinja2 78 | joblib==1.3.1 79 | jsonlines==1.2.0 80 | jsonmerge==1.9.2 81 | jsonschema==4.18.6 82 | jsonschema-specifications==2023.7.1 83 | jupyter-server 84 | jupyter_client 85 | jupyter_core 86 | jupyterlab-pygments 87 | k-diffusion==0.0.15 88 | kiwisolver==1.4.4 89 | kornia==0.7.0 90 | lazy_loader==0.3 91 | librosa==0.10.0.post2 92 | llvmlite==0.38.1 93 | lxml 94 | Markdown==3.4.4 95 | MarkupSafe==2.1.3 96 | matplotlib==3.7.2 97 | matplotlib-inline 98 | mecab-python3==1.0.5 99 | mistune==0.8.4 100 | msgpack==1.0.5 101 | multidict==6.0.4 102 | nb-conda==2.2.1 103 | nb-conda-kernels 104 | nbclassic 105 | nbclient 106 | nbconvert 107 | nbformat 108 | nest-asyncio 109 | networkx==2.8.8 110 | nltk==3.8.1 111 | notebook 112 | notebook_shim 113 | num2words==0.5.12 114 | numba==0.55.1 115 | numpy==1.21.6 116 | oauthlib==3.2.2 117 | packaging==23.1 118 | pandas==2.0.3 119 | pandocfilters 120 | parso 121 | pathtools==0.1.2 122 | pexpect 123 | pickleshare 124 | Pillow==10.0.0 125 | pkgutil_resolve_name 126 | platformdirs 127 | pooch==1.6.0 128 | prometheus-client 129 | prompt-toolkit 130 | protobuf==3.19.6 131 | psutil==5.9.5 132 | ptyprocess 133 | pure-eval 134 | pyasn1==0.5.0 135 | pyasn1-modules==0.3.0 136 | pycparser 137 | pydub==0.25.1 138 | Pygments 139 | pynndescent==0.5.10 140 | pyparsing==3.0.9 141 | pypinyin==0.49.0 142 | pyrsistent 143 | python-crfsuite==0.9.9 144 | python-dateutil 145 | pytz==2023.3 146 | PyWavelets==1.4.1 147 | PyYAML==6.0.1 148 | pyzmq 149 | referencing==0.30.2 150 | regex==2023.6.3 151 | requests==2.31.0 152 | requests-oauthlib==1.3.1 153 | resize-right==0.0.2 154 | rpds-py==0.9.2 155 | rsa==4.9 156 | safetensors==0.3.1 157 | scikit-image==0.21.0 158 | scikit-learn==1.3.0 159 | scipy==1.10.1 160 | Send2Trash 161 | sentry-sdk==1.29.2 162 | setproctitle==1.3.2 163 | six 164 | smmap==5.0.0 165 | sniffio 166 | soundfile==0.12.1 167 | soupsieve 168 | soxr==0.3.5 169 | stack-data 170 | tensorboard==2.13.0 171 | tensorboard-data-server==0.7.1 172 | tensorboardX==2.6.2 173 | terminado 174 | threadpoolctl==3.2.0 175 | tifffile==2023.7.10 176 | tinycss2 177 | tokenizers==0.13.3 178 | torch==1.12.1+cu113 179 | torchaudio==0.12.1+cu113 180 | torchdiffeq==0.2.3 181 | torchsde==0.2.5 182 | torchvision==0.13.1+cu113 183 | tornado 184 | tqdm==4.65.0 185 | trainer==0.0.20 186 | traitlets 187 | trampoline==0.1.2 188 | transformers==4.31.0 189 | TTS==0.14.3 190 | typing_extensions 191 | tzdata==2023.3 192 | tzlocal==5.0.1 193 | umap-learn==0.5.1 194 | unidic-lite==1.0.8 195 | urllib3==1.26.16 196 | wandb==0.15.8 197 | wcwidth==0.2.6 198 | webencodings==0.5.1 199 | websocket-client 200 | Werkzeug==2.3.6 201 | wget==3.2 202 | yarl==1.9.2 203 | zipp==3.16.2 204 | -------------------------------------------------------------------------------- /modules/tokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | # from https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py 3 | 4 | from typing import Callable, Dict, List, Union 5 | 6 | from TTS.tts.utils.text import cleaners 7 | from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes 8 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name 9 | from TTS.utils.generic_utils import get_import_path, import_class 10 | 11 | 12 | class TTSTokenizer: 13 | """🐸TTS tokenizer to convert input characters to token IDs and back. 14 | Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later. 15 | Args: 16 | use_phonemes (bool): 17 | Whether to use phonemes instead of characters. Defaults to False. 18 | characters (Characters): 19 | A Characters object to use for character-to-ID and ID-to-character mappings. 20 | text_cleaner (callable): 21 | A function to pre-process the text before tokenization and phonemization. Defaults to None. 22 | phonemizer (Phonemizer): 23 | A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None. 24 | Example: 25 | >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer 26 | >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) 27 | >>> text = "Hello world!" 28 | >>> ids = tokenizer.text_to_ids(text) 29 | >>> text_hat = tokenizer.ids_to_text(ids) 30 | >>> assert text == text_hat 31 | """ 32 | 33 | def __init__( 34 | self, 35 | use_phonemes=False, 36 | text_cleaner: Callable = None, 37 | characters: "BaseCharacters" = None, 38 | phonemizer: Union["Phonemizer", Dict] = None, 39 | add_blank: bool = False, 40 | use_eos_bos=False, 41 | ): 42 | self.text_cleaner = text_cleaner 43 | self.use_phonemes = use_phonemes 44 | self.add_blank = add_blank 45 | self.use_eos_bos = use_eos_bos 46 | self.characters = characters 47 | self.not_found_characters = [] 48 | self.phonemizer = phonemizer 49 | 50 | @property 51 | def characters(self): 52 | return self._characters 53 | 54 | @characters.setter 55 | def characters(self, new_characters): 56 | self._characters = new_characters 57 | self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None 58 | self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None 59 | 60 | def encode(self, text: str) -> List[int]: 61 | """Encodes a string of text as a sequence of IDs.""" 62 | token_ids = [] 63 | for char in text: 64 | try: 65 | idx = self.characters.char_to_id(char) 66 | token_ids.append(idx) 67 | except KeyError: 68 | # discard but store not found characters 69 | if char not in self.not_found_characters: 70 | self.not_found_characters.append(char) 71 | # print(text) 72 | # print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.") 73 | return token_ids 74 | 75 | def decode(self, token_ids: List[int]) -> str: 76 | """Decodes a sequence of IDs to a string of text.""" 77 | text = "" 78 | for token_id in token_ids: 79 | text += self.characters.id_to_char(token_id) 80 | return text 81 | 82 | def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument 83 | """Converts a string of text to a sequence of token IDs. 84 | Args: 85 | text(str): 86 | The text to convert to token IDs. 87 | language(str): 88 | The language code of the text. Defaults to None. 89 | TODO: 90 | - Add support for language-specific processing. 91 | 1. Text normalizatin 92 | 2. Phonemization (if use_phonemes is True) 93 | 3. Add blank char between characters 94 | 4. Add BOS and EOS characters 95 | 5. Text to token IDs 96 | """ 97 | # TODO: text cleaner should pick the right routine based on the language 98 | if self.text_cleaner is not None: 99 | text = self.text_cleaner(text) 100 | if self.use_phonemes: 101 | text = self.phonemizer.phonemize(text, separator="") 102 | if self.add_blank: 103 | text = self.intersperse_blank_char(text, True) 104 | if self.use_eos_bos: 105 | text = self.pad_with_bos_eos(text) 106 | return self.encode(text) 107 | 108 | def ids_to_text(self, id_sequence: List[int]) -> str: 109 | """Converts a sequence of token IDs to a string of text.""" 110 | return self.decode(id_sequence) 111 | 112 | def pad_with_bos_eos(self, char_sequence: List[str]): 113 | """Pads a sequence with the special BOS and EOS characters.""" 114 | return [self.characters.bos] + list(char_sequence) + [self.characters.eos] 115 | 116 | def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False): 117 | """Intersperses the blank character between characters in a sequence. 118 | Use the ```blank``` character if defined else use the ```pad``` character. 119 | """ 120 | char_to_use = self.characters.blank if use_blank_char else self.characters.pad 121 | result = [char_to_use] * (len(char_sequence) * 2 + 1) 122 | result[1::2] = char_sequence 123 | return result 124 | 125 | def print_logs(self, level: int = 0): 126 | indent = "\t" * level 127 | print(f"{indent}| > add_blank: {self.add_blank}") 128 | print(f"{indent}| > use_eos_bos: {self.use_eos_bos}") 129 | print(f"{indent}| > use_phonemes: {self.use_phonemes}") 130 | if self.use_phonemes: 131 | print(f"{indent}| > phonemizer:") 132 | self.phonemizer.print_logs(level + 1) 133 | if len(self.not_found_characters) > 0: 134 | print(f"{indent}| > {len(self.not_found_characters)} not found characters:") 135 | for char in self.not_found_characters: 136 | print(f"{indent}| > {char}") 137 | 138 | @staticmethod 139 | def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): 140 | """Init Tokenizer object from config 141 | Args: 142 | config (Coqpit): Coqpit model config. 143 | characters (BaseCharacters): Defines the model character set. If not set, use the default options based on 144 | the config values. Defaults to None. 145 | """ 146 | # init cleaners 147 | text_cleaner = None 148 | if isinstance(config.text_cleaner, (str, list)): 149 | text_cleaner = getattr(cleaners, config.text_cleaner) 150 | 151 | # init characters 152 | if characters is None: 153 | # set characters based on defined characters class 154 | if config.characters and config.characters.characters_class: 155 | CharactersClass = import_class(config.characters.characters_class) 156 | characters, new_config = CharactersClass.init_from_config(config) 157 | # set characters based on config 158 | else: 159 | if config.use_phonemes: 160 | # init phoneme set 161 | characters, new_config = IPAPhonemes().init_from_config(config) 162 | else: 163 | # init character set 164 | characters, new_config = Graphemes().init_from_config(config) 165 | 166 | else: 167 | characters, new_config = characters.init_from_config(config) 168 | 169 | # set characters class 170 | new_config.characters.characters_class = get_import_path(characters) 171 | 172 | # init phonemizer 173 | phonemizer = None 174 | if config.use_phonemes: 175 | phonemizer_kwargs = {"language": config.phoneme_language} 176 | 177 | if "phonemizer" in config and config.phonemizer: 178 | phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs) 179 | else: 180 | try: 181 | phonemizer = get_phonemizer_by_name( 182 | DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs 183 | ) 184 | new_config.phonemizer = phonemizer.name() 185 | except KeyError as e: 186 | raise ValueError( 187 | f"""No phonemizer found for language {config.phoneme_language}. 188 | You may need to install a third party library for this language.""" 189 | ) from e 190 | 191 | return ( 192 | TTSTokenizer( 193 | config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars 194 | ), 195 | new_config, 196 | ) 197 | -------------------------------------------------------------------------------- /modules/synthsizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import shutil 4 | import bangla 5 | import TTS 6 | import time 7 | import pysbd 8 | import torch 9 | import soundfile as sf 10 | import numpy as np 11 | from typing import List 12 | from TTS.config import load_config 13 | from TTS.tts.models import setup_model as setup_tts_model 14 | from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence 15 | from TTS.utils.audio import AudioProcessor 16 | from TTS.vocoder.models import setup_model as setup_vocoder_model 17 | from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input 18 | # from https://github.com/coqui-ai/TTS/blob/dev/TTS/utils/synthesizer.py 19 | from TTS.utils.synthesizer import Synthesizer 20 | 21 | 22 | 23 | class Synthesizer(object): 24 | def __init__( 25 | self, 26 | tts_checkpoint: str, 27 | tts_config_path: str, 28 | tts_speakers_file: str = "", 29 | tts_languages_file: str = "", 30 | vocoder_checkpoint: str = "", 31 | vocoder_config: str = "", 32 | encoder_checkpoint: str = "", 33 | encoder_config: str = "", 34 | use_cuda: bool = False, 35 | ) -> None: 36 | """General 🐸 TTS interface for inference. It takes a tts and a vocoder 37 | model and synthesize speech from the provided text. 38 | The text is divided into a list of sentences using `pysbd` and synthesize 39 | speech on each sentence separately. 40 | If you have certain special characters in your text, you need to handle 41 | them before providing the text to Synthesizer. 42 | TODO: set the segmenter based on the source language 43 | Args: 44 | tts_checkpoint (str): path to the tts model file. 45 | tts_config_path (str): path to the tts config file. 46 | vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. 47 | vocoder_config (str, optional): path to the vocoder config file. Defaults to None. 48 | encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`, 49 | encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`, 50 | use_cuda (bool, optional): enable/disable cuda. Defaults to False. 51 | """ 52 | self.tts_checkpoint = tts_checkpoint 53 | self.tts_config_path = tts_config_path 54 | self.tts_speakers_file = tts_speakers_file 55 | self.tts_languages_file = tts_languages_file 56 | self.vocoder_checkpoint = vocoder_checkpoint 57 | self.vocoder_config = vocoder_config 58 | self.encoder_checkpoint = encoder_checkpoint 59 | self.encoder_config = encoder_config 60 | self.use_cuda = use_cuda 61 | 62 | self.tts_model = None 63 | self.vocoder_model = None 64 | self.speaker_manager = None 65 | self.num_speakers = 0 66 | self.tts_speakers = {} 67 | self.language_manager = None 68 | self.num_languages = 0 69 | self.tts_languages = {} 70 | self.d_vector_dim = 0 71 | self.seg = self._get_segmenter("en") 72 | self.use_cuda = use_cuda 73 | 74 | if self.use_cuda: 75 | assert torch.cuda.is_available(), "CUDA is not availabe on this machine." 76 | self._load_tts(tts_checkpoint, tts_config_path, use_cuda) 77 | self.output_sample_rate = self.tts_config.audio["sample_rate"] 78 | if vocoder_checkpoint: 79 | self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) 80 | self.output_sample_rate = self.vocoder_config.audio["sample_rate"] 81 | 82 | @staticmethod 83 | def _get_segmenter(lang: str): 84 | """get the sentence segmenter for the given language. 85 | Args: 86 | lang (str): target language code. 87 | Returns: 88 | [type]: [description] 89 | """ 90 | return pysbd.Segmenter(language=lang, clean=True) 91 | 92 | def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: 93 | """Load the TTS model. 94 | 1. Load the model config. 95 | 2. Init the model from the config. 96 | 3. Load the model weights. 97 | 4. Move the model to the GPU if CUDA is enabled. 98 | 5. Init the speaker manager in the model. 99 | Args: 100 | tts_checkpoint (str): path to the model checkpoint. 101 | tts_config_path (str): path to the model config file. 102 | use_cuda (bool): enable/disable CUDA use. 103 | """ 104 | # pylint: disable=global-statement 105 | self.tts_config = load_config(tts_config_path) 106 | if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: 107 | raise ValueError("Phonemizer is not defined in the TTS config.") 108 | 109 | self.tts_model = setup_tts_model(config=self.tts_config) 110 | 111 | if not self.encoder_checkpoint: 112 | self._set_speaker_encoder_paths_from_tts_config() 113 | 114 | self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) 115 | if use_cuda: 116 | self.tts_model.cuda() 117 | 118 | if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): 119 | self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda) 120 | 121 | def _set_speaker_encoder_paths_from_tts_config(self): 122 | """Set the encoder paths from the tts model config for models with speaker encoders.""" 123 | if hasattr(self.tts_config, "model_args") and hasattr( 124 | self.tts_config.model_args, "speaker_encoder_config_path" 125 | ): 126 | self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path 127 | self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path 128 | 129 | def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: 130 | """Load the vocoder model. 131 | 1. Load the vocoder config. 132 | 2. Init the AudioProcessor for the vocoder. 133 | 3. Init the vocoder model from the config. 134 | 4. Move the model to the GPU if CUDA is enabled. 135 | Args: 136 | model_file (str): path to the model checkpoint. 137 | model_config (str): path to the model config file. 138 | use_cuda (bool): enable/disable CUDA use. 139 | """ 140 | self.vocoder_config = load_config(model_config) 141 | self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) 142 | self.vocoder_model = setup_vocoder_model(self.vocoder_config) 143 | self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) 144 | if use_cuda: 145 | self.vocoder_model.cuda() 146 | 147 | def split_into_sentences(self, text) -> List[str]: 148 | """Split give text into sentences. 149 | Args: 150 | text (str): input text in string format. 151 | Returns: 152 | List[str]: list of sentences. 153 | """ 154 | return self.seg.segment(text) 155 | 156 | def save_wav(self, wav: List[int], path: str) -> None: 157 | """Save the waveform as a file. 158 | Args: 159 | wav (List[int]): waveform as a list of values. 160 | path (str): output path to save the waveform. 161 | """ 162 | wav = np.array(wav) 163 | self.tts_model.ap.save_wav(wav, path, self.output_sample_rate) 164 | 165 | def tts( 166 | self, 167 | text: str = "", 168 | speaker_name: str = "", 169 | language_name: str = "", 170 | speaker_wav=None, 171 | style_wav=None, 172 | style_text=None, 173 | reference_wav=None, 174 | reference_speaker_name=None, 175 | ) -> List[int]: 176 | """🐸 TTS magic. Run all the models and generate speech. 177 | Args: 178 | text (str): input text. 179 | speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". 180 | language_name (str, optional): language id for multi-language models. Defaults to "". 181 | speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. 182 | style_wav ([type], optional): style waveform for GST. Defaults to None. 183 | style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None. 184 | reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. 185 | reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. 186 | Returns: 187 | List[int]: [description] 188 | """ 189 | start_time = time.time() 190 | wavs = [] 191 | 192 | if not text and not reference_wav: 193 | raise ValueError( 194 | "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." 195 | ) 196 | 197 | if text: 198 | sens = self.split_into_sentences(text) 199 | # print(" > Text splitted to sentences.") 200 | # print(sens) 201 | 202 | # handle multi-speaker 203 | speaker_embedding = None 204 | speaker_id = None 205 | if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): 206 | if speaker_name and isinstance(speaker_name, str): 207 | if self.tts_config.use_d_vector_file: 208 | # get the average speaker embedding from the saved d_vectors. 209 | speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( 210 | speaker_name, num_samples=None, randomize=False 211 | ) 212 | speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] 213 | else: 214 | # get speaker idx from the speaker name 215 | speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name] 216 | 217 | elif not speaker_name and not speaker_wav: 218 | raise ValueError( 219 | " [!] Look like you use a multi-speaker model. " 220 | "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model." 221 | ) 222 | else: 223 | speaker_embedding = None 224 | else: 225 | if speaker_name: 226 | raise ValueError( 227 | f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." 228 | "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " 229 | ) 230 | 231 | # handle multi-lingaul 232 | language_id = None 233 | if self.tts_languages_file or ( 234 | hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None 235 | ): 236 | if language_name and isinstance(language_name, str): 237 | language_id = self.tts_model.language_manager.name_to_id[language_name] 238 | 239 | elif not language_name: 240 | raise ValueError( 241 | " [!] Look like you use a multi-lingual model. " 242 | "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." 243 | ) 244 | 245 | else: 246 | raise ValueError( 247 | f" [!] Missing language_ids.json file path for selecting language {language_name}." 248 | "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " 249 | ) 250 | 251 | # compute a new d_vector from the given clip. 252 | if speaker_wav is not None: 253 | speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) 254 | 255 | use_gl = self.vocoder_model is None 256 | 257 | if not reference_wav: 258 | for sen in sens: 259 | # synthesize voice 260 | outputs = synthesis( 261 | model=self.tts_model, 262 | text=sen, 263 | CONFIG=self.tts_config, 264 | use_cuda=self.use_cuda, 265 | speaker_id=speaker_id, 266 | style_wav=style_wav, 267 | style_text=style_text, 268 | use_griffin_lim=use_gl, 269 | d_vector=speaker_embedding, 270 | language_id=language_id, 271 | ) 272 | waveform = outputs["wav"] 273 | mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() 274 | if not use_gl: 275 | # denormalize tts output based on tts audio config 276 | mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T 277 | device_type = "cuda" if self.use_cuda else "cpu" 278 | # renormalize spectrogram based on vocoder config 279 | vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) 280 | # compute scale factor for possible sample rate mismatch 281 | scale_factor = [ 282 | 1, 283 | self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, 284 | ] 285 | if scale_factor[1] != 1: 286 | print(" > interpolating tts model output.") 287 | vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) 288 | else: 289 | vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable 290 | # run vocoder model 291 | # [1, T, C] 292 | waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) 293 | if self.use_cuda and not use_gl: 294 | waveform = waveform.cpu() 295 | if not use_gl: 296 | waveform = waveform.numpy() 297 | waveform = waveform.squeeze() 298 | 299 | # trim silence 300 | if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]: 301 | waveform = trim_silence(waveform, self.tts_model.ap) 302 | 303 | wavs += list(waveform) 304 | wavs += [0] * 10000 305 | else: 306 | # get the speaker embedding or speaker id for the reference wav file 307 | reference_speaker_embedding = None 308 | reference_speaker_id = None 309 | if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): 310 | if reference_speaker_name and isinstance(reference_speaker_name, str): 311 | if self.tts_config.use_d_vector_file: 312 | # get the speaker embedding from the saved d_vectors. 313 | reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name( 314 | reference_speaker_name 315 | )[0] 316 | reference_speaker_embedding = np.array(reference_speaker_embedding)[ 317 | None, : 318 | ] # [1 x embedding_dim] 319 | else: 320 | # get speaker idx from the speaker name 321 | reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name] 322 | else: 323 | reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( 324 | reference_wav 325 | ) 326 | outputs = transfer_voice( 327 | model=self.tts_model, 328 | CONFIG=self.tts_config, 329 | use_cuda=self.use_cuda, 330 | reference_wav=reference_wav, 331 | speaker_id=speaker_id, 332 | d_vector=speaker_embedding, 333 | use_griffin_lim=use_gl, 334 | reference_speaker_id=reference_speaker_id, 335 | reference_d_vector=reference_speaker_embedding, 336 | ) 337 | waveform = outputs 338 | if not use_gl: 339 | mel_postnet_spec = outputs[0].detach().cpu().numpy() 340 | # denormalize tts output based on tts audio config 341 | mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T 342 | device_type = "cuda" if self.use_cuda else "cpu" 343 | # renormalize spectrogram based on vocoder config 344 | vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) 345 | # compute scale factor for possible sample rate mismatch 346 | scale_factor = [ 347 | 1, 348 | self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, 349 | ] 350 | if scale_factor[1] != 1: 351 | print(" > interpolating tts model output.") 352 | vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) 353 | else: 354 | vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable 355 | # run vocoder model 356 | # [1, T, C] 357 | waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) 358 | if self.use_cuda: 359 | waveform = waveform.cpu() 360 | if not use_gl: 361 | waveform = waveform.numpy() 362 | wavs = waveform.squeeze() 363 | 364 | # compute stats 365 | process_time = time.time() - start_time 366 | audio_time = len(wavs) / self.tts_config.audio["sample_rate"] 367 | # print(f" > Processing time: {process_time}") 368 | # print(f" > Real-time factor: {process_time / audio_time}") 369 | return wavs --------------------------------------------------------------------------------