├── logs
    └── test_male.wav
├── image
    └── api_for_tts.png
├── __pycache__
    └── tts_inference.cpython-38.pyc
├── modules
    ├── __pycache__
    │   ├── synthsizer.cpython-38.pyc
    │   └── model_download.cpython-38.pyc
    ├── model_download.py
    ├── tokenizer.py
    └── synthsizer.py
├── app.py
├── README.md
├── inference.py
└── requirements.txt


/logs/test_male.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/logs/test_male.wav


--------------------------------------------------------------------------------
/image/api_for_tts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/image/api_for_tts.png


--------------------------------------------------------------------------------
/__pycache__/tts_inference.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/__pycache__/tts_inference.cpython-38.pyc


--------------------------------------------------------------------------------
/modules/__pycache__/synthsizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/modules/__pycache__/synthsizer.cpython-38.pyc


--------------------------------------------------------------------------------
/modules/__pycache__/model_download.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saiful9379/Bangla_TTS/HEAD/modules/__pycache__/model_download.cpython-38.pyc


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | import json
 5 | import requests
 6 | import time
 7 | from flask import Flask, request, jsonify
 8 | from inference import model_loading, bangla_tts
 9 | 
10 | 
11 | DEBUG = True
12 | tts_model = model_loading()
13 | 
14 | app = Flask(__name__)
15 | 
16 | @app.route('/tts', methods=['POST'])
17 | def process_text():
18 |     st = time.time()
19 |     data = request.get_json()
20 |     sender = data.get('sender', '')
21 |     text = data.get('text', '')
22 |     save_dir = data.get("save_dir", )
23 |     print("==============================")
24 |     print("request : ", request)
25 |     print("==============================")
26 |     print(f"sender : {sender}")
27 |     print(f"text : {text}")
28 | 
29 |     audio= bangla_tts(
30 |         model= tts_model, 
31 |         text = text, 
32 |         is_male = False, 
33 |         is_e2e_vits = True,
34 |         log_dir = save_dir
35 |      )
36 | 
37 |     print("type : ", type(audio))
38 |     response = {
39 |         "audio_url"     : audio,
40 |         "sender"    : sender,
41 |         "status"    : 200,
42 |         "processing_time" : time.time()-st
43 |     }
44 | 
45 | 
46 |     return response
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     app.run(debug=True, host="192.168.0.114",port=8009)


--------------------------------------------------------------------------------
/modules/model_download.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Author : Saiful, Sagor
 3 | @ Email : saifulbrur79@gmail.com
 4 | """
 5 | import os
 6 | import wget
 7 | root_dir = os.getcwd()
 8 | 
 9 | model_dict = {
10 |     "male" : {
11 |         "model_path" : "https://huggingface.co/bangla-speech-processing/bangla_tts_male/resolve/main/pytorch_model.pth",
12 |         "config"     : "https://huggingface.co/bangla-speech-processing/bangla_tts_male/resolve/main/config.json"
13 |         },
14 |     "female" : {
15 |         "model_path" : 'https://huggingface.co/bangla-speech-processing/bangla_tts_female/resolve/main/pytorch_model.pth',
16 |         "config"     : "https://huggingface.co/bangla-speech-processing/bangla_tts_female/resolve/main/config.json"
17 |     } 
18 | }
19 | 
20 | def download_file(root_dir = "./", output_path="models", gender = "female"):
21 |     path_dir = os.path.join(root_dir, output_path, gender)
22 | 
23 |     model_dir = os.path.join(path_dir, "pytorch_model.pth")
24 |     config_dir = os.path.join(path_dir, "config.json")
25 | 
26 | 
27 | 
28 |     # print(model_dir)
29 |     os.makedirs(path_dir, exist_ok=True)
30 | 
31 |     if os.path.exists(model_dir) and  os.path.exists(config_dir):
32 |        print("model and config already exits")
33 |     else:
34 |         wget.download(model_dict[gender]["config"], out=path_dir)
35 |         wget.download(model_dict[gender]["model_path"], out=path_dir)
36 |     
37 |     return model_dir, config_dir
38 | 
39 | 
40 | if __name__ == "__main__":
41 |   model_dir, config_dir = download_file(root_dir=root_dir)
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Bangla TTS
  3 | The Bangla TTS was training mono(male) speakers using Vit TTS model. The paper is ViT-TTS: Visual Text-to-Speech with Scalable Diffusion Transformer, we used the coqui-ai🐸-toolkit for Bangla Text-to-Speech training as well as inference.
  4 | 
  5 | __N.B : This pipeline only for inference as well as end point API testing purposes.__
  6 | 
  7 | __Please check the faster test into [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ea_BVSinWFy_9W2AH7NI55Ur0XO4Tr-a?usp=sharing)
  8 | 
  9 | # Requiremnts
 10 | Create Environments
 11 | ```
 12 | conda create -n bn_tts python==3.8
 13 | conda activate bn_tts
 14 | ```
 15 | Install require modules
 16 | 
 17 | ```
 18 | pip install -r requirements.txt
 19 | ```
 20 | # Dataset
 21 | 
 22 | Bangla Speech corpus prepared by the Indic TTS Team of IIT Madras. I've downsampled the dataset down to 22050 and converted the raw iitm annotation format into ljspeech format for training several TTS models for bangla.
 23 | in this dataset, i am sharing the final processed dataset for Bangla TTS along with trained best models weight files. please cite this paper: https://aclanthology.org/2020.lrec-1.789.pdf if you are using the dataset in your research works.
 24 | 
 25 | Dataset link: https://www.kaggle.com/datasets/mobassir/comprehensive-bangla-tts
 26 | 
 27 | 
 28 | 
 29 | # Training
 30 | 
 31 | Training code [jupyter](train_bangla_vits.ipynb) 
 32 | 
 33 | 
 34 | # Single Test[Inference]
 35 | 
 36 | For the single testing run,
 37 | 
 38 | ```
 39 | python inference.py
 40 | ```
 41 | or
 42 | 
 43 | Inference on [jupyter notebook](inference.ipynb)
 44 | 
 45 | [huggingface](https://huggingface.co/bangla-speech-processing/bangla_tts_female)
 46 | 
 47 | 
 48 | # End Point API
 49 |  For the API testing,
 50 | 
 51 | ### 1. Run the ```app.py``` script
 52 | ```
 53 | python app.py
 54 | 
 55 | ```
 56 | ### 2. Testing using python request
 57 | Write a .py script and run this code the audio .wav file will save into logs directory,
 58 | 
 59 | ```
 60 | import os
 61 | import request
 62 | import time
 63 | 
 64 | username = "saiful"
 65 | text = "আপনি কেমন আছেন।"
 66 | log_dir = "logs"
 67 | filename = "audio_file_"+str(time.strftime("%Y%m%d-%H%M%S"))+".wav"
 68 | os.makedirs(log_dir, exist_ok= True)
 69 | 
 70 | file_dir = os.path.join(log_dir, filename)
 71 | # here use your localhost machine api or localhost and post 
 72 | url = 'http://192.168.1.154:8009/tts'
 73 | 
 74 | payload = {
 75 |     "sender": username, 
 76 |     "message": text
 77 |     }
 78 | 
 79 | payload = {
 80 |     "text" : text,
 81 |     "sender" : username,
 82 |     "save_dir" : file_dir
 83 | }
 84 | headers = {'content-type': 'application/json'} 
 85 | result = requests.post(url, json=payload, headers=headers)
 86 | print(result)
 87 | 
 88 | ```
 89 | 
 90 | ### 3. if want to use Postman skip the procedure 2
 91 | 
 92 | ![alt text](image/api_for_tts.png)
 93 | 
 94 | 
 95 | # Reference
 96 | 
 97 | 1. https://aclanthology.org/2020.lrec-1.789.pdf
 98 | 2. https://arxiv.org/pdf/2106.06103.pdf
 99 | 3. https://arxiv.org/abs/2005.11129
100 | 4. https://aclanthology.org/2020.emnlp-main.207.pdf
101 | 5. https://github.com/mobassir94
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Author : Saiful Islam
  3 | @Email : saifulbrur79@gmail.com
  4 | """
  5 | import re
  6 | import os
  7 | import torch
  8 | import bangla
  9 | import base64
 10 | from IPython.display import Audio 
 11 | import soundfile as sf
 12 | from bnnumerizer import numerize
 13 | from bnunicodenormalizer import Normalizer
 14 | from modules.model_download import download_file
 15 | from modules.synthsizer import Synthesizer 
 16 | bnorm=Normalizer()
 17 | root_dir = os.getcwd()
 18 | 
 19 | use_cuda = True
 20 | DEBUG_SAVE = True
 21 | 
 22 | # set pretrain model female or male
 23 | DEBUG_GENDER = ["female", "male"]
 24 | GENDER = DEBUG_GENDER[1]
 25 | 
 26 | def model_loading(model_path=None, config_path=None):
 27 |     tts_bn_model=Synthesizer(
 28 |         model_path,
 29 |         config_path,
 30 |         use_cuda = use_cuda
 31 |     )
 32 |     return tts_bn_model
 33 | 
 34 | def normalize(sen):
 35 |     _words = [bnorm(word)['normalized']  for word in sen.split()]
 36 |     return " ".join([word for word in _words if word is not None])
 37 | 
 38 | def bangla_tts(model:object=None, text = "আমি বাংলা শিখেছি",is_male = True, is_e2e_vits = True, log_dir = "logs/unknown.wav"):
 39 |     '''
 40 |       params:
 41 |         text : input bangla text that needs to be synthesized.
 42 |         is_male : if True then uses cloned voice of male speaker,otherwise female speaker is used.
 43 |         is_e2e_vits : if True then uses vits model,otherwise glowtts gets used.
 44 | 
 45 |     '''
 46 |     if(text[-1] != '।'):
 47 |       text += '।'
 48 |     # english numbers to bangla conversion
 49 |     res = re.search('[0-9]', text)
 50 |     if res is not None:
 51 |       text = bangla.convert_english_digit_to_bangla_digit(text)
 52 |     
 53 |     #replace ':' in between two bangla numbers with ' এর '
 54 |     pattern=r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
 55 |     matches=re.findall(pattern,text)
 56 |     for m in matches:
 57 |         r=m.replace(":"," এর ")
 58 |         text=text.replace(m,r)
 59 |     try:
 60 |         text=numerize(text)
 61 |     except:
 62 |         pass
 63 |     text = normalize(text)
 64 |     sentenceEnders = re.compile('[।!?]')
 65 |     sentences = sentenceEnders.split(str(text))
 66 |     audio_list = []
 67 |     for i in range(len(sentences)):
 68 |       if(not sentences[i]):
 69 |         continue
 70 |       text = sentences[i]+'।'
 71 |       audio_list.append(torch.as_tensor(model.tts(text)))
 72 |     audio = torch.cat([k for k in audio_list])
 73 |     numpy_audio = audio.detach().cpu().numpy()
 74 |     return numpy_audio
 75 | 
 76 | if __name__ == "__main__":
 77 | 
 78 |   text = 'রওশন এরশাদের সঙ্গে দেখা করলেন জিএম কাদের।'
 79 |   fileName = 'logs/test_male.wav'
 80 |    
 81 |   print("Model Downloading : .......")
 82 |   model_path, config_path = download_file(
 83 |     root_dir=root_dir, 
 84 |     output_path="models", 
 85 |     gender=GENDER
 86 |     )
 87 |   print("Done")
 88 | 
 89 |   tts_bn_model = model_loading(
 90 |     model_path=model_path, 
 91 |     config_path=config_path
 92 |     )
 93 |   audio= bangla_tts(
 94 |      model= tts_bn_model, 
 95 |      text = text, 
 96 |      is_male = False, 
 97 |      is_e2e_vits = True
 98 |      )
 99 |   
100 | 
101 |   if DEBUG_SAVE:
102 |     sf.write(fileName, audio, 22050)
103 |   # 22050
104 |   # Audio(fileName, autoplay=True)
105 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.4.0
  2 | accelerate==0.21.0
  3 | aiohttp==3.8.5
  4 | aiosignal==1.3.1
  5 | anyascii==0.3.2
  6 | anyio
  7 | appdirs==1.4.4
  8 | argon2-cffi
  9 | argon2-cffi-bindings
 10 | asttokens
 11 | async-timeout==4.0.2
 12 | attrs==23.1.0
 13 | audioread==3.0.0
 14 | Babel==2.12.1
 15 | backcall
 16 | backports.zoneinfo==0.2.1
 17 | bangla==0.0.2
 18 | beautifulsoup4
 19 | bleach
 20 | blinker==1.6.2
 21 | bnnumerizer==0.0.2
 22 | bnunicodenormalizer==0.1.1
 23 | boltons==23.0.0
 24 | cachetools==5.3.1
 25 | certifi==2023.7.22
 26 | cffi
 27 | charset-normalizer==3.2.0
 28 | clean-fid==0.1.35
 29 | click==8.1.6
 30 | clip-anytorch==2.5.2
 31 | comm
 32 | contourpy==1.1.0
 33 | coqpit==0.0.17
 34 | cycler==0.11.0
 35 | Cython==0.29.28
 36 | dateparser==1.1.8
 37 | debugpy
 38 | decorator
 39 | defusedxml
 40 | docker-pycreds==0.4.0
 41 | docopt==0.6.2
 42 | einops==0.6.1
 43 | entrypoints
 44 | executing
 45 | fastjsonschema
 46 | filelock==3.12.2
 47 | Flask==2.3.2
 48 | fonttools==4.42.0
 49 | frozenlist==1.4.0
 50 | fsspec==2023.6.0
 51 | ftfy==6.1.1
 52 | g2pkk==0.1.2
 53 | gitdb==4.0.10
 54 | GitPython==3.1.32
 55 | google-auth==2.22.0
 56 | google-auth-oauthlib==1.0.0
 57 | grpcio==1.56.2
 58 | gruut==2.2.3
 59 | gruut-ipa==0.13.0
 60 | gruut-lang-de==2.0.0
 61 | gruut-lang-en==2.0.0
 62 | gruut-lang-es==2.0.0
 63 | gruut-lang-fr==2.0.2
 64 | huggingface-hub==0.16.4
 65 | idna
 66 | imageio==2.31.1
 67 | importlib-metadata==6.8.0
 68 | importlib-resources==6.0.0
 69 | inflect==5.6.0
 70 | ipykernel
 71 | ipython
 72 | ipython-genutils
 73 | itsdangerous==2.1.2
 74 | jamo==0.4.1
 75 | jedi
 76 | jieba==0.42.1
 77 | Jinja2
 78 | joblib==1.3.1
 79 | jsonlines==1.2.0
 80 | jsonmerge==1.9.2
 81 | jsonschema==4.18.6
 82 | jsonschema-specifications==2023.7.1
 83 | jupyter-server
 84 | jupyter_client 
 85 | jupyter_core
 86 | jupyterlab-pygments
 87 | k-diffusion==0.0.15
 88 | kiwisolver==1.4.4
 89 | kornia==0.7.0
 90 | lazy_loader==0.3
 91 | librosa==0.10.0.post2
 92 | llvmlite==0.38.1
 93 | lxml
 94 | Markdown==3.4.4
 95 | MarkupSafe==2.1.3
 96 | matplotlib==3.7.2
 97 | matplotlib-inline
 98 | mecab-python3==1.0.5
 99 | mistune==0.8.4
100 | msgpack==1.0.5
101 | multidict==6.0.4
102 | nb-conda==2.2.1
103 | nb-conda-kernels
104 | nbclassic 
105 | nbclient
106 | nbconvert
107 | nbformat
108 | nest-asyncio
109 | networkx==2.8.8
110 | nltk==3.8.1
111 | notebook
112 | notebook_shim
113 | num2words==0.5.12
114 | numba==0.55.1
115 | numpy==1.21.6
116 | oauthlib==3.2.2
117 | packaging==23.1
118 | pandas==2.0.3
119 | pandocfilters
120 | parso 
121 | pathtools==0.1.2
122 | pexpect
123 | pickleshare
124 | Pillow==10.0.0
125 | pkgutil_resolve_name
126 | platformdirs
127 | pooch==1.6.0
128 | prometheus-client
129 | prompt-toolkit
130 | protobuf==3.19.6
131 | psutil==5.9.5
132 | ptyprocess
133 | pure-eval
134 | pyasn1==0.5.0
135 | pyasn1-modules==0.3.0
136 | pycparser
137 | pydub==0.25.1
138 | Pygments
139 | pynndescent==0.5.10
140 | pyparsing==3.0.9
141 | pypinyin==0.49.0
142 | pyrsistent
143 | python-crfsuite==0.9.9
144 | python-dateutil
145 | pytz==2023.3
146 | PyWavelets==1.4.1
147 | PyYAML==6.0.1
148 | pyzmq
149 | referencing==0.30.2
150 | regex==2023.6.3
151 | requests==2.31.0
152 | requests-oauthlib==1.3.1
153 | resize-right==0.0.2
154 | rpds-py==0.9.2
155 | rsa==4.9
156 | safetensors==0.3.1
157 | scikit-image==0.21.0
158 | scikit-learn==1.3.0
159 | scipy==1.10.1
160 | Send2Trash
161 | sentry-sdk==1.29.2
162 | setproctitle==1.3.2
163 | six
164 | smmap==5.0.0
165 | sniffio
166 | soundfile==0.12.1
167 | soupsieve
168 | soxr==0.3.5
169 | stack-data
170 | tensorboard==2.13.0
171 | tensorboard-data-server==0.7.1
172 | tensorboardX==2.6.2
173 | terminado
174 | threadpoolctl==3.2.0
175 | tifffile==2023.7.10
176 | tinycss2
177 | tokenizers==0.13.3
178 | torch==1.12.1+cu113
179 | torchaudio==0.12.1+cu113
180 | torchdiffeq==0.2.3
181 | torchsde==0.2.5
182 | torchvision==0.13.1+cu113
183 | tornado
184 | tqdm==4.65.0
185 | trainer==0.0.20
186 | traitlets
187 | trampoline==0.1.2
188 | transformers==4.31.0
189 | TTS==0.14.3
190 | typing_extensions
191 | tzdata==2023.3
192 | tzlocal==5.0.1
193 | umap-learn==0.5.1
194 | unidic-lite==1.0.8
195 | urllib3==1.26.16
196 | wandb==0.15.8
197 | wcwidth==0.2.6
198 | webencodings==0.5.1
199 | websocket-client
200 | Werkzeug==2.3.6
201 | wget==3.2
202 | yarl==1.9.2
203 | zipp==3.16.2
204 | 


--------------------------------------------------------------------------------
/modules/tokenizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # from https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py
  3 | 
  4 | from typing import Callable, Dict, List, Union
  5 | 
  6 | from TTS.tts.utils.text import cleaners
  7 | from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
  8 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
  9 | from TTS.utils.generic_utils import get_import_path, import_class
 10 | 
 11 | 
 12 | class TTSTokenizer:
 13 |     """ðŸ¸TTS tokenizer to convert input characters to token IDs and back.
 14 |     Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.
 15 |     Args:
 16 |         use_phonemes (bool):
 17 |             Whether to use phonemes instead of characters. Defaults to False.
 18 |         characters (Characters):
 19 |             A Characters object to use for character-to-ID and ID-to-character mappings.
 20 |         text_cleaner (callable):
 21 |             A function to pre-process the text before tokenization and phonemization. Defaults to None.
 22 |         phonemizer (Phonemizer):
 23 |             A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
 24 |     Example:
 25 |         >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
 26 |         >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
 27 |         >>> text = "Hello world!"
 28 |         >>> ids = tokenizer.text_to_ids(text)
 29 |         >>> text_hat = tokenizer.ids_to_text(ids)
 30 |         >>> assert text == text_hat
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         use_phonemes=False,
 36 |         text_cleaner: Callable = None,
 37 |         characters: "BaseCharacters" = None,
 38 |         phonemizer: Union["Phonemizer", Dict] = None,
 39 |         add_blank: bool = False,
 40 |         use_eos_bos=False,
 41 |     ):
 42 |         self.text_cleaner = text_cleaner
 43 |         self.use_phonemes = use_phonemes
 44 |         self.add_blank = add_blank
 45 |         self.use_eos_bos = use_eos_bos
 46 |         self.characters = characters
 47 |         self.not_found_characters = []
 48 |         self.phonemizer = phonemizer
 49 | 
 50 |     @property
 51 |     def characters(self):
 52 |         return self._characters
 53 | 
 54 |     @characters.setter
 55 |     def characters(self, new_characters):
 56 |         self._characters = new_characters
 57 |         self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
 58 |         self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
 59 | 
 60 |     def encode(self, text: str) -> List[int]:
 61 |         """Encodes a string of text as a sequence of IDs."""
 62 |         token_ids = []
 63 |         for char in text:
 64 |             try:
 65 |                 idx = self.characters.char_to_id(char)
 66 |                 token_ids.append(idx)
 67 |             except KeyError:
 68 |                 # discard but store not found characters
 69 |                 if char not in self.not_found_characters:
 70 |                     self.not_found_characters.append(char)
 71 |                     # print(text)
 72 |                     # print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
 73 |         return token_ids
 74 | 
 75 |     def decode(self, token_ids: List[int]) -> str:
 76 |         """Decodes a sequence of IDs to a string of text."""
 77 |         text = ""
 78 |         for token_id in token_ids:
 79 |             text += self.characters.id_to_char(token_id)
 80 |         return text
 81 | 
 82 |     def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
 83 |         """Converts a string of text to a sequence of token IDs.
 84 |         Args:
 85 |             text(str):
 86 |                 The text to convert to token IDs.
 87 |             language(str):
 88 |                 The language code of the text. Defaults to None.
 89 |         TODO:
 90 |             - Add support for language-specific processing.
 91 |         1. Text normalizatin
 92 |         2. Phonemization (if use_phonemes is True)
 93 |         3. Add blank char between characters
 94 |         4. Add BOS and EOS characters
 95 |         5. Text to token IDs
 96 |         """
 97 |         # TODO: text cleaner should pick the right routine based on the language
 98 |         if self.text_cleaner is not None:
 99 |             text = self.text_cleaner(text)
100 |         if self.use_phonemes:
101 |             text = self.phonemizer.phonemize(text, separator="")
102 |         if self.add_blank:
103 |             text = self.intersperse_blank_char(text, True)
104 |         if self.use_eos_bos:
105 |             text = self.pad_with_bos_eos(text)
106 |         return self.encode(text)
107 | 
108 |     def ids_to_text(self, id_sequence: List[int]) -> str:
109 |         """Converts a sequence of token IDs to a string of text."""
110 |         return self.decode(id_sequence)
111 | 
112 |     def pad_with_bos_eos(self, char_sequence: List[str]):
113 |         """Pads a sequence with the special BOS and EOS characters."""
114 |         return [self.characters.bos] + list(char_sequence) + [self.characters.eos]
115 | 
116 |     def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
117 |         """Intersperses the blank character between characters in a sequence.
118 |         Use the ```blank``` character if defined else use the ```pad``` character.
119 |         """
120 |         char_to_use = self.characters.blank if use_blank_char else self.characters.pad
121 |         result = [char_to_use] * (len(char_sequence) * 2 + 1)
122 |         result[1::2] = char_sequence
123 |         return result
124 | 
125 |     def print_logs(self, level: int = 0):
126 |         indent = "\t" * level
127 |         print(f"{indent}| > add_blank: {self.add_blank}")
128 |         print(f"{indent}| > use_eos_bos: {self.use_eos_bos}")
129 |         print(f"{indent}| > use_phonemes: {self.use_phonemes}")
130 |         if self.use_phonemes:
131 |             print(f"{indent}| > phonemizer:")
132 |             self.phonemizer.print_logs(level + 1)
133 |         if len(self.not_found_characters) > 0:
134 |             print(f"{indent}| > {len(self.not_found_characters)} not found characters:")
135 |             for char in self.not_found_characters:
136 |                 print(f"{indent}| > {char}")
137 | 
138 |     @staticmethod
139 |     def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
140 |         """Init Tokenizer object from config
141 |         Args:
142 |             config (Coqpit): Coqpit model config.
143 |             characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
144 |                 the config values. Defaults to None.
145 |         """
146 |         # init cleaners
147 |         text_cleaner = None
148 |         if isinstance(config.text_cleaner, (str, list)):
149 |             text_cleaner = getattr(cleaners, config.text_cleaner)
150 | 
151 |         # init characters
152 |         if characters is None:
153 |             # set characters based on defined characters class
154 |             if config.characters and config.characters.characters_class:
155 |                 CharactersClass = import_class(config.characters.characters_class)
156 |                 characters, new_config = CharactersClass.init_from_config(config)
157 |             # set characters based on config
158 |             else:
159 |                 if config.use_phonemes:
160 |                     # init phoneme set
161 |                     characters, new_config = IPAPhonemes().init_from_config(config)
162 |                 else:
163 |                     # init character set
164 |                     characters, new_config = Graphemes().init_from_config(config)
165 | 
166 |         else:
167 |             characters, new_config = characters.init_from_config(config)
168 | 
169 |         # set characters class
170 |         new_config.characters.characters_class = get_import_path(characters)
171 | 
172 |         # init phonemizer
173 |         phonemizer = None
174 |         if config.use_phonemes:
175 |             phonemizer_kwargs = {"language": config.phoneme_language}
176 | 
177 |             if "phonemizer" in config and config.phonemizer:
178 |                 phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
179 |             else:
180 |                 try:
181 |                     phonemizer = get_phonemizer_by_name(
182 |                         DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
183 |                     )
184 |                     new_config.phonemizer = phonemizer.name()
185 |                 except KeyError as e:
186 |                     raise ValueError(
187 |                         f"""No phonemizer found for language {config.phoneme_language}.
188 |                         You may need to install a third party library for this language."""
189 |                     ) from e
190 | 
191 |         return (
192 |             TTSTokenizer(
193 |                 config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
194 |             ),
195 |             new_config,
196 |         )
197 |    


--------------------------------------------------------------------------------
/modules/synthsizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import shutil
  4 | import bangla
  5 | import TTS
  6 | import time
  7 | import pysbd
  8 | import torch
  9 | import soundfile as sf
 10 | import numpy as np
 11 | from typing import List
 12 | from TTS.config import load_config
 13 | from TTS.tts.models import setup_model as setup_tts_model
 14 | from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
 15 | from TTS.utils.audio import AudioProcessor
 16 | from TTS.vocoder.models import setup_model as setup_vocoder_model
 17 | from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
 18 | # from https://github.com/coqui-ai/TTS/blob/dev/TTS/utils/synthesizer.py
 19 | from TTS.utils.synthesizer import Synthesizer
 20 | 
 21 | 
 22 | 
 23 | class Synthesizer(object):
 24 |     def __init__(
 25 |         self,
 26 |         tts_checkpoint: str,
 27 |         tts_config_path: str,
 28 |         tts_speakers_file: str = "",
 29 |         tts_languages_file: str = "",
 30 |         vocoder_checkpoint: str = "",
 31 |         vocoder_config: str = "",
 32 |         encoder_checkpoint: str = "",
 33 |         encoder_config: str = "",
 34 |         use_cuda: bool = False,
 35 |     ) -> None:
 36 |         """General ðŸ¸ TTS interface for inference. It takes a tts and a vocoder
 37 |         model and synthesize speech from the provided text.
 38 |         The text is divided into a list of sentences using `pysbd` and synthesize
 39 |         speech on each sentence separately.
 40 |         If you have certain special characters in your text, you need to handle
 41 |         them before providing the text to Synthesizer.
 42 |         TODO: set the segmenter based on the source language
 43 |         Args:
 44 |             tts_checkpoint (str): path to the tts model file.
 45 |             tts_config_path (str): path to the tts config file.
 46 |             vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
 47 |             vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
 48 |             encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`,
 49 |             encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`,
 50 |             use_cuda (bool, optional): enable/disable cuda. Defaults to False.
 51 |         """
 52 |         self.tts_checkpoint = tts_checkpoint
 53 |         self.tts_config_path = tts_config_path
 54 |         self.tts_speakers_file = tts_speakers_file
 55 |         self.tts_languages_file = tts_languages_file
 56 |         self.vocoder_checkpoint = vocoder_checkpoint
 57 |         self.vocoder_config = vocoder_config
 58 |         self.encoder_checkpoint = encoder_checkpoint
 59 |         self.encoder_config = encoder_config
 60 |         self.use_cuda = use_cuda
 61 | 
 62 |         self.tts_model = None
 63 |         self.vocoder_model = None
 64 |         self.speaker_manager = None
 65 |         self.num_speakers = 0
 66 |         self.tts_speakers = {}
 67 |         self.language_manager = None
 68 |         self.num_languages = 0
 69 |         self.tts_languages = {}
 70 |         self.d_vector_dim = 0
 71 |         self.seg = self._get_segmenter("en")
 72 |         self.use_cuda = use_cuda
 73 | 
 74 |         if self.use_cuda:
 75 |             assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
 76 |         self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
 77 |         self.output_sample_rate = self.tts_config.audio["sample_rate"]
 78 |         if vocoder_checkpoint:
 79 |             self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
 80 |             self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
 81 | 
 82 |     @staticmethod
 83 |     def _get_segmenter(lang: str):
 84 |         """get the sentence segmenter for the given language.
 85 |         Args:
 86 |             lang (str): target language code.
 87 |         Returns:
 88 |             [type]: [description]
 89 |         """
 90 |         return pysbd.Segmenter(language=lang, clean=True)
 91 | 
 92 |     def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
 93 |         """Load the TTS model.
 94 |         1. Load the model config.
 95 |         2. Init the model from the config.
 96 |         3. Load the model weights.
 97 |         4. Move the model to the GPU if CUDA is enabled.
 98 |         5. Init the speaker manager in the model.
 99 |         Args:
100 |             tts_checkpoint (str): path to the model checkpoint.
101 |             tts_config_path (str): path to the model config file.
102 |             use_cuda (bool): enable/disable CUDA use.
103 |         """
104 |         # pylint: disable=global-statement
105 |         self.tts_config = load_config(tts_config_path)
106 |         if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None:
107 |             raise ValueError("Phonemizer is not defined in the TTS config.")
108 | 
109 |         self.tts_model = setup_tts_model(config=self.tts_config)
110 | 
111 |         if not self.encoder_checkpoint:
112 |             self._set_speaker_encoder_paths_from_tts_config()
113 | 
114 |         self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
115 |         if use_cuda:
116 |             self.tts_model.cuda()
117 | 
118 |         if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
119 |             self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda)
120 | 
121 |     def _set_speaker_encoder_paths_from_tts_config(self):
122 |         """Set the encoder paths from the tts model config for models with speaker encoders."""
123 |         if hasattr(self.tts_config, "model_args") and hasattr(
124 |             self.tts_config.model_args, "speaker_encoder_config_path"
125 |         ):
126 |             self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path
127 |             self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path
128 | 
129 |     def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
130 |         """Load the vocoder model.
131 |         1. Load the vocoder config.
132 |         2. Init the AudioProcessor for the vocoder.
133 |         3. Init the vocoder model from the config.
134 |         4. Move the model to the GPU if CUDA is enabled.
135 |         Args:
136 |             model_file (str): path to the model checkpoint.
137 |             model_config (str): path to the model config file.
138 |             use_cuda (bool): enable/disable CUDA use.
139 |         """
140 |         self.vocoder_config = load_config(model_config)
141 |         self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio)
142 |         self.vocoder_model = setup_vocoder_model(self.vocoder_config)
143 |         self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
144 |         if use_cuda:
145 |             self.vocoder_model.cuda()
146 | 
147 |     def split_into_sentences(self, text) -> List[str]:
148 |         """Split give text into sentences.
149 |         Args:
150 |             text (str): input text in string format.
151 |         Returns:
152 |             List[str]: list of sentences.
153 |         """
154 |         return self.seg.segment(text)
155 | 
156 |     def save_wav(self, wav: List[int], path: str) -> None:
157 |         """Save the waveform as a file.
158 |         Args:
159 |             wav (List[int]): waveform as a list of values.
160 |             path (str): output path to save the waveform.
161 |         """
162 |         wav = np.array(wav)
163 |         self.tts_model.ap.save_wav(wav, path, self.output_sample_rate)
164 | 
165 |     def tts(
166 |         self,
167 |         text: str = "",
168 |         speaker_name: str = "",
169 |         language_name: str = "",
170 |         speaker_wav=None,
171 |         style_wav=None,
172 |         style_text=None,
173 |         reference_wav=None,
174 |         reference_speaker_name=None,
175 |     ) -> List[int]:
176 |         """ðŸ¸ TTS magic. Run all the models and generate speech.
177 |         Args:
178 |             text (str): input text.
179 |             speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "".
180 |             language_name (str, optional): language id for multi-language models. Defaults to "".
181 |             speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
182 |             style_wav ([type], optional): style waveform for GST. Defaults to None.
183 |             style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
184 |             reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
185 |             reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
186 |         Returns:
187 |             List[int]: [description]
188 |         """
189 |         start_time = time.time()
190 |         wavs = []
191 | 
192 |         if not text and not reference_wav:
193 |             raise ValueError(
194 |                 "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
195 |             )
196 | 
197 |         if text:
198 |             sens = self.split_into_sentences(text)
199 |             # print(" > Text splitted to sentences.")
200 |             # print(sens)
201 | 
202 |         # handle multi-speaker
203 |         speaker_embedding = None
204 |         speaker_id = None
205 |         if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
206 |             if speaker_name and isinstance(speaker_name, str):
207 |                 if self.tts_config.use_d_vector_file:
208 |                     # get the average speaker embedding from the saved d_vectors.
209 |                     speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
210 |                         speaker_name, num_samples=None, randomize=False
211 |                     )
212 |                     speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
213 |                 else:
214 |                     # get speaker idx from the speaker name
215 |                     speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
216 | 
217 |             elif not speaker_name and not speaker_wav:
218 |                 raise ValueError(
219 |                     " [!] Look like you use a multi-speaker model. "
220 |                     "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model."
221 |                 )
222 |             else:
223 |                 speaker_embedding = None
224 |         else:
225 |             if speaker_name:
226 |                 raise ValueError(
227 |                     f" [!] Missing speakers.json file path for selecting speaker {speaker_name}."
228 |                     "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
229 |                 )
230 | 
231 |         # handle multi-lingaul
232 |         language_id = None
233 |         if self.tts_languages_file or (
234 |             hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
235 |         ):
236 |             if language_name and isinstance(language_name, str):
237 |                 language_id = self.tts_model.language_manager.name_to_id[language_name]
238 | 
239 |             elif not language_name:
240 |                 raise ValueError(
241 |                     " [!] Look like you use a multi-lingual model. "
242 |                     "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model."
243 |                 )
244 | 
245 |             else:
246 |                 raise ValueError(
247 |                     f" [!] Missing language_ids.json file path for selecting language {language_name}."
248 |                     "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. "
249 |                 )
250 | 
251 |         # compute a new d_vector from the given clip.
252 |         if speaker_wav is not None:
253 |             speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
254 | 
255 |         use_gl = self.vocoder_model is None
256 | 
257 |         if not reference_wav:
258 |             for sen in sens:
259 |                 # synthesize voice
260 |                 outputs = synthesis(
261 |                     model=self.tts_model,
262 |                     text=sen,
263 |                     CONFIG=self.tts_config,
264 |                     use_cuda=self.use_cuda,
265 |                     speaker_id=speaker_id,
266 |                     style_wav=style_wav,
267 |                     style_text=style_text,
268 |                     use_griffin_lim=use_gl,
269 |                     d_vector=speaker_embedding,
270 |                     language_id=language_id,
271 |                 )
272 |                 waveform = outputs["wav"]
273 |                 mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
274 |                 if not use_gl:
275 |                     # denormalize tts output based on tts audio config
276 |                     mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
277 |                     device_type = "cuda" if self.use_cuda else "cpu"
278 |                     # renormalize spectrogram based on vocoder config
279 |                     vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
280 |                     # compute scale factor for possible sample rate mismatch
281 |                     scale_factor = [
282 |                         1,
283 |                         self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
284 |                     ]
285 |                     if scale_factor[1] != 1:
286 |                         print(" > interpolating tts model output.")
287 |                         vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
288 |                     else:
289 |                         vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
290 |                     # run vocoder model
291 |                     # [1, T, C]
292 |                     waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
293 |                 if self.use_cuda and not use_gl:
294 |                     waveform = waveform.cpu()
295 |                 if not use_gl:
296 |                     waveform = waveform.numpy()
297 |                 waveform = waveform.squeeze()
298 | 
299 |                 # trim silence
300 |                 if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]:
301 |                     waveform = trim_silence(waveform, self.tts_model.ap)
302 | 
303 |                 wavs += list(waveform)
304 |                 wavs += [0] * 10000
305 |         else:
306 |             # get the speaker embedding or speaker id for the reference wav file
307 |             reference_speaker_embedding = None
308 |             reference_speaker_id = None
309 |             if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
310 |                 if reference_speaker_name and isinstance(reference_speaker_name, str):
311 |                     if self.tts_config.use_d_vector_file:
312 |                         # get the speaker embedding from the saved d_vectors.
313 |                         reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
314 |                             reference_speaker_name
315 |                         )[0]
316 |                         reference_speaker_embedding = np.array(reference_speaker_embedding)[
317 |                             None, :
318 |                         ]  # [1 x embedding_dim]
319 |                     else:
320 |                         # get speaker idx from the speaker name
321 |                         reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
322 |                 else:
323 |                     reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
324 |                         reference_wav
325 |                     )
326 |             outputs = transfer_voice(
327 |                 model=self.tts_model,
328 |                 CONFIG=self.tts_config,
329 |                 use_cuda=self.use_cuda,
330 |                 reference_wav=reference_wav,
331 |                 speaker_id=speaker_id,
332 |                 d_vector=speaker_embedding,
333 |                 use_griffin_lim=use_gl,
334 |                 reference_speaker_id=reference_speaker_id,
335 |                 reference_d_vector=reference_speaker_embedding,
336 |             )
337 |             waveform = outputs
338 |             if not use_gl:
339 |                 mel_postnet_spec = outputs[0].detach().cpu().numpy()
340 |                 # denormalize tts output based on tts audio config
341 |                 mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
342 |                 device_type = "cuda" if self.use_cuda else "cpu"
343 |                 # renormalize spectrogram based on vocoder config
344 |                 vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
345 |                 # compute scale factor for possible sample rate mismatch
346 |                 scale_factor = [
347 |                     1,
348 |                     self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
349 |                 ]
350 |                 if scale_factor[1] != 1:
351 |                     print(" > interpolating tts model output.")
352 |                     vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
353 |                 else:
354 |                     vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
355 |                 # run vocoder model
356 |                 # [1, T, C]
357 |                 waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
358 |             if self.use_cuda:
359 |                 waveform = waveform.cpu()
360 |             if not use_gl:
361 |                 waveform = waveform.numpy()
362 |             wavs = waveform.squeeze()
363 | 
364 |         # compute stats
365 |         process_time = time.time() - start_time
366 |         audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
367 |         # print(f" > Processing time: {process_time}")
368 |         # print(f" > Real-time factor: {process_time / audio_time}")
369 |         return wavs


--------------------------------------------------------------------------------