├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── TeraTTS
    ├── __init__.py
    ├── infer_onnx.py
    └── tokenizer
    │   ├── __init__.py
    │   └── g2p
    │       ├── __init__.py
    │       ├── g2p.py
    │       └── tokenizer.py
├── example.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.py[cod]
 3 | *.pyc 
 4 | *.so
 5 | *.wav
 6 | 
 7 | build/
 8 | dist/
 9 | RUTTS.egg-info
10 | model/


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2023 TeraSpace
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | git:
 2 | 	git add .
 3 | 	git commit -m "update"
 4 | 	git push -u -f origin main
 5 | 
 6 | pypi:
 7 | 	rm -r ./build
 8 | 	rm -r ./dist
 9 | 	rm -r RUTTS.egg-info
10 | 	python setup.py sdist bdist_wheel
11 | 	twine upload dist/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Russian TTS inference
 2 | # Установка
 3 | Вы можете установить пакет с помощью pip:
 4 | ```
 5 | pip install TeraTTS
 6 | ```
 7 | Также вы можете установить используя Git:
 8 | ```
 9 | pip install -e git+https://github.com/Tera2Space/RUTTS#egg=TeraTTS
10 | ```
11 | # Ошибки
12 | 1)Если на Windows у вас **ошибка при установке**,нужно просто **скачать Visual Studio [здесь](https://visualstudio.microsoft.com/ru/thank-you-downloading-visual-studio/?sku=Community&channel=Release&version=VS2022&source=VSLandingPage&cid=2030&passive=false)** и при установке выбрать галочку около **Разработка классических приложений на С++**
13 | 
14 | 2)Если **после установки не работает** что-то, **убедитесь что модуль скачан последней версии**(удалить и скачать) и **так же что названия моделей есть на** https://huggingface.co/TeraTTS
15 | 
16 | 3)Если ничего не помогло **обратитесь за помощью в https://t.me/teraspace_chat**
17 | # Использование
18 | 
19 | ```python  
20 | text = "Привет, мир!"
21 | 
22 | from TeraTTS import TTS
23 | 
24 | # Опционально: Предобработка текста (улучшает качество)
25 | from ruaccent import RUAccent
26 | accentizer = RUAccent()
27 | 
28 | # Загрузка моделей акцентуации и словарей
29 | accentizer.load(omograph_model_size='turbo', use_dictionary=True)
30 | 
31 | # Обработка текста с учетом ударений и буквы ё
32 | text = accentizer.process_all(text)
33 | print(f"Текст с ударениями и ё: {text}")
34 | 
35 | 
36 | # Примечание: Вы можете найти все модели по адресу https://huggingface.co/TeraTTS, включая модель GLADOS
37 | tts = TTS("TeraTTS/natasha-g2p-vits", add_time_to_end=1.0, tokenizer_load_dict=True) # Вы можете настроить 'add_time_to_end' для продолжительности аудио, 'tokenizer_load_dict' можно отключить если используете RUAccent
38 | 
39 | 
40 | # 'length_scale' можно использовать для замедления аудио для лучшего звучания (по умолчанию 1.1, указано здесь для примера)
41 | audio = tts(text, length_scale=1.1)  # Создать аудио. Можно добавить ударения, используя '+'
42 | tts.play_audio(audio)  # Воспроизвести созданное аудио
43 | tts.save_wav(audio, "./test.wav")  # Сохранить аудио в файл
44 | 
45 | 
46 | # Создать аудио и сразу его воспроизвести
47 | tts(text, play=True, length_scale=1.1)
48 | 
49 | ```
50 | 


--------------------------------------------------------------------------------
/TeraTTS/__init__.py:
--------------------------------------------------------------------------------
1 | from .infer_onnx import TTS
2 | from .tokenizer import TokenizerG2P
3 | 


--------------------------------------------------------------------------------
/TeraTTS/infer_onnx.py:
--------------------------------------------------------------------------------
 1 | import scipy.io.wavfile
 2 | import os
 3 | import sounddevice as sd
 4 | import onnxruntime
 5 | import numpy as np
 6 | from huggingface_hub import snapshot_download
 7 | from num2words import num2words
 8 | import re
 9 | from transliterate import translit
10 | from .tokenizer import TokenizerG2P
11 | 
12 | class TTS:
13 |     def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0, preprocess_nums=True, preprocess_trans=True, tokenizer_load_dict=True) -> None:
14 |         if not os.path.exists(save_path):
15 |             os.mkdir(save_path)
16 |         
17 |         model_dir = os.path.join(save_path, model_name)
18 |         
19 |         if not os.path.exists(model_dir):
20 |             snapshot_download(repo_id=model_name, 
21 |                               allow_patterns=["*.txt", "*.onnx", "*.json"], 
22 |                               local_dir=model_dir,
23 |                               local_dir_use_symlinks=False
24 |                             )
25 |         
26 |         self.model = onnxruntime.InferenceSession(os.path.join(model_dir, "exported/model.onnx"), providers=['CPUExecutionProvider'])
27 |         self.preprocess_nums = preprocess_nums
28 |         self.preprocess_trans = preprocess_trans
29 |         
30 |         self.tokenizer = TokenizerG2P(os.path.join(model_dir, "exported"), load_dict=tokenizer_load_dict)    
31 |         
32 |         self.add_time_to_end = add_time_to_end
33 | 
34 |     
35 |     def _add_silent(self, audio, silence_duration: float = 1.0, sample_rate: int = 22050):
36 |         num_samples_silence = int(sample_rate * silence_duration)
37 |         silence_array = np.zeros(num_samples_silence, dtype=np.float32)
38 |         audio_with_silence = np.concatenate((audio, silence_array), axis=0)
39 |         return audio_with_silence
40 | 
41 | 
42 |     def save_wav(self, audio, path:str):
43 |         '''save audio to wav'''
44 |         scipy.io.wavfile.write(path, 22050, audio)
45 |     
46 |     
47 |     def play_audio(self, audio):
48 |         sd.play(audio, 22050, blocking=True)
49 |     
50 |     
51 |     def _intersperse(self, lst, item):
52 |         result = [item] * (len(lst) * 2 + 1)
53 |         result[1::2] = lst
54 |         return result
55 |     
56 |     
57 |     def _get_seq(self, text):
58 |         phoneme_ids = self.tokenizer._get_seq(text)
59 |         phoneme_ids_inter = self._intersperse(phoneme_ids, 0)
60 |         return phoneme_ids_inter
61 |         
62 |     def _num2wordsshor(self, match):
63 |         match = match.group()
64 |         ret = num2words(match, lang ='ru')
65 |         return ret 
66 |     
67 |     def __call__(self, text: str, play = False, length_scale=1.2):
68 |         if self.preprocess_trans:
69 |             text = translit(text, 'ru')
70 |         
71 |         if self.preprocess_nums:
72 |             text = re.sub(r'\d+',self._num2wordsshor,text)
73 |         phoneme_ids = self._get_seq(text)
74 |         text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
75 |         text_lengths = np.array([text.shape[1]], dtype=np.int64)
76 |         scales = np.array(
77 |             [0.667, length_scale, 0.8],
78 |             dtype=np.float32,
79 |         )
80 |         audio = self.model.run(
81 |             None,
82 |             {
83 |                 "input": text,
84 |                 "input_lengths": text_lengths,
85 |                 "scales": scales,
86 |                 "sid": None,
87 |             },
88 |         )[0][0,0][0]
89 |         audio = self._add_silent(audio, silence_duration = self.add_time_to_end)
90 |         if play:
91 |             self.play_audio(audio)
92 |         return audio
93 | 


--------------------------------------------------------------------------------
/TeraTTS/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .g2p import Tokenizer as TokenizerG2P


--------------------------------------------------------------------------------
/TeraTTS/tokenizer/g2p/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import Tokenizer


--------------------------------------------------------------------------------
/TeraTTS/tokenizer/g2p/g2p.py:
--------------------------------------------------------------------------------
 1 | 
 2 | softletters=set(u"яёюиье")
 3 | startsyl=set(u"#ъьаяоёуюэеиы-")
 4 | others = set(["#", "+", "-", u"ь", u"ъ"])
 5 | 
 6 | softhard_cons = {
 7 |     u"б" : u"b",
 8 |     u"в" : u"v",
 9 |     u"г" : u"g",
10 |     u"Г" : u"g",
11 |     u"д" : u"d",
12 |     u"з" : u"z",
13 |     u"к" : u"k",
14 |     u"л" : u"l",
15 |     u"м" : u"m",
16 |     u"н" : u"n",
17 |     u"п" : u"p",
18 |     u"р" : u"r",
19 |     u"с" : u"s",
20 |     u"т" : u"t",
21 |     u"ф" : u"f",
22 |     u"х" : u"h"
23 | }
24 | 
25 | other_cons = {
26 |     u"ж" : u"zh",
27 |     u"ц" : u"c",
28 |     u"ч" : u"ch",
29 |     u"ш" : u"sh",
30 |     u"щ" : u"sch",
31 |     u"й" : u"j"
32 | }
33 | 
34 | vowels = {
35 |     u"а" : u"a",
36 |     u"я" : u"a",
37 |     u"у" : u"u",
38 |     u"ю" : u"u",
39 |     u"о" : u"o",
40 |     u"ё" : u"o",
41 |     u"э" : u"e",
42 |     u"е" : u"e",
43 |     u"и" : u"i",
44 |     u"ы" : u"y",
45 | }                                
46 | 
47 | def pallatize(phones):
48 |     for i, phone in enumerate(phones[:-1]):
49 |         if phone[0] in softhard_cons:
50 |             if phones[i+1][0] in softletters:
51 |                 phones[i] = (softhard_cons[phone[0]] + "j", 0)
52 |             else:
53 |                 phones[i] = (softhard_cons[phone[0]], 0)
54 |         if phone[0] in other_cons:
55 |             phones[i] = (other_cons[phone[0]], 0)
56 | 
57 | def convert_vowels(phones):
58 |     new_phones = []
59 |     prev = ""
60 |     for phone in phones:
61 |         if prev in startsyl:
62 |             if phone[0] in set(u"яюеё"):
63 |                 new_phones.append("j")
64 |         if phone[0] in vowels:
65 |             new_phones.append(vowels[phone[0]] + str(phone[1]))
66 |         else:
67 |             new_phones.append(phone[0])
68 |         prev = phone[0]
69 | 
70 |     return new_phones
71 | 
72 | def convert(stressword):
73 |     phones = ("#" + stressword + "#")
74 | 
75 | 
76 |     # Assign stress marks
77 |     stress_phones = []
78 |     stress = 0
79 |     for phone in phones:
80 |         if phone == "+":
81 |             stress = 1
82 |         else:
83 |             stress_phones.append((phone, stress))
84 |             stress = 0
85 | 
86 |     # Pallatize
87 |     pallatize(stress_phones)
88 | 
89 |     # Assign stress
90 |     phones = convert_vowels(stress_phones)
91 | 
92 |     # Filter
93 |     phones = [x for x in phones if x not in others]
94 |     return " ".join(phones)
95 | 


--------------------------------------------------------------------------------
/TeraTTS/tokenizer/g2p/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from .g2p import * #noqa
 3 | import json
 4 | import os
 5 | 
 6 | class Tokenizer():
 7 |     def __init__(self, data_path: str, load_dict=True) -> None:
 8 |         '''data_path - path to data dir; load_dict - load dict, if you use accent model like ruaccent you dont need its'''
 9 |         self.dic = {}
10 |         if load_dict:
11 |             for line in open(os.path.join(data_path, "dictionary.txt")): #noqa
12 |                 items = line.split()
13 |                 self.dic[items[0]] = " ".join(items[1:])
14 | 
15 |         self.config = json.load(open(os.path.join(data_path, "config.json"))) #noqa
16 |     
17 |     def g2p(self, text):
18 |         text = re.sub("—", "-", text)
19 |         text = re.sub("([!'(),-.:;?])", r' \1 ', text)
20 | 
21 |         phonemes = []
22 |         for word in text.split():
23 |             if re.match("[!'(),-.:;?]", word):
24 |                 phonemes.append(word)
25 |                 continue
26 | 
27 |             word = word.lower()
28 |             if len(phonemes) > 0: 
29 |                 phonemes.append(' ')
30 | 
31 |             if word in self.dic:
32 |                 phonemes.extend(self.dic[word].split())
33 |             else:
34 |                 phonemes.extend(convert(word).split()) #noqa
35 | 
36 |         phoneme_id_map = self.config["phoneme_id_map"]
37 |         phoneme_ids = []
38 |         phoneme_ids.extend(phoneme_id_map["^"])
39 |         phoneme_ids.extend(phoneme_id_map["_"])
40 |         for p in phonemes:
41 |             if p in phoneme_id_map:
42 |                 phoneme_ids.extend(phoneme_id_map[p])
43 |                 phoneme_ids.extend(phoneme_id_map["_"])
44 |         phoneme_ids.extend(phoneme_id_map["$"])
45 | 
46 |         return phoneme_ids, phonemes
47 |     
48 |     def _get_seq(self, text: str) -> list[int]:
49 |         seq = self.g2p(text)[0]
50 |         return seq


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | text = "Привет, мир!"
 2 | 
 3 | from TeraTTS import TTS
 4 | 
 5 | # Опционально: Предобработка текста (улучшает качество)
 6 | from ruaccent import RUAccent
 7 | accentizer = RUAccent()
 8 | 
 9 | # Загрузка моделей акцентуации и словарей
10 | accentizer.load(omograph_model_size='turbo', use_dictionary=True)
11 | 
12 | # Обработка текста с учетом ударений и буквы ё
13 | text = accentizer.process_all(text)
14 | print(f"Текст с ударениями и ё: {text}")
15 | 
16 | 
17 | # Примечание: Вы можете найти все модели по адресу https://huggingface.co/TeraTTS, включая модель GLADOS
18 | tts = TTS("TeraTTS/natasha-g2p-vits", add_time_to_end=1.0, tokenizer_load_dict=True) # Вы можете настроить 'add_time_to_end' для продолжительности аудио, 'tokenizer_load_dict' можно отключить если используете RUAccent
19 | 
20 | 
21 | # 'length_scale' можно использовать для замедления аудио для лучшего звучания (по умолчанию 1.1, указано здесь для примера)
22 | audio = tts(text, length_scale=1.1)  # Создать аудио. Можно добавить ударения, используя '+'
23 | tts.play_audio(audio)  # Воспроизвести созданное аудио
24 | tts.save_wav(audio, "./test.wav")  # Сохранить аудио в файл
25 | 
26 | 
27 | # Создать аудио и сразу его воспроизвести
28 | tts(text, play=True, length_scale=1.1)
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 |  
 3 | classifiers = [
 4 |   'Development Status :: 5 - Production/Stable',
 5 |   'Intended Audience :: Education',
 6 |   'Operating System :: Microsoft :: Windows',
 7 |   'Operating System :: Unix',
 8 |   'Operating System :: MacOS',
 9 |   'License :: OSI Approved :: MIT License',
10 |   'Programming Language :: Python :: 3'
11 | ]
12 |  
13 | setup(
14 |   name='TeraTTS',
15 |   version='1.0',
16 |   description='russian text to speech',
17 |   long_description=open("./README.md").read(),
18 |   long_description_content_type='text/markdown',
19 |   url='https://github.com/Tera2Space/TeraTTS',  
20 |   author='Tera Space',
21 |   author_email='tera2space@gmail.com',
22 |   license='MIT', 
23 |   classifiers=classifiers,
24 |   keywords='tts', 
25 |   packages=find_packages(),
26 |   install_requires=['scipy', 'sounddevice', 'onnxruntime', "tok", "transformers", "numpy", "sentencepiece", "ruaccent", "transliterate", "num2words"] 
27 | )


--------------------------------------------------------------------------------