├── ChatWaifu_marai.py ├── LICENSE ├── README.md ├── attentions.py ├── commons.py ├── eng-README.md ├── hubert_model.py ├── jieba └── dict.txt ├── mel_processing.py ├── models.py ├── modules.py ├── output.wav ├── readme ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png ├── cyberbot.png └── token.png ├── requirements.txt ├── text ├── LICENSE ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── cleaners.cpython-38.pyc │ ├── japanese.cpython-38.pyc │ └── mandarin.cpython-38.pyc ├── cantonese.py ├── cleaners.py ├── english.py ├── japanese.py ├── korean.py ├── mandarin.py ├── ngu_dialect.py ├── sanskrit.py ├── shanghainese.py └── thai.py ├── transforms.py └── utils.py /ChatWaifu_marai.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from scipy.io.wavfile import write 4 | from mel_processing import spectrogram_torch 5 | from text import text_to_sequence, _clean_text 6 | from models import SynthesizerTrn 7 | import utils 8 | import commons 9 | import sys 10 | import re 11 | import os 12 | import miraicle 13 | from torch import no_grad, LongTensor 14 | import logging 15 | from winsound import PlaySound 16 | 17 | 18 | sound_status=False 19 | qq = # 你登录的机器人 QQ 号 20 | verify_key = '' # 你在 setting.yml 中设置的 verifyKey 21 | port =8080 # 你在 setting.yml 中设置的 port (http) 22 | 23 | #################################### 24 | #CHATGPT INITIALIZE 25 | from pyChatGPT import ChatGPT 26 | import json 27 | idmessage = """ID Speaker 28 | 0 綾地寧々 29 | 1 在原七海 30 | 2 小茸 31 | 3 唐乐吟 32 | """ 33 | speakerID = 0 34 | 35 | def get_input(): 36 | # prompt for input 37 | print("You:") 38 | user_input = input() 39 | return user_input 40 | 41 | 42 | if os.path.exists("./token.txt") == True: 43 | def get_token(): 44 | tok = open("token.txt",encoding = "utf-8") 45 | token=tok.read() 46 | print("you have cached your token, simpiflied your login!") 47 | return token; 48 | else : 49 | def get_token(): 50 | token = input("Copy your token from ChatGPT and press Enter \n") 51 | with open('token.txt','w') as tok: 52 | tok.write(token) 53 | return token; 54 | 55 | 56 | ################################################ 57 | 58 | 59 | logging.getLogger('numba').setLevel(logging.WARNING) 60 | 61 | 62 | def ex_print(text, escape=False): 63 | if escape: 64 | print(text.encode('unicode_escape').decode()) 65 | else: 66 | print(text) 67 | 68 | 69 | def get_text(text, hps, cleaned=False): 70 | if cleaned: 71 | text_norm = text_to_sequence(text, hps.symbols, []) 72 | else: 73 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 74 | if hps.data.add_blank: 75 | text_norm = commons.intersperse(text_norm, 0) 76 | text_norm = LongTensor(text_norm) 77 | return text_norm 78 | 79 | 80 | def ask_if_continue(): 81 | while True: 82 | answer = input('Continue? (y/n): ') 83 | if answer == 'y': 84 | break 85 | elif answer == 'n': 86 | sys.exit(0) 87 | 88 | 89 | def print_speakers(speakers, escape=False): 90 | if len(speakers) > 100: 91 | return 92 | print('ID\tSpeaker') 93 | for id, name in enumerate(speakers): 94 | ex_print(str(id) + '\t' + name, escape) 95 | 96 | 97 | def get_speaker_id(message): 98 | speaker_id = input(message) 99 | try: 100 | speaker_id = int(speaker_id) 101 | except: 102 | print(str(speaker_id) + ' is not a valid ID!') 103 | sys.exit(1) 104 | return speaker_id 105 | 106 | 107 | def get_label_value(text, label, default, warning_name='value'): 108 | value = re.search(rf'\[{label}=(.+?)\]', text) 109 | if value: 110 | try: 111 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 112 | value = float(value.group(1)) 113 | except: 114 | print(f'Invalid {warning_name}!') 115 | sys.exit(1) 116 | else: 117 | value = default 118 | return value, text 119 | 120 | 121 | def get_label(text, label): 122 | if f'[{label}]' in text: 123 | return True, text.replace(f'[{label}]', '') 124 | else: 125 | return False, text 126 | 127 | 128 | 129 | def generateSound(inputString,language): 130 | if '--escape' in sys.argv: 131 | escape = True 132 | else: 133 | escape = False 134 | 135 | 136 | #model = input('Path of a VITS model: ') 137 | #config = input('Path of a config file: ') 138 | if language=="ch": 139 | model = r".\model\CN\model.pth" 140 | config = r".\model\CN\config.json" 141 | elif language=="jp": 142 | model = r".\model\H_excluded.pth" 143 | config = r".\model\config.json" 144 | elif language=="multi": 145 | model = r".\model\Multi\multi.pth" 146 | config = r".\model\Multi\config.json" 147 | 148 | hps_ms = utils.get_hparams_from_file(config) 149 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 150 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 151 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 152 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 153 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 154 | 155 | net_g_ms = SynthesizerTrn( 156 | n_symbols, 157 | hps_ms.data.filter_length // 2 + 1, 158 | hps_ms.train.segment_size // hps_ms.data.hop_length, 159 | n_speakers=n_speakers, 160 | emotion_embedding=emotion_embedding, 161 | **hps_ms.model) 162 | _ = net_g_ms.eval() 163 | utils.load_checkpoint(model, net_g_ms) 164 | 165 | def voice_conversion(): 166 | audio_path = input('Path of an audio file to convert:\n') 167 | print_speakers(speakers) 168 | audio = utils.load_audio_to_torch( 169 | audio_path, hps_ms.data.sampling_rate) 170 | 171 | originnal_id = get_speaker_id('Original speaker ID: ') 172 | target_id = get_speaker_id('Target speaker ID: ') 173 | out_path = input('Path to save: ') 174 | 175 | y = audio.unsqueeze(0) 176 | 177 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 178 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 179 | center=False) 180 | spec_lengths = LongTensor([spec.size(-1)]) 181 | sid_src = LongTensor([originnal_id]) 182 | 183 | with no_grad(): 184 | sid_tgt = LongTensor([target_id]) 185 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 186 | 0][0, 0].data.cpu().float().numpy() 187 | return audio, out_path 188 | 189 | if n_symbols != 0: 190 | if not emotion_embedding: 191 | #while True: 192 | if(1==1): 193 | #choice = input('TTS or VC? (t/v):') 194 | choice = 't' 195 | if choice == 't': 196 | #text = input('Text to read: ') 197 | text = inputString 198 | if text == '[ADVANCED]': 199 | #text = input('Raw text:') 200 | text = "我不会说" 201 | #print('Cleaned text is:') 202 | #ex_print(_clean_text( 203 | # text, hps_ms.data.text_cleaners), escape) 204 | #continue 205 | 206 | length_scale, text = get_label_value( 207 | text, 'LENGTH', 1, 'length scale') 208 | noise_scale, text = get_label_value( 209 | text, 'NOISE', 0.667, 'noise scale') 210 | noise_scale_w, text = get_label_value( 211 | text, 'NOISEW', 0.8, 'deviation of noise') 212 | cleaned, text = get_label(text, 'CLEANED') 213 | 214 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 215 | 216 | #print_speakers(speakers, escape) 217 | #speaker_id = get_speaker_id('Speaker ID: ') 218 | speaker_id = speakerID 219 | #out_path = input('Path to save: ') 220 | out_path = "output.wav" 221 | 222 | with no_grad(): 223 | x_tst = stn_tst.unsqueeze(0) 224 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 225 | sid = LongTensor([speaker_id]) 226 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 227 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 228 | 229 | elif choice == 'v': 230 | audio, out_path = voice_conversion() 231 | 232 | write(out_path, hps_ms.data.sampling_rate, audio) 233 | print('Successfully saved!') 234 | #ask_if_continue() 235 | else: 236 | import os 237 | import librosa 238 | import numpy as np 239 | from torch import FloatTensor 240 | import audonnx 241 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 242 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 243 | #while True: 244 | if(1==1): 245 | #choice = input('TTS or VC? (t/v):') 246 | choice = 't' 247 | if choice == 't': 248 | #text = input('Text to read: ') 249 | text = inputString 250 | if text == '[ADVANCED]': 251 | #text = input('Raw text:') 252 | text = "我不会说" 253 | #print('Cleaned text is:') 254 | #ex_print(_clean_text( 255 | # text, hps_ms.data.text_cleaners), escape) 256 | #continue 257 | 258 | length_scale, text = get_label_value( 259 | text, 'LENGTH', 1, 'length scale') 260 | noise_scale, text = get_label_value( 261 | text, 'NOISE', 0.667, 'noise scale') 262 | noise_scale_w, text = get_label_value( 263 | text, 'NOISEW', 0.8, 'deviation of noise') 264 | cleaned, text = get_label(text, 'CLEANED') 265 | 266 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 267 | 268 | #print_speakers(speakers, escape) 269 | #speaker_id = get_speaker_id('Speaker ID: ') 270 | speaker_id = speakerID 271 | 272 | emotion_reference = input('Path of an emotion reference: ') 273 | if emotion_reference.endswith('.npy'): 274 | emotion = np.load(emotion_reference) 275 | emotion = FloatTensor(emotion).unsqueeze(0) 276 | else: 277 | audio16000, sampling_rate = librosa.load( 278 | emotion_reference, sr=16000, mono=True) 279 | emotion = w2v2_model(audio16000, sampling_rate)[ 280 | 'hidden_states'] 281 | emotion_reference = re.sub( 282 | r'\..*$', '', emotion_reference) 283 | np.save(emotion_reference, emotion.squeeze(0)) 284 | emotion = FloatTensor(emotion) 285 | 286 | #out_path = input('Path to save: ') 287 | out_path = "output.wav" 288 | 289 | with no_grad(): 290 | x_tst = stn_tst.unsqueeze(0) 291 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 292 | sid = LongTensor([speaker_id]) 293 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 294 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 295 | 296 | elif choice == 'v': 297 | audio, out_path = voice_conversion() 298 | 299 | write(out_path, hps_ms.data.sampling_rate, audio) 300 | print('Successfully saved!') 301 | #ask_if_continue() 302 | else: 303 | model = input('Path of a hubert-soft model: ') 304 | from hubert_model import hubert_soft 305 | hubert = hubert_soft(model) 306 | 307 | while True: 308 | audio_path = input('Path of an audio file to convert:\n') 309 | 310 | if audio_path != '[VC]': 311 | import librosa 312 | if use_f0: 313 | audio, sampling_rate = librosa.load( 314 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 315 | audio16000 = librosa.resample( 316 | audio, orig_sr=sampling_rate, target_sr=16000) 317 | else: 318 | audio16000, sampling_rate = librosa.load( 319 | audio_path, sr=16000, mono=True) 320 | 321 | #print_speakers(speakers, escape) 322 | target_id = get_speaker_id('Target speaker ID: ') 323 | out_path = input('Path to save: ') 324 | length_scale, out_path = get_label_value( 325 | out_path, 'LENGTH', 1, 'length scale') 326 | noise_scale, out_path = get_label_value( 327 | out_path, 'NOISE', 0.1, 'noise scale') 328 | noise_scale_w, out_path = get_label_value( 329 | out_path, 'NOISEW', 0.1, 'deviation of noise') 330 | 331 | from torch import inference_mode, FloatTensor 332 | import numpy as np 333 | with inference_mode(): 334 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 335 | 0).unsqueeze(0)).squeeze(0).numpy() 336 | if use_f0: 337 | f0_scale, out_path = get_label_value( 338 | out_path, 'F0', 1, 'f0 scale') 339 | f0 = librosa.pyin(audio, sr=sampling_rate, 340 | fmin=librosa.note_to_hz('C0'), 341 | fmax=librosa.note_to_hz('C7'), 342 | frame_length=1780)[0] 343 | target_length = len(units[:, 0]) 344 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 345 | np.arange(0, len(f0)), f0)) * f0_scale 346 | units[:, 0] = f0 / 10 347 | 348 | stn_tst = FloatTensor(units) 349 | with no_grad(): 350 | x_tst = stn_tst.unsqueeze(0) 351 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 352 | sid = LongTensor([target_id]) 353 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 354 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 355 | 356 | else: 357 | audio, out_path = voice_conversion() 358 | 359 | write(out_path, hps_ms.data.sampling_rate, audio) 360 | print('Successfully saved!') 361 | #ask_if_continue() 362 | 363 | if __name__ == "__main__": 364 | session_token = get_token() 365 | api = ChatGPT(session_token) 366 | bot = miraicle.Mirai(qq=qq, verify_key=verify_key, port=port) 367 | print(idmessage) 368 | peaker_id = input() 369 | os_dir=os.getcwd() 370 | 371 | @miraicle.Mirai.receiver('GroupMessage') 372 | def reply_to_group(bot: miraicle.Mirai, msg: miraicle.GroupMessage): 373 | global sound_status 374 | if msg.at_me() and msg.plain.lstrip()!="" and sound_status==False: 375 | if msg.plain.lstrip() == "重置对话": 376 | api.reset_conversation() 377 | bot.send_group_msg(group=msg.group, msg="重置对话成功") 378 | else: 379 | return_voice = True 380 | sound_status = True 381 | bot.send_group_msg(group=msg.group, msg="正在处理中请稍后...") 382 | text = msg.plain.strip() 383 | resp = api.send_message(text) 384 | answer = resp["message"].replace('\n', '') 385 | if re.match('[\u4e00-\u9fa5]',answer): 386 | generateSound("[ZH]" + answer + "[ZH]",language="ch") 387 | else: 388 | generateSound(answer,language="jp") 389 | #generateSound(answer,language="multi") 390 | if False: 391 | try: 392 | os.remove('output.pcm') 393 | os.remove('output.silk') 394 | except: 395 | pass 396 | 397 | ffmpeg=os.popen(r'"D:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe" -i '+os_dir+'"\output.wav" -f s16le -ar 24000 -ac 1 -acodec pcm_s16le '+os_dir+'"\output.pcm" -loglevel quiet') 398 | #print(ffmpeg) 399 | while return_voice: 400 | if os.path.exists('output.pcm'): 401 | return_voice=False 402 | return_voice = True 403 | 404 | voice_status=os.system(os_dir+r'\silk_v3_encoder.exe '+os_dir+ r'\output.pcm '+os_dir+r'\output.silk -tencent -quiet') 405 | while return_voice: 406 | if voice_status==0: 407 | return_voice=False 408 | sound_win=miraicle.Voice(base64='output.silk') 409 | else: 410 | sound = miraicle.Voice(base64='output.wav') 411 | 412 | 413 | print("ChatGPT:") 414 | print(answer) 415 | bot.send_group_msg(group=msg.group,msg=sound) 416 | bot.send_group_msg(group=msg.group,msg=answer) 417 | #PlaySound(r'.\output.wav', flags=1) 418 | sound_status=False 419 | elif msg.at_me() and msg.plain.lstrip()!="" and sound_status==True: 420 | bot.send_group_msg(group=msg.group,msg="上条信息还没处理完,请等一下再试吧!") 421 | elif msg.at_me() and msg.plain.lstrip()=="": 422 | bot.send_group_msg(group=msg.group, msg="哎呦,你干嘛~") 423 | else: 424 | pass 425 | 426 | 427 | bot.run() 428 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 CjangCjengh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語") 4 | 5 |
10 | 11 | # 12 | ### 这是一个使用TTS+VITS的ChatGPT语音对话程序! 13 | 14 | **基于**: 15 | - [miraicle](https://github.com/Excaive/miraicle) 16 | - [mirai-http-api](https://github.com/project-mirai/mirai-api-http) 17 | - [ChatWaifu](https://github.com/cjyaddone/ChatWaifu) 18 | 19 | 效果演示BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili") 20 | 21 | **当前支持功能:** 22 | * [x] ChatGPT的对话聊天 23 | * [x] 回答转语音 24 | * [x] 多角色语音 25 | * [x] 语音识别对话 (研发了一款真正人性化的智能语音Q宝 26 | * [x] 对接Marai机器人 27 | * [x] [对接Live2D版本](https://github.com/cjyaddone/ChatWaifuL2D) 28 | 29 | 30 | # 目录 31 | * [1.安装环境:](#1.) 32 | * 1.1 [使用cd命令进入项目文件夹](#cd) 33 | * 1.2 [创建Python虚拟环境:](#99) 34 | * 1.3 [进入创建好的虚拟环境:](#venv) 35 | * 1.4 [pip安装项目所需要的库文件:](#pip) 36 | * [2.导入模型到根目录model文件夹(如果没有自行创建):](#.model) 37 | * 2.1 [双击导入model](#cd1) 38 | * [3.运行(快和我的老婆们对话吧:](#22) 39 | * 3.1 [获取marai verify_key和QQ](#343533) 40 | * 3.2 [获取ChatGPT Token](#333) 41 | * 3.3 [开始和CyberWaifu聊天](#444) 42 | * [4.已知问题:](#9315) 43 | * [5.鸣:谢](#915) 44 | ## 1.安装环境: 45 | > **安装anaconda环境或Python>=3.7** 46 | > 47 | > **本例使用的环境名称是:chatWaifu** 48 | 49 | ### 1.1 使用cd命令进入项目文件夹 50 | `cd 你的项目路径` 51 |  52 | ### 1.2 创建Python虚拟环境: 53 | 54 | Conda:`conda create --name chatWaifu python=3.10` 55 | 56 |  57 |  58 | 59 | 60 | Python:`python -m venv chatWaifu` 61 |  62 | 63 | ### 1.3 进入创建好的虚拟环境: 64 | Conda:`conda activate chatWaifu` 65 | 66 |  67 | 68 | Python:`.\chatWaifu\Scripts\activate.bat` 69 | 70 |  71 | 72 | ### 1.4 pip安装项目所需要的库文件: 73 | `pip install -r requirements.txt` 74 | 75 |  76 | 77 | ## 2.导入模型到根目录model文件夹: 78 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing 79 | 80 | 阿里云盘: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3 81 | 82 | ### 2.1移动到项目根目录下双击导入model 83 | 84 | ## 3.运行(快和老婆们对话吧: 85 | 86 | 中日双语版:`python ChatWaifu_marai.py` 87 | 88 | ### 3.1 获取marai verify_key和QQ 89 | #### 在程序开头有下图的配置区域 90 | 91 | ####  92 | 93 | #### verify_key填入mirai-http-api 中设置的 verifyKey 94 | 95 | #### QQ填入使用的机器人QQ 96 | 97 | ### 3.2 获取ChatGPT Token 98 | #### 在浏览器登入https://chat.openai.com 99 | #### 按F12进入开发控制台 100 | #### 找到 应用程序 -> cookie -> __Secure-next-auth.session-token 101 | 102 | ####  103 | 104 | #### 将值复制进入终端并回车 105 | 106 | ### 3.3 开始和CyberWaifu聊天!!! 107 | 108 | 在群中@你的机器人吧! 109 | 110 | 附赠:[ChatGPT 中文调教指南](https://github.com/PlexPt/awesome-chatgpt-prompts-zh) 111 | 112 | ## 4.已知问题: 113 | 114 | ### 暂时所以用户对话都是使用一个会话的,因为ChatGPT的限制... 115 | 116 | ## 5.鸣谢: 117 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI 118 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels 119 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT 120 | -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | import torch.jit 4 | 5 | 6 | def script_method(fn, _rcb=None): 7 | return fn 8 | 9 | 10 | def script(obj, optimize=True, _frames_up=0, _rcb=None): 11 | return obj 12 | 13 | 14 | torch.jit.script_method = script_method 15 | torch.jit.script = script 16 | 17 | 18 | def init_weights(m, mean=0.0, std=0.01): 19 | classname = m.__class__.__name__ 20 | if classname.find("Conv") != -1: 21 | m.weight.data.normal_(mean, std) 22 | 23 | 24 | def get_padding(kernel_size, dilation=1): 25 | return int((kernel_size*dilation - dilation)/2) 26 | 27 | 28 | def intersperse(lst, item): 29 | result = [item] * (len(lst) * 2 + 1) 30 | result[1::2] = lst 31 | return result 32 | 33 | 34 | def slice_segments(x, ids_str, segment_size=4): 35 | ret = torch.zeros_like(x[:, :, :segment_size]) 36 | for i in range(x.size(0)): 37 | idx_str = ids_str[i] 38 | idx_end = idx_str + segment_size 39 | ret[i] = x[i, :, idx_str:idx_end] 40 | return ret 41 | 42 | 43 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 44 | b, d, t = x.size() 45 | if x_lengths is None: 46 | x_lengths = t 47 | ids_str_max = x_lengths - segment_size + 1 48 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 49 | ret = slice_segments(x, ids_str, segment_size) 50 | return ret, ids_str 51 | 52 | 53 | def subsequent_mask(length): 54 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 55 | return mask 56 | 57 | 58 | @torch.jit.script 59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 60 | n_channels_int = n_channels[0] 61 | in_act = input_a + input_b 62 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 63 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 64 | acts = t_act * s_act 65 | return acts 66 | 67 | 68 | def convert_pad_shape(pad_shape): 69 | l = pad_shape[::-1] 70 | pad_shape = [item for sublist in l for item in sublist] 71 | return pad_shape 72 | 73 | 74 | def sequence_mask(length, max_length=None): 75 | if max_length is None: 76 | max_length = length.max() 77 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 78 | return x.unsqueeze(0) < length.unsqueeze(1) 79 | 80 | 81 | def generate_path(duration, mask): 82 | """ 83 | duration: [b, 1, t_x] 84 | mask: [b, 1, t_y, t_x] 85 | """ 86 | device = duration.device 87 | 88 | b, _, t_y, t_x = mask.shape 89 | cum_duration = torch.cumsum(duration, -1) 90 | 91 | cum_duration_flat = cum_duration.view(b * t_x) 92 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 93 | path = path.view(b, t_x, t_y) 94 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 95 | path = path.unsqueeze(1).transpose(2,3) * mask 96 | return path 97 | -------------------------------------------------------------------------------- /eng-README.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語") 4 | 5 | 6 | 11 | 12 | # 13 | > ### This is a chatting Waifu program use VITS & ChatGPT! 14 | 15 | **Base on**: 16 | - [miraicle](https://github.com/Excaive/miraicle) 17 | - [mirai-http-api](https://github.com/project-mirai/mirai-api-http) 18 | - [ChatWaifu](https://github.com/cjyaddone/ChatWaifu) 19 | 20 | Effect demonstration BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili") 21 | 22 | **Functioning Now:** 23 | * [x] Talking with ChatGPT 24 | * [x] Convert AI's Response to wav file 25 | * [x] Multi-Character voice generator 26 | * [x] Voice Recognition 27 | * [x] Connect to Marai Robort 28 | 29 | **Under Construction:** 30 | * [ ] Connect to Live2D 31 | 32 | 33 | # Catalogue 34 | * [1.Install Python venv:](#1.) 35 | * 1.1 [Enter directory with cd commend](#cd) 36 | * 1.2 [Create Python Venv:](#99) 37 | * 1.3 [Enter Python Venv:](#venv) 38 | * 1.4 [Install required library with Pip:](#pip) 39 | * [2.Import pre-trained models to "model" folder(create a new one if doesn't exist):](#.model) 40 | * 2.1 [Double click model.exe to import Models](#cd1) 41 | * [3.Run(Talk to your Waifu:](#22) 42 | * 3.1 [Get marai verify_key and QQ](#343533) 43 | * 3.2 [Get ChatGPT Token](#333) 44 | * 3.3 [Start chatting with CyberWaifu](#444) 45 | * [4.known issue:](#9315) 46 | * [5.Contributions:](#915)](#915) 47 | ## 1.Install Python Venv: 48 | > **Install Anaconda or Python>=3.7** 49 | > 50 | > **This example name the venv:chatWaifu** 51 | 52 | ### 1.1 Enter project directory with cd command 53 | `cd YOUR_PROJECT_RESPORY` 54 |  55 | ### 1.2 Create Python Venv: 56 | 57 | Conda:`conda create --name CyberWaifu python=3.10` 58 |  59 |  60 | 61 | Python:`python -m venv chatWaifu` 62 |  63 | 64 | ### 1.3 Activate created venv: 65 | Conda:`conda activate chatWaifu` 66 | 67 |  68 | 69 | Python:`.\chatWaifu\Scripts\activate.bat` 70 |  71 | 72 | ### 1.4 Install required library with Pip: 73 | `pip install -r requirements.txt` 74 |  75 | 76 | ## 2.import pre-trained models to root directory: 77 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing 78 | 79 | Ali Drive: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3 80 | 81 | ### 2.1Double click model.exe to import Models 82 | 83 | ## 3.RUN(Start chatting with CyberWaifu: 84 | 85 | Chinese and Japanese Ver:`python ChatWaifu_marai.py` 86 | 87 | ### 3.1 Get marai verify_key and QQ 88 | #### At the beginning of the program is the configuration area shown below 89 | 90 | ####  91 | 92 | #### verify_key filling-in mirai-http-api set verifyKey 93 | 94 | #### QQ fill in the use of the robot QQ 95 | 96 | ### 3.2 Get ChatGPT Token 97 | #### Log in to ChatGPT whith link:https://chat.openai.com 98 | #### Press F12 to enter command center 99 | #### Find Application -> cookie -> __Secure-next-auth.session-token 100 | #### Copy the value into cmd and press ENTER 101 | 102 | ### 3.3 Start chatting with CyberWaifu 103 | 104 | @ your bot in the group! 105 | 106 | ## 4.known issue: 107 | 108 | ### Because of ChatGPT's restrictions, conversations from unregistered usrs are all in one session... 109 | 110 | ## 5.Contribution: 111 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI 112 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels 113 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT 114 | -------------------------------------------------------------------------------- /hubert_model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Tuple 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 9 | 10 | class Hubert(nn.Module): 11 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True): 12 | super().__init__() 13 | self._mask = mask 14 | self.feature_extractor = FeatureExtractor() 15 | self.feature_projection = FeatureProjection() 16 | self.positional_embedding = PositionalConvEmbedding() 17 | self.norm = nn.LayerNorm(768) 18 | self.dropout = nn.Dropout(0.1) 19 | self.encoder = TransformerEncoder( 20 | nn.TransformerEncoderLayer( 21 | 768, 12, 3072, activation="gelu", batch_first=True 22 | ), 23 | 12, 24 | ) 25 | self.proj = nn.Linear(768, 256) 26 | 27 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) 28 | self.label_embedding = nn.Embedding(num_label_embeddings, 256) 29 | 30 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 31 | mask = None 32 | if self.training and self._mask: 33 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) 34 | x[mask] = self.masked_spec_embed.to(x.dtype) 35 | return x, mask 36 | 37 | def encode( 38 | self, x: torch.Tensor, layer: Optional[int] = None 39 | ) -> Tuple[torch.Tensor, torch.Tensor]: 40 | x = self.feature_extractor(x) 41 | x = self.feature_projection(x.transpose(1, 2)) 42 | x, mask = self.mask(x) 43 | x = x + self.positional_embedding(x) 44 | x = self.dropout(self.norm(x)) 45 | x = self.encoder(x, output_layer=layer) 46 | return x, mask 47 | 48 | def logits(self, x: torch.Tensor) -> torch.Tensor: 49 | logits = torch.cosine_similarity( 50 | x.unsqueeze(2), 51 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0), 52 | dim=-1, 53 | ) 54 | return logits / 0.1 55 | 56 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 57 | x, mask = self.encode(x) 58 | x = self.proj(x) 59 | logits = self.logits(x) 60 | return logits, mask 61 | 62 | 63 | class HubertSoft(Hubert): 64 | def __init__(self): 65 | super().__init__() 66 | 67 | @torch.inference_mode() 68 | def units(self, wav: torch.Tensor) -> torch.Tensor: 69 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 70 | x, _ = self.encode(wav) 71 | return self.proj(x) 72 | 73 | 74 | class FeatureExtractor(nn.Module): 75 | def __init__(self): 76 | super().__init__() 77 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) 78 | self.norm0 = nn.GroupNorm(512, 512) 79 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) 80 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) 81 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) 82 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) 83 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) 84 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) 85 | 86 | def forward(self, x: torch.Tensor) -> torch.Tensor: 87 | x = F.gelu(self.norm0(self.conv0(x))) 88 | x = F.gelu(self.conv1(x)) 89 | x = F.gelu(self.conv2(x)) 90 | x = F.gelu(self.conv3(x)) 91 | x = F.gelu(self.conv4(x)) 92 | x = F.gelu(self.conv5(x)) 93 | x = F.gelu(self.conv6(x)) 94 | return x 95 | 96 | 97 | class FeatureProjection(nn.Module): 98 | def __init__(self): 99 | super().__init__() 100 | self.norm = nn.LayerNorm(512) 101 | self.projection = nn.Linear(512, 768) 102 | self.dropout = nn.Dropout(0.1) 103 | 104 | def forward(self, x: torch.Tensor) -> torch.Tensor: 105 | x = self.norm(x) 106 | x = self.projection(x) 107 | x = self.dropout(x) 108 | return x 109 | 110 | 111 | class PositionalConvEmbedding(nn.Module): 112 | def __init__(self): 113 | super().__init__() 114 | self.conv = nn.Conv1d( 115 | 768, 116 | 768, 117 | kernel_size=128, 118 | padding=128 // 2, 119 | groups=16, 120 | ) 121 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) 122 | 123 | def forward(self, x: torch.Tensor) -> torch.Tensor: 124 | x = self.conv(x.transpose(1, 2)) 125 | x = F.gelu(x[:, :, :-1]) 126 | return x.transpose(1, 2) 127 | 128 | 129 | class TransformerEncoder(nn.Module): 130 | def __init__( 131 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int 132 | ) -> None: 133 | super(TransformerEncoder, self).__init__() 134 | self.layers = nn.ModuleList( 135 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)] 136 | ) 137 | self.num_layers = num_layers 138 | 139 | def forward( 140 | self, 141 | src: torch.Tensor, 142 | mask: torch.Tensor = None, 143 | src_key_padding_mask: torch.Tensor = None, 144 | output_layer: Optional[int] = None, 145 | ) -> torch.Tensor: 146 | output = src 147 | for layer in self.layers[:output_layer]: 148 | output = layer( 149 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask 150 | ) 151 | return output 152 | 153 | 154 | def _compute_mask( 155 | shape: Tuple[int, int], 156 | mask_prob: float, 157 | mask_length: int, 158 | device: torch.device, 159 | min_masks: int = 0, 160 | ) -> torch.Tensor: 161 | batch_size, sequence_length = shape 162 | 163 | if mask_length < 1: 164 | raise ValueError("`mask_length` has to be bigger than 0.") 165 | 166 | if mask_length > sequence_length: 167 | raise ValueError( 168 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" 169 | ) 170 | 171 | # compute number of masked spans in batch 172 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) 173 | num_masked_spans = max(num_masked_spans, min_masks) 174 | 175 | # make sure num masked indices <= sequence_length 176 | if num_masked_spans * mask_length > sequence_length: 177 | num_masked_spans = sequence_length // mask_length 178 | 179 | # SpecAugment mask to fill 180 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) 181 | 182 | # uniform distribution to sample from, make sure that offset samples are < sequence_length 183 | uniform_dist = torch.ones( 184 | (batch_size, sequence_length - (mask_length - 1)), device=device 185 | ) 186 | 187 | # get random indices to mask 188 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans) 189 | 190 | # expand masked indices to masked spans 191 | mask_indices = ( 192 | mask_indices.unsqueeze(dim=-1) 193 | .expand((batch_size, num_masked_spans, mask_length)) 194 | .reshape(batch_size, num_masked_spans * mask_length) 195 | ) 196 | offsets = ( 197 | torch.arange(mask_length, device=device)[None, None, :] 198 | .expand((batch_size, num_masked_spans, mask_length)) 199 | .reshape(batch_size, num_masked_spans * mask_length) 200 | ) 201 | mask_idxs = mask_indices + offsets 202 | 203 | # scatter indices to mask 204 | mask = mask.scatter(1, mask_idxs, True) 205 | 206 | return mask 207 | 208 | 209 | def hubert_soft( 210 | path: str 211 | ) -> HubertSoft: 212 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 213 | Args: 214 | path (str): path of a pretrained model 215 | """ 216 | hubert = HubertSoft() 217 | checkpoint = torch.load(path) 218 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 219 | hubert.load_state_dict(checkpoint) 220 | hubert.eval() 221 | return hubert 222 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | 8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 9 | """ 10 | PARAMS 11 | ------ 12 | C: compression factor 13 | """ 14 | return torch.log(torch.clamp(x, min=clip_val) * C) 15 | 16 | 17 | def dynamic_range_decompression_torch(x, C=1): 18 | """ 19 | PARAMS 20 | ------ 21 | C: compression factor used to compress 22 | """ 23 | return torch.exp(x) / C 24 | 25 | 26 | def spectral_normalize_torch(magnitudes): 27 | output = dynamic_range_compression_torch(magnitudes) 28 | return output 29 | 30 | 31 | def spectral_de_normalize_torch(magnitudes): 32 | output = dynamic_range_decompression_torch(magnitudes) 33 | return output 34 | 35 | 36 | mel_basis = {} 37 | hann_window = {} 38 | 39 | 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 41 | if torch.min(y) < -1.: 42 | print('min value is ', torch.min(y)) 43 | if torch.max(y) > 1.: 44 | print('max value is ', torch.max(y)) 45 | 46 | global hann_window 47 | dtype_device = str(y.dtype) + '_' + str(y.device) 48 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 49 | if wnsize_dtype_device not in hann_window: 50 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 51 | 52 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 53 | y = y.squeeze(1) 54 | 55 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 56 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 57 | 58 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 59 | return spec 60 | 61 | 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 63 | global mel_basis 64 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 65 | fmax_dtype_device = str(fmax) + '_' + dtype_device 66 | if fmax_dtype_device not in mel_basis: 67 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 68 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 69 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 70 | spec = spectral_normalize_torch(spec) 71 | return spec 72 | 73 | 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 75 | if torch.min(y) < -1.: 76 | print('min value is ', torch.min(y)) 77 | if torch.max(y) > 1.: 78 | print('max value is ', torch.max(y)) 79 | 80 | global mel_basis, hann_window 81 | dtype_device = str(y.dtype) + '_' + str(y.device) 82 | fmax_dtype_device = str(fmax) + '_' + dtype_device 83 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 84 | if fmax_dtype_device not in mel_basis: 85 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 86 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 87 | if wnsize_dtype_device not in hann_window: 88 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 89 | 90 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 91 | y = y.squeeze(1) 92 | 93 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 94 | center=center, pad_mode='reflect', normalized=False, onesided=True) 95 | 96 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 97 | 98 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 99 | spec = spectral_normalize_torch(spec) 100 | 101 | return spec 102 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | 10 | from torch.nn import Conv1d, ConvTranspose1d 11 | from torch.nn.utils import weight_norm 12 | from commons import init_weights 13 | 14 | 15 | class StochasticDurationPredictor(nn.Module): 16 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 17 | super().__init__() 18 | filter_channels = in_channels # it needs to be removed from future version. 19 | self.in_channels = in_channels 20 | self.filter_channels = filter_channels 21 | self.kernel_size = kernel_size 22 | self.p_dropout = p_dropout 23 | self.n_flows = n_flows 24 | self.gin_channels = gin_channels 25 | 26 | self.log_flow = modules.Log() 27 | self.flows = nn.ModuleList() 28 | self.flows.append(modules.ElementwiseAffine(2)) 29 | for i in range(n_flows): 30 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 31 | self.flows.append(modules.Flip()) 32 | 33 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 34 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 35 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 36 | self.post_flows = nn.ModuleList() 37 | self.post_flows.append(modules.ElementwiseAffine(2)) 38 | for i in range(4): 39 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 40 | self.post_flows.append(modules.Flip()) 41 | 42 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 43 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 44 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 45 | if gin_channels != 0: 46 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 47 | 48 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 49 | x = torch.detach(x) 50 | x = self.pre(x) 51 | if g is not None: 52 | g = torch.detach(g) 53 | x = x + self.cond(g) 54 | x = self.convs(x, x_mask) 55 | x = self.proj(x) * x_mask 56 | 57 | if not reverse: 58 | flows = self.flows 59 | assert w is not None 60 | 61 | logdet_tot_q = 0 62 | h_w = self.post_pre(w) 63 | h_w = self.post_convs(h_w, x_mask) 64 | h_w = self.post_proj(h_w) * x_mask 65 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 66 | z_q = e_q 67 | for flow in self.post_flows: 68 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 69 | logdet_tot_q += logdet_q 70 | z_u, z1 = torch.split(z_q, [1, 1], 1) 71 | u = torch.sigmoid(z_u) * x_mask 72 | z0 = (w - u) * x_mask 73 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 74 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 75 | 76 | logdet_tot = 0 77 | z0, logdet = self.log_flow(z0, x_mask) 78 | logdet_tot += logdet 79 | z = torch.cat([z0, z1], 1) 80 | for flow in flows: 81 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 82 | logdet_tot = logdet_tot + logdet 83 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 84 | return nll + logq # [b] 85 | else: 86 | flows = list(reversed(self.flows)) 87 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 88 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 89 | for flow in flows: 90 | z = flow(z, x_mask, g=x, reverse=reverse) 91 | z0, z1 = torch.split(z, [1, 1], 1) 92 | logw = z0 93 | return logw 94 | 95 | 96 | class DurationPredictor(nn.Module): 97 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 98 | super().__init__() 99 | 100 | self.in_channels = in_channels 101 | self.filter_channels = filter_channels 102 | self.kernel_size = kernel_size 103 | self.p_dropout = p_dropout 104 | self.gin_channels = gin_channels 105 | 106 | self.drop = nn.Dropout(p_dropout) 107 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 108 | self.norm_1 = modules.LayerNorm(filter_channels) 109 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 110 | self.norm_2 = modules.LayerNorm(filter_channels) 111 | self.proj = nn.Conv1d(filter_channels, 1, 1) 112 | 113 | if gin_channels != 0: 114 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 115 | 116 | def forward(self, x, x_mask, g=None): 117 | x = torch.detach(x) 118 | if g is not None: 119 | g = torch.detach(g) 120 | x = x + self.cond(g) 121 | x = self.conv_1(x * x_mask) 122 | x = torch.relu(x) 123 | x = self.norm_1(x) 124 | x = self.drop(x) 125 | x = self.conv_2(x * x_mask) 126 | x = torch.relu(x) 127 | x = self.norm_2(x) 128 | x = self.drop(x) 129 | x = self.proj(x * x_mask) 130 | return x * x_mask 131 | 132 | 133 | class TextEncoder(nn.Module): 134 | def __init__(self, 135 | n_vocab, 136 | out_channels, 137 | hidden_channels, 138 | filter_channels, 139 | n_heads, 140 | n_layers, 141 | kernel_size, 142 | p_dropout, 143 | emotion_embedding): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.out_channels = out_channels 147 | self.hidden_channels = hidden_channels 148 | self.filter_channels = filter_channels 149 | self.n_heads = n_heads 150 | self.n_layers = n_layers 151 | self.kernel_size = kernel_size 152 | self.p_dropout = p_dropout 153 | self.emotion_embedding = emotion_embedding 154 | 155 | if self.n_vocab!=0: 156 | self.emb = nn.Embedding(n_vocab, hidden_channels) 157 | if emotion_embedding: 158 | self.emo_proj = nn.Linear(1024, hidden_channels) 159 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 160 | 161 | self.encoder = attentions.Encoder( 162 | hidden_channels, 163 | filter_channels, 164 | n_heads, 165 | n_layers, 166 | kernel_size, 167 | p_dropout) 168 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 169 | 170 | def forward(self, x, x_lengths, emotion_embedding=None): 171 | if self.n_vocab!=0: 172 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 173 | if emotion_embedding is not None: 174 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1)) 175 | x = torch.transpose(x, 1, -1) # [b, h, t] 176 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 177 | 178 | x = self.encoder(x * x_mask, x_mask) 179 | stats = self.proj(x) * x_mask 180 | 181 | m, logs = torch.split(stats, self.out_channels, dim=1) 182 | return x, m, logs, x_mask 183 | 184 | 185 | class ResidualCouplingBlock(nn.Module): 186 | def __init__(self, 187 | channels, 188 | hidden_channels, 189 | kernel_size, 190 | dilation_rate, 191 | n_layers, 192 | n_flows=4, 193 | gin_channels=0): 194 | super().__init__() 195 | self.channels = channels 196 | self.hidden_channels = hidden_channels 197 | self.kernel_size = kernel_size 198 | self.dilation_rate = dilation_rate 199 | self.n_layers = n_layers 200 | self.n_flows = n_flows 201 | self.gin_channels = gin_channels 202 | 203 | self.flows = nn.ModuleList() 204 | for i in range(n_flows): 205 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 206 | self.flows.append(modules.Flip()) 207 | 208 | def forward(self, x, x_mask, g=None, reverse=False): 209 | if not reverse: 210 | for flow in self.flows: 211 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 212 | else: 213 | for flow in reversed(self.flows): 214 | x = flow(x, x_mask, g=g, reverse=reverse) 215 | return x 216 | 217 | 218 | class PosteriorEncoder(nn.Module): 219 | def __init__(self, 220 | in_channels, 221 | out_channels, 222 | hidden_channels, 223 | kernel_size, 224 | dilation_rate, 225 | n_layers, 226 | gin_channels=0): 227 | super().__init__() 228 | self.in_channels = in_channels 229 | self.out_channels = out_channels 230 | self.hidden_channels = hidden_channels 231 | self.kernel_size = kernel_size 232 | self.dilation_rate = dilation_rate 233 | self.n_layers = n_layers 234 | self.gin_channels = gin_channels 235 | 236 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 237 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 238 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 239 | 240 | def forward(self, x, x_lengths, g=None): 241 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 242 | x = self.pre(x) * x_mask 243 | x = self.enc(x, x_mask, g=g) 244 | stats = self.proj(x) * x_mask 245 | m, logs = torch.split(stats, self.out_channels, dim=1) 246 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 247 | return z, m, logs, x_mask 248 | 249 | 250 | class Generator(torch.nn.Module): 251 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 252 | super(Generator, self).__init__() 253 | self.num_kernels = len(resblock_kernel_sizes) 254 | self.num_upsamples = len(upsample_rates) 255 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 256 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 257 | 258 | self.ups = nn.ModuleList() 259 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 260 | self.ups.append(weight_norm( 261 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 262 | k, u, padding=(k-u)//2))) 263 | 264 | self.resblocks = nn.ModuleList() 265 | for i in range(len(self.ups)): 266 | ch = upsample_initial_channel//(2**(i+1)) 267 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 268 | self.resblocks.append(resblock(ch, k, d)) 269 | 270 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 271 | self.ups.apply(init_weights) 272 | 273 | if gin_channels != 0: 274 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 275 | 276 | def forward(self, x, g=None): 277 | x = self.conv_pre(x) 278 | if g is not None: 279 | x = x + self.cond(g) 280 | 281 | for i in range(self.num_upsamples): 282 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 283 | x = self.ups[i](x) 284 | xs = None 285 | for j in range(self.num_kernels): 286 | if xs is None: 287 | xs = self.resblocks[i*self.num_kernels+j](x) 288 | else: 289 | xs += self.resblocks[i*self.num_kernels+j](x) 290 | x = xs / self.num_kernels 291 | x = F.leaky_relu(x) 292 | x = self.conv_post(x) 293 | x = torch.tanh(x) 294 | 295 | return x 296 | 297 | 298 | class SynthesizerTrn(nn.Module): 299 | """ 300 | Synthesizer for Training 301 | """ 302 | 303 | def __init__(self, 304 | n_vocab, 305 | spec_channels, 306 | segment_size, 307 | inter_channels, 308 | hidden_channels, 309 | filter_channels, 310 | n_heads, 311 | n_layers, 312 | kernel_size, 313 | p_dropout, 314 | resblock, 315 | resblock_kernel_sizes, 316 | resblock_dilation_sizes, 317 | upsample_rates, 318 | upsample_initial_channel, 319 | upsample_kernel_sizes, 320 | n_speakers=0, 321 | gin_channels=0, 322 | use_sdp=True, 323 | emotion_embedding=False, 324 | **kwargs): 325 | 326 | super().__init__() 327 | self.n_vocab = n_vocab 328 | self.spec_channels = spec_channels 329 | self.inter_channels = inter_channels 330 | self.hidden_channels = hidden_channels 331 | self.filter_channels = filter_channels 332 | self.n_heads = n_heads 333 | self.n_layers = n_layers 334 | self.kernel_size = kernel_size 335 | self.p_dropout = p_dropout 336 | self.resblock = resblock 337 | self.resblock_kernel_sizes = resblock_kernel_sizes 338 | self.resblock_dilation_sizes = resblock_dilation_sizes 339 | self.upsample_rates = upsample_rates 340 | self.upsample_initial_channel = upsample_initial_channel 341 | self.upsample_kernel_sizes = upsample_kernel_sizes 342 | self.segment_size = segment_size 343 | self.n_speakers = n_speakers 344 | self.gin_channels = gin_channels 345 | 346 | self.use_sdp = use_sdp 347 | 348 | self.enc_p = TextEncoder(n_vocab, 349 | inter_channels, 350 | hidden_channels, 351 | filter_channels, 352 | n_heads, 353 | n_layers, 354 | kernel_size, 355 | p_dropout, 356 | emotion_embedding) 357 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 358 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 359 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 360 | 361 | if use_sdp: 362 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 363 | else: 364 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 365 | 366 | if n_speakers > 1: 367 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 368 | 369 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None): 370 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding) 371 | if self.n_speakers > 0: 372 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 373 | else: 374 | g = None 375 | 376 | if self.use_sdp: 377 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 378 | else: 379 | logw = self.dp(x, x_mask, g=g) 380 | w = torch.exp(logw) * x_mask * length_scale 381 | w_ceil = torch.ceil(w) 382 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 383 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 384 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 385 | attn = commons.generate_path(w_ceil, attn_mask) 386 | 387 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 388 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 389 | 390 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 391 | z = self.flow(z_p, y_mask, g=g, reverse=True) 392 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 393 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 394 | 395 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 396 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 397 | g_src = self.emb_g(sid_src).unsqueeze(-1) 398 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 399 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 400 | z_p = self.flow(z, y_mask, g=g_src) 401 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 402 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 403 | return o_hat, y_mask, (z, z_p, z_hat) 404 | 405 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dilated and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /output.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/output.wav -------------------------------------------------------------------------------- /readme/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/1.png -------------------------------------------------------------------------------- /readme/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/2.png -------------------------------------------------------------------------------- /readme/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/3.png -------------------------------------------------------------------------------- /readme/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/4.png -------------------------------------------------------------------------------- /readme/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/5.png -------------------------------------------------------------------------------- /readme/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/6.png -------------------------------------------------------------------------------- /readme/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/7.png -------------------------------------------------------------------------------- /readme/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/8.png -------------------------------------------------------------------------------- /readme/cyberbot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/cyberbot.png -------------------------------------------------------------------------------- /readme/token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/readme/token.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | librosa 3 | numpy 4 | scipy 5 | torch 6 | unidecode 7 | openjtalk>=0.3.0.dev2 8 | jamo 9 | pypinyin 10 | jieba 11 | protobuf 12 | cn2an 13 | inflect 14 | eng_to_ipa 15 | ko_pron 16 | indic_transliteration 17 | num_thai 18 | opencc 19 | pyChatGPT 20 | vosk 21 | sounddevice 22 | miraicle 23 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /text/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/text/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /text/__pycache__/cleaners.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/text/__pycache__/cleaners.cpython-38.pyc -------------------------------------------------------------------------------- /text/__pycache__/japanese.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/text/__pycache__/japanese.cpython-38.pyc -------------------------------------------------------------------------------- /text/__pycache__/mandarin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuBai-He/ChatWaifu-marai/d0b76e350706e1365db42527718379764fac5dc1/text/__pycache__/mandarin.cpython-38.pyc -------------------------------------------------------------------------------- /text/cantonese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('jyutjyu') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ei˥'), 11 | ('B', 'biː˥'), 12 | ('C', 'siː˥'), 13 | ('D', 'tiː˥'), 14 | ('E', 'iː˥'), 15 | ('F', 'e˥fuː˨˩'), 16 | ('G', 'tsiː˥'), 17 | ('H', 'ɪk̚˥tsʰyː˨˩'), 18 | ('I', 'ɐi˥'), 19 | ('J', 'tsei˥'), 20 | ('K', 'kʰei˥'), 21 | ('L', 'e˥llou˨˩'), 22 | ('M', 'ɛːm˥'), 23 | ('N', 'ɛːn˥'), 24 | ('O', 'ou˥'), 25 | ('P', 'pʰiː˥'), 26 | ('Q', 'kʰiːu˥'), 27 | ('R', 'aː˥lou˨˩'), 28 | ('S', 'ɛː˥siː˨˩'), 29 | ('T', 'tʰiː˥'), 30 | ('U', 'juː˥'), 31 | ('V', 'wiː˥'), 32 | ('W', 'tʊk̚˥piː˥juː˥'), 33 | ('X', 'ɪk̚˥siː˨˩'), 34 | ('Y', 'waːi˥'), 35 | ('Z', 'iː˨sɛːt̚˥') 36 | ]] 37 | 38 | 39 | def number_to_cantonese(text): 40 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text) 41 | 42 | 43 | def latin_to_ipa(text): 44 | for regex, replacement in _latin_to_ipa: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def cantonese_to_ipa(text): 50 | text = number_to_cantonese(text.upper()) 51 | text = converter.convert(text).replace('-','').replace('$',' ') 52 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 53 | text = re.sub(r'[、;:]', ',', text) 54 | text = re.sub(r'\s*,\s*', ', ', text) 55 | text = re.sub(r'\s*。\s*', '. ', text) 56 | text = re.sub(r'\s*?\s*', '? ', text) 57 | text = re.sub(r'\s*!\s*', '! ', text) 58 | text = re.sub(r'\s*$', '', text) 59 | return text 60 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def japanese_cleaners(text): 5 | from text.japanese import japanese_to_romaji_with_accent 6 | text = japanese_to_romaji_with_accent(text) 7 | text = re.sub(r'([A-Za-z])$', r'\1.', text) 8 | return text 9 | 10 | 11 | def japanese_cleaners2(text): 12 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') 13 | 14 | 15 | def korean_cleaners(text): 16 | '''Pipeline for Korean text''' 17 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul 18 | text = latin_to_hangul(text) 19 | text = number_to_hangul(text) 20 | text = divide_hangul(text) 21 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) 22 | return text 23 | 24 | 25 | def chinese_cleaners(text): 26 | '''Pipeline for Chinese text''' 27 | from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo 28 | text = number_to_chinese(text) 29 | text = chinese_to_bopomofo(text) 30 | text = latin_to_bopomofo(text) 31 | text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text) 32 | return text 33 | 34 | 35 | def zh_ja_mixture_cleaners(text): 36 | from text.mandarin import chinese_to_romaji 37 | from text.japanese import japanese_to_romaji_with_accent 38 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 39 | lambda x: chinese_to_romaji(x.group(1))+' ', text) 40 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent( 41 | x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text) 42 | text = re.sub(r'\s+$', '', text) 43 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 44 | return text 45 | 46 | 47 | def sanskrit_cleaners(text): 48 | text = text.replace('॥', '।').replace('ॐ', 'ओम्') 49 | text = re.sub(r'([^।])$', r'\1।', text) 50 | return text 51 | 52 | 53 | def cjks_cleaners(text): 54 | from text.mandarin import chinese_to_lazy_ipa 55 | from text.japanese import japanese_to_ipa 56 | from text.korean import korean_to_lazy_ipa 57 | from text.sanskrit import devanagari_to_ipa 58 | from text.english import english_to_lazy_ipa 59 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 60 | lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text) 61 | text = re.sub(r'\[JA\](.*?)\[JA\]', 62 | lambda x: japanese_to_ipa(x.group(1))+' ', text) 63 | text = re.sub(r'\[KO\](.*?)\[KO\]', 64 | lambda x: korean_to_lazy_ipa(x.group(1))+' ', text) 65 | text = re.sub(r'\[SA\](.*?)\[SA\]', 66 | lambda x: devanagari_to_ipa(x.group(1))+' ', text) 67 | text = re.sub(r'\[EN\](.*?)\[EN\]', 68 | lambda x: english_to_lazy_ipa(x.group(1))+' ', text) 69 | text = re.sub(r'\s+$', '', text) 70 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 71 | return text 72 | 73 | 74 | def cjke_cleaners(text): 75 | from text.mandarin import chinese_to_lazy_ipa 76 | from text.japanese import japanese_to_ipa 77 | from text.korean import korean_to_ipa 78 | from text.english import english_to_ipa2 79 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace( 80 | 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text) 81 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace( 82 | 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text) 83 | text = re.sub(r'\[KO\](.*?)\[KO\]', 84 | lambda x: korean_to_ipa(x.group(1))+' ', text) 85 | text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace( 86 | 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text) 87 | text = re.sub(r'\s+$', '', text) 88 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 89 | return text 90 | 91 | 92 | def cjke_cleaners2(text): 93 | from text.mandarin import chinese_to_ipa 94 | from text.japanese import japanese_to_ipa2 95 | from text.korean import korean_to_ipa 96 | from text.english import english_to_ipa2 97 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 98 | lambda x: chinese_to_ipa(x.group(1))+' ', text) 99 | text = re.sub(r'\[JA\](.*?)\[JA\]', 100 | lambda x: japanese_to_ipa2(x.group(1))+' ', text) 101 | text = re.sub(r'\[KO\](.*?)\[KO\]', 102 | lambda x: korean_to_ipa(x.group(1))+' ', text) 103 | text = re.sub(r'\[EN\](.*?)\[EN\]', 104 | lambda x: english_to_ipa2(x.group(1))+' ', text) 105 | text = re.sub(r'\s+$', '', text) 106 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 107 | return text 108 | 109 | 110 | def thai_cleaners(text): 111 | from text.thai import num_to_thai, latin_to_thai 112 | text = num_to_thai(text) 113 | text = latin_to_thai(text) 114 | return text 115 | 116 | 117 | def shanghainese_cleaners(text): 118 | from text.shanghainese import shanghainese_to_ipa 119 | text = shanghainese_to_ipa(text) 120 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 121 | return text 122 | 123 | 124 | def chinese_dialect_cleaners(text): 125 | from text.mandarin import chinese_to_ipa2 126 | from text.japanese import japanese_to_ipa3 127 | from text.shanghainese import shanghainese_to_ipa 128 | from text.cantonese import cantonese_to_ipa 129 | from text.english import english_to_lazy_ipa2 130 | from text.ngu_dialect import ngu_dialect_to_ipa 131 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 132 | lambda x: chinese_to_ipa2(x.group(1))+' ', text) 133 | text = re.sub(r'\[JA\](.*?)\[JA\]', 134 | lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text) 135 | text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5', 136 | '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text) 137 | text = re.sub(r'\[GD\](.*?)\[GD\]', 138 | lambda x: cantonese_to_ipa(x.group(1))+' ', text) 139 | text = re.sub(r'\[EN\](.*?)\[EN\]', 140 | lambda x: english_to_lazy_ipa2(x.group(1))+' ', text) 141 | text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group( 142 | 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text) 143 | text = re.sub(r'\s+$', '', text) 144 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 145 | return text 146 | -------------------------------------------------------------------------------- /text/english.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | 18 | 19 | import re 20 | import inflect 21 | from unidecode import unidecode 22 | import eng_to_ipa as ipa 23 | _inflect = inflect.engine() 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 29 | _number_re = re.compile(r'[0-9]+') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | # List of (ipa, lazy ipa) pairs: 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 56 | ('r', 'ɹ'), 57 | ('æ', 'e'), 58 | ('ɑ', 'a'), 59 | ('ɔ', 'o'), 60 | ('ð', 'z'), 61 | ('θ', 's'), 62 | ('ɛ', 'e'), 63 | ('ɪ', 'i'), 64 | ('ʊ', 'u'), 65 | ('ʒ', 'ʥ'), 66 | ('ʤ', 'ʥ'), 67 | ('ˈ', '↓'), 68 | ]] 69 | 70 | # List of (ipa, lazy ipa2) pairs: 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 72 | ('r', 'ɹ'), 73 | ('ð', 'z'), 74 | ('θ', 's'), 75 | ('ʒ', 'ʑ'), 76 | ('ʤ', 'dʑ'), 77 | ('ˈ', '↓'), 78 | ]] 79 | 80 | # List of (ipa, ipa2) pairs 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 82 | ('r', 'ɹ'), 83 | ('ʤ', 'dʒ'), 84 | ('ʧ', 'tʃ') 85 | ]] 86 | 87 | 88 | def expand_abbreviations(text): 89 | for regex, replacement in _abbreviations: 90 | text = re.sub(regex, replacement, text) 91 | return text 92 | 93 | 94 | def collapse_whitespace(text): 95 | return re.sub(r'\s+', ' ', text) 96 | 97 | 98 | def _remove_commas(m): 99 | return m.group(1).replace(',', '') 100 | 101 | 102 | def _expand_decimal_point(m): 103 | return m.group(1).replace('.', ' point ') 104 | 105 | 106 | def _expand_dollars(m): 107 | match = m.group(1) 108 | parts = match.split('.') 109 | if len(parts) > 2: 110 | return match + ' dollars' # Unexpected format 111 | dollars = int(parts[0]) if parts[0] else 0 112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 113 | if dollars and cents: 114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 115 | cent_unit = 'cent' if cents == 1 else 'cents' 116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 117 | elif dollars: 118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 119 | return '%s %s' % (dollars, dollar_unit) 120 | elif cents: 121 | cent_unit = 'cent' if cents == 1 else 'cents' 122 | return '%s %s' % (cents, cent_unit) 123 | else: 124 | return 'zero dollars' 125 | 126 | 127 | def _expand_ordinal(m): 128 | return _inflect.number_to_words(m.group(0)) 129 | 130 | 131 | def _expand_number(m): 132 | num = int(m.group(0)) 133 | if num > 1000 and num < 3000: 134 | if num == 2000: 135 | return 'two thousand' 136 | elif num > 2000 and num < 2010: 137 | return 'two thousand ' + _inflect.number_to_words(num % 100) 138 | elif num % 100 == 0: 139 | return _inflect.number_to_words(num // 100) + ' hundred' 140 | else: 141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 142 | else: 143 | return _inflect.number_to_words(num, andword='') 144 | 145 | 146 | def normalize_numbers(text): 147 | text = re.sub(_comma_number_re, _remove_commas, text) 148 | text = re.sub(_pounds_re, r'\1 pounds', text) 149 | text = re.sub(_dollars_re, _expand_dollars, text) 150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 151 | text = re.sub(_ordinal_re, _expand_ordinal, text) 152 | text = re.sub(_number_re, _expand_number, text) 153 | return text 154 | 155 | 156 | def mark_dark_l(text): 157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) 158 | 159 | 160 | def english_to_ipa(text): 161 | text = unidecode(text).lower() 162 | text = expand_abbreviations(text) 163 | text = normalize_numbers(text) 164 | phonemes = ipa.convert(text) 165 | phonemes = collapse_whitespace(phonemes) 166 | return phonemes 167 | 168 | 169 | def english_to_lazy_ipa(text): 170 | text = english_to_ipa(text) 171 | for regex, replacement in _lazy_ipa: 172 | text = re.sub(regex, replacement, text) 173 | return text 174 | 175 | 176 | def english_to_ipa2(text): 177 | text = english_to_ipa(text) 178 | text = mark_dark_l(text) 179 | for regex, replacement in _ipa_to_ipa2: 180 | text = re.sub(regex, replacement, text) 181 | return text.replace('...', '…') 182 | 183 | 184 | def english_to_lazy_ipa2(text): 185 | text = english_to_ipa(text) 186 | for regex, replacement in _lazy_ipa2: 187 | text = re.sub(regex, replacement, text) 188 | return text 189 | -------------------------------------------------------------------------------- /text/japanese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unidecode import unidecode 3 | import pyopenjtalk 4 | 5 | 6 | # Regular expression matching Japanese without punctuation marks: 7 | _japanese_characters = re.compile( 8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 9 | 10 | # Regular expression matching non-Japanese characters or punctuation marks: 11 | _japanese_marks = re.compile( 12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 13 | 14 | # List of (symbol, Japanese) pairs for marks: 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ 16 | ('%', 'パーセント') 17 | ]] 18 | 19 | # List of (romaji, ipa) pairs for marks: 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 21 | ('ts', 'ʦ'), 22 | ('u', 'ɯ'), 23 | ('j', 'ʥ'), 24 | ('y', 'j'), 25 | ('ni', 'n^i'), 26 | ('nj', 'n^'), 27 | ('hi', 'çi'), 28 | ('hj', 'ç'), 29 | ('f', 'ɸ'), 30 | ('I', 'i*'), 31 | ('U', 'ɯ*'), 32 | ('r', 'ɾ') 33 | ]] 34 | 35 | # List of (romaji, ipa2) pairs for marks: 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 37 | ('u', 'ɯ'), 38 | ('ʧ', 'tʃ'), 39 | ('j', 'dʑ'), 40 | ('y', 'j'), 41 | ('ni', 'n^i'), 42 | ('nj', 'n^'), 43 | ('hi', 'çi'), 44 | ('hj', 'ç'), 45 | ('f', 'ɸ'), 46 | ('I', 'i*'), 47 | ('U', 'ɯ*'), 48 | ('r', 'ɾ') 49 | ]] 50 | 51 | # List of (consonant, sokuon) pairs: 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 53 | (r'Q([↑↓]*[kg])', r'k#\1'), 54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'), 55 | (r'Q([↑↓]*[sʃ])', r's\1'), 56 | (r'Q([↑↓]*[pb])', r'p#\1') 57 | ]] 58 | 59 | # List of (consonant, hatsuon) pairs: 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 61 | (r'N([↑↓]*[pbm])', r'm\1'), 62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'), 63 | (r'N([↑↓]*[tdn])', r'n\1'), 64 | (r'N([↑↓]*[kg])', r'ŋ\1') 65 | ]] 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def japanese_to_romaji_with_accent(text): 75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = '' 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if text != '': 83 | text += ' ' 84 | labels = pyopenjtalk.extract_fullcontext(sentence) 85 | for n, label in enumerate(labels): 86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 87 | if phoneme not in ['sil', 'pau']: 88 | text += phoneme.replace('ch', 'ʧ').replace('sh', 89 | 'ʃ').replace('cl', 'Q') 90 | else: 91 | continue 92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 95 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']: 97 | a2_next = -1 98 | else: 99 | a2_next = int( 100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 101 | # Accent phrase boundary 102 | if a3 == 1 and a2_next == 1: 103 | text += ' ' 104 | # Falling 105 | elif a1 == 0 and a2_next == a2 + 1: 106 | text += '↓' 107 | # Rising 108 | elif a2 == 1 and a2_next == 2: 109 | text += '↑' 110 | if i < len(marks): 111 | text += unidecode(marks[i]).replace(' ', '') 112 | return text 113 | 114 | 115 | def get_real_sokuon(text): 116 | for regex, replacement in _real_sokuon: 117 | text = re.sub(regex, replacement, text) 118 | return text 119 | 120 | 121 | def get_real_hatsuon(text): 122 | for regex, replacement in _real_hatsuon: 123 | text = re.sub(regex, replacement, text) 124 | return text 125 | 126 | 127 | def japanese_to_ipa(text): 128 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 129 | text = re.sub( 130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 131 | text = get_real_sokuon(text) 132 | text = get_real_hatsuon(text) 133 | for regex, replacement in _romaji_to_ipa: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def japanese_to_ipa2(text): 139 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 140 | text = get_real_sokuon(text) 141 | text = get_real_hatsuon(text) 142 | for regex, replacement in _romaji_to_ipa2: 143 | text = re.sub(regex, replacement, text) 144 | return text 145 | 146 | 147 | def japanese_to_ipa3(text): 148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace( 149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a') 150 | text = re.sub( 151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text) 153 | return text 154 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2hcj 3 | import ko_pron 4 | 5 | 6 | # This is a list of Korean classifiers preceded by pure Korean numerals. 7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 8 | 9 | # List of (hangul, hangul divided) pairs: 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 11 | ('ㄳ', 'ㄱㅅ'), 12 | ('ㄵ', 'ㄴㅈ'), 13 | ('ㄶ', 'ㄴㅎ'), 14 | ('ㄺ', 'ㄹㄱ'), 15 | ('ㄻ', 'ㄹㅁ'), 16 | ('ㄼ', 'ㄹㅂ'), 17 | ('ㄽ', 'ㄹㅅ'), 18 | ('ㄾ', 'ㄹㅌ'), 19 | ('ㄿ', 'ㄹㅍ'), 20 | ('ㅀ', 'ㄹㅎ'), 21 | ('ㅄ', 'ㅂㅅ'), 22 | ('ㅘ', 'ㅗㅏ'), 23 | ('ㅙ', 'ㅗㅐ'), 24 | ('ㅚ', 'ㅗㅣ'), 25 | ('ㅝ', 'ㅜㅓ'), 26 | ('ㅞ', 'ㅜㅔ'), 27 | ('ㅟ', 'ㅜㅣ'), 28 | ('ㅢ', 'ㅡㅣ'), 29 | ('ㅑ', 'ㅣㅏ'), 30 | ('ㅒ', 'ㅣㅐ'), 31 | ('ㅕ', 'ㅣㅓ'), 32 | ('ㅖ', 'ㅣㅔ'), 33 | ('ㅛ', 'ㅣㅗ'), 34 | ('ㅠ', 'ㅣㅜ') 35 | ]] 36 | 37 | # List of (Latin alphabet, hangul) pairs: 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 39 | ('a', '에이'), 40 | ('b', '비'), 41 | ('c', '시'), 42 | ('d', '디'), 43 | ('e', '이'), 44 | ('f', '에프'), 45 | ('g', '지'), 46 | ('h', '에이치'), 47 | ('i', '아이'), 48 | ('j', '제이'), 49 | ('k', '케이'), 50 | ('l', '엘'), 51 | ('m', '엠'), 52 | ('n', '엔'), 53 | ('o', '오'), 54 | ('p', '피'), 55 | ('q', '큐'), 56 | ('r', '아르'), 57 | ('s', '에스'), 58 | ('t', '티'), 59 | ('u', '유'), 60 | ('v', '브이'), 61 | ('w', '더블유'), 62 | ('x', '엑스'), 63 | ('y', '와이'), 64 | ('z', '제트') 65 | ]] 66 | 67 | # List of (ipa, lazy ipa) pairs: 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 69 | ('t͡ɕ','ʧ'), 70 | ('d͡ʑ','ʥ'), 71 | ('ɲ','n^'), 72 | ('ɕ','ʃ'), 73 | ('ʷ','w'), 74 | ('ɭ','l`'), 75 | ('ʎ','ɾ'), 76 | ('ɣ','ŋ'), 77 | ('ɰ','ɯ'), 78 | ('ʝ','j'), 79 | ('ʌ','ə'), 80 | ('ɡ','g'), 81 | ('\u031a','#'), 82 | ('\u0348','='), 83 | ('\u031e',''), 84 | ('\u0320',''), 85 | ('\u0339','') 86 | ]] 87 | 88 | 89 | def latin_to_hangul(text): 90 | for regex, replacement in _latin_to_hangul: 91 | text = re.sub(regex, replacement, text) 92 | return text 93 | 94 | 95 | def divide_hangul(text): 96 | text = j2hcj(h2j(text)) 97 | for regex, replacement in _hangul_divided: 98 | text = re.sub(regex, replacement, text) 99 | return text 100 | 101 | 102 | def hangul_number(num, sino=True): 103 | '''Reference https://github.com/Kyubyong/g2pK''' 104 | num = re.sub(',', '', num) 105 | 106 | if num == '0': 107 | return '영' 108 | if not sino and num == '20': 109 | return '스무' 110 | 111 | digits = '123456789' 112 | names = '일이삼사오육칠팔구' 113 | digit2name = {d: n for d, n in zip(digits, names)} 114 | 115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 119 | 120 | spelledout = [] 121 | for i, digit in enumerate(num): 122 | i = len(num) - i - 1 123 | if sino: 124 | if i == 0: 125 | name = digit2name.get(digit, '') 126 | elif i == 1: 127 | name = digit2name.get(digit, '') + '십' 128 | name = name.replace('일십', '십') 129 | else: 130 | if i == 0: 131 | name = digit2mod.get(digit, '') 132 | elif i == 1: 133 | name = digit2dec.get(digit, '') 134 | if digit == '0': 135 | if i % 4 == 0: 136 | last_three = spelledout[-min(3, len(spelledout)):] 137 | if ''.join(last_three) == '': 138 | spelledout.append('') 139 | continue 140 | else: 141 | spelledout.append('') 142 | continue 143 | if i == 2: 144 | name = digit2name.get(digit, '') + '백' 145 | name = name.replace('일백', '백') 146 | elif i == 3: 147 | name = digit2name.get(digit, '') + '천' 148 | name = name.replace('일천', '천') 149 | elif i == 4: 150 | name = digit2name.get(digit, '') + '만' 151 | name = name.replace('일만', '만') 152 | elif i == 5: 153 | name = digit2name.get(digit, '') + '십' 154 | name = name.replace('일십', '십') 155 | elif i == 6: 156 | name = digit2name.get(digit, '') + '백' 157 | name = name.replace('일백', '백') 158 | elif i == 7: 159 | name = digit2name.get(digit, '') + '천' 160 | name = name.replace('일천', '천') 161 | elif i == 8: 162 | name = digit2name.get(digit, '') + '억' 163 | elif i == 9: 164 | name = digit2name.get(digit, '') + '십' 165 | elif i == 10: 166 | name = digit2name.get(digit, '') + '백' 167 | elif i == 11: 168 | name = digit2name.get(digit, '') + '천' 169 | elif i == 12: 170 | name = digit2name.get(digit, '') + '조' 171 | elif i == 13: 172 | name = digit2name.get(digit, '') + '십' 173 | elif i == 14: 174 | name = digit2name.get(digit, '') + '백' 175 | elif i == 15: 176 | name = digit2name.get(digit, '') + '천' 177 | spelledout.append(name) 178 | return ''.join(elem for elem in spelledout) 179 | 180 | 181 | def number_to_hangul(text): 182 | '''Reference https://github.com/Kyubyong/g2pK''' 183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 184 | for token in tokens: 185 | num, classifier = token 186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 187 | spelledout = hangul_number(num, sino=False) 188 | else: 189 | spelledout = hangul_number(num, sino=True) 190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 191 | # digit by digit for remaining digits 192 | digits = '0123456789' 193 | names = '영일이삼사오육칠팔구' 194 | for d, n in zip(digits, names): 195 | text = text.replace(d, n) 196 | return text 197 | 198 | 199 | def korean_to_lazy_ipa(text): 200 | text = latin_to_hangul(text) 201 | text = number_to_hangul(text) 202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) 203 | for regex, replacement in _ipa_to_lazy_ipa: 204 | text = re.sub(regex, replacement, text) 205 | return text 206 | 207 | 208 | def korean_to_ipa(text): 209 | text = korean_to_lazy_ipa(text) 210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ') 211 | -------------------------------------------------------------------------------- /text/mandarin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | from pypinyin import lazy_pinyin, BOPOMOFO 5 | import jieba 6 | import cn2an 7 | import logging 8 | 9 | logging.getLogger('jieba').setLevel(logging.WARNING) 10 | jieba.set_dictionary(r'./jieba/dict.txt') 11 | jieba.initialize() 12 | 13 | 14 | # List of (Latin alphabet, bopomofo) pairs: 15 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 16 | ('a', 'ㄟˉ'), 17 | ('b', 'ㄅㄧˋ'), 18 | ('c', 'ㄙㄧˉ'), 19 | ('d', 'ㄉㄧˋ'), 20 | ('e', 'ㄧˋ'), 21 | ('f', 'ㄝˊㄈㄨˋ'), 22 | ('g', 'ㄐㄧˋ'), 23 | ('h', 'ㄝˇㄑㄩˋ'), 24 | ('i', 'ㄞˋ'), 25 | ('j', 'ㄐㄟˋ'), 26 | ('k', 'ㄎㄟˋ'), 27 | ('l', 'ㄝˊㄛˋ'), 28 | ('m', 'ㄝˊㄇㄨˋ'), 29 | ('n', 'ㄣˉ'), 30 | ('o', 'ㄡˉ'), 31 | ('p', 'ㄆㄧˉ'), 32 | ('q', 'ㄎㄧㄡˉ'), 33 | ('r', 'ㄚˋ'), 34 | ('s', 'ㄝˊㄙˋ'), 35 | ('t', 'ㄊㄧˋ'), 36 | ('u', 'ㄧㄡˉ'), 37 | ('v', 'ㄨㄧˉ'), 38 | ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), 39 | ('x', 'ㄝˉㄎㄨˋㄙˋ'), 40 | ('y', 'ㄨㄞˋ'), 41 | ('z', 'ㄗㄟˋ') 42 | ]] 43 | 44 | # List of (bopomofo, romaji) pairs: 45 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ 46 | ('ㄅㄛ', 'p⁼wo'), 47 | ('ㄆㄛ', 'pʰwo'), 48 | ('ㄇㄛ', 'mwo'), 49 | ('ㄈㄛ', 'fwo'), 50 | ('ㄅ', 'p⁼'), 51 | ('ㄆ', 'pʰ'), 52 | ('ㄇ', 'm'), 53 | ('ㄈ', 'f'), 54 | ('ㄉ', 't⁼'), 55 | ('ㄊ', 'tʰ'), 56 | ('ㄋ', 'n'), 57 | ('ㄌ', 'l'), 58 | ('ㄍ', 'k⁼'), 59 | ('ㄎ', 'kʰ'), 60 | ('ㄏ', 'h'), 61 | ('ㄐ', 'ʧ⁼'), 62 | ('ㄑ', 'ʧʰ'), 63 | ('ㄒ', 'ʃ'), 64 | ('ㄓ', 'ʦ`⁼'), 65 | ('ㄔ', 'ʦ`ʰ'), 66 | ('ㄕ', 's`'), 67 | ('ㄖ', 'ɹ`'), 68 | ('ㄗ', 'ʦ⁼'), 69 | ('ㄘ', 'ʦʰ'), 70 | ('ㄙ', 's'), 71 | ('ㄚ', 'a'), 72 | ('ㄛ', 'o'), 73 | ('ㄜ', 'ə'), 74 | ('ㄝ', 'e'), 75 | ('ㄞ', 'ai'), 76 | ('ㄟ', 'ei'), 77 | ('ㄠ', 'au'), 78 | ('ㄡ', 'ou'), 79 | ('ㄧㄢ', 'yeNN'), 80 | ('ㄢ', 'aNN'), 81 | ('ㄧㄣ', 'iNN'), 82 | ('ㄣ', 'əNN'), 83 | ('ㄤ', 'aNg'), 84 | ('ㄧㄥ', 'iNg'), 85 | ('ㄨㄥ', 'uNg'), 86 | ('ㄩㄥ', 'yuNg'), 87 | ('ㄥ', 'əNg'), 88 | ('ㄦ', 'əɻ'), 89 | ('ㄧ', 'i'), 90 | ('ㄨ', 'u'), 91 | ('ㄩ', 'ɥ'), 92 | ('ˉ', '→'), 93 | ('ˊ', '↑'), 94 | ('ˇ', '↓↑'), 95 | ('ˋ', '↓'), 96 | ('˙', ''), 97 | (',', ','), 98 | ('。', '.'), 99 | ('!', '!'), 100 | ('?', '?'), 101 | ('—', '-') 102 | ]] 103 | 104 | # List of (romaji, ipa) pairs: 105 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 106 | ('ʃy', 'ʃ'), 107 | ('ʧʰy', 'ʧʰ'), 108 | ('ʧ⁼y', 'ʧ⁼'), 109 | ('NN', 'n'), 110 | ('Ng', 'ŋ'), 111 | ('y', 'j'), 112 | ('h', 'x') 113 | ]] 114 | 115 | # List of (bopomofo, ipa) pairs: 116 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 117 | ('ㄅㄛ', 'p⁼wo'), 118 | ('ㄆㄛ', 'pʰwo'), 119 | ('ㄇㄛ', 'mwo'), 120 | ('ㄈㄛ', 'fwo'), 121 | ('ㄅ', 'p⁼'), 122 | ('ㄆ', 'pʰ'), 123 | ('ㄇ', 'm'), 124 | ('ㄈ', 'f'), 125 | ('ㄉ', 't⁼'), 126 | ('ㄊ', 'tʰ'), 127 | ('ㄋ', 'n'), 128 | ('ㄌ', 'l'), 129 | ('ㄍ', 'k⁼'), 130 | ('ㄎ', 'kʰ'), 131 | ('ㄏ', 'x'), 132 | ('ㄐ', 'tʃ⁼'), 133 | ('ㄑ', 'tʃʰ'), 134 | ('ㄒ', 'ʃ'), 135 | ('ㄓ', 'ts`⁼'), 136 | ('ㄔ', 'ts`ʰ'), 137 | ('ㄕ', 's`'), 138 | ('ㄖ', 'ɹ`'), 139 | ('ㄗ', 'ts⁼'), 140 | ('ㄘ', 'tsʰ'), 141 | ('ㄙ', 's'), 142 | ('ㄚ', 'a'), 143 | ('ㄛ', 'o'), 144 | ('ㄜ', 'ə'), 145 | ('ㄝ', 'ɛ'), 146 | ('ㄞ', 'aɪ'), 147 | ('ㄟ', 'eɪ'), 148 | ('ㄠ', 'ɑʊ'), 149 | ('ㄡ', 'oʊ'), 150 | ('ㄧㄢ', 'jɛn'), 151 | ('ㄩㄢ', 'ɥæn'), 152 | ('ㄢ', 'an'), 153 | ('ㄧㄣ', 'in'), 154 | ('ㄩㄣ', 'ɥn'), 155 | ('ㄣ', 'ən'), 156 | ('ㄤ', 'ɑŋ'), 157 | ('ㄧㄥ', 'iŋ'), 158 | ('ㄨㄥ', 'ʊŋ'), 159 | ('ㄩㄥ', 'jʊŋ'), 160 | ('ㄥ', 'əŋ'), 161 | ('ㄦ', 'əɻ'), 162 | ('ㄧ', 'i'), 163 | ('ㄨ', 'u'), 164 | ('ㄩ', 'ɥ'), 165 | ('ˉ', '→'), 166 | ('ˊ', '↑'), 167 | ('ˇ', '↓↑'), 168 | ('ˋ', '↓'), 169 | ('˙', ''), 170 | (',', ','), 171 | ('。', '.'), 172 | ('!', '!'), 173 | ('?', '?'), 174 | ('—', '-') 175 | ]] 176 | 177 | # List of (bopomofo, ipa2) pairs: 178 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 179 | ('ㄅㄛ', 'pwo'), 180 | ('ㄆㄛ', 'pʰwo'), 181 | ('ㄇㄛ', 'mwo'), 182 | ('ㄈㄛ', 'fwo'), 183 | ('ㄅ', 'p'), 184 | ('ㄆ', 'pʰ'), 185 | ('ㄇ', 'm'), 186 | ('ㄈ', 'f'), 187 | ('ㄉ', 't'), 188 | ('ㄊ', 'tʰ'), 189 | ('ㄋ', 'n'), 190 | ('ㄌ', 'l'), 191 | ('ㄍ', 'k'), 192 | ('ㄎ', 'kʰ'), 193 | ('ㄏ', 'h'), 194 | ('ㄐ', 'tɕ'), 195 | ('ㄑ', 'tɕʰ'), 196 | ('ㄒ', 'ɕ'), 197 | ('ㄓ', 'tʂ'), 198 | ('ㄔ', 'tʂʰ'), 199 | ('ㄕ', 'ʂ'), 200 | ('ㄖ', 'ɻ'), 201 | ('ㄗ', 'ts'), 202 | ('ㄘ', 'tsʰ'), 203 | ('ㄙ', 's'), 204 | ('ㄚ', 'a'), 205 | ('ㄛ', 'o'), 206 | ('ㄜ', 'ɤ'), 207 | ('ㄝ', 'ɛ'), 208 | ('ㄞ', 'aɪ'), 209 | ('ㄟ', 'eɪ'), 210 | ('ㄠ', 'ɑʊ'), 211 | ('ㄡ', 'oʊ'), 212 | ('ㄧㄢ', 'jɛn'), 213 | ('ㄩㄢ', 'yæn'), 214 | ('ㄢ', 'an'), 215 | ('ㄧㄣ', 'in'), 216 | ('ㄩㄣ', 'yn'), 217 | ('ㄣ', 'ən'), 218 | ('ㄤ', 'ɑŋ'), 219 | ('ㄧㄥ', 'iŋ'), 220 | ('ㄨㄥ', 'ʊŋ'), 221 | ('ㄩㄥ', 'jʊŋ'), 222 | ('ㄥ', 'ɤŋ'), 223 | ('ㄦ', 'əɻ'), 224 | ('ㄧ', 'i'), 225 | ('ㄨ', 'u'), 226 | ('ㄩ', 'y'), 227 | ('ˉ', '˥'), 228 | ('ˊ', '˧˥'), 229 | ('ˇ', '˨˩˦'), 230 | ('ˋ', '˥˩'), 231 | ('˙', ''), 232 | (',', ','), 233 | ('。', '.'), 234 | ('!', '!'), 235 | ('?', '?'), 236 | ('—', '-') 237 | ]] 238 | 239 | 240 | def number_to_chinese(text): 241 | numbers = re.findall(r'\d+(?:\.?\d+)?', text) 242 | for number in numbers: 243 | text = text.replace(number, cn2an.an2cn(number), 1) 244 | return text 245 | 246 | 247 | def chinese_to_bopomofo(text): 248 | text = text.replace('、', ',').replace(';', ',').replace(':', ',') 249 | words = jieba.lcut(text, cut_all=False) 250 | text = '' 251 | for word in words: 252 | bopomofos = lazy_pinyin(word, BOPOMOFO) 253 | if not re.search('[\u4e00-\u9fff]', word): 254 | text += word 255 | continue 256 | for i in range(len(bopomofos)): 257 | bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) 258 | if text != '': 259 | text += ' ' 260 | text += ''.join(bopomofos) 261 | return text 262 | 263 | 264 | def latin_to_bopomofo(text): 265 | for regex, replacement in _latin_to_bopomofo: 266 | text = re.sub(regex, replacement, text) 267 | return text 268 | 269 | 270 | def bopomofo_to_romaji(text): 271 | for regex, replacement in _bopomofo_to_romaji: 272 | text = re.sub(regex, replacement, text) 273 | return text 274 | 275 | 276 | def bopomofo_to_ipa(text): 277 | for regex, replacement in _bopomofo_to_ipa: 278 | text = re.sub(regex, replacement, text) 279 | return text 280 | 281 | 282 | def bopomofo_to_ipa2(text): 283 | for regex, replacement in _bopomofo_to_ipa2: 284 | text = re.sub(regex, replacement, text) 285 | return text 286 | 287 | 288 | def chinese_to_romaji(text): 289 | text = number_to_chinese(text) 290 | text = chinese_to_bopomofo(text) 291 | text = latin_to_bopomofo(text) 292 | text = bopomofo_to_romaji(text) 293 | text = re.sub('i([aoe])', r'y\1', text) 294 | text = re.sub('u([aoəe])', r'w\1', text) 295 | text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 296 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 297 | text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 298 | return text 299 | 300 | 301 | def chinese_to_lazy_ipa(text): 302 | text = chinese_to_romaji(text) 303 | for regex, replacement in _romaji_to_ipa: 304 | text = re.sub(regex, replacement, text) 305 | return text 306 | 307 | 308 | def chinese_to_ipa(text): 309 | text = number_to_chinese(text) 310 | text = chinese_to_bopomofo(text) 311 | text = latin_to_bopomofo(text) 312 | text = bopomofo_to_ipa(text) 313 | text = re.sub('i([aoe])', r'j\1', text) 314 | text = re.sub('u([aoəe])', r'w\1', text) 315 | text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 316 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 317 | text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 318 | return text 319 | 320 | 321 | def chinese_to_ipa2(text): 322 | text = number_to_chinese(text) 323 | text = chinese_to_bopomofo(text) 324 | text = latin_to_bopomofo(text) 325 | text = bopomofo_to_ipa2(text) 326 | text = re.sub(r'i([aoe])', r'j\1', text) 327 | text = re.sub(r'u([aoəe])', r'w\1', text) 328 | text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) 329 | text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) 330 | return text 331 | -------------------------------------------------------------------------------- /text/ngu_dialect.py: -------------------------------------------------------------------------------- 1 | import re 2 | import opencc 3 | 4 | 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou', 6 | 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing', 7 | 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang', 8 | 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan', 9 | 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 10 | 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'} 11 | 12 | converters = {} 13 | 14 | for dialect in dialects.values(): 15 | try: 16 | converters[dialect] = opencc.OpenCC(dialect) 17 | except: 18 | pass 19 | 20 | 21 | def ngu_dialect_to_ipa(text, dialect): 22 | dialect = dialects[dialect] 23 | text = converters[dialect].convert(text).replace('-','').replace('$',' ') 24 | text = re.sub(r'[、;:]', ',', text) 25 | text = re.sub(r'\s*,\s*', ', ', text) 26 | text = re.sub(r'\s*。\s*', '. ', text) 27 | text = re.sub(r'\s*?\s*', '? ', text) 28 | text = re.sub(r'\s*!\s*', '! ', text) 29 | text = re.sub(r'\s*$', '', text) 30 | return text 31 | -------------------------------------------------------------------------------- /text/sanskrit.py: -------------------------------------------------------------------------------- 1 | import re 2 | from indic_transliteration import sanscript 3 | 4 | 5 | # List of (iast, ipa) pairs: 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 7 | ('a', 'ə'), 8 | ('ā', 'aː'), 9 | ('ī', 'iː'), 10 | ('ū', 'uː'), 11 | ('ṛ', 'ɹ`'), 12 | ('ṝ', 'ɹ`ː'), 13 | ('ḷ', 'l`'), 14 | ('ḹ', 'l`ː'), 15 | ('e', 'eː'), 16 | ('o', 'oː'), 17 | ('k', 'k⁼'), 18 | ('k⁼h', 'kʰ'), 19 | ('g', 'g⁼'), 20 | ('g⁼h', 'gʰ'), 21 | ('ṅ', 'ŋ'), 22 | ('c', 'ʧ⁼'), 23 | ('ʧ⁼h', 'ʧʰ'), 24 | ('j', 'ʥ⁼'), 25 | ('ʥ⁼h', 'ʥʰ'), 26 | ('ñ', 'n^'), 27 | ('ṭ', 't`⁼'), 28 | ('t`⁼h', 't`ʰ'), 29 | ('ḍ', 'd`⁼'), 30 | ('d`⁼h', 'd`ʰ'), 31 | ('ṇ', 'n`'), 32 | ('t', 't⁼'), 33 | ('t⁼h', 'tʰ'), 34 | ('d', 'd⁼'), 35 | ('d⁼h', 'dʰ'), 36 | ('p', 'p⁼'), 37 | ('p⁼h', 'pʰ'), 38 | ('b', 'b⁼'), 39 | ('b⁼h', 'bʰ'), 40 | ('y', 'j'), 41 | ('ś', 'ʃ'), 42 | ('ṣ', 's`'), 43 | ('r', 'ɾ'), 44 | ('l̤', 'l`'), 45 | ('h', 'ɦ'), 46 | ("'", ''), 47 | ('~', '^'), 48 | ('ṃ', '^') 49 | ]] 50 | 51 | 52 | def devanagari_to_ipa(text): 53 | text = text.replace('ॐ', 'ओम्') 54 | text = re.sub(r'\s*।\s*$', '.', text) 55 | text = re.sub(r'\s*।\s*', ', ', text) 56 | text = re.sub(r'\s*॥', '.', text) 57 | text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST) 58 | for regex, replacement in _iast_to_ipa: 59 | text = re.sub(regex, replacement, text) 60 | text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0) 61 | [:-1]+'h'+x.group(1)+'*', text) 62 | return text 63 | -------------------------------------------------------------------------------- /text/shanghainese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('zaonhe') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ᴇ'), 11 | ('B', 'bi'), 12 | ('C', 'si'), 13 | ('D', 'di'), 14 | ('E', 'i'), 15 | ('F', 'ᴇf'), 16 | ('G', 'dʑi'), 17 | ('H', 'ᴇtɕʰ'), 18 | ('I', 'ᴀi'), 19 | ('J', 'dʑᴇ'), 20 | ('K', 'kʰᴇ'), 21 | ('L', 'ᴇl'), 22 | ('M', 'ᴇm'), 23 | ('N', 'ᴇn'), 24 | ('O', 'o'), 25 | ('P', 'pʰi'), 26 | ('Q', 'kʰiu'), 27 | ('R', 'ᴀl'), 28 | ('S', 'ᴇs'), 29 | ('T', 'tʰi'), 30 | ('U', 'ɦiu'), 31 | ('V', 'vi'), 32 | ('W', 'dᴀbɤliu'), 33 | ('X', 'ᴇks'), 34 | ('Y', 'uᴀi'), 35 | ('Z', 'zᴇ') 36 | ]] 37 | 38 | 39 | def _number_to_shanghainese(num): 40 | num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两') 41 | return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num) 42 | 43 | 44 | def number_to_shanghainese(text): 45 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text) 46 | 47 | 48 | def latin_to_ipa(text): 49 | for regex, replacement in _latin_to_ipa: 50 | text = re.sub(regex, replacement, text) 51 | return text 52 | 53 | 54 | def shanghainese_to_ipa(text): 55 | text = number_to_shanghainese(text.upper()) 56 | text = converter.convert(text).replace('-','').replace('$',' ') 57 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 58 | text = re.sub(r'[、;:]', ',', text) 59 | text = re.sub(r'\s*,\s*', ', ', text) 60 | text = re.sub(r'\s*。\s*', '. ', text) 61 | text = re.sub(r'\s*?\s*', '? ', text) 62 | text = re.sub(r'\s*!\s*', '! ', text) 63 | text = re.sub(r'\s*$', '', text) 64 | return text 65 | -------------------------------------------------------------------------------- /text/thai.py: -------------------------------------------------------------------------------- 1 | import re 2 | from num_thai.thainumbers import NumThai 3 | 4 | 5 | num = NumThai() 6 | 7 | # List of (Latin alphabet, Thai) pairs: 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 9 | ('a', 'เอ'), 10 | ('b','บี'), 11 | ('c','ซี'), 12 | ('d','ดี'), 13 | ('e','อี'), 14 | ('f','เอฟ'), 15 | ('g','จี'), 16 | ('h','เอช'), 17 | ('i','ไอ'), 18 | ('j','เจ'), 19 | ('k','เค'), 20 | ('l','แอล'), 21 | ('m','เอ็ม'), 22 | ('n','เอ็น'), 23 | ('o','โอ'), 24 | ('p','พี'), 25 | ('q','คิว'), 26 | ('r','แอร์'), 27 | ('s','เอส'), 28 | ('t','ที'), 29 | ('u','ยู'), 30 | ('v','วี'), 31 | ('w','ดับเบิลยู'), 32 | ('x','เอ็กซ์'), 33 | ('y','วาย'), 34 | ('z','ซี') 35 | ]] 36 | 37 | 38 | def num_to_thai(text): 39 | return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text) 40 | 41 | def latin_to_thai(text): 42 | for regex, replacement in _latin_to_thai: 43 | text = re.sub(regex, replacement, text) 44 | return text 45 | -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from json import loads 3 | from torch import load, FloatTensor 4 | from numpy import float32 5 | import librosa 6 | 7 | 8 | class HParams(): 9 | def __init__(self, **kwargs): 10 | for k, v in kwargs.items(): 11 | if type(v) == dict: 12 | v = HParams(**v) 13 | self[k] = v 14 | 15 | def keys(self): 16 | return self.__dict__.keys() 17 | 18 | def items(self): 19 | return self.__dict__.items() 20 | 21 | def values(self): 22 | return self.__dict__.values() 23 | 24 | def __len__(self): 25 | return len(self.__dict__) 26 | 27 | def __getitem__(self, key): 28 | return getattr(self, key) 29 | 30 | def __setitem__(self, key, value): 31 | return setattr(self, key, value) 32 | 33 | def __contains__(self, key): 34 | return key in self.__dict__ 35 | 36 | def __repr__(self): 37 | return self.__dict__.__repr__() 38 | 39 | 40 | def load_checkpoint(checkpoint_path, model): 41 | checkpoint_dict = load(checkpoint_path, map_location='cpu') 42 | iteration = checkpoint_dict['iteration'] 43 | saved_state_dict = checkpoint_dict['model'] 44 | if hasattr(model, 'module'): 45 | state_dict = model.module.state_dict() 46 | else: 47 | state_dict = model.state_dict() 48 | new_state_dict= {} 49 | for k, v in state_dict.items(): 50 | try: 51 | new_state_dict[k] = saved_state_dict[k] 52 | except: 53 | logging.info("%s is not in the checkpoint" % k) 54 | new_state_dict[k] = v 55 | if hasattr(model, 'module'): 56 | model.module.load_state_dict(new_state_dict) 57 | else: 58 | model.load_state_dict(new_state_dict) 59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format( 60 | checkpoint_path, iteration)) 61 | return 62 | 63 | 64 | def get_hparams_from_file(config_path): 65 | with open(config_path, "r") as f: 66 | data = f.read() 67 | config = loads(data) 68 | 69 | hparams = HParams(**config) 70 | return hparams 71 | 72 | 73 | def load_audio_to_torch(full_path, target_sampling_rate): 74 | audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True) 75 | return FloatTensor(audio.astype(float32)) 76 | --------------------------------------------------------------------------------