├── README.md ├── app.py ├── damo ├── speech_frcrn_ans_cirm_16k │ └── readme.txt └── speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k │ └── readme.txt ├── gitattributes ├── listen ├── output_training_data └── __init__.py ├── packages.txt ├── pretrain_work_dir └── __init__.py ├── requirements.txt └── trained_model └── __init__.py /README.md: -------------------------------------------------------------------------------- 1 | # SambertTTS-WebUI/SambertHifigan中文语音克隆UI 2 | 3 | **UI界面/User Interface** 4 | 5 |  6 | 7 | **体验地址:https://huggingface.co/spaces/TUHs/Genshin_impactTTS** 8 | 9 | **一个gradio WebUI运行阿里的SambertHifigan中文语音克隆模型** 10 | 11 | A gradio web UI for running Samberthifigan 12 | 13 |  14 | 15 | 16 | 17 | **训练五分钟,推理10分钟,效果90+的语音克隆模型** 18 | 19 | A voice cloning model that trains for five minutes, infers for ten minutes, and has an accuracy of over 90% 20 | 21 |  22 | 23 | 24 | 支持 **[一键训练]**、**[声音合成]**、**[模型修改]**、**[AI降噪]**、**[缓存清理]** 25 | 26 | This has [training], [inference], [model modification], [noise reduction], and [cache cleaning] functions. 27 | 28 |  29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | # 使用方法/Usage 40 | 41 | 42 | **1.[一键训练] : 上传或录制音频,程序会自动标注音频,一键训练模型,支持训练后推理试听,支持模型保存** 43 | 44 | [Training]: Upload or record audio, the program will automatically label the audio, one-click training of the model, support post-training inference audition, support model saving 45 | 46 |  47 | 48 |  49 | 50 | **2.[声音合成] : 在这里可以选择已保存的模型进行推理,自带可调机械降噪,可以任意选择已训练的音频进行推理** 51 | 52 | [Influence]: Here you can choose a saved model for inference, which comes with adjustable mechanical noise reduction. You can choose any trained audio for inference. 53 | 54 |  55 | 56 |  57 | 58 | **3.[模型修改] : 在这里可以选择已保存的模型进行重命名,方便日后推理使用** 59 | 60 | [Model modification]: Here you can choose a saved model for renaming, which is convenient for future inference use. 61 | 62 |  63 | 64 |  65 | 66 | **4.[ AI降噪 ] : 在这里可以上传音频进行AI降噪,一键去除噪音杂声** 67 | 68 | [Noise reduction]: Here you can upload audio for AI noise reduction, one-click to remove noise and miscellaneous sounds. 69 | 70 |  71 | 72 |  73 | 74 | **5.[缓存清理] : 如果训练时出现报错可以尝试缓存清理,每次保存模型会自动清理缓存,如果未保存就重新开始训练需要清理缓存** 75 | 76 | [Cache cleaning]: If an error occurs during training, you can try cache cleaning. The cache will be automatically cleaned every time the model is saved. If you need to clear the cache when restarting training without saving, you need to clear the cache. 77 | 78 |  79 | 80 |  81 | 82 | **!!注意!! 不要生成会对个人以及组织造成侵害的内容** 83 | 84 | **!! Warning !! Do not generate content that will harm individuals or organizations.** 85 | 86 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import random 3 | import os 4 | from pathlib import Path 5 | import librosa 6 | from scipy.io import wavfile 7 | import numpy as np 8 | import torch 9 | import csv 10 | import whisper 11 | import gradio as gr 12 | import soundfile as sf 13 | 14 | os.system("pip install --upgrade Cython==0.29.35") 15 | os.system("pip install pysptk --no-build-isolation") 16 | os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") 17 | os.system("pip install tts-autolabel -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") 18 | 19 | import sox 20 | 21 | def split_long_audio(model, filepaths, save_dir="data_dir", out_sr=44100): 22 | if isinstance(filepaths, str): 23 | filepaths = [filepaths] 24 | 25 | for file_idx, filepath in enumerate(filepaths): 26 | 27 | save_path = Path(save_dir) 28 | save_path.mkdir(exist_ok=True, parents=True) 29 | 30 | print(f"Transcribing file {file_idx}: '{filepath}' to segments...") 31 | result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5) 32 | segments = result['segments'] 33 | 34 | wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True) 35 | wav, _ = librosa.effects.trim(wav, top_db=20) 36 | peak = np.abs(wav).max() 37 | if peak > 1.0: 38 | wav = 0.98 * wav / peak 39 | wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr) 40 | wav2 /= max(wav2.max(), -wav2.min()) 41 | 42 | for i, seg in enumerate(segments): 43 | start_time = seg['start'] 44 | end_time = seg['end'] 45 | wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)] 46 | wav_seg_name = f"{file_idx}_{i}.wav" 47 | out_fpath = save_path / wav_seg_name 48 | wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16)) 49 | 50 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 51 | whisper_size = "medium" 52 | whisper_model = whisper.load_model(whisper_size).to(device) 53 | 54 | from modelscope.tools import run_auto_label 55 | 56 | from modelscope.models.audio.tts import SambertHifigan 57 | from modelscope.pipelines import pipeline 58 | from modelscope.utils.constant import Tasks 59 | 60 | from modelscope.metainfo import Trainers 61 | from modelscope.trainers import build_trainer 62 | from modelscope.utils.audio.audio_utils import TtsTrainType 63 | 64 | pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k' 65 | 66 | dataset_id = "/home/user/app/output_training_data/" 67 | pretrain_work_dir = "/home/user/app/pretrain_work_dir/" 68 | 69 | 70 | def auto_label(Voicetoclone, VoiceMicrophone): 71 | if VoiceMicrophone is not None: 72 | audio = VoiceMicrophone 73 | else: 74 | audio = Voicetoclone 75 | 76 | try: 77 | split_long_audio(whisper_model, audio, "/home/user/app/test_wavs/") 78 | input_wav = "/home/user/app/test_wavs/" 79 | output_data = "/home/user/app/output_training_data/" 80 | ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7") 81 | 82 | except Exception as e: 83 | print(e) 84 | return "标注成功" 85 | 86 | 87 | 88 | def train(train_step): 89 | try: 90 | 91 | train_info = { 92 | TtsTrainType.TRAIN_TYPE_SAMBERT: { # 配置训练AM(sambert)模型 93 | 'train_steps': int(train_step / 20) * 20 + 2, # 训练多少个step 94 | 'save_interval_steps': int(train_step / 20) * 20, # 每训练多少个step保存一次checkpoint 95 | 'log_interval': int(train_step / 20) * 20 # 每训练多少个step打印一次训练日志 96 | } 97 | } 98 | 99 | kwargs = dict( 100 | model=pretrained_model_id, # 指定要finetune的模型 101 | model_revision = "v1.0.6", 102 | work_dir=pretrain_work_dir, # 指定临时工作目录 103 | train_dataset=dataset_id, # 指定数据集id 104 | train_type=train_info # 指定要训练类型及参数 105 | ) 106 | 107 | trainer = build_trainer(Trainers.speech_kantts_trainer, 108 | default_args=kwargs) 109 | 110 | trainer.train() 111 | 112 | except Exception as e: 113 | print(e) 114 | 115 | return "训练完成" 116 | 117 | 118 | # 保存模型 119 | 120 | import shutil 121 | 122 | import datetime 123 | 124 | def save_model(worked_dir,dest_dir): 125 | worked_dir = "/home/user/app/pretrain_work_dir" 126 | dest_dir = "/home/user/app/trained_model" 127 | 128 | if os.listdir(worked_dir): 129 | 130 | now = datetime.datetime.now() 131 | 132 | date_str = now.strftime("%Y%m%d%H%M%S") 133 | 134 | dest_folder = os.path.join(dest_dir, date_str) 135 | 136 | shutil.copytree(worked_dir, dest_folder) 137 | 138 | # List of files and directories to delete 139 | files_to_delete = [ 140 | "tmp_voc", 141 | "tmp_am/ckpt/checkpoint_2400000.pth", 142 | "orig_model/description", 143 | "orig_model/.mdl", 144 | "orig_model/.msc", 145 | "orig_model/README.md", 146 | "orig_model/resource", 147 | "orig_model/description", 148 | "orig_model/basemodel_16k/sambert", 149 | "orig_model/basemodel_16k/speaker_embedding", 150 | "data/duration", 151 | "data/energy", 152 | "data/f0", 153 | "data/frame_energy", 154 | "data/frame_f0", 155 | "data/frame_uv", 156 | "data/mel", 157 | "data/raw_duration", 158 | "data/wav", 159 | "data/am_train.lst", 160 | "data/am_valid.lst", 161 | "data/badlist.txt", 162 | "data/raw_metafile.txt", 163 | "data/Script.xml", 164 | "data/train.lst", 165 | "data/valid.lst", 166 | "data/se/0_*" 167 | ] 168 | 169 | for item in files_to_delete: 170 | item_path = os.path.join(dest_folder, item) 171 | if os.path.exists(item_path): 172 | if os.path.isdir(item_path): 173 | shutil.rmtree(item_path) 174 | else: 175 | os.remove(item_path) 176 | 177 | shutil.rmtree("/home/user/app/output_training_data") 178 | shutil.rmtree("/home/user/app/pretrain_work_dir") 179 | shutil.rmtree("/home/user/app/test_wavs") 180 | 181 | os.mkdir("/home/user/app/output_training_data") 182 | os.mkdir("/home/user/app/pretrain_work_dir") 183 | os.mkdir("/home/user/app/test_wavs") 184 | 185 | return f"模型已成功保存为 {date_str}" 186 | else: 187 | return "保存失败,模型已保存或已被清除" 188 | 189 | 190 | import random 191 | 192 | def infer(text): 193 | 194 | model_dir = "/home/user/app/pretrain_work_dir/" 195 | 196 | test_infer_abs = { 197 | 'voice_name': 198 | 'F7', 199 | 'am_ckpt': 200 | os.path.join(model_dir, 'tmp_am', 'ckpt'), 201 | 'am_config': 202 | os.path.join(model_dir, 'tmp_am', 'config.yaml'), 203 | 'voc_ckpt': 204 | os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 205 | 'voc_config': 206 | os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 207 | 'config.yaml'), 208 | 'audio_config': 209 | os.path.join(model_dir, 'data', 'audio_config.yaml'), 210 | 'se_file': 211 | os.path.join(model_dir, 'data', 'se', 'se.npy') 212 | } 213 | kwargs = {'custom_ckpt': test_infer_abs} 214 | 215 | model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs) 216 | 217 | inference = pipeline(task=Tasks.text_to_speech, model=model_id) 218 | output = inference(input=text) 219 | 220 | 221 | now = datetime.datetime.now() 222 | date_str = now.strftime("%Y%m%d%H%M%S") 223 | rand_num = random.randint(1000, 9999) 224 | filename = date_str + str(rand_num) 225 | 226 | 227 | with open(filename + "0.wav", mode='bx') as f: 228 | f.write(output["output_wav"]) 229 | 230 | 231 | y, sr = librosa.load(filename + "0.wav") 232 | 233 | S = librosa.stft(y) 234 | 235 | noise = S[np.abs(S) < np.percentile(S, 95)] 236 | noise_mean, noise_std = np.mean(noise), np.std(noise) 237 | 238 | filter_ = np.ones_like(S) 239 | filter_[np.abs(S) < noise_mean + 2 * noise_std] = 0 240 | 241 | filtered_S = filter_ * S 242 | 243 | filtered_y = librosa.istft(filtered_S) 244 | 245 | sf.write(filename + "testfile.wav", filtered_y, sr) 246 | 247 | 248 | os.remove(filename + "0.wav") 249 | 250 | 251 | return filename + "testfile.wav" 252 | 253 | 254 | def infer_custom(model_name, text, noise_level): 255 | 256 | custom_model_dir = os.path.join("/home/user/app/trained_model/", model_name) 257 | 258 | custom_infer_abs = { 259 | 'voice_name': 260 | 'F7', 261 | 'am_ckpt': 262 | os.path.join(custom_model_dir, 'tmp_am', 'ckpt'), 263 | 'am_config': 264 | os.path.join(custom_model_dir, 'tmp_am', 'config.yaml'), 265 | 'voc_ckpt': 266 | os.path.join(custom_model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 267 | 'voc_config': 268 | os.path.join(custom_model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 269 | 'config.yaml'), 270 | 'audio_config': 271 | os.path.join(custom_model_dir, 'data', 'audio_config.yaml'), 272 | 'se_file': 273 | os.path.join(custom_model_dir, 'data', 'se', 'se.npy') 274 | } 275 | kwargs = {'custom_ckpt': custom_infer_abs} 276 | 277 | model_id = SambertHifigan(os.path.join(custom_model_dir, "orig_model"), **kwargs) 278 | 279 | inference = pipeline(task=Tasks.text_to_speech, model=model_id) 280 | output = inference(input=text) 281 | 282 | 283 | now = datetime.datetime.now() 284 | date_str = now.strftime("%Y%m%d%H%M%S") 285 | rand_num = random.randint(1000, 9999) 286 | filename = date_str + str(rand_num) 287 | 288 | 289 | with open(filename + ".wav", mode='bx') as f: 290 | f.write(output["output_wav"]) 291 | 292 | 293 | 294 | 295 | y, sr = librosa.load(filename + ".wav") 296 | 297 | S = librosa.stft(y) 298 | 299 | noise = S[np.abs(S) < np.percentile(S, 95)] 300 | noise_mean, noise_std = np.mean(noise), np.std(noise) 301 | 302 | filter_ = np.ones_like(S) 303 | filter_[np.abs(S) < noise_mean + noise_level * noise_std] = 0 304 | 305 | filtered_S = filter_ * S 306 | 307 | filtered_y = librosa.istft(filtered_S) 308 | 309 | sf.write(filename + "customfile.wav", filtered_y, sr) 310 | 311 | os.remove(filename + ".wav") 312 | 313 | return filename + "customfile.wav" 314 | 315 | 316 | 317 | trained_model = "/home/user/app/trained_model/" 318 | 319 | 320 | def update_model_dropdown(inp3): 321 | 322 | model_list = os.listdir(trained_model) 323 | 324 | return gr.Dropdown(choices=model_list, value=inp3) 325 | 326 | 327 | def rename_model(old_name, new_name): 328 | 329 | if not os.path.isdir(os.path.join(trained_model, old_name)): 330 | return "模型名称不存在,请重新输入!" 331 | else: 332 | try: 333 | os.rename(os.path.join(trained_model, old_name), os.path.join(trained_model, new_name)) 334 | return "模型重命名成功!" 335 | except OSError: 336 | return "新名称已经存在,请重新输入!" 337 | 338 | 339 | # 清除训练缓存 340 | def clear_cache(a): 341 | shutil.rmtree("/home/user/app/output_training_data") 342 | shutil.rmtree("/home/user/app/pretrain_work_dir") 343 | shutil.rmtree("/home/user/app/test_wavs") 344 | 345 | os.mkdir("/home/user/app/output_training_data") 346 | os.mkdir("/home/user/app/pretrain_work_dir") 347 | os.mkdir("/home/user/app/test_wavs") 348 | return "已清除缓存,请返回训练页面重新训练" 349 | 350 | 351 | from textwrap import dedent 352 | 353 | 354 | 355 | def FRCRN_De_Noise(noise_wav, noisemic_wav): 356 | 357 | if noisemic_wav is not None: 358 | noise_audio = noisemic_wav 359 | else: 360 | noise_audio = noise_wav 361 | 362 | ans = pipeline( 363 | Tasks.acoustic_noise_suppression, 364 | model='/home/yiho/Personal-TTS-v3/damo/speech_frcrn_ans_cirm_16k') 365 | 366 | now = datetime.datetime.now() 367 | date_str = now.strftime("%Y%m%d%H%M%S") 368 | rand_num = random.randint(1000, 9999) 369 | filename = date_str + str(rand_num) 370 | 371 | result = ans( 372 | noise_audio, 373 | output_path= filename + "AIdenoise.wav" ) 374 | 375 | return filename + "AIdenoise.wav" 376 | 377 | def Normal_De_Noise(noise_wav, noisemic_wav, noise_level): 378 | if noisemic_wav is not None: 379 | noise_audio = noisemic_wav 380 | else: 381 | noise_audio = noise_wav 382 | 383 | now = datetime.datetime.now() 384 | date_str = now.strftime("%Y%m%d%H%M%S") 385 | rand_num = random.randint(1000, 9999) 386 | filename = date_str + str(rand_num) 387 | 388 | 389 | y, sr = librosa.load(noise_audio) 390 | 391 | S = librosa.stft(y) 392 | 393 | noise = S[np.abs(S) < np.percentile(S, 95)] 394 | noise_mean, noise_std = np.mean(noise), np.std(noise) 395 | 396 | filter_ = np.ones_like(S) 397 | filter_[np.abs(S) < noise_mean + noise_level * noise_std] = 0 398 | 399 | filtered_S = filter_ * S 400 | 401 | filtered_y = librosa.istft(filtered_S) 402 | 403 | sf.write(filename + "denoise.wav", filtered_y, sr) 404 | 405 | return filename + "denoise.wav" 406 | 407 | 408 | app = gr.Blocks() 409 | 410 | with app: 411 | gr.Markdown("#