├── version.txt ├── python ├── mmvc.ico ├── requirements.txt ├── makeexe.ps1 ├── install_pipenv.ps1 ├── Pipfile ├── commons.py ├── compile.md ├── symbols.py ├── output_audio_device_list.py ├── snake.py ├── onnx_bench.py ├── index.py ├── setup_check.py ├── rec_environmental_noise.py ├── features.py ├── modules.py ├── residual_block.py ├── generator.py ├── models.py └── mmvc_client.py ├── conf ├── myprofile_CUDA_sample.conf ├── myprofile_ONNX_sample.conf ├── myprofile.conf └── myprofile_ONNX_output_sample.conf ├── .gitignore ├── LICENSE └── README.md /version.txt: -------------------------------------------------------------------------------- 1 | v0.5.0.0 2 | -------------------------------------------------------------------------------- /python/mmvc.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isletennos/MMVC_Client/HEAD/python/mmvc.ico -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | noisereduce==2.0.0 2 | numpy==1.23.5 3 | protobuf==3.20.3 4 | pyworld==0.3.2 5 | #PyAudio==0.2.11 6 | sounddevice==0.4.4 7 | SoundFile==0.10.3.post1 8 | -f https://download.pytorch.org/whl/torch_stable.html 9 | torch==1.10.1+cu111 10 | onnxruntime-directml==1.13.1 11 | -------------------------------------------------------------------------------- /python/makeexe.ps1: -------------------------------------------------------------------------------- 1 | pipenv run pyinstaller mmvc_client.py --add-binary "./.venv/Lib/site-packages/onnxruntime/capi/onnxruntime_providers_shared.dll;./onnxruntime/capi/" --add-binary "./.venv/Lib/site-packages/onnxruntime/capi/DirectML.dll;./onnxruntime/capi/" --collect-data librosa --onedir --icon=mmvc.ico --clean -y 2 | pipenv run pyinstaller output_audio_device_list.py --onefile 3 | pipenv run pyinstaller rec_environmental_noise.py --onefile 4 | pipenv run pyinstaller setup_check.py --onefile 5 | -------------------------------------------------------------------------------- /python/install_pipenv.ps1: -------------------------------------------------------------------------------- 1 | pip install --upgrade pip 2 | pip install pipenv 3 | $pythonUserPath = python -m site --user-site 4 | $pythonUserPath = $pythonUserPath.Replace('site-packages', 'Scripts') 5 | $ENV:Path += ";" + $pythonUserPath 6 | $userPath = [System.Environment]::GetEnvironmentVariable("Path", "User") 7 | $userPath += ";" + $pythonUserPath 8 | [System.Environment]::SetEnvironmentVariable("Path", $userPath, "User") 9 | $ENV:PIPENV_VENV_IN_PROJECT = '.venv' 10 | [System.Environment]::SetEnvironmentVariable("PIPENV_VENV_IN_PROJECT", ".venv", "User") 11 | -------------------------------------------------------------------------------- /python/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [[source]] 7 | url = "https://download.pytorch.org/whl/cu111/" 8 | verify_ssl = true 9 | name = "downloadpytorch" 10 | 11 | [packages] 12 | torch = {version = "==1.10.1+cu111", index = "downloadpytorch"} 13 | noisereduce = "==2.0.0" 14 | scikit-learn = "==1.0.2" 15 | sounddevice = "==0.4.4" 16 | SoundFile = "==0.10.3.post1" 17 | numpy = "~=1.23" 18 | protobuf = "~=3.20" 19 | pyworld = "==0.3.2" 20 | onnxruntime-directml = "==1.13.1" 21 | pyinstaller = "*" 22 | 23 | [dev-packages] 24 | PyAudio = "~=0.2" 25 | py-cpuinfo = "~=9.0" 26 | psutil = "~=5.9" 27 | nvgpu = "~=0.9" 28 | 29 | [requires] 30 | python_version = "3.9" 31 | -------------------------------------------------------------------------------- /python/commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def init_weights(m, mean=0.0, std=0.01): 5 | classname = m.__class__.__name__ 6 | if classname.find("Conv") != -1: 7 | m.weight.data.normal_(mean, std) 8 | 9 | 10 | def get_padding(kernel_size, dilation=1): 11 | return int((kernel_size*dilation - dilation)/2) 12 | 13 | 14 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 15 | n_channels_int = n_channels[0] 16 | in_act = input_a + input_b 17 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 18 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 19 | acts = t_act * s_act 20 | return acts 21 | 22 | 23 | def sequence_mask(length, max_length=None): 24 | if max_length is None: 25 | max_length = length.max() 26 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 27 | return x.unsqueeze(0) < length.unsqueeze(1) 28 | -------------------------------------------------------------------------------- /python/compile.md: -------------------------------------------------------------------------------- 1 | nuitka --standalone --mingw64 --follow-imports --windows-icon-from-ico=D:\GitRepository\RT-MMVC_Client\use_exe.ico --enable-plugin=torch --enable-plugin=anti-bloat --enable-plugin=numpy --enable-plugin=multiprocessing --enable-plugin=tk-inter --assume-yes-for-downloads --user-plugin=D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python\FixBuildPlugin_pytorch.py --include-plugin-directory=D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python --nofollow-import-to=torchvision --no-prefer-source-code D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python\mmvc_client_GPU.py 2 | 3 | 1)_soundfile_data\... がないといわれるので、pythonの環境から_soundfile_dataディレクトリを直接持ってくる 4 | 2)llvmlite.dll がないといわれるので、pythonの環境からllvmliteディレクトリを直接持ってくる 5 | 3)librosa\... がないといわれるので、 6 | 4)cannot load filter definition for kaiser best と言われるので、python環境から、resampyを持ってくる 7 | 5)_sounddevice_dataも持ってくる 8 | 9 | -------------------------------------------------------------------------------- /conf/myprofile_CUDA_sample.conf: -------------------------------------------------------------------------------- 1 | { 2 | "device": { 3 | "input_device1": "マイク (Realtek(R) Audio), MME", 4 | "input_device2": false, 5 | "output_device": "スピーカー (Realtek(R) Audio), MME", 6 | "gpu_id": 0 7 | }, 8 | "vc_conf": { 9 | "frame_length": 8192, 10 | "delay_flames": 4096, 11 | "overlap": 1024, 12 | "dispose_stft_specs": 2, 13 | "dispose_conv1d_specs": 10, 14 | "source_id": 0, 15 | "target_id": 101, 16 | "f0_scale": 2.30, 17 | "onnx": { 18 | "use_onnx": false, 19 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 20 | } 21 | }, 22 | "path": { 23 | "json": ".\\logs\\20220306_24000\\config.json", 24 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.pth", 25 | "noise": ".\\noise.wav" 26 | }, 27 | "others": { 28 | "use_nr": false, 29 | "voice_selector": false, 30 | "voice_list": [101, 108, 6, 30], 31 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"], 32 | "voice_f0": [2.30, 2.00, 2.10, 1.20] 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/myprofile_ONNX_sample.conf: -------------------------------------------------------------------------------- 1 | { 2 | "device": { 3 | "input_device1": "マイク (Realtek(R) Audio), MME", 4 | "input_device2": false, 5 | "output_device": "スピーカー (Realtek(R) Audio), MME", 6 | "gpu_id": 0 7 | }, 8 | "vc_conf": { 9 | "frame_length": 8192, 10 | "delay_flames": 4096, 11 | "overlap": 1024, 12 | "dispose_stft_specs": 2, 13 | "dispose_conv1d_specs": 10, 14 | "source_id": 0, 15 | "target_id": 101, 16 | "f0_scale": 2.30, 17 | "onnx": { 18 | "use_onnx": true, 19 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 20 | } 21 | }, 22 | "path": { 23 | "json": ".\\logs\\20220306_24000\\config.json", 24 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 25 | "noise": ".\\noise.wav" 26 | }, 27 | "others": { 28 | "use_nr": false, 29 | "voice_selector": false, 30 | "voice_list": [101, 108, 6, 30], 31 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"], 32 | "voice_f0": [2.30, 2.00, 2.10, 1.20] 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/myprofile.conf: -------------------------------------------------------------------------------- 1 | { 2 | "device": { 3 | "input_device1": "マイク (Realtek(R) Audio), MME", 4 | "input_device2": false, 5 | "output_device": "スピーカー (Realtek(R) Audio), MME", 6 | "gpu_id": 0 7 | }, 8 | "vc_conf": { 9 | "frame_length": 8192, 10 | "delay_flames": 4096, 11 | "overlap": 1024, 12 | "dispose_stft_specs": 2, 13 | "dispose_conv1d_specs": 10, 14 | "source_id": 0, 15 | "target_id": 101, 16 | "f0_scale": 1.0, 17 | "mic_scale": 1.0, 18 | "onnx": { 19 | "use_onnx": true, 20 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 21 | } 22 | }, 23 | "path": { 24 | "json": ".\\logs\\20220306_24000\\config.json", 25 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 26 | "correspondence":".\\logs\\20220306_24000\\train_config_Correspondence.txt", 27 | "noise": ".\\noise.wav" 28 | }, 29 | "others": { 30 | "use_nr": false, 31 | "voice_selector": false, 32 | "voice_list": [101, 108, 6, 30], 33 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"], 34 | "voice_f0": [1.0, 1.0, 1.0, 1.0] 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /conf/myprofile_ONNX_output_sample.conf: -------------------------------------------------------------------------------- 1 | { 2 | "device": { 3 | "input_device1": "マイク (Realtek(R) Audio), MME", 4 | "input_device2": false, 5 | "output_device": "スピーカー (Realtek(R) Audio), MME", 6 | "gpu_id": 0 7 | }, 8 | "vc_conf": { 9 | "frame_length": 8192, 10 | "delay_flames": 4096, 11 | "overlap": 1024, 12 | "dispose_stft_specs": 2, 13 | "dispose_conv1d_specs": 10, 14 | "source_id": 0, 15 | "target_id": 101, 16 | "f0_scale": 2.30, 17 | "onnx": { 18 | "use_onnx": true, 19 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 20 | } 21 | }, 22 | "path": { 23 | "json": ".\\logs\\20220306_24000\\config.json", 24 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 25 | "noise": ".\\noise.wav" 26 | }, 27 | "others": { 28 | "use_nr": false, 29 | "voice_selector": false, 30 | "voice_list": [101, 108, 6, 30], 31 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"], 32 | "voice_f0": [2.30, 2.00, 2.10, 1.20], 33 | "input_filename": ".\\emotion059.wav", 34 | "output_filename": ".\\trans_emotion059.wav" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /python/symbols.py: -------------------------------------------------------------------------------- 1 | """ The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """ 2 | """ from https://github.com/keithito/tacotron """ 3 | 4 | ''' 5 | Defines the set of symbols used in text input to the model. 6 | ''' 7 | _pad = '_' 8 | _punctuation = ';:,.!?¡¿—…"«»“” ' 9 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 10 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 11 | 12 | 13 | # Export all symbols: 14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 15 | 16 | # Special symbol ids 17 | SPACE_ID = symbols.index(" ") 18 | 19 | symbols = [ 20 | "A", 21 | "E", 22 | "I", 23 | "N", 24 | "O", 25 | "U", 26 | "a", 27 | "b", 28 | "by", 29 | "ch", 30 | "cl", 31 | "d", 32 | "dy", 33 | "e", 34 | "f", 35 | "g", 36 | "gy", 37 | "h", 38 | "hy", 39 | "i", 40 | "j", 41 | "k", 42 | "ky", 43 | "m", 44 | "my", 45 | "n", 46 | "ny", 47 | "o", 48 | "p", 49 | "py", 50 | "r", 51 | "ry", 52 | "s", 53 | "sh", 54 | "t", 55 | "ts", 56 | "ty", 57 | "u", 58 | "v", 59 | "w", 60 | "y", 61 | "z", 62 | "pau", 63 | "sil", 64 | ] 65 | -------------------------------------------------------------------------------- /python/output_audio_device_list.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | import pyaudio 3 | from os import linesep 4 | 5 | def main(): 6 | audio = pyaudio.PyAudio() 7 | audio_devices = list() 8 | host_apis = list() 9 | 10 | for api_index in range(audio.get_host_api_count()): 11 | host_apis.append(audio.get_host_api_info_by_index(api_index)['name']) 12 | 13 | # 音声デバイス毎のインデックス番号を一覧表示 14 | for x in range(0, audio.get_device_count()): 15 | devices = audio.get_device_info_by_index(x) 16 | try: 17 | device_name = devices['name'].encode('shift-jis').decode('utf-8') 18 | except (UnicodeDecodeError, UnicodeEncodeError): 19 | device_name = devices['name'] 20 | 21 | device_name = device_name.replace(linesep, '') + ", " + host_apis[devices['hostApi']] 22 | 23 | isInOut = "" 24 | if devices['maxInputChannels'] > 0: 25 | isInOut += "入" 26 | if devices['maxOutputChannels'] > 0: 27 | isInOut += "出" 28 | 29 | audio_devices.append(f"{isInOut}力: Index:{devices['index']} デバイス名:\"{device_name}\"\n") 30 | 31 | with open('audio_device_list.txt', 'w', encoding='utf-8') as f: 32 | f.writelines(audio_devices) 33 | 34 | print(" 使用可能なデバイス一覧の取得が完了しました。\n audio_device_list.txt を参照してください。\n このウィンドウは閉じて問題ありません。") 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /python/snake.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Snake Activation Function Module. 7 | 8 | References: 9 | - Neural Networks Fail to Learn Periodic Functions and How to Fix It 10 | https://arxiv.org/pdf/2006.08195.pdf 11 | - BigVGAN: A Universal Neural Vocoder with Large-Scale Training 12 | https://arxiv.org/pdf/2206.04658.pdf 13 | 14 | """ 15 | 16 | import torch 17 | import torch.nn as nn 18 | 19 | 20 | class Snake(nn.Module): 21 | """Snake activation function module.""" 22 | 23 | def __init__(self, channels, init=50): 24 | """Initialize Snake module. 25 | 26 | Args: 27 | channels (int): Number of feature channels. 28 | init (float): Initial value of the learnable parameter alpha. 29 | According to the original paper, 5 ~ 50 would be 30 | suitable for periodic data (i.e. voices). 31 | 32 | """ 33 | super(Snake, self).__init__() 34 | alpha = init * torch.ones(1, channels, 1) 35 | self.alpha = nn.Parameter(alpha) 36 | 37 | def forward(self, x): 38 | """Calculate forward propagation. 39 | 40 | Args: 41 | x (Tensor): Input noise signal (B, channels, T). 42 | 43 | Returns: 44 | Tensor: Output tensor (B, channels, T). 45 | 46 | """ 47 | return x + torch.sin(self.alpha * x) ** 2 / self.alpha 48 | -------------------------------------------------------------------------------- /python/onnx_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import onnxruntime as ort 4 | import torch 5 | 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--input_onnx", required=True) 10 | return parser.parse_args() 11 | 12 | 13 | def inspect_onnx(session): 14 | print("inputs") 15 | for i in session.get_inputs(): 16 | print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) 17 | print("outputs") 18 | for i in session.get_outputs(): 19 | print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) 20 | 21 | 22 | def benchmark(session): 23 | dummy_specs = torch.rand(1, 257, 60) 24 | dummy_lengths = torch.LongTensor([60]) 25 | dummy_sid_src = torch.LongTensor([0]) 26 | dummy_sid_tgt = torch.LongTensor([1]) 27 | 28 | use_time_list = [] 29 | for i in range(30): 30 | start = time.time() 31 | output = session.run( 32 | ["audio"], 33 | { 34 | "specs": dummy_specs.numpy(), 35 | "lengths": dummy_lengths.numpy(), 36 | "sid_src": dummy_sid_src.numpy(), 37 | "sid_tgt": dummy_sid_tgt.numpy() 38 | } 39 | ) 40 | use_time = time.time() - start 41 | use_time_list.append(use_time) 42 | #print("use time:{}".format(use_time)) 43 | use_time_list = use_time_list[5:] 44 | mean_use_time = sum(use_time_list) / len(use_time_list) 45 | print(f"mean_use_time:{mean_use_time}") 46 | 47 | 48 | def main(args): 49 | ort_session_cpu = ort.InferenceSession( 50 | args.input_onnx, 51 | providers=["CPUExecutionProvider"]) 52 | 53 | ort_session_cuda = ort.InferenceSession( 54 | args.input_onnx, 55 | providers=["CUDAExecutionProvider"]) 56 | 57 | # DirectMLで動かすための設定 58 | ort_options = ort.SessionOptions() 59 | ort_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL 60 | ort_options.enable_mem_pattern = False 61 | ort_session_dml = ort.InferenceSession( 62 | args.input_onnx, 63 | sess_options=ort_options, 64 | providers=["DmlExecutionProvider"]) 65 | 66 | print("vits onnx benchmark") 67 | inspect_onnx(ort_session_cpu) 68 | print("ONNX CPU") 69 | benchmark(ort_session_cpu) 70 | print("ONNX CUDA") 71 | benchmark(ort_session_cuda) 72 | print("ONNX DirectML") 73 | benchmark(ort_session_dml) 74 | 75 | if __name__ == '__main__': 76 | args = get_args() 77 | print(args) 78 | main(args) 79 | -------------------------------------------------------------------------------- /python/index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Indexing-related functions.""" 7 | 8 | import torch 9 | from torch.nn import ConstantPad1d as pad1d 10 | 11 | 12 | def pd_indexing(x, d, dilation, batch_index, ch_index): 13 | """Pitch-dependent indexing of past and future samples. 14 | 15 | Args: 16 | x (Tensor): Input feature map (B, C, T). 17 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 18 | dilation (Int): Dilation size. 19 | batch_index (Tensor): Batch index 20 | ch_index (Tensor): Channel index 21 | 22 | Returns: 23 | Tensor: Past output tensor (B, out_channels, T) 24 | Tensor: Future output tensor (B, out_channels, T) 25 | 26 | """ 27 | (_, _, batch_length) = d.size() 28 | dilations = d * dilation 29 | 30 | # get past index 31 | idxP = torch.arange(-batch_length, 0).float() 32 | idxP = idxP.to(x.device) 33 | idxP = torch.add(-dilations, idxP) 34 | idxP = idxP.round().long() 35 | maxP = -((torch.min(idxP) + batch_length)) 36 | assert maxP >= 0 37 | idxP = (batch_index, ch_index, idxP) 38 | # padding past tensor 39 | xP = pad1d((maxP, 0), 0)(x) 40 | 41 | # get future index 42 | idxF = torch.arange(0, batch_length).float() 43 | idxF = idxF.to(x.device) 44 | idxF = torch.add(dilations, idxF) 45 | idxF = idxF.round().long() 46 | maxF = torch.max(idxF) - (batch_length - 1) 47 | assert maxF >= 0 48 | idxF = (batch_index, ch_index, idxF) 49 | # padding future tensor 50 | xF = pad1d((0, maxF), 0)(x) 51 | 52 | return xP[idxP], xF[idxF] 53 | 54 | 55 | def index_initial(n_batch, n_ch, tensor=True): 56 | """Tensor batch and channel index initialization. 57 | 58 | Args: 59 | n_batch (Int): Number of batch. 60 | n_ch (Int): Number of channel. 61 | tensor (bool): Return tensor or numpy array 62 | 63 | Returns: 64 | Tensor: Batch index 65 | Tensor: Channel index 66 | 67 | """ 68 | batch_index = [] 69 | for i in range(n_batch): 70 | batch_index.append([[i]] * n_ch) 71 | ch_index = [] 72 | for i in range(n_ch): 73 | ch_index += [[i]] 74 | ch_index = [ch_index] * n_batch 75 | 76 | if tensor: 77 | batch_index = torch.tensor(batch_index) 78 | ch_index = torch.tensor(ch_index) 79 | if torch.cuda.is_available(): 80 | batch_index = batch_index.cuda() 81 | ch_index = ch_index.cuda() 82 | return batch_index, ch_index 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | 8 | # Distribution / packaging 9 | .Python 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 94 | __pypackages__/ 95 | 96 | # Celery stuff 97 | celerybeat-schedule 98 | celerybeat.pid 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | old/ 131 | .history/ 132 | rt-mmvc_client_CPU/rt-mmvc_client_CPU/* 133 | rt-mmvc_client_GPU/rt-mmvc_client_GPU/* 134 | audio_device_list.txt 135 | device_check.txt 136 | *.rar 137 | *.exe 138 | *.build/ 139 | *.dist/ 140 | temp/ 141 | isle/ 142 | noise.wav 143 | use_exe.ico 144 | myprofile copy.json 145 | python/mmvc_client_GPU_v0.2.0.0.zip 146 | python/mmvc_client_GPU_v0.2.0.0/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Isle Tennos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | The MIT License (MIT) 24 | Copyright (c) 2019, Tim Sainburg 25 | 26 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 31 | 32 | MIT License 33 | 34 | Copyright (c) 2021 Jaehyeon Kim 35 | 36 | Permission is hereby granted, free of charge, to any person obtaining a copy 37 | of this software and associated documentation files (the "Software"), to deal 38 | in the Software without restriction, including without limitation the rights 39 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 40 | copies of the Software, and to permit persons to whom the Software is 41 | furnished to do so, subject to the following conditions: 42 | 43 | The above copyright notice and this permission notice shall be included in all 44 | copies or substantial portions of the Software. 45 | 46 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 47 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 48 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 49 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 50 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 51 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 52 | SOFTWARE. 53 | -------------------------------------------------------------------------------- /python/setup_check.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import platform 3 | from multiprocessing import freeze_support 4 | 5 | # 以下必要な外部ライブラリ 6 | # pip install --upgrade py-cpuinfo 7 | # pip install --upgrade psutil 8 | # pip install --upgrade nvgpu 9 | 10 | # Pipfileとpipenvを使用する場合 11 | # cd pyhon 12 | # pipenv install --dev 13 | # pipenv run python setup_check.py 14 | 15 | 16 | # cpuinfo.get_cpu_info()とpyinstallerを組み合わせる場合に必要 17 | freeze_support() 18 | 19 | 20 | 21 | # 定数 22 | MMVC_INFO: str = "MMVC_Client" 23 | OUTPUT_FILE_NAME: str = "device_check.txt" 24 | 25 | LOWER_LIMIT_MEMORY: int = 4 * 1024**3 # 4 GiB 26 | LOWER_LIMIT_GPU_MEMORY: int = 1 * 1024**3 # 1 GiB 27 | 28 | ONNX_TEXT: str = "このPCでは、onnxモデルを出力することで動作する可能性があります" 29 | 30 | 31 | 32 | # ログファイル出力の設定 33 | logging.basicConfig( 34 | filename = OUTPUT_FILE_NAME, 35 | filemode = "w", 36 | encoding = "utf-8", 37 | level = logging.INFO, 38 | format = "%(levelname)s%(message)s") 39 | logging.addLevelName(logging.INFO, "") 40 | logging.addLevelName(logging.WARNING, "\n[警告]\n") 41 | logging.addLevelName(logging.ERROR, "\n[エラー]\n") 42 | 43 | 44 | 45 | # 基本的な情報 46 | logging.info(f"バージョン: {MMVC_INFO}") 47 | logging.info(f"Python: {platform.python_version()}") 48 | logging.info(f"アーキテクチャ: {platform.machine()}") 49 | logging.info(f"OS: {platform.system()}") 50 | 51 | 52 | 53 | # CPU関連 54 | try: 55 | from cpuinfo import get_cpu_info 56 | 57 | cpu_info = get_cpu_info() 58 | logging.info(f"CPU: {cpu_info['brand_raw']}") 59 | 60 | except ModuleNotFoundError: 61 | logging.info(f"CPU: {platform.processor()}") 62 | logging.warning("py-cpuinfoライブラリがインストールされていません\n" + 63 | "以下のコマンドを実行して、py-cpuinfoをインストールするとより詳細な情報を得られます\n" + 64 | "pip install --upgrade py-cpuinfo\n") 65 | 66 | 67 | # メモリ 68 | try: 69 | from psutil import virtual_memory 70 | memory = virtual_memory().total 71 | logging.info(f"メモリ: {round(memory / 1024**3, 0)} GiB") 72 | 73 | if memory < LOWER_LIMIT_MEMORY: 74 | logging.error("メモリが不足しています\n" + 75 | "メモリを増設することで動作不良が改善される場合があります\n") 76 | except ModuleNotFoundError: 77 | logging.error("psutilライブラリがインストールされていません\n" + 78 | "以下のコマンドを実行して、psutilをインストールする必要があります\n" + 79 | "pip install --upgrade psutil\n") 80 | 81 | 82 | # GPU 83 | try: 84 | import nvgpu 85 | 86 | gpu_infos = nvgpu.gpu_info() 87 | gpu_memory = 0 88 | 89 | for gpu_info in gpu_infos: 90 | temp_gpt_memory = gpu_info["mem_total"] * 1024**2 91 | logging.info(f"GPU {gpu_info['index']} 名称: {gpu_info['type']}") 92 | logging.info(f"GPU {gpu_info['index']} メモリ: {round(temp_gpt_memory / 1024**3, 1)} GiB") 93 | gpu_memory = max(gpu_info["mem_total"] * 1024**2, gpu_memory) 94 | 95 | if len(gpu_infos) == 0: 96 | logging.warning(f"NvidiaのGPUが存在しません\n{ONNX_TEXT}\n") 97 | 98 | elif gpu_memory < LOWER_LIMIT_GPU_MEMORY: 99 | logging.warning(f"GPUのメモリ量が不足しています\n{ONNX_TEXT}\n") 100 | 101 | except ModuleNotFoundError: 102 | logging.error("nvgpuライブラリがインストールされていません\n" + 103 | "以下のコマンドを実行して、nvgpuをインストールする必要があります\n" + 104 | "pip install --upgrade nvgpu\n") 105 | 106 | except FileNotFoundError: 107 | # nvidia-smiパッケージが見つからない場合 108 | logging.warning(f"NvidiaのGPUもしくはGPUドライバーが存在しません\n{ONNX_TEXT}\n") 109 | 110 | 111 | 112 | logging.info("デバイス情報取得完了") 113 | print(f"デバイス情報の取得が完了しました。\n{OUTPUT_FILE_NAME} を確認してください。\nこのウィンドウは閉じて問題ありません。") -------------------------------------------------------------------------------- /python/rec_environmental_noise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | import pyaudio 3 | import sounddevice as sd 4 | import wave 5 | import numpy as np 6 | import time 7 | import json 8 | import os 9 | 10 | #ファイルダイアログ関連 11 | import tkinter as tk #add 12 | from tkinter import filedialog #add 13 | 14 | class VCPrifile(): 15 | def __init__(self, **kwargs): 16 | for k, v in kwargs.items(): 17 | if type(v) == dict: 18 | v = VCPrifile(**v) 19 | self[k] = v 20 | 21 | def keys(self): 22 | return self.__dict__.keys() 23 | 24 | def items(self): 25 | return self.__dict__.items() 26 | 27 | def values(self): 28 | return self.__dict__.values() 29 | 30 | def __len__(self): 31 | return len(self.__dict__) 32 | 33 | def __getitem__(self, key): 34 | return getattr(self, key) 35 | 36 | def __setitem__(self, key, value): 37 | return setattr(self, key, value) 38 | 39 | def __contains__(self, key): 40 | return key in self.__dict__ 41 | 42 | def __repr__(self): 43 | return self.__dict__.__repr__() 44 | 45 | def config_get(conf): 46 | config_path = conf 47 | with open(config_path, "r", encoding="utf-8") as f: 48 | data = f.read() 49 | config = json.loads(data) 50 | hparams = VCPrifile(**config) 51 | return hparams 52 | 53 | def MakeWavFile(profile_path): 54 | chunk = 1024 55 | 56 | params = config_get(profile_path) 57 | print(params.device.input_device1) 58 | if type(params.device.input_device1) == str: 59 | device_index = sd.query_devices().index(sd.query_devices(params.device.input_device1, 'input')) 60 | else: 61 | device_index = params.device.input_device1 62 | 63 | p = pyaudio.PyAudio() 64 | stream = p.open(format = pyaudio.paInt16, 65 | channels = 1, 66 | rate = sr, 67 | input = True, 68 | input_device_index = device_index, 69 | frames_per_buffer = chunk) 70 | #レコード開始 71 | print("あなたの環境ノイズを録音します。マイクの電源を入れて、何もせずに待機していてください。") 72 | print("5秒後に録音を開始します。5秒間ノイズを録音します。完了するまで待機していてください。") 73 | Record_Seconds = 5 74 | MAX_Value = 32768.0 75 | all = [] 76 | time.sleep(5) 77 | print("録音を開始しました。") 78 | for i in range(0, int(sr / chunk * Record_Seconds)): 79 | data = stream.read(chunk) #音声を読み取って、 80 | data = np.frombuffer(data, dtype='int16') 81 | audio1 = data * MAX_Value 82 | audio1 = audio1.astype(np.int16).tobytes() 83 | all.append(data) #データを追加 84 | #レコード終了 85 | print("録音が完了しました。") 86 | print("ファイルに書き込みを行っています。") 87 | stream.close() 88 | p.terminate() 89 | wavFile = wave.open("noise.wav", 'wb') 90 | wavFile.setnchannels(1) 91 | wavFile.setsampwidth(p.get_sample_size(pyaudio.paInt16)) 92 | wavFile.setframerate(sr) 93 | #wavFile.writeframes(b''.join(all)) #Python2 用 94 | wavFile.writeframes(b"".join(all)) #Python3用 95 | wavFile.close() 96 | print("ファイルの書き込み完了しました。") 97 | print("このウィンドウは閉じて問題ありません。") 98 | input() 99 | 100 | if __name__ == '__main__': 101 | try: #add 102 | #サンプリングレートの指定 103 | while True: # 無限ループ 104 | print('学習済みモデルのサンプリングレートを指定してください。') 105 | try: 106 | sr = int(input('>> ')) 107 | except ValueError: 108 | # ValueError例外を処理するコード 109 | print('数字以外が入力されました。数字のみを入力してください') 110 | continue 111 | break 112 | 113 | end_counter = 0 114 | while True: # 無限ループ 115 | tkroot = tk.Tk() 116 | tkroot.withdraw() 117 | print('myprofile.conf を選択して下さい') 118 | typ = [('confファイル','*.conf')] 119 | dir = './' 120 | profile_path = filedialog.askopenfilename(filetypes = typ, initialdir = dir) 121 | tkroot.destroy() 122 | try: 123 | if profile_path: 124 | MakeWavFile(profile_path) 125 | break 126 | else: 127 | print('ファイルが存在しません') 128 | end_counter = end_counter + 1 129 | print(end_counter) 130 | if end_counter > 3: 131 | break 132 | continue 133 | 134 | except Exception as e: 135 | # ValueError例外を処理するコード 136 | print(profile_path) 137 | print(e) 138 | print('パスを入力してください・') 139 | continue 140 | 141 | except Exception as e: 142 | print('エラーが発生しました。') 143 | print(e) 144 | os.system('PAUSE') -------------------------------------------------------------------------------- /python/features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Feature-related functions. 7 | 8 | References: 9 | - https://github.com/bigpon/QPPWG 10 | - https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts 11 | 12 | """ 13 | 14 | import sys 15 | from logging import getLogger 16 | 17 | import numpy as np 18 | import torch 19 | from torch.nn.functional import interpolate 20 | 21 | # A logger for this file 22 | logger = getLogger(__name__) 23 | 24 | 25 | def validate_length(xs, ys=None, hop_size=None): 26 | """Validate length 27 | 28 | Args: 29 | xs (ndarray): numpy array of features 30 | ys (ndarray): numpy array of audios 31 | hop_size (int): upsampling factor 32 | 33 | Returns: 34 | (ndarray): length adjusted features 35 | 36 | """ 37 | min_len_x = min([x.shape[0] for x in xs]) 38 | if ys is not None: 39 | min_len_y = min([y.shape[0] for y in ys]) 40 | if min_len_y < min_len_x * hop_size: 41 | min_len_x = min_len_y // hop_size 42 | if min_len_y > min_len_x * hop_size: 43 | min_len_y = min_len_x * hop_size 44 | ys = [y[:min_len_y] for y in ys] 45 | xs = [x[:min_len_x] for x in xs] 46 | 47 | return xs + ys if ys is not None else xs 48 | 49 | 50 | def dilated_factor(batch_f0, fs, dense_factor): 51 | """Pitch-dependent dilated factor 52 | 53 | Args: 54 | batch_f0 (ndarray): the f0 sequence (T) 55 | fs (int): sampling rate 56 | dense_factor (int): the number of taps in one cycle 57 | 58 | Return: 59 | dilated_factors(np array): 60 | float array of the pitch-dependent dilated factors (T) 61 | 62 | """ 63 | batch_f0[batch_f0 == 0] = fs / dense_factor 64 | dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0 65 | #assert np.all(dilated_factors > 0) 66 | return dilated_factors 67 | 68 | 69 | class SignalGenerator: 70 | """Input signal generator module.""" 71 | 72 | def __init__( 73 | self, 74 | sample_rate=24000, 75 | hop_size=120, 76 | sine_amp=0.1, 77 | noise_amp=0.003, 78 | signal_types=["sine", "noise"], 79 | ): 80 | """Initialize WaveNetResidualBlock module. 81 | 82 | Args: 83 | sample_rate (int): Sampling rate. 84 | hop_size (int): Hop size of input F0. 85 | sine_amp (float): Sine amplitude for NSF-based sine generation. 86 | noise_amp (float): Noise amplitude for NSF-based sine generation. 87 | signal_types (list): List of input signal types for generator. 88 | 89 | """ 90 | self.sample_rate = sample_rate 91 | self.hop_size = hop_size 92 | self.signal_types = signal_types 93 | self.sine_amp = sine_amp 94 | self.noise_amp = noise_amp 95 | 96 | for signal_type in signal_types: 97 | if not signal_type in ["noise", "sine", "sines", "uv"]: 98 | logger.info(f"{signal_type} is not supported type for generator input.") 99 | sys.exit(0) 100 | #logger.info(f"Use {signal_types} for generator input signals.") 101 | 102 | @torch.no_grad() 103 | def __call__(self, f0, f0_scale = 1.0): 104 | signals = [] 105 | for typ in self.signal_types: 106 | if "noise" == typ: 107 | signals.append(self.random_noise(f0)) 108 | if "sine" == typ: 109 | signals.append(self.sinusoid(f0)) 110 | if "sines" == typ: 111 | signals.append(self.sinusoids(f0)) 112 | if "uv" == typ: 113 | signals.append(self.vuv_binary(f0)) 114 | 115 | input_batch = signals[0] 116 | for signal in signals[1:]: 117 | input_batch = torch.cat([input_batch, signal], axis=1) 118 | 119 | return input_batch * f0_scale 120 | 121 | @torch.no_grad() 122 | def random_noise(self, f0): 123 | """Calculate noise signals. 124 | 125 | Args: 126 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 127 | 128 | Returns: 129 | Tensor: Gaussian noise signals (B, 1, T). 130 | 131 | """ 132 | B, _, T = f0.size() 133 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) 134 | 135 | return noise 136 | 137 | @torch.no_grad() 138 | def sinusoid(self, f0): 139 | """Calculate sine signals. 140 | 141 | Args: 142 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 143 | 144 | Returns: 145 | Tensor: Sines generated following NSF (B, 1, T). 146 | 147 | """ 148 | B, _, T = f0.size() 149 | vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 150 | radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1 151 | sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp 152 | if self.noise_amp > 0: 153 | noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0 154 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp 155 | sine = sine + noise 156 | 157 | return sine 158 | 159 | @torch.no_grad() 160 | def sinusoids(self, f0): 161 | """Calculate sines. 162 | 163 | Args: 164 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 165 | 166 | Returns: 167 | Tensor: Sines generated following NSF (B, 1, T). 168 | 169 | """ 170 | B, _, T = f0.size() 171 | vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 172 | f0 = interpolate(f0, T * self.hop_size) 173 | sines = torch.zeros_like(f0, device=f0.device) 174 | harmonics = 5 # currently only fixed number of harmonics is supported 175 | for i in range(harmonics): 176 | radious = (f0 * (i + 1) / self.sample_rate) % 1 177 | sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) 178 | sines = self.sine_amp * sines * vuv / harmonics 179 | if self.noise_amp > 0: 180 | noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0 181 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp 182 | sines = sines + noise 183 | 184 | return sines 185 | 186 | @torch.no_grad() 187 | def vuv_binary(self, f0): 188 | """Calculate V/UV binary sequences. 189 | 190 | Args: 191 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 192 | 193 | Returns: 194 | Tensor: V/UV binary sequences (B, 1, T). 195 | 196 | """ 197 | _, _, T = f0.size() 198 | uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 199 | 200 | return uv 201 | -------------------------------------------------------------------------------- /python/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from torch.nn import Conv1d 6 | from torch.nn.utils import weight_norm, remove_weight_norm 7 | 8 | from commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply 9 | 10 | 11 | LRELU_SLOPE = 0.1 12 | 13 | 14 | class WN(torch.nn.Module): 15 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 16 | super(WN, self).__init__() 17 | assert(kernel_size % 2 == 1) 18 | self.hidden_channels =hidden_channels 19 | self.kernel_size = kernel_size, 20 | self.dilation_rate = dilation_rate 21 | self.n_layers = n_layers 22 | self.gin_channels = gin_channels 23 | self.p_dropout = p_dropout 24 | 25 | self.in_layers = torch.nn.ModuleList() 26 | self.res_skip_layers = torch.nn.ModuleList() 27 | self.drop = nn.Dropout(p_dropout) 28 | 29 | if gin_channels != 0: 30 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 31 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 32 | 33 | for i in range(n_layers): 34 | dilation = dilation_rate ** i 35 | padding = int((kernel_size * dilation - dilation) / 2) 36 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 37 | dilation=dilation, padding=padding) 38 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 39 | self.in_layers.append(in_layer) 40 | 41 | # last one is not necessary 42 | if i < n_layers - 1: 43 | res_skip_channels = 2 * hidden_channels 44 | else: 45 | res_skip_channels = hidden_channels 46 | 47 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 48 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 49 | self.res_skip_layers.append(res_skip_layer) 50 | 51 | def forward(self, x, x_mask, g=None, **kwargs): 52 | output = torch.zeros_like(x) 53 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 54 | 55 | if g is not None: 56 | g = self.cond_layer(g) 57 | 58 | for i in range(self.n_layers): 59 | x_in = self.in_layers[i](x) 60 | if g is not None: 61 | cond_offset = i * 2 * self.hidden_channels 62 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 63 | else: 64 | g_l = torch.zeros_like(x_in) 65 | 66 | acts = fused_add_tanh_sigmoid_multiply( 67 | x_in, 68 | g_l, 69 | n_channels_tensor) 70 | acts = self.drop(acts) 71 | 72 | res_skip_acts = self.res_skip_layers[i](acts) 73 | if i < self.n_layers - 1: 74 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 75 | x = (x + res_acts) * x_mask 76 | output = output + res_skip_acts[:,self.hidden_channels:,:] 77 | else: 78 | output = output + res_skip_acts 79 | return output * x_mask 80 | 81 | def remove_weight_norm(self): 82 | if self.gin_channels != 0: 83 | torch.nn.utils.remove_weight_norm(self.cond_layer) 84 | for l in self.in_layers: 85 | torch.nn.utils.remove_weight_norm(l) 86 | for l in self.res_skip_layers: 87 | torch.nn.utils.remove_weight_norm(l) 88 | 89 | 90 | class ResBlock1(torch.nn.Module): 91 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 92 | super(ResBlock1, self).__init__() 93 | self.convs1 = nn.ModuleList([ 94 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 95 | padding=get_padding(kernel_size, dilation[0]))), 96 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 97 | padding=get_padding(kernel_size, dilation[1]))), 98 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 99 | padding=get_padding(kernel_size, dilation[2]))) 100 | ]) 101 | self.convs1.apply(init_weights) 102 | 103 | self.convs2 = nn.ModuleList([ 104 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 105 | padding=get_padding(kernel_size, 1))), 106 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 107 | padding=get_padding(kernel_size, 1))), 108 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 109 | padding=get_padding(kernel_size, 1))) 110 | ]) 111 | self.convs2.apply(init_weights) 112 | 113 | def forward(self, x, x_mask=None): 114 | for c1, c2 in zip(self.convs1, self.convs2): 115 | xt = F.leaky_relu(x, LRELU_SLOPE) 116 | if x_mask is not None: 117 | xt = xt * x_mask 118 | xt = c1(xt) 119 | xt = F.leaky_relu(xt, LRELU_SLOPE) 120 | if x_mask is not None: 121 | xt = xt * x_mask 122 | xt = c2(xt) 123 | x = xt + x 124 | if x_mask is not None: 125 | x = x * x_mask 126 | return x 127 | 128 | def remove_weight_norm(self): 129 | for l in self.convs1: 130 | remove_weight_norm(l) 131 | for l in self.convs2: 132 | remove_weight_norm(l) 133 | 134 | 135 | class ResBlock2(torch.nn.Module): 136 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 137 | super(ResBlock2, self).__init__() 138 | self.convs = nn.ModuleList([ 139 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 140 | padding=get_padding(kernel_size, dilation[0]))), 141 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 142 | padding=get_padding(kernel_size, dilation[1]))) 143 | ]) 144 | self.convs.apply(init_weights) 145 | 146 | def forward(self, x, x_mask=None): 147 | for c in self.convs: 148 | xt = F.leaky_relu(x, LRELU_SLOPE) 149 | if x_mask is not None: 150 | xt = xt * x_mask 151 | xt = c(xt) 152 | x = xt + x 153 | if x_mask is not None: 154 | x = x * x_mask 155 | return x 156 | 157 | def remove_weight_norm(self): 158 | for l in self.convs: 159 | remove_weight_norm(l) 160 | 161 | 162 | class Flip(nn.Module): 163 | def forward(self, x, *args, reverse=False, **kwargs): 164 | x = torch.flip(x, [1]) 165 | if not reverse: 166 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 167 | return x, logdet 168 | else: 169 | return x 170 | 171 | 172 | class ResidualCouplingLayer(nn.Module): 173 | def __init__(self, 174 | channels, 175 | hidden_channels, 176 | kernel_size, 177 | dilation_rate, 178 | n_layers, 179 | p_dropout=0, 180 | gin_channels=0, 181 | mean_only=False): 182 | assert channels % 2 == 0, "channels should be divisible by 2" 183 | super().__init__() 184 | self.channels = channels 185 | self.hidden_channels = hidden_channels 186 | self.kernel_size = kernel_size 187 | self.dilation_rate = dilation_rate 188 | self.n_layers = n_layers 189 | self.half_channels = channels // 2 190 | self.mean_only = mean_only 191 | 192 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 193 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 194 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 195 | self.post.weight.data.zero_() 196 | self.post.bias.data.zero_() 197 | 198 | def forward(self, x, x_mask, g=None, reverse=False): 199 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 200 | h = self.pre(x0) * x_mask 201 | h = self.enc(h, x_mask, g=g) 202 | stats = self.post(h) * x_mask 203 | if not self.mean_only: 204 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 205 | else: 206 | m = stats 207 | logs = torch.zeros_like(m) 208 | 209 | if not reverse: 210 | x1 = m + x1 * torch.exp(logs) * x_mask 211 | x = torch.cat([x0, x1], 1) 212 | logdet = torch.sum(logs, [1,2]) 213 | return x, logdet 214 | else: 215 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 216 | x = torch.cat([x0, x1], 1) 217 | return x 218 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MMVC_Client 2 | ==== 3 | 4 | AIを使ったリアルタイムボイスチェンジャー 5 | 6 | ## Description 7 | AIを使ったリアルタイムボイスチェンジャー「MMVC(RealTime-Many to Many Voice Conversion)」 8 | の本体です。 9 | MMVC_Trainerで学習したモデルを使ってリアルタイムでVCを行います。 10 | ## MMVC_Trainer 11 | https://github.com/isletennos/MMVC_Trainer 12 | ## concept 13 | 「簡単」「だれでも」「好きな声に」「リアルタイムで」 14 | ## Requirement 15 | ・MMVC_Trainerで学習したモデルとそのコンフィグ 16 | ## Install 17 | ### windows かつ 実行ファイルを利用する方 18 | 下記URLからダウンロードして、自己解凍形式ファイルを実行して展開してください。(ファイルサイズが非常に大きいので注意) 19 | [MMVC_client v0.3.1.0](https://github.com/isletennos/MMVC_Client/releases/tag/v0.3.1.0) 20 | 21 | ### 旧ver 22 | [MMVC_client v0.3.0.0(GPU ver)](https://drive.google.com/file/d/1QXJQAnTOr8vE5nwxInUROtj-fiHeJsXH/view?usp=sharing) 23 | ファイルサイズが大きすぎてDLできない人向けの分割版 24 | [MMVC_client v0.3.0.0(GPU ver)](https://drive.google.com/drive/folders/1eoDBw37WT7wJsAXh-RIXvXLvbSwnDtt9?usp=sharing) 25 | [MMVC_client v0.2.0.1(GPU ver)](https://drive.google.com/file/d/1JEvYw4vjiBwhsZq79Pb0Doh7Fy16dK76/view?usp=sharing) 26 | [MMVC_client 無印(CPU_ver) (現在非推奨)](https://drive.google.com/file/d/1KLqo_q-qbahPRzNo2kUhCqHqnb8lTjMJ/view?usp=sharing) 27 | [MMVC_client 無印(GPU ver)](https://drive.google.com/file/d/1XNdfT3BFGKlxDm43hEbYvnoJSecjLedt/view?usp=sharing) 28 | 29 | #### TrainerとClientの対応表 30 | | MMVC Trainer ver | v1.2.x.x | v1.3.0.x | 1.3.2.x | 1.3.2.x(ONNX) | 31 | | ------------------------- | -------- | -------- | ------- | ------------- | 32 | | MMVC Client 無印(CPU/GPU) | 〇 | × | × | × | 33 | | MMVC Client v0.2.0.x(GPU) | 〇 | × | × | × | 34 | | MMVC Client v0.3.0.x(GPU) | × | 〇 | 〇 | × | 35 | | MMVC Client v0.3.1.x | × | 〇 | 〇 | 〇 | 36 | 37 | ## Install(python) 38 | このリポジトリをダウンロードして、展開してください。 39 | また、下記.exeの実行を.pyの実行に置き換えて実行してください。 40 | 41 | ## Usage 42 | ### 1. 使用可能なオーディオデバイス一覧の取得 43 | 「output_audio_device_list.exe」を実行します。 44 | 「audio_device_list.txt」が実行ファイルと同じディレクトリに出力されます。 45 | こちらに入出力のオーディオデバイス名およびIDが出力されており、下記セクション以降で利用します。 46 | ### 2. myprofile.confの書き換え 47 | myprofile.confの下記項目を環境に合わせて変更します。 48 | ``` 49 | "device": { 50 | "input_device1": "マイク (Realtek(R) Audio), MME", 51 | "input_device2": false, 52 | "output_device": "スピーカー (Realtek(R) Audio), MME", 53 | "gpu_id":0 54 | }, 55 | ``` 56 | 57 | ``` 58 | "vc_conf": { 59 | "frame_length": 8192, 60 | "delay_flames": 4096, 61 | "overlap": 1024, 62 | "dispose_stft_specs": 2, 63 | "dispose_conv1d_specs": 10, 64 | "source_id": 0, 65 | "target_id": 101, 66 | "onnx": { 67 | "use_onnx": true, 68 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 69 | } 70 | }, 71 | ``` 72 | 73 | ``` 74 | "path": { 75 | "json": ".\\logs\\20220306_24000\\config.json", 76 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 77 | "noise": ".\\noise.wav" 78 | }, 79 | ``` 80 | 81 | ``` 82 | "others": { 83 | "use_nr":false, 84 | "voice_selector":false, 85 | "voice_list": [101, 108, 6, 30], 86 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"] 87 | } 88 | ``` 89 | ### 2.1 myprofile.confの書き換え(device) 90 | このセクションでは、下記項目の変更方法について記載します。 91 | ``` 92 | "device": { 93 | "input_device1": "マイク (Realtek(R) Audio), MME", 94 | "input_device2": false, 95 | "output_device": "スピーカー (Realtek(R) Audio), MME", 96 | "gpu_id":0 97 | }, 98 | ``` 99 | 各要素はそれぞれ 100 | **input_device1 : マイク入力のデバイスID or デバイス名を指定します。** 101 | 102 | 103 | **input_device2 : 背景音声の入力のデバイスID or デバイス名を指定します。** 104 | 主にカラオケ等背景のBGMと自分の変換後の音声のラグを0にしたいときに使います。 105 | 106 | 107 | **output_device : 変換した音声の出力先のデバイスID or デバイス名を指定します。** 108 | 109 | 110 | **gpu_id : 複数GPUをPCに搭載している場合、数字で指定できます。** 111 | 使い分けが不要な場合は0のまま変更は不要です。 112 | 113 | ### 2.2 myprofile.confの書き換え(vc_conf) 114 | このセクションでは、下記項目の変更方法について記載します。 115 | ``` 116 | "vc_conf": { 117 | "frame_length": 8192, 118 | "delay_flames": 4096, 119 | "overlap": 1024, 120 | "dispose_stft_specs": 2, 121 | "dispose_conv1d_specs": 10, 122 | "source_id": 0, 123 | "target_id": 101, 124 | "onnx": { 125 | "use_onnx": true, 126 | "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"] 127 | } 128 | }, 129 | ``` 130 | この項目では、下記4項目のみ変更します。それ以外の項目については割愛します。 131 | **source_id : 変換元の音声の話者IDになります。** 132 | Trainerで特に弄っていなければ、107のままで問題ありません。 133 | 134 | **target_id : 変換先の音声の話者IDになります。** 135 | 学習時に生成した「./filelists/train_config_Correspondence.txt」を参考に話者IDを指定してください。 136 | チュートリアルもんであれば101のままで問題ありません。 137 | 138 | **onnx.use_onnx : 変換にONNXを使うか指定します。** 139 | ONNXを使って変換する場合trueにします。 140 | 従来のtorchを使って変換する場合はfalseにします。この場合、onnx_providersの設定は無視されます。 141 | ONNXを利用する場合、学習したモデルは「~.onnx」形式のファイルを指定します。 142 | 従来のtorchを利用する場合、「~.pth」形式のファイルを指定します。 143 | 144 | **onnx.onnx_providers : ONNXが使う実行エンジンと優先順位を指定します。** 145 | 記述されている順番に優先して利用されます。 146 | - DmlExecutionProvider 147 | DirectMLを利用します。GPUを利用したい場合はこの項目を先に入れてください。 148 | - CPUExecutionProvider 149 | CPUを利用します。ONNXでCPUでの変換をしたい場合は、この項目だけを入れます。 150 | 151 | ### 2.3 myprofile.confの書き換え(path) 152 | このセクションでは、下記項目の変更方法について記載します。 153 | ``` 154 | "path": { 155 | "json": ".\\logs\\20220306_24000\\config.json", 156 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 157 | "noise": ".\\noise.wav" 158 | }, 159 | ``` 160 | **※ここで指定するパスは必ず「\」ではなく「\\\\」で区切ってください。** 161 | 162 | 学習済みフォルダ内に config.json, G_latest_99999999.pth, G_latest_99999999.onnx 等のファイルがあります。 163 | これらのファイルをコピーして、この例の場合、logsフォルダ内に「20220306_24000」フォルダを作って、その中にファイルを置きます。 164 | 165 | **json : 学習時に生成したconfigファイルのパスを指定します。** 166 | 学習時の設定ファイル ./logs/xxxx/config.json を指定します。 167 | 168 | **model : 学習したモデルのパスを指定します。** 169 | 学習済みモデルファイル ./logs/xxxx/G_xxxx.pth といった感じのファイルを指定します。 170 | ONNXを使って変換する場合は ./logs/xxxx/G_xxxx.onnx といったONNX形式ファイルを指定します。 171 | 172 | **noise : 現在非推奨で使わないのでそのままでいいです。** 173 | 使いたい方は下記おまけセクションを参考ください。 174 | 175 | ### 2.4 myprofile.confの書き換え(others) 176 | このセクションでは、下記項目の変更方法について記載します。 177 | ``` 178 | "others": { 179 | "use_nr":false, 180 | "voice_selector":false, 181 | "voice_list": [101, 108, 6, 30], 182 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"] 183 | } 184 | ``` 185 | 各要素はそれぞれ 186 | **use_nr : ノイズリダクションを有効化するかしないか指定します。** 187 | 現状は品質が下がるため、自前で用意することを推奨します。 188 | この機能を使う場合、 true に書き換えてください。 189 | 190 | 191 | **voice_selector : MMVC起動中にターゲット話者をリアルタイムで変更する機能を有効化するかしないか指定します。** 192 | この機能を十全に使うには、複数話者の同時学習を行う必要があります。 193 | 複数話者の同時学習を行っていない場合はfalseのままにしておいてください。 194 | 195 | 196 | **voice_list : voice_selectorを有効化したときに利用する項目です。学習した話者IDを記載します。** 197 | 198 | 199 | **voice_label : voice_selectorを有効化したときに利用する項目です。話者IDのラベルになります。** 200 | 201 | 202 | **input_filename : .wavファイルに対して音声変換したいときに利用する項目です。** 203 | デフォルトでは.confファイルに記入されていません。 204 | "input_filename": ".\\emotion059.wav", 205 | のように入力する.wavファイルのパスを指定します。 206 | **output_filename : .wavファイルに対して音声変換したいときに利用する項目です。** 207 | デフォルトでは.confファイルに記入されていません。 208 | "output_filename": ".\\trans_emotion059.wav" 209 | のように、変換結果の保存先とファイル名となる.wavファイルのパスを指定します。 210 | 211 | 212 | ### 3. ソフトウェアの起動 213 | パターン1 214 | 「mmvc_client_GPU.bat」を実行 215 | 正しく「myprofile.conf」が設定されていればそのまま起動します。 216 | 217 | パターン2 218 | 「mmvc_client_GPU.exe」を実行してください。 219 | 起動に少しだけ時間がかかります。 220 | 起動すると「myprofile.conf」のパスを聞かれるので、パスを指定して下さい。 221 | 222 | ### おまけ:ノイズリダクションの有効化 223 | #### 1. ノイズ音取得の実行 224 | 「rec_environmental_noise.exe」を実行します。 225 | 実行したら、モデルを学習したときに設定したサンプリングレートを入力してください。 226 | (MMVC_Trainerの設定を変えていなければ24000です) 227 | 次にmyprofile.confのパスを聞かれるため、編集したmyprofile.confのパスを入力してください。 228 | 以下の入力パスの例のように、.confファイルまで含めて入力して下さい。 229 | ``` 230 | D:\mmvc_client_GPU\myprofile.conf 231 | ``` 232 | ※注意として、入力パスの両端に”(ダブルクォーテーション)は付けないでください。 233 | パスの入力とmyprofile.confに問題が無ければ、ノイズの録音が開始されます。 234 | ノイズの録音が完了するまで、マイクに話しかけたり等しないで、待ちます。 235 | 「noise.wav」が実行ファイルと同じディレクトリに出力されます。 236 | 237 | #### 2. myprofile.confの書き換え 238 | ``` 239 | "path": { 240 | "json": ".\\logs\\20220306_24000\\config.json", 241 | "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx", 242 | "noise": ".\\noise.wav" 243 | } 244 | ``` 245 | 上記項目の"noise"に 1. ノイズ音取得の実行 で作成した「noise.wav」のパスを入力します。 246 | ``` 247 | "others": { 248 | "use_nr":false, 249 | "voice_selector":false, 250 | "voice_list": [101, 108, 6, 30], 251 | "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"] 252 | } 253 | ``` 254 | 上記項目の"use_nr"をtrueに変えます。 255 | 256 | ## Reference 257 | https://arxiv.org/abs/2106.06103 258 | https://github.com/jaywalnut310/vits 259 | https://github.com/timsainb/noisereduce 260 | ## Author 261 | Isle Tennos 262 | Twitter : https://twitter.com/IsleTennos 263 | 264 | -------------------------------------------------------------------------------- /python/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual block modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | - https://github.com/r9y9/wavenet_vocoder 12 | 13 | """ 14 | 15 | from logging import getLogger 16 | 17 | import torch 18 | import torch.nn as nn 19 | from snake import Snake 20 | from index import index_initial, pd_indexing 21 | 22 | # A logger for this file 23 | logger = getLogger(__name__) 24 | 25 | 26 | class Conv1d(nn.Conv1d): 27 | """Conv1d module with customized initialization.""" 28 | 29 | def __init__(self, *args, **kwargs): 30 | """Initialize Conv1d module.""" 31 | super(Conv1d, self).__init__(*args, **kwargs) 32 | 33 | def reset_parameters(self): 34 | """Reset parameters.""" 35 | nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 36 | if self.bias is not None: 37 | nn.init.constant_(self.bias, 0.0) 38 | 39 | 40 | class Conv1d1x1(Conv1d): 41 | """1x1 Conv1d with customized initialization.""" 42 | 43 | def __init__(self, in_channels, out_channels, bias=True): 44 | """Initialize 1x1 Conv1d module.""" 45 | super(Conv1d1x1, self).__init__( 46 | in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias 47 | ) 48 | 49 | 50 | class Conv2d(nn.Conv2d): 51 | """Conv2d module with customized initialization.""" 52 | 53 | def __init__(self, *args, **kwargs): 54 | """Initialize Conv2d module.""" 55 | super(Conv2d, self).__init__(*args, **kwargs) 56 | 57 | def reset_parameters(self): 58 | """Reset parameters.""" 59 | nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu") 60 | if self.bias is not None: 61 | nn.init.constant_(self.bias, 0.0) 62 | 63 | 64 | class Conv2d1x1(Conv2d): 65 | """1x1 Conv2d with customized initialization.""" 66 | 67 | def __init__(self, in_channels, out_channels, bias=True): 68 | """Initialize 1x1 Conv2d module.""" 69 | super(Conv2d1x1, self).__init__( 70 | in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias 71 | ) 72 | 73 | 74 | class ResidualBlock(nn.Module): 75 | """Residual block module in HiFiGAN.""" 76 | 77 | def __init__( 78 | self, 79 | kernel_size=3, 80 | channels=512, 81 | dilations=(1, 3, 5), 82 | bias=True, 83 | use_additional_convs=True, 84 | nonlinear_activation="LeakyReLU", 85 | nonlinear_activation_params={"negative_slope": 0.1}, 86 | ): 87 | """Initialize ResidualBlock module. 88 | 89 | Args: 90 | kernel_size (int): Kernel size of dilation convolution layer. 91 | channels (int): Number of channels for convolution layer. 92 | dilations (List[int]): List of dilation factors. 93 | use_additional_convs (bool): Whether to use additional convolution layers. 94 | bias (bool): Whether to add bias parameter in convolution layers. 95 | nonlinear_activation (str): Activation function module name. 96 | nonlinear_activation_params (dict): Hyperparameters for activation function. 97 | 98 | """ 99 | super().__init__() 100 | self.use_additional_convs = use_additional_convs 101 | self.convs1 = nn.ModuleList() 102 | if use_additional_convs: 103 | self.convs2 = nn.ModuleList() 104 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 105 | for dilation in dilations: 106 | if nonlinear_activation == "Snake": 107 | nonlinear = Snake(channels, **nonlinear_activation_params) 108 | else: 109 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 110 | self.convs1 += [ 111 | nn.Sequential( 112 | nonlinear, 113 | nn.Conv1d( 114 | channels, 115 | channels, 116 | kernel_size, 117 | dilation=dilation, 118 | bias=bias, 119 | padding=(kernel_size - 1) // 2 * dilation, 120 | ), 121 | ) 122 | ] 123 | if use_additional_convs: 124 | if nonlinear_activation == "Snake": 125 | nonlinear = Snake(channels, **nonlinear_activation_params) 126 | else: 127 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 128 | self.convs2 += [ 129 | nn.Sequential( 130 | nonlinear, 131 | nn.Conv1d( 132 | channels, 133 | channels, 134 | kernel_size, 135 | dilation=1, 136 | bias=bias, 137 | padding=(kernel_size - 1) // 2, 138 | ), 139 | ) 140 | ] 141 | 142 | def forward(self, x): 143 | """Calculate forward propagation. 144 | 145 | Args: 146 | x (Tensor): Input tensor (B, channels, T). 147 | 148 | Returns: 149 | Tensor: Output tensor (B, channels, T). 150 | 151 | """ 152 | for idx in range(len(self.convs1)): 153 | xt = self.convs1[idx](x) 154 | if self.use_additional_convs: 155 | xt = self.convs2[idx](xt) 156 | x = xt + x 157 | return x 158 | 159 | 160 | class AdaptiveResidualBlock(nn.Module): 161 | """Residual block module in HiFiGAN.""" 162 | 163 | def __init__( 164 | self, 165 | kernel_size=3, 166 | channels=512, 167 | dilations=(1, 2, 4), 168 | bias=True, 169 | use_additional_convs=True, 170 | nonlinear_activation="LeakyReLU", 171 | nonlinear_activation_params={"negative_slope": 0.1}, 172 | ): 173 | """Initialize ResidualBlock module. 174 | 175 | Args: 176 | kernel_size (int): Kernel size of dilation convolution layer. 177 | channels (int): Number of channels for convolution layer. 178 | bias (bool): Whether to add bias parameter in convolution layers. 179 | nonlinear_activation (str): Activation function module name. 180 | nonlinear_activation_params (dict): Hyperparameters for activation function. 181 | 182 | """ 183 | super().__init__() 184 | self.use_additional_convs = use_additional_convs 185 | assert kernel_size == 3, "Currently only kernel_size = 3 is supported." 186 | self.channels = channels 187 | self.dilations = dilations 188 | self.nonlinears = nn.ModuleList() 189 | self.convsC = nn.ModuleList() 190 | self.convsP = nn.ModuleList() 191 | self.convsF = nn.ModuleList() 192 | if use_additional_convs: 193 | self.convsA = nn.ModuleList() 194 | for _ in dilations: 195 | if nonlinear_activation == "Snake": 196 | self.nonlinears += [Snake(channels, **nonlinear_activation_params)] 197 | else: 198 | self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)] 199 | self.convsC += [ 200 | Conv1d1x1( 201 | channels, 202 | channels, 203 | bias=bias, 204 | ), 205 | ] 206 | self.convsP += [ 207 | Conv1d1x1( 208 | channels, 209 | channels, 210 | bias=bias, 211 | ), 212 | ] 213 | self.convsF += [ 214 | Conv1d1x1( 215 | channels, 216 | channels, 217 | bias=bias, 218 | ), 219 | ] 220 | if use_additional_convs: 221 | if nonlinear_activation == "Snake": 222 | nonlinear = Snake(channels, **nonlinear_activation_params) 223 | else: 224 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 225 | self.convsA += [ 226 | nn.Sequential( 227 | nonlinear, 228 | nn.Conv1d( 229 | channels, 230 | channels, 231 | kernel_size, 232 | dilation=1, 233 | bias=bias, 234 | padding=(kernel_size - 1) // 2, 235 | ), 236 | ) 237 | ] 238 | 239 | def forward(self, x, d): 240 | """Calculate forward propagation. 241 | 242 | Args: 243 | x (Tensor): Input tensor (B, channels, T). 244 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 245 | 246 | Returns: 247 | Tensor: Output tensor (B, channels, T). 248 | 249 | """ 250 | batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False) 251 | batch_index = torch.tensor(batch_index).to(x.device) 252 | ch_index = torch.tensor(ch_index).to(x.device) 253 | 254 | for i, dilation in enumerate(self.dilations): 255 | xt = self.nonlinears[i](x) 256 | xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index) 257 | xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF) 258 | if self.use_additional_convs: 259 | xt = self.convsA[i](xt) 260 | x = xt + x 261 | return x 262 | -------------------------------------------------------------------------------- /python/generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """HiFiGAN and SiFiGAN Generator modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | - https://github.com/jik876/hifi-gan 12 | 13 | """ 14 | 15 | from logging import getLogger 16 | 17 | import torch 18 | import torch.nn as nn 19 | from residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock 20 | 21 | # A logger for this file 22 | logger = getLogger(__name__) 23 | 24 | 25 | 26 | class SiFiGANGenerator(nn.Module): 27 | """SiFiGAN generator module.""" 28 | 29 | def __init__( 30 | self, 31 | in_channels, 32 | out_channels=1, 33 | channels=512, 34 | kernel_size=7, 35 | upsample_scales=(5, 4, 3, 2), 36 | upsample_kernel_sizes=(10, 8, 6, 4), 37 | source_network_params={ 38 | "resblock_kernel_size": 3, # currently only 3 is supported. 39 | "resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)], 40 | "use_additional_convs": True, 41 | }, 42 | filter_network_params={ 43 | "resblock_kernel_sizes": (3, 5, 7), 44 | "resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)], 45 | "use_additional_convs": False, 46 | }, 47 | share_upsamples=False, 48 | share_downsamples=False, 49 | bias=True, 50 | nonlinear_activation="LeakyReLU", 51 | nonlinear_activation_params={"negative_slope": 0.1}, 52 | use_weight_norm=True, 53 | requires_grad=True 54 | ): 55 | """Initialize SiFiGANGenerator module. 56 | 57 | Args: 58 | in_channels (int): Number of input channels. 59 | out_channels (int): Number of output channels. 60 | channels (int): Number of hidden representation channels. 61 | kernel_size (int): Kernel size of initial and final conv layer. 62 | upsample_scales (list): List of upsampling scales. 63 | upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. 64 | source_network_params (dict): Parameters for source-network. 65 | filter_network_params (dict): Parameters for filter-network. 66 | share_upsamples (bool): Whether to share up-sampling transposed CNNs. 67 | share_downsamples (bool): Whether to share down-sampling CNNs. 68 | bias (bool): Whether to add bias parameter in convolution layers. 69 | nonlinear_activation (str): Activation function module name. 70 | nonlinear_activation_params (dict): Hyperparameters for activation function. 71 | use_weight_norm (bool): Whether to use weight norm. 72 | If set to true, it will be applied to all of the conv layers. 73 | 74 | """ 75 | super().__init__() 76 | # check hyperparameters are valid 77 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 78 | assert len(upsample_scales) == len(upsample_kernel_sizes) 79 | 80 | # define modules 81 | self.num_upsamples = len(upsample_kernel_sizes) 82 | self.source_network_params = source_network_params 83 | self.filter_network_params = filter_network_params 84 | self.share_upsamples = share_upsamples 85 | self.share_downsamples = share_downsamples 86 | self.sn = nn.ModuleDict() 87 | self.fn = nn.ModuleDict() 88 | self.input_conv = Conv1d( 89 | in_channels, 90 | channels, 91 | kernel_size, 92 | bias=bias, 93 | padding=(kernel_size - 1) // 2, 94 | ) 95 | self.sn["upsamples"] = nn.ModuleList() 96 | self.fn["upsamples"] = nn.ModuleList() 97 | self.sn["blocks"] = nn.ModuleList() 98 | self.fn["blocks"] = nn.ModuleList() 99 | for i in range(len(upsample_kernel_sizes)): 100 | assert upsample_kernel_sizes[i] == 2 * upsample_scales[i] 101 | self.sn["upsamples"] += [ 102 | nn.Sequential( 103 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 104 | nn.ConvTranspose1d( 105 | channels // (2 ** i), 106 | channels // (2 ** (i + 1)), 107 | upsample_kernel_sizes[i], 108 | upsample_scales[i], 109 | padding=upsample_scales[i] // 2 + upsample_scales[i] % 2, 110 | output_padding=upsample_scales[i] % 2, 111 | bias=bias, 112 | ), 113 | ) 114 | ] 115 | if not share_upsamples: 116 | self.fn["upsamples"] += [ 117 | nn.Sequential( 118 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 119 | nn.ConvTranspose1d( 120 | channels // (2 ** i), 121 | channels // (2 ** (i + 1)), 122 | upsample_kernel_sizes[i], 123 | upsample_scales[i], 124 | padding=upsample_scales[i] // 2 + upsample_scales[i] % 2, 125 | output_padding=upsample_scales[i] % 2, 126 | bias=bias, 127 | ), 128 | ) 129 | ] 130 | self.sn["blocks"] += [ 131 | AdaptiveResidualBlock( 132 | kernel_size=source_network_params["resblock_kernel_size"], 133 | channels=channels // (2 ** (i + 1)), 134 | dilations=source_network_params["resblock_dilations"][i], 135 | bias=bias, 136 | use_additional_convs=source_network_params["use_additional_convs"], 137 | nonlinear_activation=nonlinear_activation, 138 | nonlinear_activation_params=nonlinear_activation_params, 139 | ) 140 | ] 141 | for j in range(len(filter_network_params["resblock_kernel_sizes"])): 142 | self.fn["blocks"] += [ 143 | ResidualBlock( 144 | kernel_size=filter_network_params["resblock_kernel_sizes"][j], 145 | channels=channels // (2 ** (i + 1)), 146 | dilations=filter_network_params["resblock_dilations"][j], 147 | bias=bias, 148 | use_additional_convs=filter_network_params["use_additional_convs"], 149 | nonlinear_activation=nonlinear_activation, 150 | nonlinear_activation_params=nonlinear_activation_params, 151 | ) 152 | ] 153 | self.sn["output_conv"] = nn.Sequential( 154 | nn.LeakyReLU(), 155 | nn.Conv1d( 156 | channels // (2 ** (i + 1)), 157 | out_channels, 158 | kernel_size, 159 | bias=bias, 160 | padding=(kernel_size - 1) // 2, 161 | ), 162 | ) 163 | self.fn["output_conv"] = nn.Sequential( 164 | nn.LeakyReLU(), 165 | nn.Conv1d( 166 | channels // (2 ** (i + 1)), 167 | out_channels, 168 | kernel_size, 169 | bias=bias, 170 | padding=(kernel_size - 1) // 2, 171 | ), 172 | nn.Tanh(), 173 | ) 174 | 175 | # sine embedding layers 176 | self.sn["emb"] = Conv1d( 177 | 1, 178 | channels // (2 ** len(upsample_kernel_sizes)), 179 | kernel_size, 180 | bias=bias, 181 | padding=(kernel_size - 1) // 2, 182 | ) 183 | # down-sampling CNNs 184 | self.sn["downsamples"] = nn.ModuleList() 185 | for i in reversed(range(1,len(upsample_kernel_sizes))): 186 | self.sn["downsamples"] += [ 187 | nn.Sequential( 188 | nn.Conv1d( 189 | channels // (2 ** (i + 1)), 190 | channels // (2 ** i), 191 | upsample_kernel_sizes[i], 192 | upsample_scales[i], 193 | padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0), 194 | bias=bias, 195 | ), 196 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 197 | ) 198 | ] 199 | if not share_downsamples: 200 | self.fn["downsamples"] = nn.ModuleList() 201 | for i in reversed(range(1,len(upsample_kernel_sizes))): 202 | self.fn["downsamples"] += [ 203 | nn.Sequential( 204 | nn.Conv1d( 205 | channels // (2 ** (i + 1)), 206 | channels // (2 ** i), 207 | upsample_kernel_sizes[i], 208 | upsample_scales[i], 209 | padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0), 210 | bias=bias, 211 | ), 212 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 213 | ) 214 | ] 215 | 216 | # apply weight norm 217 | if use_weight_norm: 218 | self.apply_weight_norm() 219 | 220 | # reset parameters 221 | self.reset_parameters() 222 | 223 | if requires_grad == False: 224 | for param in self.parameters(): 225 | param.requires_grad = False 226 | 227 | def forward(self, x, c, d, sid): 228 | """Calculate forward propagation. 229 | 230 | Args: 231 | x (Tensor): Input sine signal (B, 1, T). 232 | c (Tensor): Input tensor (B, in_channels, T). 233 | d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples]. 234 | 235 | Returns: 236 | Tensor: Output tensor (B, out_channels, T). 237 | 238 | """ 239 | 240 | # currently, same input feature is input to each network 241 | c = self.input_conv(c) 242 | e = c 243 | 244 | # source-network forward 245 | x = self.sn["emb"](x) 246 | embs = [x] 247 | for i in range(self.num_upsamples - 1): 248 | x = self.sn["downsamples"][i](x) 249 | embs += [x] 250 | for i in range(self.num_upsamples): 251 | # excitation generation network 252 | e = self.sn["upsamples"][i](e) + embs[-i - 1] 253 | e = self.sn["blocks"][i](e, d[i]) 254 | e_ = self.sn["output_conv"](e) 255 | 256 | # filter-network forward 257 | embs = [e] 258 | for i in range(self.num_upsamples - 1): 259 | if self.share_downsamples: 260 | e = self.sn["downsamples"][i](e) 261 | else: 262 | e = self.fn["downsamples"][i](e) 263 | embs += [e] 264 | num_blocks = len(self.filter_network_params["resblock_kernel_sizes"]) 265 | for i in range(self.num_upsamples): 266 | # resonance filtering network 267 | if self.share_upsamples: 268 | c = self.sn["upsamples"][i](c) + embs[-i - 1] 269 | else: 270 | c = self.fn["upsamples"][i](c) + embs[-i - 1] 271 | cs = 0.0 # initialize 272 | for j in range(num_blocks): 273 | cs += self.fn["blocks"][i * num_blocks + j](c) 274 | c = cs / num_blocks 275 | c = self.fn["output_conv"](c) 276 | 277 | return c, e_ 278 | 279 | def reset_parameters(self): 280 | """Reset parameters. 281 | 282 | This initialization follows the official implementation manner. 283 | https://github.com/jik876/hifi-gan/blob/master/models.py 284 | 285 | """ 286 | 287 | def _reset_parameters(m): 288 | if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)): 289 | m.weight.data.normal_(0.0, 0.01) 290 | logger.debug(f"Reset parameters in {m}.") 291 | 292 | self.apply(_reset_parameters) 293 | 294 | def remove_weight_norm(self): 295 | """Remove weight normalization module from all of the layers.""" 296 | 297 | def _remove_weight_norm(m): 298 | try: 299 | logger.debug(f"Weight norm is removed from {m}.") 300 | nn.utils.remove_weight_norm(m) 301 | except ValueError: # this module didn't have weight norm 302 | return 303 | 304 | self.apply(_remove_weight_norm) 305 | 306 | def apply_weight_norm(self): 307 | """Apply weight normalization module from all of the layers.""" 308 | 309 | def _apply_weight_norm(m): 310 | if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d): 311 | nn.utils.weight_norm(m) 312 | logger.debug(f"Weight norm is applied to {m}.") 313 | 314 | self.apply(_apply_weight_norm) -------------------------------------------------------------------------------- /python/models.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | import commons 9 | import modules 10 | 11 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 13 | from commons import init_weights, get_padding 14 | from generator import SiFiGANGenerator 15 | from features import SignalGenerator, dilated_factor 16 | 17 | class TextEncoder(nn.Module): 18 | def __init__(self, 19 | out_channels, 20 | hidden_channels, 21 | requires_grad=True): 22 | super().__init__() 23 | self.out_channels = out_channels 24 | self.hidden_channels = hidden_channels 25 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 26 | #パラメータを学習しない 27 | if requires_grad == False: 28 | for param in self.parameters(): 29 | param.requires_grad = False 30 | 31 | def forward(self, x, x_lengths): 32 | x = torch.transpose(x.half(), 1, -1) # [b, h, t] 33 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 34 | stats = self.proj(x) * x_mask 35 | m, logs = torch.split(stats, self.out_channels, dim=1) 36 | return x, m, logs, x_mask 37 | 38 | class ResidualCouplingBlock(nn.Module): 39 | def __init__(self, 40 | channels, 41 | hidden_channels, 42 | kernel_size, 43 | dilation_rate, 44 | n_layers, 45 | n_flows=4, 46 | gin_channels=0, 47 | requires_grad=True): 48 | super().__init__() 49 | self.channels = channels 50 | self.hidden_channels = hidden_channels 51 | self.kernel_size = kernel_size 52 | self.dilation_rate = dilation_rate 53 | self.n_layers = n_layers 54 | self.n_flows = n_flows 55 | self.gin_channels = gin_channels 56 | 57 | self.flows = nn.ModuleList() 58 | for i in range(n_flows): 59 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 60 | self.flows.append(modules.Flip()) 61 | 62 | #パラメータを学習しない 63 | if requires_grad == False: 64 | for param in self.parameters(): 65 | param.requires_grad = False 66 | 67 | def forward(self, x, x_mask, g=None, reverse=False): 68 | if not reverse: 69 | for flow in self.flows: 70 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 71 | else: 72 | for flow in reversed(self.flows): 73 | x = flow(x, x_mask, g=g, reverse=reverse) 74 | return x 75 | 76 | 77 | class PosteriorEncoder(nn.Module): 78 | def __init__(self, 79 | in_channels, 80 | out_channels, 81 | hidden_channels, 82 | kernel_size, 83 | dilation_rate, 84 | n_layers, 85 | gin_channels=0, 86 | requires_grad=True): 87 | super().__init__() 88 | self.in_channels = in_channels 89 | self.out_channels = out_channels 90 | self.hidden_channels = hidden_channels 91 | self.kernel_size = kernel_size 92 | self.dilation_rate = dilation_rate 93 | self.n_layers = n_layers 94 | self.gin_channels = gin_channels 95 | 96 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 97 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 98 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 99 | 100 | #パラメータを学習しない 101 | if requires_grad == False: 102 | for param in self.parameters(): 103 | param.requires_grad = False 104 | 105 | 106 | def forward(self, x, x_lengths, g=None): 107 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 108 | x = self.pre(x) * x_mask 109 | x = self.enc(x, x_mask, g=g) 110 | stats = self.proj(x) * x_mask 111 | m, logs = torch.split(stats, self.out_channels, dim=1) 112 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 113 | return z, m, logs, x_mask 114 | 115 | 116 | class Generator(torch.nn.Module): 117 | def __init__(self, 118 | initial_channel, 119 | resblock, 120 | resblock_kernel_sizes, 121 | resblock_dilation_sizes, 122 | upsample_rates, 123 | upsample_initial_channel, 124 | upsample_kernel_sizes, 125 | requires_grad=True): 126 | super(Generator, self).__init__() 127 | self.num_kernels = len(resblock_kernel_sizes) 128 | self.num_upsamples = len(upsample_rates) 129 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 130 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 131 | 132 | self.ups = nn.ModuleList() 133 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 134 | self.ups.append(weight_norm( 135 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 136 | k, u, padding=(k-u)//2))) 137 | 138 | self.resblocks = nn.ModuleList() 139 | for i in range(len(self.ups)): 140 | ch = upsample_initial_channel//(2**(i+1)) 141 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 142 | self.resblocks.append(resblock(ch, k, d)) 143 | 144 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 145 | self.ups.apply(init_weights) 146 | 147 | if requires_grad == False: 148 | for param in self.parameters(): 149 | param.requires_grad = False 150 | 151 | 152 | def forward(self, x, g=None): 153 | x = self.conv_pre(x) 154 | 155 | for i in range(self.num_upsamples): 156 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 157 | x = self.ups[i](x) 158 | xs = None 159 | for j in range(self.num_kernels): 160 | if xs is None: 161 | xs = self.resblocks[i*self.num_kernels+j](x) 162 | else: 163 | xs += self.resblocks[i*self.num_kernels+j](x) 164 | x = xs / self.num_kernels 165 | x = F.leaky_relu(x) 166 | x = self.conv_post(x) 167 | x = torch.tanh(x) 168 | 169 | return x 170 | 171 | def remove_weight_norm(self): 172 | print('Removing weight norm...') 173 | for l in self.ups: 174 | remove_weight_norm(l) 175 | for l in self.resblocks: 176 | l.remove_weight_norm() 177 | 178 | 179 | class DiscriminatorP(torch.nn.Module): 180 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 181 | super(DiscriminatorP, self).__init__() 182 | self.period = period 183 | self.use_spectral_norm = use_spectral_norm 184 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 185 | self.convs = nn.ModuleList([ 186 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 187 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 188 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 189 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 190 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), 191 | ]) 192 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 193 | 194 | def forward(self, x): 195 | fmap = [] 196 | 197 | # 1d to 2d 198 | b, c, t = x.shape 199 | if t % self.period != 0: # pad first 200 | n_pad = self.period - (t % self.period) 201 | x = F.pad(x, (0, n_pad), "reflect") 202 | t = t + n_pad 203 | x = x.view(b, c, t // self.period, self.period) 204 | 205 | for l in self.convs: 206 | x = l(x) 207 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 208 | fmap.append(x) 209 | x = self.conv_post(x) 210 | fmap.append(x) 211 | x = torch.flatten(x, 1, -1) 212 | 213 | return x, fmap 214 | 215 | 216 | class DiscriminatorS(torch.nn.Module): 217 | def __init__(self, use_spectral_norm=False): 218 | super(DiscriminatorS, self).__init__() 219 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 220 | self.convs = nn.ModuleList([ 221 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 222 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 223 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 224 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 225 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 226 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 227 | ]) 228 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 229 | 230 | def forward(self, x): 231 | fmap = [] 232 | 233 | for l in self.convs: 234 | x = l(x) 235 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 236 | fmap.append(x) 237 | x = self.conv_post(x) 238 | fmap.append(x) 239 | x = torch.flatten(x, 1, -1) 240 | 241 | return x, fmap 242 | 243 | 244 | class MultiPeriodDiscriminator(torch.nn.Module): 245 | def __init__(self, use_spectral_norm=False): 246 | super(MultiPeriodDiscriminator, self).__init__() 247 | #periods = [2,3,5,7,11] 248 | periods = [3,5,7,11,13] 249 | 250 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 251 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] 252 | self.discriminators = nn.ModuleList(discs) 253 | 254 | def forward(self, y, y_hat, flag = True): 255 | if flag: 256 | y_d_rs = [] 257 | y_d_gs = [] 258 | fmap_rs = [] 259 | fmap_gs = [] 260 | for i, d in enumerate(self.discriminators): 261 | y_d_r, fmap_r = d(y) 262 | y_d_g, fmap_g = d(y_hat) 263 | y_d_rs.append(y_d_r) 264 | y_d_gs.append(y_d_g) 265 | fmap_rs.append(fmap_r) 266 | fmap_gs.append(fmap_g) 267 | 268 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 269 | else: 270 | y_d_gs = [] 271 | with torch.no_grad(): 272 | for i, d in enumerate(self.discriminators): 273 | y_d_g, _ = d(y_hat) 274 | y_d_gs.append(y_d_g) 275 | 276 | return y_d_gs 277 | 278 | 279 | class SynthesizerTrn(nn.Module): 280 | """ 281 | Synthesizer for Training 282 | """ 283 | 284 | def __init__(self, 285 | spec_channels, 286 | segment_size, 287 | inter_channels, 288 | hidden_channels, 289 | upsample_rates, 290 | upsample_initial_channel, 291 | upsample_kernel_sizes, 292 | n_flow, 293 | dec_out_channels=1, 294 | dec_kernel_size=7, 295 | n_speakers=0, 296 | gin_channels=0, 297 | requires_grad_pe=True, 298 | requires_grad_flow=True, 299 | requires_grad_text_enc=True, 300 | requires_grad_dec=True, 301 | requires_grad_emb_g=True, 302 | sample_rate=24000, 303 | hop_size=128, 304 | sine_amp=0.1, 305 | noise_amp=0.003, 306 | signal_types=["sine"], 307 | dense_factors=[0.5, 1, 4, 8], 308 | upsample_scales=[8, 4, 2, 2], 309 | ): 310 | 311 | super().__init__() 312 | self.spec_channels = spec_channels 313 | self.hidden_channels = hidden_channels 314 | self.upsample_rates = upsample_rates 315 | self.upsample_initial_channel = upsample_initial_channel 316 | self.upsample_kernel_sizes = upsample_kernel_sizes 317 | self.segment_size = segment_size 318 | self.dec_out_channels = dec_out_channels 319 | self.dec_kernel_size = dec_kernel_size 320 | self.n_speakers = n_speakers 321 | self.gin_channels = gin_channels 322 | self.requires_grad_pe = requires_grad_pe 323 | self.requires_grad_flow = requires_grad_flow 324 | self.requires_grad_text_enc = requires_grad_text_enc 325 | self.requires_grad_dec = requires_grad_dec 326 | self.requires_grad_emb_g = requires_grad_emb_g 327 | self.sample_rate = sample_rate 328 | self.hop_size = hop_size 329 | self.sine_amp = sine_amp 330 | self.noise_amp = noise_amp 331 | self.signal_types = signal_types 332 | self.dense_factors = dense_factors 333 | self.upsample_scales = upsample_scales 334 | 335 | self.enc_q = PosteriorEncoder( 336 | spec_channels, 337 | inter_channels, 338 | hidden_channels, 339 | 5, 340 | 1, 341 | 16, 342 | gin_channels=gin_channels, 343 | requires_grad=requires_grad_pe) 344 | self.enc_p = TextEncoder( 345 | inter_channels, 346 | hidden_channels, 347 | requires_grad=requires_grad_text_enc) 348 | self.dec = SiFiGANGenerator( 349 | in_channels=inter_channels, 350 | out_channels=dec_out_channels, 351 | channels=upsample_initial_channel, 352 | kernel_size=dec_kernel_size, 353 | upsample_scales=upsample_rates, 354 | upsample_kernel_sizes=upsample_kernel_sizes, 355 | requires_grad=requires_grad_dec) 356 | self.flow = ResidualCouplingBlock( 357 | inter_channels, 358 | hidden_channels, 359 | 5, 360 | 1, 361 | 4, 362 | n_flows=n_flow, 363 | gin_channels=gin_channels, 364 | requires_grad=requires_grad_flow) 365 | self.signal_generator = SignalGenerator( 366 | sample_rate=sample_rate, 367 | hop_size=hop_size, 368 | noise_amp=noise_amp, 369 | signal_types=signal_types 370 | ) 371 | 372 | if n_speakers > 1: 373 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 374 | self.emb_g.requires_grad = requires_grad_emb_g 375 | 376 | def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None): 377 | sin, d = self.make_sin_d(f0) 378 | 379 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) 380 | #target sid 作成 381 | target_sids = self.make_random_target_sids(target_ids, sid) 382 | 383 | if self.n_speakers > 0: 384 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 385 | tgt_g = self.emb_g(target_sids).unsqueeze(-1) # [b, h, 1] 386 | else: 387 | g = None 388 | 389 | #PE 390 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 391 | #Flow 392 | z_p = self.flow(z, y_mask, g=g) 393 | #VC 394 | tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True) 395 | #アライメントの作成 396 | liner_alignment = F.one_hot(torch.arange(0, x.shape[2]+2)).cuda() 397 | liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0) 398 | liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode='linear', align_corners=True) 399 | liner_alignment = liner_alignment[:,1:-1,:] 400 | #TextEncとPEのshape合わせ 401 | m_p = torch.matmul(m_p, liner_alignment) 402 | logs_p = torch.matmul(logs_p, liner_alignment) 403 | 404 | #slice 405 | z_slice = commons.slice_segments(z, slice_id, self.segment_size) 406 | #targetのslice 407 | tgt_z_slice = commons.slice_segments(tgt_z, slice_id, self.segment_size) 408 | #Dec 409 | o = self.dec(sin, z_slice, d, sid=g) 410 | tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g) 411 | 412 | return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q) 413 | 414 | def make_sin_d(self, f0): 415 | # f0 から sin と d を作成 416 | # f0 : [b, 1, t] 417 | # sin : [b, 1, t] 418 | # d : [4][b, 1, t] 419 | prod_upsample_scales = np.cumprod(self.upsample_scales) 420 | dfs_batch = [] 421 | for df, us in zip(self.dense_factors, prod_upsample_scales): 422 | dilated_tensor = dilated_factor(f0, self.sample_rate, df) 423 | #result += [torch.repeat_interleave(dilated_tensor, us, dim=1)] 424 | result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)] 425 | dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1)) 426 | in_batch = self.signal_generator(f0) 427 | 428 | return in_batch, dfs_batch 429 | 430 | def make_random_target_sids(self, target_ids, sid): 431 | # target_sids は target_ids をランダムで埋める 432 | target_sids = torch.zeros_like(sid) 433 | for i in range(len(target_sids)): 434 | source_id = sid[i] 435 | deleted_target_ids = target_ids[target_ids != source_id] # source_id と target_id が同じにならないよう sid と同じものを削除 436 | if len(deleted_target_ids) >= 1: 437 | target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))] 438 | else: 439 | # target_id 候補が無いときは仕方ないので sid を使う 440 | target_sids[i] = source_id 441 | return target_sids 442 | 443 | def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt): 444 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 445 | sin, d = self.make_sin_d(f0) 446 | g_src = self.emb_g(sid_src).unsqueeze(-1) 447 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 448 | z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src) 449 | z_p = self.flow(z, y_mask, g=g_src) 450 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 451 | o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt) 452 | return o_hat[0] 453 | 454 | def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt): 455 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 456 | g_src = self.emb_g(sid_src).unsqueeze(-1) 457 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 458 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 459 | o_hat = self.dec(z * y_mask, g=g_tgt) 460 | return o_hat, y_mask, (z) 461 | 462 | def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt): 463 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 464 | g_src = self.emb_g(sid_src).unsqueeze(-1) 465 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 466 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 467 | o_hat = self.dec(z * y_mask, g=g_src) 468 | return o_hat, y_mask, (z) 469 | 470 | def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt): 471 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 472 | g_src = self.emb_g(sid_src).unsqueeze(-1) 473 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 474 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 475 | z_p = self.flow(z, y_mask, g=g_src) 476 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 477 | z_p_hat = self.flow(z_hat, y_mask, g=g_tgt) 478 | z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True) 479 | o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt) 480 | return o_hat, y_mask, (z, z_p, z_hat) 481 | 482 | def save_synthesizer(self, path): 483 | enc_q = self.enc_q.state_dict() 484 | dec = self.dec.state_dict() 485 | emb_g = self.emb_g.state_dict() 486 | torch.save({'enc_q': enc_q,'dec': dec, 'emb_g': emb_g}, path) 487 | 488 | def load_synthesizer(self, path): 489 | dict = torch.load(path, map_location='cpu') 490 | enc_q = dict['enc_q'] 491 | dec = dict['dec'] 492 | emb_g = dict['emb_g'] 493 | self.enc_q.load_state_dict(enc_q) 494 | self.dec.load_state_dict(dec) 495 | self.emb_g.load_state_dict(emb_g) 496 | 497 | -------------------------------------------------------------------------------- /python/mmvc_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | #use thread limit 3 | import os 4 | os.environ["OMP_NUM_THREADS"] = "1" 5 | import sys 6 | import json 7 | import csv 8 | import numpy as np 9 | import torch 10 | import onnxruntime as ort 11 | import pyaudio 12 | import sounddevice as sd 13 | import soundfile as sf 14 | import wave 15 | #noice reduce 16 | import noisereduce as nr 17 | #ファイルダイアログ関連 18 | import tkinter as tk #add 19 | from tkinter import filedialog #add 20 | 21 | #user lib 22 | from models import SynthesizerTrn 23 | 24 | #remove F0_SCALE 25 | 26 | import time 27 | import pyworld as pw 28 | from scipy.interpolate import interp1d 29 | from features import SignalGenerator, dilated_factor 30 | 31 | 32 | def load_checkpoint(checkpoint_path, model, optimizer=None): 33 | assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}" 34 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 35 | iteration = checkpoint_dict['iteration'] 36 | learning_rate = checkpoint_dict['learning_rate'] 37 | if optimizer is not None: 38 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 39 | saved_state_dict = { 40 | **checkpoint_dict['pe'], 41 | **checkpoint_dict['flow'], 42 | **checkpoint_dict['text_enc'], 43 | **checkpoint_dict['dec'], 44 | **checkpoint_dict['emb_g'] 45 | } 46 | if hasattr(model, 'module'): 47 | state_dict = model.module.state_dict() 48 | else: 49 | state_dict = model.state_dict() 50 | new_state_dict= {} 51 | for k, v in state_dict.items(): 52 | try: 53 | new_state_dict[k] = saved_state_dict[k] 54 | except: 55 | new_state_dict[k] = v 56 | if hasattr(model, 'module'): 57 | model.module.load_state_dict(new_state_dict) 58 | else: 59 | model.load_state_dict(new_state_dict) 60 | return model, optimizer, learning_rate, iteration 61 | 62 | 63 | def get_hparams_from_file(config_path): 64 | with open(config_path, "r", encoding="utf-8") as f: 65 | data = f.read() 66 | config = json.loads(data) 67 | 68 | hparams =HParams(**config) 69 | return hparams 70 | 71 | 72 | def read_correspondence_file(filename, delimiter='|', newline='\n'): 73 | data = {} 74 | with open(filename, "r", encoding="utf-8", newline=newline) as f: 75 | csv_reader = csv.reader(f, delimiter=delimiter) 76 | for row in csv_reader: 77 | sid = int(row[0]) 78 | f0 = float(row[1]) 79 | data[sid] = f0 80 | return data 81 | 82 | class HParams(): 83 | def __init__(self, **kwargs): 84 | for k, v in kwargs.items(): 85 | if type(v) == dict: 86 | v = HParams(**v) 87 | self[k] = v 88 | 89 | def keys(self): 90 | return self.__dict__.keys() 91 | 92 | def items(self): 93 | return self.__dict__.items() 94 | 95 | def values(self): 96 | return self.__dict__.values() 97 | 98 | def __len__(self): 99 | return len(self.__dict__) 100 | 101 | def __getitem__(self, key): 102 | return getattr(self, key) 103 | 104 | def __setitem__(self, key, value): 105 | return setattr(self, key, value) 106 | 107 | def __contains__(self, key): 108 | return key in self.__dict__ 109 | 110 | def __repr__(self): 111 | return self.__dict__.__repr__() 112 | 113 | 114 | class Hyperparameters(): 115 | CHANNELS = 1 #モノラル 116 | FORMAT = pyaudio.paInt16 117 | INPUT_DEVICE_1 = None 118 | INPUT_DEVICE_2 = None 119 | OUTPUT_DEVICE_1 = None 120 | CONFIG_JSON_PATH = None 121 | MODEL_PATH = None 122 | NOISE_FILE = None 123 | CORRESPONDENCE_PATH = None 124 | FLAME_LENGTH = None 125 | SOURCE_ID = None 126 | TARGET_ID = None 127 | F0_SCALE = None 128 | MIC_SCALE = None 129 | USE_NR = None 130 | VOICE_LIST = None 131 | VOICE_LABEL = None 132 | VOICE_F0 = None 133 | #jsonから取得 134 | SAMPLE_RATE = None 135 | MAX_WAV_VALUE = None 136 | FILTER_LENGTH = None 137 | HOP_LENGTH = None 138 | SEGMENT_SIZE = None 139 | N_SPEAKERS = None 140 | CONFIG_JSON_Body = None 141 | DELAY_FLAMES = None 142 | #thread share var 143 | REC_NOISE_END_FLAG = False 144 | VC_END_FLAG = False 145 | OVERLAP = None 146 | DISPOSE_STFT_SPECS = 0 147 | DISPOSE_CONV1D_SPECS = 0 148 | INPUT_FILENAME = None 149 | OUTPUT_FILENAME = None 150 | GPU_ID = 0 151 | Voice_Selector_Flag = None 152 | USE_ONNX = None 153 | ONNX_PROVIDERS = None 154 | ORT_ENABLE_BASIC = None 155 | hps = None 156 | 157 | def set_input_device_1(self, value): 158 | Hyperparameters.INPUT_DEVICE_1 = value 159 | 160 | def set_input_device_2(self, value): 161 | Hyperparameters.INPUT_DEVICE_2 = value 162 | 163 | def set_output_device_1(self, value): 164 | Hyperparameters.OUTPUT_DEVICE_1 = value 165 | 166 | def set_config_path(self, value): 167 | Hyperparameters.CONFIG_JSON_PATH = value 168 | self.hps = get_hparams_from_file(Hyperparameters.CONFIG_JSON_PATH) 169 | Hyperparameters.CONFIG_JSON_Body = self.hps 170 | Hyperparameters.SAMPLE_RATE = self.hps.data.sampling_rate 171 | Hyperparameters.MAX_WAV_VALUE = self.hps.data.max_wav_value 172 | Hyperparameters.FILTER_LENGTH = self.hps.data.filter_length 173 | Hyperparameters.HOP_LENGTH = self.hps.data.hop_length 174 | Hyperparameters.SEGMENT_SIZE = self.hps.train.segment_size 175 | Hyperparameters.N_SPEAKERS = self.hps.data.n_speakers 176 | if not hasattr(self.hps.model, "use_mel_train"): 177 | self.hps.model.use_mel_train = False 178 | 179 | def set_model_path(self, value): 180 | Hyperparameters.MODEL_PATH = value 181 | 182 | def set_NOISE_FILE(self, value): 183 | Hyperparameters.NOISE_FILE = value 184 | 185 | def set_CORRESPONDENCE_PATH(self, value): 186 | Hyperparameters.CORRESPONDENCE_PATH = value 187 | 188 | def set_FLAME_LENGTH(self, value): 189 | Hyperparameters.FLAME_LENGTH = value 190 | 191 | def set_SOURCE_ID(self, value): 192 | Hyperparameters.SOURCE_ID = value 193 | 194 | def set_TARGET_ID(self, value): 195 | Hyperparameters.TARGET_ID = value 196 | 197 | def set_F0_SCALE(self, value): 198 | Hyperparameters.F0_SCALE = value 199 | 200 | def set_MIC_SCALE(self, value): 201 | Hyperparameters.MIC_SCALE = value 202 | 203 | def set_OVERLAP(self, value): 204 | Hyperparameters.OVERLAP = value 205 | 206 | def set_USE_NR(self, value): 207 | Hyperparameters.USE_NR = value 208 | 209 | def set_VOICE_LIST(self, value): 210 | Hyperparameters.VOICE_LIST = value 211 | 212 | def set_VOICE_LABEL(self, value): 213 | Hyperparameters.VOICE_LABEL = value 214 | 215 | def set_VOICE_F0(self, value): 216 | Hyperparameters.VOICE_F0 = value 217 | 218 | def set_DELAY_FLAMES(self, value): 219 | Hyperparameters.DELAY_FLAMES = value 220 | 221 | def set_DISPOSE_STFT_SPECS(self, value): 222 | Hyperparameters.DISPOSE_STFT_SPECS = value 223 | 224 | def set_DISPOSE_CONV1D_SPECS(self, value): 225 | Hyperparameters.DISPOSE_CONV1D_SPECS = value 226 | 227 | def set_INPUT_FILENAME(self, value): 228 | Hyperparameters.INPUT_FILENAME = value 229 | 230 | def set_OUTPUT_FILENAME(self, value): 231 | Hyperparameters.OUTPUT_FILENAME = value 232 | 233 | def set_GPU_ID(self, value): 234 | Hyperparameters.GPU_ID = value 235 | 236 | def set_Voice_Selector(self, value): 237 | Hyperparameters.Voice_Selector_Flag = value 238 | 239 | def set_USE_ONNX(self, value): 240 | Hyperparameters.USE_ONNX = value 241 | 242 | def set_ONNX_PROVIDERS(self, value): 243 | Hyperparameters.ONNX_PROVIDERS = value 244 | 245 | def set_ONNX_ORT_ENABLE_BASIC(self, value): 246 | Hyperparameters.ORT_ENABLE_BASIC = value 247 | 248 | def set_profile(self, profile): 249 | sound_devices = sd.query_devices() 250 | if type(profile.device.input_device1) == str: 251 | self.set_input_device_1(sound_devices.index(sd.query_devices(profile.device.input_device1, 'input'))) 252 | else: 253 | self.set_input_device_1(profile.device.input_device1) 254 | 255 | if type(profile.device.input_device2) == str: 256 | self.set_input_device_2(sound_devices.index(sd.query_devices(profile.device.input_device2, 'input'))) 257 | else: 258 | self.set_input_device_2(profile.device.input_device2) 259 | 260 | if type(profile.device.output_device) == str: 261 | self.set_output_device_1(sound_devices.index(sd.query_devices(profile.device.output_device, 'output'))) 262 | else: 263 | self.set_output_device_1(profile.device.output_device) 264 | 265 | self.set_config_path(profile.path.json) 266 | self.set_model_path(profile.path.model) 267 | self.set_NOISE_FILE(profile.path.noise) 268 | self.set_CORRESPONDENCE_PATH(profile.path.correspondence) 269 | self.set_FLAME_LENGTH(profile.vc_conf.frame_length) 270 | self.set_SOURCE_ID(profile.vc_conf.source_id) 271 | self.set_TARGET_ID(profile.vc_conf.target_id) 272 | self.set_F0_SCALE(profile.vc_conf.f0_scale) 273 | self.set_MIC_SCALE(profile.vc_conf.mic_scale) 274 | self.set_OVERLAP(profile.vc_conf.overlap) 275 | self.set_USE_NR(profile.others.use_nr) 276 | self.set_VOICE_LIST(profile.others.voice_list) 277 | self.set_VOICE_LABEL(profile.others.voice_label) 278 | self.set_VOICE_F0(profile.others.voice_f0) 279 | self.set_DELAY_FLAMES(profile.vc_conf.delay_flames) 280 | self.set_DISPOSE_STFT_SPECS(profile.vc_conf.dispose_stft_specs) 281 | self.set_DISPOSE_CONV1D_SPECS(profile.vc_conf.dispose_conv1d_specs) 282 | if hasattr(profile.others, "input_filename"): 283 | self.set_INPUT_FILENAME(profile.others.input_filename) 284 | if hasattr(profile.others, "output_filename"): 285 | self.set_OUTPUT_FILENAME(profile.others.output_filename) 286 | self.set_GPU_ID(profile.device.gpu_id) 287 | self.set_Voice_Selector(profile.others.voice_selector) 288 | if hasattr(profile.vc_conf, "onnx"): 289 | self.set_USE_ONNX(profile.vc_conf.onnx.use_onnx) 290 | self.set_ONNX_PROVIDERS(profile.vc_conf.onnx.onnx_providers) 291 | if hasattr(profile.vc_conf.onnx, "ort_enable_basic"): 292 | self.set_ONNX_ORT_ENABLE_BASIC(profile.vc_conf.onnx.ort_enable_basic) 293 | else: 294 | self.set_ONNX_ORT_ENABLE_BASIC(False) 295 | 296 | def launch_model(self): 297 | if self.hps.model.use_mel_train: 298 | channels = self.hps.data.n_mel_channels 299 | else: 300 | channels = self.hps.data.filter_length // 2 + 1 301 | 302 | net_g = SynthesizerTrn( 303 | spec_channels = channels, 304 | segment_size = self.hps.train.segment_size // self.hps.data.hop_length, 305 | inter_channels = self.hps.model.inter_channels, 306 | hidden_channels = self.hps.model.hidden_channels, 307 | upsample_rates = self.hps.model.upsample_rates, 308 | upsample_initial_channel = self.hps.model.upsample_initial_channel, 309 | upsample_kernel_sizes = self.hps.model.upsample_kernel_sizes, 310 | n_flow = self.hps.model.n_flow, 311 | dec_out_channels=1, 312 | dec_kernel_size=7, 313 | n_speakers = self.hps.data.n_speakers, 314 | gin_channels = self.hps.model.gin_channels, 315 | requires_grad_pe = self.hps.requires_grad.pe, 316 | requires_grad_flow = self.hps.requires_grad.flow, 317 | requires_grad_text_enc = self.hps.requires_grad.text_enc, 318 | requires_grad_dec = self.hps.requires_grad.dec, 319 | requires_grad_emb_g = self.hps.requires_grad.emb_g, 320 | sample_rate = self.hps.data.sampling_rate, 321 | hop_size = self.hps.data.hop_length, 322 | sine_amp = self.hps.data.sine_amp, 323 | noise_amp = self.hps.data.noise_amp, 324 | signal_types = self.hps.data.signal_types, 325 | dense_factors = self.hps.data.dense_factors, 326 | upsample_scales = self.hps.model.upsample_rates, 327 | ) 328 | _ = net_g.eval() 329 | 330 | return net_g 331 | 332 | #f0からcf0を推定する 333 | def convert_continuos_f0(self, f0, f0_size): 334 | """Convert F0 to continuous F0 335 | 336 | Args: 337 | f0 (ndarray): original f0 sequence with the shape (T) 338 | 339 | Return: 340 | (ndarray): continuous f0 with the shape (T) 341 | 342 | """ 343 | # get start and end of f0 344 | if (f0 == 0).all(): 345 | return np.zeros((f0_size,)) 346 | start_f0 = f0[f0 != 0][0] 347 | end_f0 = f0[f0 != 0][-1] 348 | # padding start and end of f0 sequence 349 | cf0 = f0 350 | start_idx = np.where(cf0 == start_f0)[0][0] 351 | end_idx = np.where(cf0 == end_f0)[0][-1] 352 | cf0[:start_idx] = start_f0 353 | cf0[end_idx:] = end_f0 354 | # get non-zero frame index 355 | nz_frames = np.where(cf0 != 0)[0] 356 | # perform linear interpolation 357 | f = interp1d(nz_frames, cf0[nz_frames], bounds_error=False, fill_value=0.0) 358 | return f(np.arange(0, f0_size)) 359 | 360 | def audio_trans(self, tdbm, input, net_g, noise_data, target_id, f0_scale, dispose_stft_specs, dispose_conv1d_specs, ort_session=None): 361 | gpu_id = Hyperparameters.GPU_ID 362 | mic_scale = Hyperparameters.MIC_SCALE 363 | hop_length = Hyperparameters.HOP_LENGTH 364 | delay_frames = Hyperparameters.DELAY_FLAMES 365 | overlap_length = Hyperparameters.OVERLAP 366 | dispose_conv1d_length = dispose_conv1d_specs * hop_length 367 | dispose_specs = dispose_stft_specs * 2 + dispose_conv1d_specs * 2 368 | dispose_length = dispose_specs * hop_length 369 | fixed_length = (delay_frames + dispose_length + overlap_length) // hop_length - dispose_stft_specs * 2 370 | 371 | # byte => torch 372 | signal = np.frombuffer(input, dtype='int16') 373 | #signal = torch.frombuffer(input, dtype=torch.float32) 374 | signal = signal * mic_scale / Hyperparameters.MAX_WAV_VALUE 375 | #F0推定テスト 5.5が奇跡的にぴったり 376 | _f0, _time = pw.dio(signal, Hyperparameters.SAMPLE_RATE,frame_period = 5.5) # 基本周波数の抽出 377 | f0 = pw.stonemask(signal, _f0, _time, Hyperparameters.SAMPLE_RATE) # 基本周波数の修正 378 | f0 = self.convert_continuos_f0(f0, int(signal.shape[0] / hop_length)) 379 | f0 = torch.from_numpy(f0.astype(np.float32)) 380 | 381 | if Hyperparameters.USE_NR: 382 | signal = nr.reduce_noise(y=signal, sr=Hyperparameters.SAMPLE_RATE, y_noise = noise_data, n_std_thresh_stationary=2.5,stationary=True) 383 | # any to many への取り組み(失敗) 384 | # f0を変えるだけでは枯れた声は直らなかった 385 | #f0trans = Shifter(Hyperparameters.SAMPLE_RATE, 1.75, frame_ms=20, shift_ms=10) 386 | #transformed = f0trans.transform(signal) 387 | signal = torch.from_numpy(signal.astype(np.float32)).clone() 388 | 389 | #voice conversion 390 | with torch.no_grad(): 391 | #SID 392 | trans_length = signal.size()[0] 393 | spec, sid = tdbm.get_audio_text_speaker_pair(signal.view(1, trans_length), Hyperparameters.SOURCE_ID) 394 | if dispose_stft_specs != 0: 395 | # specの頭と終がstft paddingの影響受けるので2コマを削る 396 | # wavもspecで削るぶんと同じだけ頭256と終256を削る 397 | spec = spec[:, dispose_stft_specs:-dispose_stft_specs] 398 | f0 = f0[dispose_stft_specs:-dispose_stft_specs] 399 | sid_src = sid 400 | sid_target = torch.LongTensor([target_id]) # 話者IDはJVSの番号を100で割った余りです 401 | spec = spec.unsqueeze(0) 402 | spec_lengths = torch.tensor([spec.size(2)]) 403 | f0 = (f0 * f0_scale).unsqueeze(0).unsqueeze(0) 404 | if Hyperparameters.USE_ONNX: 405 | if spec_lengths.numpy() != fixed_length: # 固定長に足りない場合は0パディング 406 | spec_padding_size = (1, spec.size(1), fixed_length - spec.size(2)) 407 | spec_zero_padding = torch.zeros(spec_padding_size) 408 | spec = torch.cat([spec, spec_zero_padding], dim=2) 409 | f0_padding_size = (1, 1, fixed_length - f0.size(2)) 410 | f0_zero_padding = torch.zeros(f0_padding_size) 411 | f0 = torch.cat([f0, f0_zero_padding], dim=2) 412 | spec_lengths = torch.tensor([spec.size(2)]) 413 | sin, d = net_g.make_sin_d(f0) 414 | (d0, d1, d2, d3) = d 415 | audio = ort_session.run( 416 | ["audio"], 417 | { 418 | "specs": spec.numpy(), 419 | "lengths": spec_lengths.numpy(), 420 | "sin": sin.numpy(), 421 | "d0": d0.numpy(), 422 | "d1": d1.numpy(), 423 | "d2": d2.numpy(), 424 | "d3": d3.numpy(), 425 | "sid_src": sid_src.numpy(), 426 | "sid_tgt": sid_target.numpy() 427 | })[0][0,0] 428 | else: 429 | if gpu_id >= 0: 430 | #spec, spec_lengths, sid_src, sin, d = [x.cuda(gpu_id) for x in data] 431 | spec = spec.cuda(gpu_id) 432 | spec_lengths = spec_lengths.cuda(gpu_id) 433 | sid_src = sid_src.cuda(gpu_id) 434 | sid_target = sid_target.cuda(gpu_id) # 話者IDはJVSの番号を100で割った余りです 435 | f0 = f0.cuda(gpu_id) 436 | audio = net_g.cuda(gpu_id).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy() 437 | else: 438 | audio = net_g.voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy() 439 | 440 | if dispose_conv1d_specs != 0: 441 | # 出力されたwavでconv1d paddingの影響受けるところを削る 442 | audio = audio[dispose_conv1d_length:-dispose_conv1d_length] 443 | audio = audio * Hyperparameters.MAX_WAV_VALUE 444 | audio = audio.astype(np.int16).tobytes() 445 | 446 | return audio 447 | 448 | def overlap_merge(self, now_wav, prev_wav, overlap_length): 449 | """ 450 | 生成したwavデータを前回生成したwavデータとoverlap_lengthだけ重ねてグラデーション的にマージします 451 | 終端のoverlap_lengthぶんは次回マージしてから再生するので削除します 452 | 453 | Parameters 454 | ---------- 455 | now_wav: 今回生成した音声wavデータ 456 | prev_wav: 前回生成した音声wavデータ 457 | overlap_length: 重ねる長さ 458 | """ 459 | if overlap_length == 0: 460 | return now_wav 461 | gradation = np.arange(overlap_length) / overlap_length 462 | now = np.frombuffer(now_wav, dtype='int16') 463 | prev = np.frombuffer(prev_wav, dtype='int16') 464 | now_head = now[:overlap_length] 465 | prev_tail = prev[-overlap_length:] 466 | merged = prev_tail * (np.cos(gradation * np.pi * 0.5) ** 2) + now_head * (np.cos((1-gradation) * np.pi * 0.5) ** 2) 467 | #merged = prev_tail * (1 - gradation) + now_head * gradation 468 | overlapped = np.append(merged, now[overlap_length:-overlap_length]) 469 | signal = np.round(overlapped, decimals=0) 470 | signal = signal.astype(np.int16).tobytes() 471 | return signal 472 | 473 | def vc_run(self): 474 | audio = pyaudio.PyAudio() 475 | print("モデルを読み込んでいます。少々お待ちください。") 476 | net_g = self.launch_model() 477 | ort_session = None 478 | if Hyperparameters.USE_ONNX : 479 | # DirectMLで動かすための設定 480 | ort_options = ort.SessionOptions() 481 | ort_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL 482 | ort_options.enable_mem_pattern = False 483 | if Hyperparameters.ORT_ENABLE_BASIC: 484 | ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC # https://kazuhito00.hatenablog.com/entry/2022/10/13/133248 485 | #ort_options.enable_profiling = True 486 | ort_session = ort.InferenceSession( 487 | Hyperparameters.MODEL_PATH, 488 | sess_options=ort_options, 489 | providers=Hyperparameters.ONNX_PROVIDERS) 490 | else: 491 | _ = load_checkpoint(Hyperparameters.MODEL_PATH, net_g, None) 492 | 493 | print("モデルの読み込みが完了しました。音声の入出力の準備を行います。少々お待ちください。") 494 | tdbm = Transform_Data_By_Model() 495 | 496 | if Hyperparameters.USE_NR: 497 | noise_data, noise_rate = sf.read(Hyperparameters.NOISE_FILE) 498 | else: 499 | noise_data = 0 500 | 501 | # audio stream voice 502 | #マイク 503 | audio_input_stream = audio.open(format=Hyperparameters.FORMAT, 504 | channels=1, 505 | rate=Hyperparameters.SAMPLE_RATE, 506 | frames_per_buffer=Hyperparameters.DELAY_FLAMES, 507 | input_device_index=Hyperparameters.INPUT_DEVICE_1, 508 | input=True) 509 | 510 | #Realtek Digital Output 511 | audio_output_stream = audio.open(format=Hyperparameters.FORMAT, 512 | channels=1, 513 | rate=Hyperparameters.SAMPLE_RATE, 514 | frames_per_buffer=Hyperparameters.DELAY_FLAMES, 515 | output_device_index = Hyperparameters.OUTPUT_DEVICE_1, 516 | output=True) 517 | 518 | # テストファイル入出力のモックアップ 519 | mock_stream = MockStream(Hyperparameters.SAMPLE_RATE) 520 | if Hyperparameters.INPUT_FILENAME != None: 521 | mock_stream.open_inputfile(Hyperparameters.INPUT_FILENAME) 522 | audio_input_stream = mock_stream 523 | if Hyperparameters.OUTPUT_FILENAME != None: 524 | mock_stream.open_outputfile(Hyperparameters.OUTPUT_FILENAME) 525 | audio_output_stream = mock_stream 526 | 527 | #CABLE Output 528 | if Hyperparameters.INPUT_DEVICE_2 != False: 529 | back_audio_input_stream = audio.open(format=Hyperparameters.FORMAT, 530 | channels=1, 531 | rate=Hyperparameters.SAMPLE_RATE, 532 | frames_per_buffer=Hyperparameters.DELAY_FLAMES, 533 | input_device_index=Hyperparameters.INPUT_DEVICE_2, 534 | input=True) 535 | else: 536 | back_audio_input_stream = audio.open(format=Hyperparameters.FORMAT, 537 | channels=1, 538 | rate=Hyperparameters.SAMPLE_RATE, 539 | frames_per_buffer=Hyperparameters.DELAY_FLAMES, 540 | input_device_index=Hyperparameters.INPUT_DEVICE_1, 541 | input=True) 542 | 543 | #Realtek Digital Output 544 | back_audio_output_stream = audio.open(format=Hyperparameters.FORMAT, 545 | channels=1, 546 | rate=Hyperparameters.SAMPLE_RATE, 547 | frames_per_buffer=Hyperparameters.DELAY_FLAMES, 548 | output_device_index = Hyperparameters.OUTPUT_DEVICE_1, 549 | output=True) 550 | 551 | with_bgm = (Hyperparameters.INPUT_DEVICE_2 != False) 552 | with_voice_selector = (Hyperparameters.INPUT_FILENAME == None) # 入力ファイルがない場合は音声選択ウィンドウあり 553 | voice_selector_flag = Hyperparameters.Voice_Selector_Flag # 音声選択ウィンドウの有無 554 | delay_frames = Hyperparameters.DELAY_FLAMES 555 | overlap_length = Hyperparameters.OVERLAP 556 | source_id = Hyperparameters.SOURCE_ID 557 | target_id = Hyperparameters.TARGET_ID 558 | target_f0_scale = 1.0 559 | f0_scale = Hyperparameters.F0_SCALE 560 | wav_bytes = 2 # 1音声データあたりのデータサイズ(2bytes) (math.log2(max_wav_value)+1)/8 で算出してもよいけど 561 | hop_length = Hyperparameters.HOP_LENGTH 562 | dispose_stft_specs = Hyperparameters.DISPOSE_STFT_SPECS 563 | dispose_conv1d_specs = Hyperparameters.DISPOSE_CONV1D_SPECS 564 | dispose_specs = dispose_stft_specs * 2 + dispose_conv1d_specs * 2 565 | dispose_length = dispose_specs * hop_length 566 | assert delay_frames >= dispose_length + overlap_length, "delay_frames have to be larger than dispose_length + overlap_length" 567 | 568 | #第一節を取得する 569 | try: 570 | print("準備が完了しました。VC開始します。") 571 | if with_voice_selector and voice_selector_flag: 572 | voice_selector = VoiceSelector() 573 | voice_selector.open_window() 574 | 575 | # in_wav: delay_frames * wav_bytes = 4096 * 2 = 8192 576 | # prev_wav_tail: (dispose_length + overlap_length) * wav_bytes = (1536 + 128) * 2 = 3328 577 | # prev_trans_wav: (delay_frames + overlap_length) * wav_bytes = (4096 + 128) * 2 = 8448 578 | prev_wav_tail = bytes((dispose_length + overlap_length) * wav_bytes) 579 | prev_trans_wav = bytes((delay_frames + overlap_length) * wav_bytes) 580 | #prev_wav_tail = bytes(0) 581 | #in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False) 582 | #trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, 0, 0, ort_session=ort_session) # 遅延減らすため初回だけpadding対策使わない 583 | #overlapped_wav = trans_wav 584 | #prev_trans_wav = trans_wav 585 | #if dispose_length + overlap_length != 0: 586 | # prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 次回の頭のデータとして終端データを保持する 587 | #if with_bgm: 588 | # back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow = False) # 背景BGMを取得 589 | while True: 590 | f0_factor = tdbm.get_f0_scale(source_id, target_id) * f0_scale * target_f0_scale 591 | in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False) 592 | trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, f0_factor, dispose_stft_specs, dispose_conv1d_specs, ort_session=ort_session) 593 | overlapped_wav = self.overlap_merge(trans_wav, prev_trans_wav, overlap_length) 594 | audio_output_stream.write(overlapped_wav) 595 | prev_trans_wav = trans_wav 596 | if dispose_length + overlap_length != 0: 597 | prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 今回の終端の捨てデータぶんだけ次回の頭のデータとして保持する 598 | if with_bgm: 599 | back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow=False) # 背景BGMを取得 600 | back_audio_output_stream.write(back_in_raw) 601 | 602 | if with_voice_selector and voice_selector_flag: 603 | target_id = voice_selector.voice_select_id 604 | target_f0_scale = voice_selector.voice_select_f0 605 | voice_selector.update_window() 606 | 607 | if Hyperparameters.VC_END_FLAG: #エスケープ 608 | print("vc_finish") 609 | break 610 | 611 | except KeyboardInterrupt: 612 | audio_input_stream.stop_stream() 613 | audio_input_stream.close() 614 | audio_output_stream.stop_stream() 615 | audio_output_stream.close() 616 | back_audio_input_stream.stop_stream() 617 | back_audio_input_stream.close() 618 | back_audio_output_stream.stop_stream() 619 | back_audio_output_stream.close() 620 | audio.terminate() 621 | #prof_file = ort_session.end_profiling() 622 | #print(prof_file) 623 | print("Stop Streaming") 624 | 625 | if with_voice_selector and voice_selector_flag: 626 | voice_selector.close_window() 627 | 628 | class Transform_Data_By_Model(): 629 | hann_window = {} 630 | FILTER_LENGTH = 0 631 | HOP_LENGTH = 0 632 | SAMPLE_RATE = 0 633 | HPS = None 634 | CONFIG = None 635 | correspondence_dict = None 636 | 637 | def __init__(self): 638 | self.G_HP = Hyperparameters() 639 | self.HPS = get_hparams_from_file(self.G_HP.CONFIG_JSON_PATH) 640 | self.correspondence_dict = read_correspondence_file(self.G_HP.CORRESPONDENCE_PATH) 641 | #define samplerate 642 | self.SAMPLE_RATE =self.HPS.data.sampling_rate 643 | #define filter size 644 | self.FILTER_LENGTH = self.HPS.data.filter_length 645 | self.HOP_LENGTH = self.HPS.data.hop_length 646 | 647 | def spectrogram_torch(self, y, n_fft, sampling_rate, hop_size, win_size, center=False): 648 | if torch.min(y) < -1.: 649 | print('min value is ', torch.min(y)) 650 | if torch.max(y) > 1.: 651 | print('max value is ', torch.max(y)) 652 | 653 | dtype_device = str(y.dtype) + '_' + str(y.device) 654 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 655 | if wnsize_dtype_device not in self.hann_window: 656 | self.hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 657 | 658 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 659 | y = y.squeeze(1) 660 | 661 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device], 662 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) 663 | spec = torch.view_as_real(spec) 664 | 665 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 666 | return spec 667 | 668 | def get_audio_text_speaker_pair(self, wav, sid): 669 | spec = self.get_spec(wav) 670 | sid = self.get_sid(sid) 671 | return (spec, sid) 672 | 673 | def get_spec(self, audio_norm): 674 | filter_length = self.FILTER_LENGTH 675 | sampling_rate = self.SAMPLE_RATE 676 | hop_length = self.HOP_LENGTH 677 | win_length = self.FILTER_LENGTH 678 | spec = self.spectrogram_torch(audio_norm, filter_length, 679 | sampling_rate, hop_length, win_length, 680 | center=False) 681 | spec = torch.squeeze(spec, 0) 682 | return spec 683 | 684 | def get_text(self, text): 685 | return text 686 | 687 | def get_sid(self, sid): 688 | sid = torch.LongTensor([int(sid)]) 689 | return sid 690 | 691 | def get_f0_scale(self, sid_src, sid_target): 692 | src_f0 = self.correspondence_dict[int(sid_src)] 693 | target_f0 = self.correspondence_dict[int(sid_target)] 694 | f0_scale = target_f0 / src_f0 695 | return torch.FloatTensor([f0_scale]) 696 | 697 | class MockStream: 698 | """ 699 | オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック 700 | """ 701 | def __init__(self, sampling_rate): 702 | self.sampling_rate = sampling_rate 703 | self.start_count = 2 704 | self.end_count = 2 705 | self.fr = None 706 | self.fw = None 707 | 708 | def open_inputfile(self, input_filename): 709 | self.fr = wave.open(input_filename, 'rb') 710 | 711 | def open_outputfile(self, output_filename): 712 | self.fw = wave.open(output_filename, 'wb') 713 | self.fw.setnchannels(1) 714 | self.fw.setsampwidth(2) 715 | self.fw.setframerate(self.sampling_rate) 716 | 717 | def read(self, length, exception_on_overflow=False): 718 | if self.start_count > 0: 719 | wav = bytes(length * 2) 720 | self.start_count -= 1 # 最初の2回はダミーの空データ送る 721 | else: 722 | wav = self.fr.readframes(length) 723 | if len(wav) <= 0: # データなくなってから最後の2回はダミーの空データを送る 724 | wav = bytes(length * 2) 725 | self.end_count -= 1 726 | if self.end_count < 0: 727 | Hyperparameters.VC_END_FLAG = True 728 | return wav 729 | 730 | def write(self, wav): 731 | self.fw.writeframes(wav) 732 | 733 | def stop_stream(self): 734 | pass 735 | 736 | def close(self): 737 | if self.fr != None: 738 | self.fr.close() 739 | self.fr = None 740 | if self.fw != None: 741 | self.fw.close() 742 | self.fw = None 743 | 744 | class VoiceSelector(): 745 | def get_closure(self, button, id, f0): 746 | 747 | def on_click(event): 748 | button.config(fg="red") 749 | self.selected_button.config(fg="black") 750 | self.selected_button = button 751 | self.voice_select_id = id 752 | self.voice_select_f0 = f0 753 | #print(f"voice select id: {id}") 754 | 755 | return on_click 756 | 757 | def open_window(self): 758 | self.voice_ids = Hyperparameters.VOICE_LIST 759 | self.voice_labels = Hyperparameters.VOICE_LABEL 760 | self.voice_f0s = Hyperparameters.VOICE_F0 761 | 762 | self.root_win = tk.Tk() 763 | height = int(len(self.voice_ids) * 30) 764 | self.root_win.geometry(f"200x{height}") 765 | self.root_win.title("MMVC Client") 766 | self.root_win.protocol("WM_DELETE_WINDOW", self.close_window) 767 | 768 | self.button_list = [] 769 | self.selected_button = None 770 | self.voice_select_id = self.voice_ids[0] 771 | self.voice_select_f0 = self.voice_f0s[0] 772 | 773 | for voice_id, voice_label, voice_f0 in zip(self.voice_ids, self.voice_labels, self.voice_f0s): 774 | button = tk.Button(self.root_win, text=f"{voice_label}") 775 | if voice_id == self.voice_select_id: 776 | button.config(fg="red") 777 | self.selected_button = button 778 | button_on_click = self.get_closure(button, voice_id, voice_f0) 779 | button.bind("", button_on_click) 780 | button.pack() 781 | self.button_list.append(button) 782 | 783 | def update_window(self): 784 | self.root_win.update() 785 | 786 | def close_window(self): 787 | if self.root_win != None: 788 | self.root_win.destroy() 789 | self.root_win = None 790 | Hyperparameters.VC_END_FLAG = True 791 | 792 | class VCPrifile(): 793 | def __init__(self, **kwargs): 794 | for k, v in kwargs.items(): 795 | if type(v) == dict: 796 | v = VCPrifile(**v) 797 | self[k] = v 798 | 799 | def keys(self): 800 | return self.__dict__.keys() 801 | 802 | def items(self): 803 | return self.__dict__.items() 804 | 805 | def values(self): 806 | return self.__dict__.values() 807 | 808 | def __len__(self): 809 | return len(self.__dict__) 810 | 811 | def __getitem__(self, key): 812 | return getattr(self, key) 813 | 814 | def __setitem__(self, key, value): 815 | return setattr(self, key, value) 816 | 817 | def __contains__(self, key): 818 | return key in self.__dict__ 819 | 820 | def __repr__(self): 821 | return self.__dict__.__repr__() 822 | 823 | def config_get(conf): 824 | config_path = conf 825 | with open(config_path, "r", encoding="utf-8") as f: 826 | data = f.read() 827 | config = json.loads(data) 828 | hparams = VCPrifile(**config) 829 | return hparams 830 | 831 | if __name__ == '__main__': 832 | try: #add 833 | args = sys.argv 834 | if len(args) < 2: 835 | end_counter = 0 836 | while True: # 無限ループ 837 | tkroot = tk.Tk() 838 | tkroot.withdraw() 839 | print('myprofile.conf を選択して下さい') 840 | typ = [('jsonファイル','*.conf')] 841 | dir = './' 842 | profile_path = filedialog.askopenfilename(filetypes = typ, initialdir = dir) 843 | tkroot.destroy() 844 | try: 845 | if profile_path: 846 | break 847 | else: 848 | print('ファイルが存在しません') 849 | end_counter = end_counter + 1 850 | print(end_counter) 851 | if end_counter > 3: 852 | break 853 | continue 854 | 855 | except ValueError: 856 | # ValueError例外を処理するコード 857 | print('パスを入力してください・') 858 | continue 859 | else: 860 | profile_path = args[1] 861 | print("起動時にmyprofile.confのパスが指定されました。") 862 | print(profile_path) 863 | 864 | params = config_get(profile_path) 865 | vc_main = Hyperparameters() 866 | 867 | print(params.path.json) 868 | vc_main.set_profile(params) 869 | vc_main.vc_run() 870 | 871 | except Exception as e: 872 | print('エラーが発生しました。') 873 | print(e) 874 | os.system('PAUSE') 875 | --------------------------------------------------------------------------------