├── version.txt
├── python
    ├── mmvc.ico
    ├── requirements.txt
    ├── makeexe.ps1
    ├── install_pipenv.ps1
    ├── Pipfile
    ├── commons.py
    ├── compile.md
    ├── symbols.py
    ├── output_audio_device_list.py
    ├── snake.py
    ├── onnx_bench.py
    ├── index.py
    ├── setup_check.py
    ├── rec_environmental_noise.py
    ├── features.py
    ├── modules.py
    ├── residual_block.py
    ├── generator.py
    ├── models.py
    └── mmvc_client.py
├── conf
    ├── myprofile_CUDA_sample.conf
    ├── myprofile_ONNX_sample.conf
    ├── myprofile.conf
    └── myprofile_ONNX_output_sample.conf
├── .gitignore
├── LICENSE
└── README.md


/version.txt:
--------------------------------------------------------------------------------
1 | v0.5.0.0
2 | 


--------------------------------------------------------------------------------
/python/mmvc.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isletennos/MMVC_Client/HEAD/python/mmvc.ico


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | noisereduce==2.0.0
 2 | numpy==1.23.5
 3 | protobuf==3.20.3
 4 | pyworld==0.3.2
 5 | #PyAudio==0.2.11
 6 | sounddevice==0.4.4
 7 | SoundFile==0.10.3.post1
 8 | -f https://download.pytorch.org/whl/torch_stable.html
 9 | torch==1.10.1+cu111
10 | onnxruntime-directml==1.13.1
11 | 


--------------------------------------------------------------------------------
/python/makeexe.ps1:
--------------------------------------------------------------------------------
1 | pipenv run pyinstaller mmvc_client.py --add-binary "./.venv/Lib/site-packages/onnxruntime/capi/onnxruntime_providers_shared.dll;./onnxruntime/capi/" --add-binary "./.venv/Lib/site-packages/onnxruntime/capi/DirectML.dll;./onnxruntime/capi/" --collect-data librosa --onedir --icon=mmvc.ico --clean -y
2 | pipenv run pyinstaller output_audio_device_list.py --onefile
3 | pipenv run pyinstaller rec_environmental_noise.py --onefile
4 | pipenv run pyinstaller setup_check.py --onefile
5 | 


--------------------------------------------------------------------------------
/python/install_pipenv.ps1:
--------------------------------------------------------------------------------
 1 | pip install --upgrade pip
 2 | pip install pipenv
 3 | $pythonUserPath = python -m site --user-site
 4 | $pythonUserPath = $pythonUserPath.Replace('site-packages', 'Scripts')
 5 | $ENV:Path += ";" + $pythonUserPath
 6 | $userPath = [System.Environment]::GetEnvironmentVariable("Path", "User")
 7 | $userPath += ";" + $pythonUserPath
 8 | [System.Environment]::SetEnvironmentVariable("Path", $userPath, "User")
 9 | $ENV:PIPENV_VENV_IN_PROJECT = '.venv'
10 | [System.Environment]::SetEnvironmentVariable("PIPENV_VENV_IN_PROJECT", ".venv", "User")
11 | 


--------------------------------------------------------------------------------
/python/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [[source]]
 7 | url = "https://download.pytorch.org/whl/cu111/"
 8 | verify_ssl = true
 9 | name = "downloadpytorch"
10 | 
11 | [packages]
12 | torch = {version = "==1.10.1+cu111", index = "downloadpytorch"}
13 | noisereduce = "==2.0.0"
14 | scikit-learn = "==1.0.2"
15 | sounddevice = "==0.4.4"
16 | SoundFile = "==0.10.3.post1"
17 | numpy = "~=1.23"
18 | protobuf = "~=3.20"
19 | pyworld = "==0.3.2"
20 | onnxruntime-directml = "==1.13.1"
21 | pyinstaller = "*"
22 | 
23 | [dev-packages]
24 | PyAudio = "~=0.2"
25 | py-cpuinfo = "~=9.0"
26 | psutil = "~=5.9"
27 | nvgpu = "~=0.9"
28 | 
29 | [requires]
30 | python_version = "3.9"
31 | 


--------------------------------------------------------------------------------
/python/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def init_weights(m, mean=0.0, std=0.01):
 5 |   classname = m.__class__.__name__
 6 |   if classname.find("Conv") != -1:
 7 |     m.weight.data.normal_(mean, std)
 8 | 
 9 | 
10 | def get_padding(kernel_size, dilation=1):
11 |   return int((kernel_size*dilation - dilation)/2)
12 | 
13 | 
14 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
15 |   n_channels_int = n_channels[0]
16 |   in_act = input_a + input_b
17 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
18 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
19 |   acts = t_act * s_act
20 |   return acts
21 | 
22 | 
23 | def sequence_mask(length, max_length=None):
24 |   if max_length is None:
25 |     max_length = length.max()
26 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
27 |   return x.unsqueeze(0) < length.unsqueeze(1)
28 | 


--------------------------------------------------------------------------------
/python/compile.md:
--------------------------------------------------------------------------------
1 | nuitka --standalone --mingw64 --follow-imports --windows-icon-from-ico=D:\GitRepository\RT-MMVC_Client\use_exe.ico --enable-plugin=torch --enable-plugin=anti-bloat --enable-plugin=numpy --enable-plugin=multiprocessing --enable-plugin=tk-inter --assume-yes-for-downloads --user-plugin=D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python\FixBuildPlugin_pytorch.py --include-plugin-directory=D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python --nofollow-import-to=torchvision --no-prefer-source-code D:\GitRepository\MMVC_Client\brunch\MMVC_Client\python\mmvc_client_GPU.py  
2 | 
3 | 1)_soundfile_data\... がないといわれるので、pythonの環境から_soundfile_dataディレクトリを直接持ってくる
4 | 2)llvmlite.dll がないといわれるので、pythonの環境からllvmliteディレクトリを直接持ってくる
5 | 3)librosa\... がないといわれるので、
6 | 4)cannot load filter definition for kaiser best と言われるので、python環境から、resampyを持ってくる
7 | 5)_sounddevice_dataも持ってくる
8 | 
9 | 


--------------------------------------------------------------------------------
/conf/myprofile_CUDA_sample.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |   "device": {
 3 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 4 |     "input_device2": false,
 5 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 6 |     "gpu_id": 0
 7 |   },
 8 |   "vc_conf": {
 9 |     "frame_length": 8192,
10 |     "delay_flames": 4096,
11 |     "overlap": 1024,
12 |     "dispose_stft_specs": 2,
13 |     "dispose_conv1d_specs": 10,
14 |     "source_id": 0,
15 |     "target_id": 101,
16 |     "f0_scale": 2.30,
17 |     "onnx": {
18 |       "use_onnx": false,
19 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
20 |     }
21 |   },
22 |   "path": {
23 |     "json": ".\\logs\\20220306_24000\\config.json",
24 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.pth",
25 |     "noise": ".\\noise.wav"
26 |   },
27 |   "others": {
28 |     "use_nr": false,
29 |     "voice_selector": false,
30 |     "voice_list": [101, 108, 6, 30],
31 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"],
32 |     "voice_f0": [2.30, 2.00, 2.10, 1.20]
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/myprofile_ONNX_sample.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |   "device": {
 3 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 4 |     "input_device2": false,
 5 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 6 |     "gpu_id": 0
 7 |   },
 8 |   "vc_conf": {
 9 |     "frame_length": 8192,
10 |     "delay_flames": 4096,
11 |     "overlap": 1024,
12 |     "dispose_stft_specs": 2,
13 |     "dispose_conv1d_specs": 10,
14 |     "source_id": 0,
15 |     "target_id": 101,
16 |     "f0_scale": 2.30,
17 |     "onnx": {
18 |       "use_onnx": true,
19 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
20 |     }
21 |   },
22 |   "path": {
23 |     "json": ".\\logs\\20220306_24000\\config.json",
24 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
25 |     "noise": ".\\noise.wav"
26 |   },
27 |   "others": {
28 |     "use_nr": false,
29 |     "voice_selector": false,
30 |     "voice_list": [101, 108, 6, 30],
31 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"],
32 |     "voice_f0": [2.30, 2.00, 2.10, 1.20]
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/myprofile.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |   "device": {
 3 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 4 |     "input_device2": false,
 5 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 6 |     "gpu_id": 0
 7 |   },
 8 |   "vc_conf": {
 9 |     "frame_length": 8192,
10 |     "delay_flames": 4096,
11 |     "overlap": 1024,
12 |     "dispose_stft_specs": 2,
13 |     "dispose_conv1d_specs": 10,
14 |     "source_id": 0,
15 |     "target_id": 101,
16 |     "f0_scale": 1.0,
17 |     "mic_scale": 1.0,
18 |     "onnx": {
19 |       "use_onnx": true,
20 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
21 |     }
22 |   },
23 |   "path": {
24 |     "json": ".\\logs\\20220306_24000\\config.json",
25 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
26 |     "correspondence":".\\logs\\20220306_24000\\train_config_Correspondence.txt",
27 |     "noise": ".\\noise.wav"
28 |   },
29 |   "others": {
30 |     "use_nr": false,
31 |     "voice_selector": false,
32 |     "voice_list": [101, 108, 6, 30],
33 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"],
34 |     "voice_f0": [1.0, 1.0, 1.0, 1.0]
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/conf/myprofile_ONNX_output_sample.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |   "device": {
 3 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 4 |     "input_device2": false,
 5 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 6 |     "gpu_id": 0
 7 |   },
 8 |   "vc_conf": {
 9 |     "frame_length": 8192,
10 |     "delay_flames": 4096,
11 |     "overlap": 1024,
12 |     "dispose_stft_specs": 2,
13 |     "dispose_conv1d_specs": 10,
14 |     "source_id": 0,
15 |     "target_id": 101,
16 |     "f0_scale": 2.30,
17 |     "onnx": {
18 |       "use_onnx": true,
19 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
20 |     }
21 |   },
22 |   "path": {
23 |     "json": ".\\logs\\20220306_24000\\config.json",
24 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
25 |     "noise": ".\\noise.wav"
26 |   },
27 |   "others": {
28 |     "use_nr": false,
29 |     "voice_selector": false,
30 |     "voice_list": [101, 108, 6, 30],
31 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"],
32 |     "voice_f0": [2.30, 2.00, 2.10, 1.20],
33 |     "input_filename": ".\\emotion059.wav",
34 |     "output_filename": ".\\trans_emotion059.wav"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/python/symbols.py:
--------------------------------------------------------------------------------
 1 | """ The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """
 2 | """ from https://github.com/keithito/tacotron """
 3 | 
 4 | '''
 5 | Defines the set of symbols used in text input to the model.
 6 | '''
 7 | _pad        = '_'
 8 | _punctuation = ';:,.!?¡¿—…"«»“” '
 9 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
10 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
11 | 
12 | 
13 | # Export all symbols:
14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15 | 
16 | # Special symbol ids
17 | SPACE_ID = symbols.index(" ")
18 | 
19 | symbols = [
20 |     "A",
21 |     "E",
22 |     "I",
23 |     "N",
24 |     "O",
25 |     "U",
26 |     "a",
27 |     "b",
28 |     "by",
29 |     "ch",
30 |     "cl",
31 |     "d",
32 |     "dy",
33 |     "e",
34 |     "f",
35 |     "g",
36 |     "gy",
37 |     "h",
38 |     "hy",
39 |     "i",
40 |     "j",
41 |     "k",
42 |     "ky",
43 |     "m",
44 |     "my",
45 |     "n",
46 |     "ny",
47 |     "o",
48 |     "p",
49 |     "py",
50 |     "r",
51 |     "ry",
52 |     "s",
53 |     "sh",
54 |     "t",
55 |     "ts",
56 |     "ty",
57 |     "u",
58 |     "v",
59 |     "w",
60 |     "y",
61 |     "z",
62 |     "pau",
63 |     "sil",
64 | ]
65 | 


--------------------------------------------------------------------------------
/python/output_audio_device_list.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | import pyaudio
 3 | from os import linesep
 4 | 
 5 | def main():
 6 |     audio = pyaudio.PyAudio()
 7 |     audio_devices = list()
 8 |     host_apis = list()
 9 |     
10 |     for api_index in range(audio.get_host_api_count()):
11 |         host_apis.append(audio.get_host_api_info_by_index(api_index)['name'])
12 |     
13 |     # 音声デバイス毎のインデックス番号を一覧表示
14 |     for x in range(0, audio.get_device_count()):
15 |         devices = audio.get_device_info_by_index(x)
16 |         try:
17 |             device_name = devices['name'].encode('shift-jis').decode('utf-8')
18 |         except (UnicodeDecodeError, UnicodeEncodeError):
19 |             device_name = devices['name']
20 |         
21 |         device_name = device_name.replace(linesep, '') + ", " + host_apis[devices['hostApi']]
22 |         
23 |         isInOut = ""
24 |         if devices['maxInputChannels'] > 0:
25 |             isInOut += "入"
26 |         if devices['maxOutputChannels'] > 0:
27 |             isInOut += "出"
28 |         
29 |         audio_devices.append(f"{isInOut}力： Index：{devices['index']} デバイス名：\"{device_name}\"\n")
30 | 
31 |     with open('audio_device_list.txt', 'w', encoding='utf-8') as f:
32 |         f.writelines(audio_devices)
33 | 
34 |     print(" 使用可能なデバイス一覧の取得が完了しました。\n audio_device_list.txt を参照してください。\n このウィンドウは閉じて問題ありません。")
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/python/snake.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2022 Reo Yoneyama (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Snake Activation Function Module.
 7 | 
 8 | References:
 9 |     - Neural Networks Fail to Learn Periodic Functions and How to Fix It
10 |         https://arxiv.org/pdf/2006.08195.pdf
11 |     - BigVGAN: A Universal Neural Vocoder with Large-Scale Training
12 |         https://arxiv.org/pdf/2206.04658.pdf
13 | 
14 | """
15 | 
16 | import torch
17 | import torch.nn as nn
18 | 
19 | 
20 | class Snake(nn.Module):
21 |     """Snake activation function module."""
22 | 
23 |     def __init__(self, channels, init=50):
24 |         """Initialize Snake module.
25 | 
26 |         Args:
27 |             channels (int): Number of feature channels.
28 |             init (float): Initial value of the learnable parameter alpha.
29 |                           According to the original paper, 5 ~ 50 would be
30 |                           suitable for periodic data (i.e. voices).
31 | 
32 |         """
33 |         super(Snake, self).__init__()
34 |         alpha = init * torch.ones(1, channels, 1)
35 |         self.alpha = nn.Parameter(alpha)
36 | 
37 |     def forward(self, x):
38 |         """Calculate forward propagation.
39 | 
40 |         Args:
41 |             x (Tensor): Input noise signal (B, channels, T).
42 | 
43 |         Returns:
44 |             Tensor: Output tensor (B, channels, T).
45 | 
46 |         """
47 |         return x + torch.sin(self.alpha * x) ** 2 / self.alpha
48 | 


--------------------------------------------------------------------------------
/python/onnx_bench.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import time
 3 | import onnxruntime as ort
 4 | import torch
 5 | 
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--input_onnx", required=True)
10 |     return parser.parse_args()
11 | 
12 | 
13 | def inspect_onnx(session):
14 |     print("inputs")
15 |     for i in session.get_inputs():
16 |         print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
17 |     print("outputs")
18 |     for i in session.get_outputs():
19 |         print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
20 | 
21 | 
22 | def benchmark(session):
23 |     dummy_specs = torch.rand(1, 257, 60)
24 |     dummy_lengths = torch.LongTensor([60])
25 |     dummy_sid_src = torch.LongTensor([0])
26 |     dummy_sid_tgt = torch.LongTensor([1])
27 | 
28 |     use_time_list = []
29 |     for i in range(30):
30 |         start = time.time()
31 |         output = session.run(
32 |             ["audio"],
33 |             {
34 |                 "specs": dummy_specs.numpy(),
35 |                 "lengths": dummy_lengths.numpy(),
36 |                 "sid_src": dummy_sid_src.numpy(),
37 |                 "sid_tgt": dummy_sid_tgt.numpy()
38 |             }
39 |         )
40 |         use_time = time.time() - start
41 |         use_time_list.append(use_time)
42 |         #print("use time:{}".format(use_time))
43 |     use_time_list = use_time_list[5:]
44 |     mean_use_time = sum(use_time_list) / len(use_time_list)
45 |     print(f"mean_use_time:{mean_use_time}")
46 | 
47 | 
48 | def main(args):
49 |     ort_session_cpu = ort.InferenceSession(
50 |         args.input_onnx,
51 |         providers=["CPUExecutionProvider"])
52 | 
53 |     ort_session_cuda = ort.InferenceSession(
54 |         args.input_onnx,
55 |         providers=["CUDAExecutionProvider"])
56 | 
57 |     # DirectMLで動かすための設定
58 |     ort_options = ort.SessionOptions()
59 |     ort_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
60 |     ort_options.enable_mem_pattern = False
61 |     ort_session_dml = ort.InferenceSession(
62 |         args.input_onnx,
63 |         sess_options=ort_options,
64 |         providers=["DmlExecutionProvider"])
65 | 
66 |     print("vits onnx benchmark")
67 |     inspect_onnx(ort_session_cpu)
68 |     print("ONNX CPU")
69 |     benchmark(ort_session_cpu)
70 |     print("ONNX CUDA")
71 |     benchmark(ort_session_cuda)
72 |     print("ONNX DirectML")
73 |     benchmark(ort_session_dml)
74 | 
75 | if __name__ == '__main__':
76 |     args = get_args()
77 |     print(args)
78 |     main(args)
79 | 


--------------------------------------------------------------------------------
/python/index.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Indexing-related functions."""
 7 | 
 8 | import torch
 9 | from torch.nn import ConstantPad1d as pad1d
10 | 
11 | 
12 | def pd_indexing(x, d, dilation, batch_index, ch_index):
13 |     """Pitch-dependent indexing of past and future samples.
14 | 
15 |     Args:
16 |         x (Tensor): Input feature map (B, C, T).
17 |         d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
18 |         dilation (Int): Dilation size.
19 |         batch_index (Tensor): Batch index
20 |         ch_index (Tensor): Channel index
21 | 
22 |     Returns:
23 |         Tensor: Past output tensor (B, out_channels, T)
24 |         Tensor: Future output tensor (B, out_channels, T)
25 | 
26 |     """
27 |     (_, _, batch_length) = d.size()
28 |     dilations = d * dilation
29 | 
30 |     # get past index
31 |     idxP = torch.arange(-batch_length, 0).float()
32 |     idxP = idxP.to(x.device)
33 |     idxP = torch.add(-dilations, idxP)
34 |     idxP = idxP.round().long()
35 |     maxP = -((torch.min(idxP) + batch_length))
36 |     assert maxP >= 0
37 |     idxP = (batch_index, ch_index, idxP)
38 |     # padding past tensor
39 |     xP = pad1d((maxP, 0), 0)(x)
40 | 
41 |     # get future index
42 |     idxF = torch.arange(0, batch_length).float()
43 |     idxF = idxF.to(x.device)
44 |     idxF = torch.add(dilations, idxF)
45 |     idxF = idxF.round().long()
46 |     maxF = torch.max(idxF) - (batch_length - 1)
47 |     assert maxF >= 0
48 |     idxF = (batch_index, ch_index, idxF)
49 |     # padding future tensor
50 |     xF = pad1d((0, maxF), 0)(x)
51 | 
52 |     return xP[idxP], xF[idxF]
53 | 
54 | 
55 | def index_initial(n_batch, n_ch, tensor=True):
56 |     """Tensor batch and channel index initialization.
57 | 
58 |     Args:
59 |         n_batch (Int): Number of batch.
60 |         n_ch (Int): Number of channel.
61 |         tensor (bool): Return tensor or numpy array
62 | 
63 |     Returns:
64 |         Tensor: Batch index
65 |         Tensor: Channel index
66 | 
67 |     """
68 |     batch_index = []
69 |     for i in range(n_batch):
70 |         batch_index.append([[i]] * n_ch)
71 |     ch_index = []
72 |     for i in range(n_ch):
73 |         ch_index += [[i]]
74 |     ch_index = [ch_index] * n_batch
75 | 
76 |     if tensor:
77 |         batch_index = torch.tensor(batch_index)
78 |         ch_index = torch.tensor(ch_index)
79 |         if torch.cuda.is_available():
80 |             batch_index = batch_index.cuda()
81 |             ch_index = ch_index.cuda()
82 |     return batch_index, ch_index
83 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | build/
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 94 | __pypackages__/
 95 | 
 96 | # Celery stuff
 97 | celerybeat-schedule
 98 | celerybeat.pid
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | old/
131 | .history/
132 | rt-mmvc_client_CPU/rt-mmvc_client_CPU/*
133 | rt-mmvc_client_GPU/rt-mmvc_client_GPU/*
134 | audio_device_list.txt
135 | device_check.txt
136 | *.rar
137 | *.exe
138 | *.build/
139 | *.dist/
140 | temp/
141 | isle/
142 | noise.wav
143 | use_exe.ico
144 | myprofile copy.json
145 | python/mmvc_client_GPU_v0.2.0.0.zip
146 | python/mmvc_client_GPU_v0.2.0.0/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Isle Tennos
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | The MIT License (MIT)
24 | Copyright (c) 2019, Tim Sainburg
25 | 
26 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
27 | 
28 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
29 | 
30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 | 
32 | MIT License
33 | 
34 | Copyright (c) 2021 Jaehyeon Kim
35 | 
36 | Permission is hereby granted, free of charge, to any person obtaining a copy
37 | of this software and associated documentation files (the "Software"), to deal
38 | in the Software without restriction, including without limitation the rights
39 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
40 | copies of the Software, and to permit persons to whom the Software is
41 | furnished to do so, subject to the following conditions:
42 | 
43 | The above copyright notice and this permission notice shall be included in all
44 | copies or substantial portions of the Software.
45 | 
46 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
51 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52 | SOFTWARE.
53 | 


--------------------------------------------------------------------------------
/python/setup_check.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import platform
  3 | from multiprocessing import freeze_support
  4 | 
  5 | # 以下必要な外部ライブラリ
  6 | # pip install --upgrade py-cpuinfo
  7 | # pip install --upgrade psutil
  8 | # pip install --upgrade nvgpu
  9 | 
 10 | # Pipfileとpipenvを使用する場合
 11 | # cd pyhon
 12 | # pipenv install --dev
 13 | # pipenv run python setup_check.py
 14 | 
 15 | 
 16 | # cpuinfo.get_cpu_info()とpyinstallerを組み合わせる場合に必要
 17 | freeze_support()
 18 | 
 19 | 
 20 | 
 21 | # 定数
 22 | MMVC_INFO:        str = "MMVC_Client"
 23 | OUTPUT_FILE_NAME: str = "device_check.txt"
 24 | 
 25 | LOWER_LIMIT_MEMORY:     int = 4 * 1024**3 # 4 GiB
 26 | LOWER_LIMIT_GPU_MEMORY: int = 1 * 1024**3 # 1 GiB
 27 | 
 28 | ONNX_TEXT: str = "このPCでは、onnxモデルを出力することで動作する可能性があります"
 29 | 
 30 | 
 31 | 
 32 | # ログファイル出力の設定
 33 | logging.basicConfig(
 34 |     filename = OUTPUT_FILE_NAME,
 35 |     filemode = "w",
 36 |     encoding = "utf-8",
 37 |     level = logging.INFO,
 38 |     format = "%(levelname)s%(message)s")
 39 | logging.addLevelName(logging.INFO, "")
 40 | logging.addLevelName(logging.WARNING, "\n[警告]\n")
 41 | logging.addLevelName(logging.ERROR, "\n[エラー]\n")
 42 | 
 43 | 
 44 | 
 45 | # 基本的な情報
 46 | logging.info(f"バージョン: {MMVC_INFO}")
 47 | logging.info(f"Python: {platform.python_version()}")
 48 | logging.info(f"アーキテクチャ: {platform.machine()}")
 49 | logging.info(f"OS: {platform.system()}")
 50 | 
 51 | 
 52 | 
 53 | # CPU関連
 54 | try:
 55 |     from cpuinfo import get_cpu_info
 56 |     
 57 |     cpu_info = get_cpu_info()
 58 |     logging.info(f"CPU: {cpu_info['brand_raw']}")
 59 | 
 60 | except ModuleNotFoundError:
 61 |     logging.info(f"CPU: {platform.processor()}")
 62 |     logging.warning("py-cpuinfoライブラリがインストールされていません\n" +
 63 |                     "以下のコマンドを実行して、py-cpuinfoをインストールするとより詳細な情報を得られます\n" +
 64 |                     "pip install --upgrade py-cpuinfo\n")
 65 | 
 66 | 
 67 | # メモリ
 68 | try:
 69 |     from psutil import virtual_memory
 70 |     memory = virtual_memory().total
 71 |     logging.info(f"メモリ: {round(memory / 1024**3, 0)} GiB")
 72 | 
 73 |     if memory < LOWER_LIMIT_MEMORY:
 74 |         logging.error("メモリが不足しています\n" +
 75 |                       "メモリを増設することで動作不良が改善される場合があります\n")
 76 | except ModuleNotFoundError:
 77 |     logging.error("psutilライブラリがインストールされていません\n" +
 78 |                   "以下のコマンドを実行して、psutilをインストールする必要があります\n" +
 79 |                   "pip install --upgrade psutil\n")
 80 | 
 81 | 
 82 | # GPU
 83 | try:
 84 |     import nvgpu
 85 |     
 86 |     gpu_infos = nvgpu.gpu_info()
 87 |     gpu_memory = 0
 88 |     
 89 |     for gpu_info in gpu_infos:
 90 |         temp_gpt_memory = gpu_info["mem_total"] * 1024**2
 91 |         logging.info(f"GPU {gpu_info['index']} 名称: {gpu_info['type']}")
 92 |         logging.info(f"GPU {gpu_info['index']} メモリ: {round(temp_gpt_memory / 1024**3, 1)} GiB")
 93 |         gpu_memory = max(gpu_info["mem_total"] * 1024**2, gpu_memory)
 94 |     
 95 |     if len(gpu_infos) == 0:
 96 |         logging.warning(f"NvidiaのGPUが存在しません\n{ONNX_TEXT}\n")
 97 |     
 98 |     elif gpu_memory < LOWER_LIMIT_GPU_MEMORY:
 99 |         logging.warning(f"GPUのメモリ量が不足しています\n{ONNX_TEXT}\n")
100 | 
101 | except ModuleNotFoundError:
102 |     logging.error("nvgpuライブラリがインストールされていません\n" +
103 |                   "以下のコマンドを実行して、nvgpuをインストールする必要があります\n" +
104 |                   "pip install --upgrade nvgpu\n")
105 | 
106 | except FileNotFoundError:
107 |     # nvidia-smiパッケージが見つからない場合
108 |     logging.warning(f"NvidiaのGPUもしくはGPUドライバーが存在しません\n{ONNX_TEXT}\n")
109 | 
110 | 
111 | 
112 | logging.info("デバイス情報取得完了")
113 | print(f"デバイス情報の取得が完了しました。\n{OUTPUT_FILE_NAME} を確認してください。\nこのウィンドウは閉じて問題ありません。")


--------------------------------------------------------------------------------
/python/rec_environmental_noise.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*
  2 | import pyaudio
  3 | import sounddevice as sd
  4 | import wave
  5 | import numpy as np
  6 | import time
  7 | import json
  8 | import os
  9 | 
 10 | #ファイルダイアログ関連
 11 | import tkinter as tk #add
 12 | from tkinter import filedialog #add
 13 | 
 14 | class VCPrifile():
 15 |   def __init__(self, **kwargs):
 16 |     for k, v in kwargs.items():
 17 |       if type(v) == dict:
 18 |         v = VCPrifile(**v)
 19 |       self[k] = v
 20 |     
 21 |   def keys(self):
 22 |     return self.__dict__.keys()
 23 | 
 24 |   def items(self):
 25 |     return self.__dict__.items()
 26 | 
 27 |   def values(self):
 28 |     return self.__dict__.values()
 29 | 
 30 |   def __len__(self):
 31 |     return len(self.__dict__)
 32 | 
 33 |   def __getitem__(self, key):
 34 |     return getattr(self, key)
 35 | 
 36 |   def __setitem__(self, key, value):
 37 |     return setattr(self, key, value)
 38 | 
 39 |   def __contains__(self, key):
 40 |     return key in self.__dict__
 41 | 
 42 |   def __repr__(self):
 43 |     return self.__dict__.__repr__()
 44 | 
 45 | def config_get(conf):
 46 |     config_path = conf
 47 |     with open(config_path, "r", encoding="utf-8") as f:
 48 |         data = f.read()
 49 |     config = json.loads(data)
 50 |     hparams = VCPrifile(**config)
 51 |     return hparams
 52 | 
 53 | def MakeWavFile(profile_path):
 54 |     chunk = 1024
 55 |     
 56 |     params = config_get(profile_path)
 57 |     print(params.device.input_device1)
 58 |     if type(params.device.input_device1) == str:
 59 |         device_index = sd.query_devices().index(sd.query_devices(params.device.input_device1, 'input'))
 60 |     else:
 61 |         device_index = params.device.input_device1
 62 | 
 63 |     p = pyaudio.PyAudio()
 64 |     stream = p.open(format = pyaudio.paInt16,
 65 |                     channels = 1,
 66 |                     rate = sr,
 67 |                     input = True,
 68 |                     input_device_index = device_index,
 69 |                     frames_per_buffer = chunk)
 70 |     #レコード開始
 71 |     print("あなたの環境ノイズを録音します。マイクの電源を入れて、何もせずに待機していてください。")
 72 |     print("5秒後に録音を開始します。5秒間ノイズを録音します。完了するまで待機していてください。")
 73 |     Record_Seconds = 5
 74 |     MAX_Value = 32768.0
 75 |     all = []
 76 |     time.sleep(5)
 77 |     print("録音を開始しました。")
 78 |     for i in range(0, int(sr / chunk * Record_Seconds)):
 79 |         data = stream.read(chunk) #音声を読み取って、
 80 |         data = np.frombuffer(data, dtype='int16')
 81 |         audio1 = data * MAX_Value
 82 |         audio1 = audio1.astype(np.int16).tobytes()
 83 |         all.append(data) #データを追加
 84 |     #レコード終了
 85 |     print("録音が完了しました。")
 86 |     print("ファイルに書き込みを行っています。")
 87 |     stream.close()
 88 |     p.terminate()
 89 |     wavFile = wave.open("noise.wav", 'wb')
 90 |     wavFile.setnchannels(1)
 91 |     wavFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
 92 |     wavFile.setframerate(sr)
 93 |     #wavFile.writeframes(b''.join(all)) #Python2 用
 94 |     wavFile.writeframes(b"".join(all)) #Python3用
 95 |     wavFile.close()
 96 |     print("ファイルの書き込み完了しました。")
 97 |     print("このウィンドウは閉じて問題ありません。")
 98 |     input()
 99 | 
100 | if __name__ == '__main__':
101 |     try: #add
102 |         #サンプリングレートの指定
103 |         while True:  # 無限ループ
104 |             print('学習済みモデルのサンプリングレートを指定してください。')
105 |             try:
106 |                 sr = int(input('>> '))
107 |             except ValueError:
108 |                 # ValueError例外を処理するコード
109 |                 print('数字以外が入力されました。数字のみを入力してください')
110 |                 continue
111 |             break
112 | 
113 |         end_counter = 0
114 |         while True:  # 無限ループ
115 |             tkroot = tk.Tk()
116 |             tkroot.withdraw()
117 |             print('myprofile.conf を選択して下さい')
118 |             typ = [('confファイル','*.conf')]
119 |             dir = './'
120 |             profile_path = filedialog.askopenfilename(filetypes = typ, initialdir = dir)
121 |             tkroot.destroy()
122 |             try:
123 |                 if profile_path:
124 |                     MakeWavFile(profile_path)
125 |                     break
126 |                 else:
127 |                     print('ファイルが存在しません')
128 |                     end_counter = end_counter + 1
129 |                     print(end_counter)
130 |                     if end_counter > 3:
131 |                         break
132 |                     continue
133 |         
134 |             except Exception as e:
135 |                 # ValueError例外を処理するコード
136 |                 print(profile_path)
137 |                 print(e)
138 |                 print('パスを入力してください・')
139 |                 continue
140 |                 
141 |     except Exception as e:
142 |         print('エラーが発生しました。')
143 |         print(e)
144 |         os.system('PAUSE')


--------------------------------------------------------------------------------
/python/features.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2022 Reo Yoneyama (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Feature-related functions.
  7 | 
  8 | References:
  9 |     - https://github.com/bigpon/QPPWG
 10 |     - https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts
 11 | 
 12 | """
 13 | 
 14 | import sys
 15 | from logging import getLogger
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | from torch.nn.functional import interpolate
 20 | 
 21 | # A logger for this file
 22 | logger = getLogger(__name__)
 23 | 
 24 | 
 25 | def validate_length(xs, ys=None, hop_size=None):
 26 |     """Validate length
 27 | 
 28 |     Args:
 29 |         xs (ndarray): numpy array of features
 30 |         ys (ndarray): numpy array of audios
 31 |         hop_size (int): upsampling factor
 32 | 
 33 |     Returns:
 34 |         (ndarray): length adjusted features
 35 | 
 36 |     """
 37 |     min_len_x = min([x.shape[0] for x in xs])
 38 |     if ys is not None:
 39 |         min_len_y = min([y.shape[0] for y in ys])
 40 |         if min_len_y < min_len_x * hop_size:
 41 |             min_len_x = min_len_y // hop_size
 42 |         if min_len_y > min_len_x * hop_size:
 43 |             min_len_y = min_len_x * hop_size
 44 |         ys = [y[:min_len_y] for y in ys]
 45 |     xs = [x[:min_len_x] for x in xs]
 46 | 
 47 |     return xs + ys if ys is not None else xs
 48 | 
 49 | 
 50 | def dilated_factor(batch_f0, fs, dense_factor):
 51 |     """Pitch-dependent dilated factor
 52 | 
 53 |     Args:
 54 |         batch_f0 (ndarray): the f0 sequence (T)
 55 |         fs (int): sampling rate
 56 |         dense_factor (int): the number of taps in one cycle
 57 | 
 58 |     Return:
 59 |         dilated_factors(np array):
 60 |             float array of the pitch-dependent dilated factors (T)
 61 | 
 62 |     """
 63 |     batch_f0[batch_f0 == 0] = fs / dense_factor
 64 |     dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0
 65 |     #assert np.all(dilated_factors > 0)
 66 |     return dilated_factors
 67 | 
 68 | 
 69 | class SignalGenerator:
 70 |     """Input signal generator module."""
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         sample_rate=24000,
 75 |         hop_size=120,
 76 |         sine_amp=0.1,
 77 |         noise_amp=0.003,
 78 |         signal_types=["sine", "noise"],
 79 |     ):
 80 |         """Initialize WaveNetResidualBlock module.
 81 | 
 82 |         Args:
 83 |             sample_rate (int): Sampling rate.
 84 |             hop_size (int): Hop size of input F0.
 85 |             sine_amp (float): Sine amplitude for NSF-based sine generation.
 86 |             noise_amp (float): Noise amplitude for NSF-based sine generation.
 87 |             signal_types (list): List of input signal types for generator.
 88 | 
 89 |         """
 90 |         self.sample_rate = sample_rate
 91 |         self.hop_size = hop_size
 92 |         self.signal_types = signal_types
 93 |         self.sine_amp = sine_amp
 94 |         self.noise_amp = noise_amp
 95 | 
 96 |         for signal_type in signal_types:
 97 |             if not signal_type in ["noise", "sine", "sines", "uv"]:
 98 |                 logger.info(f"{signal_type} is not supported type for generator input.")
 99 |                 sys.exit(0)
100 |         #logger.info(f"Use {signal_types} for generator input signals.")
101 | 
102 |     @torch.no_grad()
103 |     def __call__(self, f0, f0_scale = 1.0):
104 |         signals = []
105 |         for typ in self.signal_types:
106 |             if "noise" == typ:
107 |                 signals.append(self.random_noise(f0))
108 |             if "sine" == typ:
109 |                 signals.append(self.sinusoid(f0))
110 |             if "sines" == typ:
111 |                 signals.append(self.sinusoids(f0))
112 |             if "uv" == typ:
113 |                 signals.append(self.vuv_binary(f0))
114 | 
115 |         input_batch = signals[0]
116 |         for signal in signals[1:]:
117 |             input_batch = torch.cat([input_batch, signal], axis=1)
118 | 
119 |         return input_batch * f0_scale
120 | 
121 |     @torch.no_grad()
122 |     def random_noise(self, f0):
123 |         """Calculate noise signals.
124 | 
125 |         Args:
126 |             f0 (Tensor): F0 tensor (B, 1, T // hop_size).
127 | 
128 |         Returns:
129 |             Tensor: Gaussian noise signals (B, 1, T).
130 | 
131 |         """
132 |         B, _, T = f0.size()
133 |         noise = torch.randn((B, 1, T * self.hop_size), device=f0.device)
134 | 
135 |         return noise
136 | 
137 |     @torch.no_grad()
138 |     def sinusoid(self, f0):
139 |         """Calculate sine signals.
140 | 
141 |         Args:
142 |             f0 (Tensor): F0 tensor (B, 1, T // hop_size).
143 | 
144 |         Returns:
145 |             Tensor: Sines generated following NSF (B, 1, T).
146 | 
147 |         """
148 |         B, _, T = f0.size()
149 |         vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
150 |         radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1
151 |         sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp
152 |         if self.noise_amp > 0:
153 |             noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
154 |             noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
155 |             sine = sine + noise
156 | 
157 |         return sine
158 | 
159 |     @torch.no_grad()
160 |     def sinusoids(self, f0):
161 |         """Calculate sines.
162 | 
163 |         Args:
164 |             f0 (Tensor): F0 tensor (B, 1, T // hop_size).
165 | 
166 |         Returns:
167 |             Tensor: Sines generated following NSF (B, 1, T).
168 | 
169 |         """
170 |         B, _, T = f0.size()
171 |         vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
172 |         f0 = interpolate(f0, T * self.hop_size)
173 |         sines = torch.zeros_like(f0, device=f0.device)
174 |         harmonics = 5  # currently only fixed number of harmonics is supported
175 |         for i in range(harmonics):
176 |             radious = (f0 * (i + 1) / self.sample_rate) % 1
177 |             sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi)
178 |         sines = self.sine_amp * sines * vuv / harmonics
179 |         if self.noise_amp > 0:
180 |             noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
181 |             noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
182 |             sines = sines + noise
183 | 
184 |         return sines
185 | 
186 |     @torch.no_grad()
187 |     def vuv_binary(self, f0):
188 |         """Calculate V/UV binary sequences.
189 | 
190 |         Args:
191 |             f0 (Tensor): F0 tensor (B, 1, T // hop_size).
192 | 
193 |         Returns:
194 |             Tensor: V/UV binary sequences (B, 1, T).
195 | 
196 |         """
197 |         _, _, T = f0.size()
198 |         uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
199 | 
200 |         return uv
201 | 


--------------------------------------------------------------------------------
/python/modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | from torch.nn import Conv1d
  6 | from torch.nn.utils import weight_norm, remove_weight_norm
  7 | 
  8 | from commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
  9 | 
 10 | 
 11 | LRELU_SLOPE = 0.1
 12 | 
 13 | 
 14 | class WN(torch.nn.Module):
 15 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
 16 |     super(WN, self).__init__()
 17 |     assert(kernel_size % 2 == 1)
 18 |     self.hidden_channels =hidden_channels
 19 |     self.kernel_size = kernel_size,
 20 |     self.dilation_rate = dilation_rate
 21 |     self.n_layers = n_layers
 22 |     self.gin_channels = gin_channels
 23 |     self.p_dropout = p_dropout
 24 | 
 25 |     self.in_layers = torch.nn.ModuleList()
 26 |     self.res_skip_layers = torch.nn.ModuleList()
 27 |     self.drop = nn.Dropout(p_dropout)
 28 | 
 29 |     if gin_channels != 0:
 30 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
 31 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 32 | 
 33 |     for i in range(n_layers):
 34 |       dilation = dilation_rate ** i
 35 |       padding = int((kernel_size * dilation - dilation) / 2)
 36 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
 37 |                                  dilation=dilation, padding=padding)
 38 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
 39 |       self.in_layers.append(in_layer)
 40 | 
 41 |       # last one is not necessary
 42 |       if i < n_layers - 1:
 43 |         res_skip_channels = 2 * hidden_channels
 44 |       else:
 45 |         res_skip_channels = hidden_channels
 46 | 
 47 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
 48 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
 49 |       self.res_skip_layers.append(res_skip_layer)
 50 | 
 51 |   def forward(self, x, x_mask, g=None, **kwargs):
 52 |     output = torch.zeros_like(x)
 53 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
 54 | 
 55 |     if g is not None:
 56 |       g = self.cond_layer(g)
 57 | 
 58 |     for i in range(self.n_layers):
 59 |       x_in = self.in_layers[i](x)
 60 |       if g is not None:
 61 |         cond_offset = i * 2 * self.hidden_channels
 62 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
 63 |       else:
 64 |         g_l = torch.zeros_like(x_in)
 65 | 
 66 |       acts = fused_add_tanh_sigmoid_multiply(
 67 |           x_in,
 68 |           g_l,
 69 |           n_channels_tensor)
 70 |       acts = self.drop(acts)
 71 | 
 72 |       res_skip_acts = self.res_skip_layers[i](acts)
 73 |       if i < self.n_layers - 1:
 74 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
 75 |         x = (x + res_acts) * x_mask
 76 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
 77 |       else:
 78 |         output = output + res_skip_acts
 79 |     return output * x_mask
 80 | 
 81 |   def remove_weight_norm(self):
 82 |     if self.gin_channels != 0:
 83 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
 84 |     for l in self.in_layers:
 85 |       torch.nn.utils.remove_weight_norm(l)
 86 |     for l in self.res_skip_layers:
 87 |      torch.nn.utils.remove_weight_norm(l)
 88 | 
 89 | 
 90 | class ResBlock1(torch.nn.Module):
 91 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
 92 |         super(ResBlock1, self).__init__()
 93 |         self.convs1 = nn.ModuleList([
 94 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 95 |                                padding=get_padding(kernel_size, dilation[0]))),
 96 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 97 |                                padding=get_padding(kernel_size, dilation[1]))),
 98 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
 99 |                                padding=get_padding(kernel_size, dilation[2])))
100 |         ])
101 |         self.convs1.apply(init_weights)
102 | 
103 |         self.convs2 = nn.ModuleList([
104 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
105 |                                padding=get_padding(kernel_size, 1))),
106 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
107 |                                padding=get_padding(kernel_size, 1))),
108 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
109 |                                padding=get_padding(kernel_size, 1)))
110 |         ])
111 |         self.convs2.apply(init_weights)
112 | 
113 |     def forward(self, x, x_mask=None):
114 |         for c1, c2 in zip(self.convs1, self.convs2):
115 |             xt = F.leaky_relu(x, LRELU_SLOPE)
116 |             if x_mask is not None:
117 |                 xt = xt * x_mask
118 |             xt = c1(xt)
119 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
120 |             if x_mask is not None:
121 |                 xt = xt * x_mask
122 |             xt = c2(xt)
123 |             x = xt + x
124 |         if x_mask is not None:
125 |             x = x * x_mask
126 |         return x
127 | 
128 |     def remove_weight_norm(self):
129 |         for l in self.convs1:
130 |             remove_weight_norm(l)
131 |         for l in self.convs2:
132 |             remove_weight_norm(l)
133 | 
134 | 
135 | class ResBlock2(torch.nn.Module):
136 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
137 |         super(ResBlock2, self).__init__()
138 |         self.convs = nn.ModuleList([
139 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
140 |                                padding=get_padding(kernel_size, dilation[0]))),
141 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
142 |                                padding=get_padding(kernel_size, dilation[1])))
143 |         ])
144 |         self.convs.apply(init_weights)
145 | 
146 |     def forward(self, x, x_mask=None):
147 |         for c in self.convs:
148 |             xt = F.leaky_relu(x, LRELU_SLOPE)
149 |             if x_mask is not None:
150 |                 xt = xt * x_mask
151 |             xt = c(xt)
152 |             x = xt + x
153 |         if x_mask is not None:
154 |             x = x * x_mask
155 |         return x
156 | 
157 |     def remove_weight_norm(self):
158 |         for l in self.convs:
159 |             remove_weight_norm(l)
160 | 
161 | 
162 | class Flip(nn.Module):
163 |   def forward(self, x, *args, reverse=False, **kwargs):
164 |     x = torch.flip(x, [1])
165 |     if not reverse:
166 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
167 |       return x, logdet
168 |     else:
169 |       return x
170 | 
171 | 
172 | class ResidualCouplingLayer(nn.Module):
173 |   def __init__(self,
174 |       channels,
175 |       hidden_channels,
176 |       kernel_size,
177 |       dilation_rate,
178 |       n_layers,
179 |       p_dropout=0,
180 |       gin_channels=0,
181 |       mean_only=False):
182 |     assert channels % 2 == 0, "channels should be divisible by 2"
183 |     super().__init__()
184 |     self.channels = channels
185 |     self.hidden_channels = hidden_channels
186 |     self.kernel_size = kernel_size
187 |     self.dilation_rate = dilation_rate
188 |     self.n_layers = n_layers
189 |     self.half_channels = channels // 2
190 |     self.mean_only = mean_only
191 | 
192 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
193 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
194 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
195 |     self.post.weight.data.zero_()
196 |     self.post.bias.data.zero_()
197 | 
198 |   def forward(self, x, x_mask, g=None, reverse=False):
199 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
200 |     h = self.pre(x0) * x_mask
201 |     h = self.enc(h, x_mask, g=g)
202 |     stats = self.post(h) * x_mask
203 |     if not self.mean_only:
204 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
205 |     else:
206 |       m = stats
207 |       logs = torch.zeros_like(m)
208 | 
209 |     if not reverse:
210 |       x1 = m + x1 * torch.exp(logs) * x_mask
211 |       x = torch.cat([x0, x1], 1)
212 |       logdet = torch.sum(logs, [1,2])
213 |       return x, logdet
214 |     else:
215 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
216 |       x = torch.cat([x0, x1], 1)
217 |       return x
218 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | MMVC_Client
  2 | ====
  3 | 
  4 | AIを使ったリアルタイムボイスチェンジャー
  5 | 
  6 | ## Description
  7 | AIを使ったリアルタイムボイスチェンジャー「MMVC(RealTime-Many to Many Voice Conversion)」  
  8 | の本体です。  
  9 | MMVC_Trainerで学習したモデルを使ってリアルタイムでVCを行います。
 10 | ## MMVC_Trainer
 11 | https://github.com/isletennos/MMVC_Trainer
 12 | ## concept
 13 | 「簡単」「だれでも」「好きな声に」「リアルタイムで」
 14 | ## Requirement
 15 | ・MMVC_Trainerで学習したモデルとそのコンフィグ  
 16 | ## Install
 17 | ### windows かつ 実行ファイルを利用する方
 18 | 下記URLからダウンロードして、自己解凍形式ファイルを実行して展開してください。(ファイルサイズが非常に大きいので注意)  
 19 | [MMVC_client v0.3.1.0](https://github.com/isletennos/MMVC_Client/releases/tag/v0.3.1.0)  
 20 | 
 21 | ### 旧ver
 22 | [MMVC_client v0.3.0.0(GPU ver)](https://drive.google.com/file/d/1QXJQAnTOr8vE5nwxInUROtj-fiHeJsXH/view?usp=sharing)  
 23 | ファイルサイズが大きすぎてDLできない人向けの分割版
 24 | [MMVC_client v0.3.0.0(GPU ver)](https://drive.google.com/drive/folders/1eoDBw37WT7wJsAXh-RIXvXLvbSwnDtt9?usp=sharing)  
 25 | [MMVC_client v0.2.0.1(GPU ver)](https://drive.google.com/file/d/1JEvYw4vjiBwhsZq79Pb0Doh7Fy16dK76/view?usp=sharing)  
 26 | [MMVC_client 無印(CPU_ver) (現在非推奨)](https://drive.google.com/file/d/1KLqo_q-qbahPRzNo2kUhCqHqnb8lTjMJ/view?usp=sharing)  
 27 | [MMVC_client 無印(GPU ver)](https://drive.google.com/file/d/1XNdfT3BFGKlxDm43hEbYvnoJSecjLedt/view?usp=sharing)  
 28 | 
 29 | #### TrainerとClientの対応表
 30 | | MMVC Trainer ver          | v1.2.x.x | v1.3.0.x | 1.3.2.x | 1.3.2.x(ONNX) |
 31 | | ------------------------- | -------- | -------- | ------- | ------------- |
 32 | | MMVC Client 無印(CPU/GPU)  | 〇       | ×        | ×      | ×             |
 33 | | MMVC Client v0.2.0.x(GPU) | 〇       | ×        | ×      | ×             |
 34 | | MMVC Client v0.3.0.x(GPU) | ×       | 〇        | 〇      | ×             |
 35 | | MMVC Client v0.3.1.x      | ×       | 〇        | 〇      | 〇             |
 36 | 
 37 | ## Install(python)
 38 | このリポジトリをダウンロードして、展開してください。  
 39 | また、下記.exeの実行を.pyの実行に置き換えて実行してください。  
 40 | 
 41 | ## Usage
 42 | ### 1. 使用可能なオーディオデバイス一覧の取得
 43 | 「output_audio_device_list.exe」を実行します。  
 44 | 「audio_device_list.txt」が実行ファイルと同じディレクトリに出力されます。  
 45 | こちらに入出力のオーディオデバイス名およびIDが出力されており、下記セクション以降で利用します。  
 46 | ### 2. myprofile.confの書き換え
 47 | myprofile.confの下記項目を環境に合わせて変更します。  
 48 | ```
 49 |   "device": {
 50 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 51 |     "input_device2": false,
 52 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 53 |     "gpu_id":0
 54 |   },
 55 | ```
 56 | 
 57 | ```
 58 |   "vc_conf": {
 59 |     "frame_length": 8192,
 60 |     "delay_flames": 4096,
 61 |     "overlap": 1024,
 62 |     "dispose_stft_specs": 2,
 63 |     "dispose_conv1d_specs": 10,
 64 |     "source_id": 0,
 65 |     "target_id": 101,
 66 |     "onnx": {
 67 |       "use_onnx": true,
 68 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
 69 |     }
 70 |   },
 71 | ```
 72 | 
 73 | ```
 74 |   "path": {
 75 |     "json": ".\\logs\\20220306_24000\\config.json",
 76 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
 77 |     "noise": ".\\noise.wav"
 78 |   },
 79 | ```
 80 | 
 81 | ```
 82 |   "others": {
 83 |     "use_nr":false,
 84 |     "voice_selector":false,
 85 |     "voice_list": [101, 108, 6, 30],
 86 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"]
 87 |   }
 88 | ```
 89 | ### 2.1 myprofile.confの書き換え(device)
 90 | このセクションでは、下記項目の変更方法について記載します。  
 91 | ```
 92 |   "device": {
 93 |     "input_device1": "マイク (Realtek(R) Audio), MME",
 94 |     "input_device2": false,
 95 |     "output_device": "スピーカー (Realtek(R) Audio), MME",
 96 |     "gpu_id":0
 97 |   },
 98 | ```
 99 | 各要素はそれぞれ  
100 | **input_device1 : マイク入力のデバイスID or デバイス名を指定します。**  
101 | 
102 | 
103 | **input_device2 : 背景音声の入力のデバイスID or デバイス名を指定します。**  
104 | 主にカラオケ等背景のBGMと自分の変換後の音声のラグを0にしたいときに使います。  
105 | 
106 | 
107 | **output_device : 変換した音声の出力先のデバイスID or デバイス名を指定します。**  
108 | 
109 | 
110 | **gpu_id : 複数GPUをPCに搭載している場合、数字で指定できます。**  
111 | 使い分けが不要な場合は0のまま変更は不要です。 
112 | 
113 | ### 2.2 myprofile.confの書き換え(vc_conf)
114 | このセクションでは、下記項目の変更方法について記載します。  
115 | ```
116 |   "vc_conf": {
117 |     "frame_length": 8192,
118 |     "delay_flames": 4096,
119 |     "overlap": 1024,
120 |     "dispose_stft_specs": 2,
121 |     "dispose_conv1d_specs": 10,
122 |     "source_id": 0,
123 |     "target_id": 101,
124 |     "onnx": {
125 |       "use_onnx": true,
126 |       "onnx_providers": ["DmlExecutionProvider", "CPUExecutionProvider"]
127 |     }
128 |   },
129 | ```
130 | この項目では、下記4項目のみ変更します。それ以外の項目については割愛します。  
131 | **source_id : 変換元の音声の話者IDになります。**  
132 | Trainerで特に弄っていなければ、107のままで問題ありません。  
133 | 
134 | **target_id : 変換先の音声の話者IDになります。**  
135 | 学習時に生成した「./filelists/train_config_Correspondence.txt」を参考に話者IDを指定してください。  
136 | チュートリアルもんであれば101のままで問題ありません。  
137 | 
138 | **onnx.use_onnx : 変換にONNXを使うか指定します。**  
139 | ONNXを使って変換する場合trueにします。  
140 | 従来のtorchを使って変換する場合はfalseにします。この場合、onnx_providersの設定は無視されます。  
141 | ONNXを利用する場合、学習したモデルは「～.onnx」形式のファイルを指定します。  
142 | 従来のtorchを利用する場合、「～.pth」形式のファイルを指定します。
143 | 
144 | **onnx.onnx_providers : ONNXが使う実行エンジンと優先順位を指定します。**  
145 | 記述されている順番に優先して利用されます。
146 | - DmlExecutionProvider  
147 |   DirectMLを利用します。GPUを利用したい場合はこの項目を先に入れてください。
148 | - CPUExecutionProvider  
149 |   CPUを利用します。ONNXでCPUでの変換をしたい場合は、この項目だけを入れます。
150 | 
151 | ### 2.3 myprofile.confの書き換え(path)
152 | このセクションでは、下記項目の変更方法について記載します。  
153 | ```
154 |   "path": {
155 |     "json": ".\\logs\\20220306_24000\\config.json",
156 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
157 |     "noise": ".\\noise.wav"
158 |   },
159 | ```
160 | **※ここで指定するパスは必ず「\」ではなく「\\\\」で区切ってください。**  
161 | 
162 | 学習済みフォルダ内に config.json, G_latest_99999999.pth, G_latest_99999999.onnx 等のファイルがあります。  
163 | これらのファイルをコピーして、この例の場合、logsフォルダ内に「20220306_24000」フォルダを作って、その中にファイルを置きます。
164 | 
165 | **json : 学習時に生成したconfigファイルのパスを指定します。**  
166 | 学習時の設定ファイル ./logs/xxxx/config.json を指定します。
167 | 
168 | **model : 学習したモデルのパスを指定します。**  
169 | 学習済みモデルファイル ./logs/xxxx/G_xxxx.pth といった感じのファイルを指定します。  
170 | ONNXを使って変換する場合は ./logs/xxxx/G_xxxx.onnx といったONNX形式ファイルを指定します。
171 | 
172 | **noise : 現在非推奨で使わないのでそのままでいいです。**  
173 | 使いたい方は下記おまけセクションを参考ください。  
174 | 
175 | ### 2.4 myprofile.confの書き換え(others)
176 | このセクションでは、下記項目の変更方法について記載します。  
177 | ```
178 |   "others": {
179 |     "use_nr":false,
180 |     "voice_selector":false,
181 |     "voice_list": [101, 108, 6, 30],
182 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"]
183 |   }
184 | ```
185 | 各要素はそれぞれ  
186 | **use_nr : ノイズリダクションを有効化するかしないか指定します。**  
187 | 現状は品質が下がるため、自前で用意することを推奨します。  
188 | この機能を使う場合、 true に書き換えてください。  
189 | 
190 | 
191 | **voice_selector : MMVC起動中にターゲット話者をリアルタイムで変更する機能を有効化するかしないか指定します。**  
192 | この機能を十全に使うには、複数話者の同時学習を行う必要があります。  
193 | 複数話者の同時学習を行っていない場合はfalseのままにしておいてください。  
194 | 
195 | 
196 | **voice_list : voice_selectorを有効化したときに利用する項目です。学習した話者IDを記載します。**  
197 | 
198 | 
199 | **voice_label : voice_selectorを有効化したときに利用する項目です。話者IDのラベルになります。**  
200 | 
201 | 
202 | **input_filename : .wavファイルに対して音声変換したいときに利用する項目です。**
203 | デフォルトでは.confファイルに記入されていません。
204 |     "input_filename": ".\\emotion059.wav",
205 | のように入力する.wavファイルのパスを指定します。
206 | **output_filename : .wavファイルに対して音声変換したいときに利用する項目です。**
207 | デフォルトでは.confファイルに記入されていません。
208 |     "output_filename": ".\\trans_emotion059.wav"
209 | のように、変換結果の保存先とファイル名となる.wavファイルのパスを指定します。
210 |   
211 | 
212 | ### 3. ソフトウェアの起動
213 | パターン1
214 | 「mmvc_client_GPU.bat」を実行  
215 | 正しく「myprofile.conf」が設定されていればそのまま起動します。
216 | 
217 | パターン2
218 | 「mmvc_client_GPU.exe」を実行してください。  
219 | 起動に少しだけ時間がかかります。  
220 | 起動すると「myprofile.conf」のパスを聞かれるので、パスを指定して下さい。  
221 | 
222 | ### おまけ:ノイズリダクションの有効化
223 | ####  1. ノイズ音取得の実行
224 | 「rec_environmental_noise.exe」を実行します。  
225 | 実行したら、モデルを学習したときに設定したサンプリングレートを入力してください。  
226 | (MMVC_Trainerの設定を変えていなければ24000です)  
227 | 次にmyprofile.confのパスを聞かれるため、編集したmyprofile.confのパスを入力してください。  
228 | 以下の入力パスの例のように、.confファイルまで含めて入力して下さい。  
229 | ```
230 | D:\mmvc_client_GPU\myprofile.conf
231 | ```
232 | ※注意として、入力パスの両端に”（ダブルクォーテーション）は付けないでください。  
233 | パスの入力とmyprofile.confに問題が無ければ、ノイズの録音が開始されます。  
234 | ノイズの録音が完了するまで、マイクに話しかけたり等しないで、待ちます。  
235 | 「noise.wav」が実行ファイルと同じディレクトリに出力されます。  
236 | 
237 | ####  2. myprofile.confの書き換え
238 | ```
239 |   "path": {
240 |     "json": ".\\logs\\20220306_24000\\config.json",
241 |     "model": ".\\logs\\20220306_24000\\G_latest_99999999.onnx",
242 |     "noise": ".\\noise.wav"
243 |   }
244 | ```
245 | 上記項目の"noise"に 1. ノイズ音取得の実行 で作成した「noise.wav」のパスを入力します。  
246 | ```
247 |   "others": {
248 |     "use_nr":false,
249 |     "voice_selector":false,
250 |     "voice_list": [101, 108, 6, 30],
251 |     "voice_label": ["ずんだもん", "目標話者", "女性の声", "男性の低い声"]
252 |   }
253 | ```
254 | 上記項目の"use_nr"をtrueに変えます。  
255 | 
256 | ## Reference
257 | https://arxiv.org/abs/2106.06103  
258 | https://github.com/jaywalnut310/vits  
259 | https://github.com/timsainb/noisereduce
260 | ## Author
261 | Isle Tennos  
262 | Twitter : https://twitter.com/IsleTennos
263 | 
264 | 


--------------------------------------------------------------------------------
/python/residual_block.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2022 Reo Yoneyama (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Residual block modules.
  7 | 
  8 | References:
  9 |     - https://github.com/kan-bayashi/ParallelWaveGAN
 10 |     - https://github.com/bigpon/QPPWG
 11 |     - https://github.com/r9y9/wavenet_vocoder
 12 | 
 13 | """
 14 | 
 15 | from logging import getLogger
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | from snake import Snake
 20 | from index import index_initial, pd_indexing
 21 | 
 22 | # A logger for this file
 23 | logger = getLogger(__name__)
 24 | 
 25 | 
 26 | class Conv1d(nn.Conv1d):
 27 |     """Conv1d module with customized initialization."""
 28 | 
 29 |     def __init__(self, *args, **kwargs):
 30 |         """Initialize Conv1d module."""
 31 |         super(Conv1d, self).__init__(*args, **kwargs)
 32 | 
 33 |     def reset_parameters(self):
 34 |         """Reset parameters."""
 35 |         nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
 36 |         if self.bias is not None:
 37 |             nn.init.constant_(self.bias, 0.0)
 38 | 
 39 | 
 40 | class Conv1d1x1(Conv1d):
 41 |     """1x1 Conv1d with customized initialization."""
 42 | 
 43 |     def __init__(self, in_channels, out_channels, bias=True):
 44 |         """Initialize 1x1 Conv1d module."""
 45 |         super(Conv1d1x1, self).__init__(
 46 |             in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias
 47 |         )
 48 | 
 49 | 
 50 | class Conv2d(nn.Conv2d):
 51 |     """Conv2d module with customized initialization."""
 52 | 
 53 |     def __init__(self, *args, **kwargs):
 54 |         """Initialize Conv2d module."""
 55 |         super(Conv2d, self).__init__(*args, **kwargs)
 56 | 
 57 |     def reset_parameters(self):
 58 |         """Reset parameters."""
 59 |         nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu")
 60 |         if self.bias is not None:
 61 |             nn.init.constant_(self.bias, 0.0)
 62 | 
 63 | 
 64 | class Conv2d1x1(Conv2d):
 65 |     """1x1 Conv2d with customized initialization."""
 66 | 
 67 |     def __init__(self, in_channels, out_channels, bias=True):
 68 |         """Initialize 1x1 Conv2d module."""
 69 |         super(Conv2d1x1, self).__init__(
 70 |             in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias
 71 |         )
 72 | 
 73 | 
 74 | class ResidualBlock(nn.Module):
 75 |     """Residual block module in HiFiGAN."""
 76 | 
 77 |     def __init__(
 78 |         self,
 79 |         kernel_size=3,
 80 |         channels=512,
 81 |         dilations=(1, 3, 5),
 82 |         bias=True,
 83 |         use_additional_convs=True,
 84 |         nonlinear_activation="LeakyReLU",
 85 |         nonlinear_activation_params={"negative_slope": 0.1},
 86 |     ):
 87 |         """Initialize ResidualBlock module.
 88 | 
 89 |         Args:
 90 |             kernel_size (int): Kernel size of dilation convolution layer.
 91 |             channels (int): Number of channels for convolution layer.
 92 |             dilations (List[int]): List of dilation factors.
 93 |             use_additional_convs (bool): Whether to use additional convolution layers.
 94 |             bias (bool): Whether to add bias parameter in convolution layers.
 95 |             nonlinear_activation (str): Activation function module name.
 96 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 97 | 
 98 |         """
 99 |         super().__init__()
100 |         self.use_additional_convs = use_additional_convs
101 |         self.convs1 = nn.ModuleList()
102 |         if use_additional_convs:
103 |             self.convs2 = nn.ModuleList()
104 |         assert kernel_size % 2 == 1, "Kernel size must be odd number."
105 |         for dilation in dilations:
106 |             if nonlinear_activation == "Snake":
107 |                 nonlinear = Snake(channels, **nonlinear_activation_params)
108 |             else:
109 |                 nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
110 |             self.convs1 += [
111 |                 nn.Sequential(
112 |                     nonlinear,
113 |                     nn.Conv1d(
114 |                         channels,
115 |                         channels,
116 |                         kernel_size,
117 |                         dilation=dilation,
118 |                         bias=bias,
119 |                         padding=(kernel_size - 1) // 2 * dilation,
120 |                     ),
121 |                 )
122 |             ]
123 |             if use_additional_convs:
124 |                 if nonlinear_activation == "Snake":
125 |                     nonlinear = Snake(channels, **nonlinear_activation_params)
126 |                 else:
127 |                     nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
128 |                 self.convs2 += [
129 |                     nn.Sequential(
130 |                         nonlinear,
131 |                         nn.Conv1d(
132 |                             channels,
133 |                             channels,
134 |                             kernel_size,
135 |                             dilation=1,
136 |                             bias=bias,
137 |                             padding=(kernel_size - 1) // 2,
138 |                         ),
139 |                     )
140 |                 ]
141 | 
142 |     def forward(self, x):
143 |         """Calculate forward propagation.
144 | 
145 |         Args:
146 |             x (Tensor): Input tensor (B, channels, T).
147 | 
148 |         Returns:
149 |             Tensor: Output tensor (B, channels, T).
150 | 
151 |         """
152 |         for idx in range(len(self.convs1)):
153 |             xt = self.convs1[idx](x)
154 |             if self.use_additional_convs:
155 |                 xt = self.convs2[idx](xt)
156 |             x = xt + x
157 |         return x
158 | 
159 | 
160 | class AdaptiveResidualBlock(nn.Module):
161 |     """Residual block module in HiFiGAN."""
162 | 
163 |     def __init__(
164 |         self,
165 |         kernel_size=3,
166 |         channels=512,
167 |         dilations=(1, 2, 4),
168 |         bias=True,
169 |         use_additional_convs=True,
170 |         nonlinear_activation="LeakyReLU",
171 |         nonlinear_activation_params={"negative_slope": 0.1},
172 |     ):
173 |         """Initialize ResidualBlock module.
174 | 
175 |         Args:
176 |             kernel_size (int): Kernel size of dilation convolution layer.
177 |             channels (int): Number of channels for convolution layer.
178 |             bias (bool): Whether to add bias parameter in convolution layers.
179 |             nonlinear_activation (str): Activation function module name.
180 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
181 | 
182 |         """
183 |         super().__init__()
184 |         self.use_additional_convs = use_additional_convs
185 |         assert kernel_size == 3, "Currently only kernel_size = 3 is supported."
186 |         self.channels = channels
187 |         self.dilations = dilations
188 |         self.nonlinears = nn.ModuleList()
189 |         self.convsC = nn.ModuleList()
190 |         self.convsP = nn.ModuleList()
191 |         self.convsF = nn.ModuleList()
192 |         if use_additional_convs:
193 |             self.convsA = nn.ModuleList()
194 |         for _ in dilations:
195 |             if nonlinear_activation == "Snake":
196 |                 self.nonlinears += [Snake(channels, **nonlinear_activation_params)]
197 |             else:
198 |                 self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)]
199 |             self.convsC += [
200 |                 Conv1d1x1(
201 |                     channels,
202 |                     channels,
203 |                     bias=bias,
204 |                 ),
205 |             ]
206 |             self.convsP += [
207 |                 Conv1d1x1(
208 |                     channels,
209 |                     channels,
210 |                     bias=bias,
211 |                 ),
212 |             ]
213 |             self.convsF += [
214 |                 Conv1d1x1(
215 |                     channels,
216 |                     channels,
217 |                     bias=bias,
218 |                 ),
219 |             ]
220 |             if use_additional_convs:
221 |                 if nonlinear_activation == "Snake":
222 |                     nonlinear = Snake(channels, **nonlinear_activation_params)
223 |                 else:
224 |                     nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
225 |                 self.convsA += [
226 |                     nn.Sequential(
227 |                         nonlinear,
228 |                         nn.Conv1d(
229 |                             channels,
230 |                             channels,
231 |                             kernel_size,
232 |                             dilation=1,
233 |                             bias=bias,
234 |                             padding=(kernel_size - 1) // 2,
235 |                         ),
236 |                     )
237 |                 ]
238 | 
239 |     def forward(self, x, d):
240 |         """Calculate forward propagation.
241 | 
242 |         Args:
243 |             x (Tensor): Input tensor (B, channels, T).
244 |             d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
245 | 
246 |         Returns:
247 |             Tensor: Output tensor (B, channels, T).
248 | 
249 |         """
250 |         batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False)
251 |         batch_index = torch.tensor(batch_index).to(x.device)
252 |         ch_index = torch.tensor(ch_index).to(x.device)
253 | 
254 |         for i, dilation in enumerate(self.dilations):
255 |             xt = self.nonlinears[i](x)
256 |             xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index)
257 |             xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF)
258 |             if self.use_additional_convs:
259 |                 xt = self.convsA[i](xt)
260 |             x = xt + x
261 |         return x
262 | 


--------------------------------------------------------------------------------
/python/generator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2022 Reo Yoneyama (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """HiFiGAN and SiFiGAN Generator modules.
  7 | 
  8 | References:
  9 |     - https://github.com/kan-bayashi/ParallelWaveGAN
 10 |     - https://github.com/bigpon/QPPWG
 11 |     - https://github.com/jik876/hifi-gan
 12 | 
 13 | """
 14 | 
 15 | from logging import getLogger
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | from residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock
 20 | 
 21 | # A logger for this file
 22 | logger = getLogger(__name__)
 23 | 
 24 | 
 25 | 
 26 | class SiFiGANGenerator(nn.Module):
 27 |     """SiFiGAN generator module."""
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         in_channels,
 32 |         out_channels=1,
 33 |         channels=512,
 34 |         kernel_size=7,
 35 |         upsample_scales=(5, 4, 3, 2),
 36 |         upsample_kernel_sizes=(10, 8, 6, 4),
 37 |         source_network_params={
 38 |             "resblock_kernel_size": 3,  # currently only 3 is supported.
 39 |             "resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)],
 40 |             "use_additional_convs": True,
 41 |         },
 42 |         filter_network_params={
 43 |             "resblock_kernel_sizes": (3, 5, 7),
 44 |             "resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)],
 45 |             "use_additional_convs": False,
 46 |         },
 47 |         share_upsamples=False,
 48 |         share_downsamples=False,
 49 |         bias=True,
 50 |         nonlinear_activation="LeakyReLU",
 51 |         nonlinear_activation_params={"negative_slope": 0.1},
 52 |         use_weight_norm=True,
 53 |         requires_grad=True
 54 |     ):
 55 |         """Initialize SiFiGANGenerator module.
 56 | 
 57 |         Args:
 58 |             in_channels (int): Number of input channels.
 59 |             out_channels (int): Number of output channels.
 60 |             channels (int): Number of hidden representation channels.
 61 |             kernel_size (int): Kernel size of initial and final conv layer.
 62 |             upsample_scales (list): List of upsampling scales.
 63 |             upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
 64 |             source_network_params (dict): Parameters for source-network.
 65 |             filter_network_params (dict): Parameters for filter-network.
 66 |             share_upsamples (bool): Whether to share up-sampling transposed CNNs.
 67 |             share_downsamples (bool): Whether to share down-sampling CNNs.
 68 |             bias (bool): Whether to add bias parameter in convolution layers.
 69 |             nonlinear_activation (str): Activation function module name.
 70 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 71 |             use_weight_norm (bool): Whether to use weight norm.
 72 |                 If set to true, it will be applied to all of the conv layers.
 73 | 
 74 |         """
 75 |         super().__init__()
 76 |         # check hyperparameters are valid
 77 |         assert kernel_size % 2 == 1, "Kernel size must be odd number."
 78 |         assert len(upsample_scales) == len(upsample_kernel_sizes)
 79 | 
 80 |         # define modules
 81 |         self.num_upsamples = len(upsample_kernel_sizes)
 82 |         self.source_network_params = source_network_params
 83 |         self.filter_network_params = filter_network_params
 84 |         self.share_upsamples = share_upsamples
 85 |         self.share_downsamples = share_downsamples
 86 |         self.sn = nn.ModuleDict()
 87 |         self.fn = nn.ModuleDict()
 88 |         self.input_conv = Conv1d(
 89 |             in_channels,
 90 |             channels,
 91 |             kernel_size,
 92 |             bias=bias,
 93 |             padding=(kernel_size - 1) // 2,
 94 |         )
 95 |         self.sn["upsamples"] = nn.ModuleList()
 96 |         self.fn["upsamples"] = nn.ModuleList()
 97 |         self.sn["blocks"] = nn.ModuleList()
 98 |         self.fn["blocks"] = nn.ModuleList()
 99 |         for i in range(len(upsample_kernel_sizes)):
100 |             assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
101 |             self.sn["upsamples"] += [
102 |                 nn.Sequential(
103 |                     getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
104 |                     nn.ConvTranspose1d(
105 |                         channels // (2 ** i),
106 |                         channels // (2 ** (i + 1)),
107 |                         upsample_kernel_sizes[i],
108 |                         upsample_scales[i],
109 |                         padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
110 |                         output_padding=upsample_scales[i] % 2,
111 |                         bias=bias,
112 |                     ),
113 |                 )
114 |             ]
115 |             if not share_upsamples:
116 |                 self.fn["upsamples"] += [
117 |                     nn.Sequential(
118 |                         getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
119 |                         nn.ConvTranspose1d(
120 |                             channels // (2 ** i),
121 |                             channels // (2 ** (i + 1)),
122 |                             upsample_kernel_sizes[i],
123 |                             upsample_scales[i],
124 |                             padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
125 |                             output_padding=upsample_scales[i] % 2,
126 |                             bias=bias,
127 |                         ),
128 |                     )
129 |                 ]
130 |             self.sn["blocks"] += [
131 |                 AdaptiveResidualBlock(
132 |                     kernel_size=source_network_params["resblock_kernel_size"],
133 |                     channels=channels // (2 ** (i + 1)),
134 |                     dilations=source_network_params["resblock_dilations"][i],
135 |                     bias=bias,
136 |                     use_additional_convs=source_network_params["use_additional_convs"],
137 |                     nonlinear_activation=nonlinear_activation,
138 |                     nonlinear_activation_params=nonlinear_activation_params,
139 |                 )
140 |             ]
141 |             for j in range(len(filter_network_params["resblock_kernel_sizes"])):
142 |                 self.fn["blocks"] += [
143 |                     ResidualBlock(
144 |                         kernel_size=filter_network_params["resblock_kernel_sizes"][j],
145 |                         channels=channels // (2 ** (i + 1)),
146 |                         dilations=filter_network_params["resblock_dilations"][j],
147 |                         bias=bias,
148 |                         use_additional_convs=filter_network_params["use_additional_convs"],
149 |                         nonlinear_activation=nonlinear_activation,
150 |                         nonlinear_activation_params=nonlinear_activation_params,
151 |                     )
152 |                 ]
153 |         self.sn["output_conv"] = nn.Sequential(
154 |             nn.LeakyReLU(),
155 |             nn.Conv1d(
156 |                 channels // (2 ** (i + 1)),
157 |                 out_channels,
158 |                 kernel_size,
159 |                 bias=bias,
160 |                 padding=(kernel_size - 1) // 2,
161 |             ),
162 |         )
163 |         self.fn["output_conv"] = nn.Sequential(
164 |             nn.LeakyReLU(),
165 |             nn.Conv1d(
166 |                 channels // (2 ** (i + 1)),
167 |                 out_channels,
168 |                 kernel_size,
169 |                 bias=bias,
170 |                 padding=(kernel_size - 1) // 2,
171 |             ),
172 |             nn.Tanh(),
173 |         )
174 | 
175 |         # sine embedding layers
176 |         self.sn["emb"] = Conv1d(
177 |             1,
178 |             channels // (2 ** len(upsample_kernel_sizes)),
179 |             kernel_size,
180 |             bias=bias,
181 |             padding=(kernel_size - 1) // 2,
182 |         )
183 |         # down-sampling CNNs
184 |         self.sn["downsamples"] = nn.ModuleList()
185 |         for i in reversed(range(1,len(upsample_kernel_sizes))):
186 |             self.sn["downsamples"] += [
187 |                 nn.Sequential(
188 |                     nn.Conv1d(
189 |                         channels // (2 ** (i + 1)),
190 |                         channels // (2 ** i),
191 |                         upsample_kernel_sizes[i],
192 |                         upsample_scales[i],
193 |                         padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
194 |                         bias=bias,
195 |                     ),
196 |                     getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
197 |                 )
198 |             ]
199 |         if not share_downsamples:
200 |             self.fn["downsamples"] = nn.ModuleList()
201 |             for i in reversed(range(1,len(upsample_kernel_sizes))):
202 |                 self.fn["downsamples"] += [
203 |                     nn.Sequential(
204 |                         nn.Conv1d(
205 |                             channels // (2 ** (i + 1)),
206 |                             channels // (2 ** i),
207 |                             upsample_kernel_sizes[i],
208 |                             upsample_scales[i],
209 |                             padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
210 |                             bias=bias,
211 |                         ),
212 |                         getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
213 |                     )
214 |                 ]
215 | 
216 |         # apply weight norm
217 |         if use_weight_norm:
218 |             self.apply_weight_norm()
219 | 
220 |         # reset parameters
221 |         self.reset_parameters()
222 | 
223 |         if requires_grad == False:
224 |           for param in self.parameters():
225 |             param.requires_grad = False
226 | 
227 |     def forward(self, x, c, d, sid):
228 |         """Calculate forward propagation.
229 | 
230 |         Args:
231 |             x (Tensor): Input sine signal (B, 1, T).
232 |             c (Tensor): Input tensor (B, in_channels, T).
233 |             d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples].
234 | 
235 |         Returns:
236 |             Tensor: Output tensor (B, out_channels, T).
237 | 
238 |         """
239 | 
240 |         # currently, same input feature is input to each network
241 |         c = self.input_conv(c)
242 |         e = c
243 | 
244 |         # source-network forward
245 |         x = self.sn["emb"](x)
246 |         embs = [x]
247 |         for i in range(self.num_upsamples - 1):
248 |             x = self.sn["downsamples"][i](x)
249 |             embs += [x]
250 |         for i in range(self.num_upsamples):
251 |             # excitation generation network
252 |             e = self.sn["upsamples"][i](e) + embs[-i - 1]
253 |             e = self.sn["blocks"][i](e, d[i])
254 |         e_ = self.sn["output_conv"](e)
255 | 
256 |         # filter-network forward
257 |         embs = [e]
258 |         for i in range(self.num_upsamples - 1):
259 |             if self.share_downsamples:
260 |                 e = self.sn["downsamples"][i](e)
261 |             else:
262 |                 e = self.fn["downsamples"][i](e)
263 |             embs += [e]
264 |         num_blocks = len(self.filter_network_params["resblock_kernel_sizes"])
265 |         for i in range(self.num_upsamples):
266 |             # resonance filtering network
267 |             if self.share_upsamples:
268 |                 c = self.sn["upsamples"][i](c) + embs[-i - 1]
269 |             else:
270 |                 c = self.fn["upsamples"][i](c) + embs[-i - 1]
271 |             cs = 0.0  # initialize
272 |             for j in range(num_blocks):
273 |                 cs += self.fn["blocks"][i * num_blocks + j](c)
274 |             c = cs / num_blocks
275 |         c = self.fn["output_conv"](c)
276 | 
277 |         return c, e_
278 | 
279 |     def reset_parameters(self):
280 |         """Reset parameters.
281 | 
282 |         This initialization follows the official implementation manner.
283 |         https://github.com/jik876/hifi-gan/blob/master/models.py
284 | 
285 |         """
286 | 
287 |         def _reset_parameters(m):
288 |             if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
289 |                 m.weight.data.normal_(0.0, 0.01)
290 |                 logger.debug(f"Reset parameters in {m}.")
291 | 
292 |         self.apply(_reset_parameters)
293 | 
294 |     def remove_weight_norm(self):
295 |         """Remove weight normalization module from all of the layers."""
296 | 
297 |         def _remove_weight_norm(m):
298 |             try:
299 |                 logger.debug(f"Weight norm is removed from {m}.")
300 |                 nn.utils.remove_weight_norm(m)
301 |             except ValueError:  # this module didn't have weight norm
302 |                 return
303 | 
304 |         self.apply(_remove_weight_norm)
305 | 
306 |     def apply_weight_norm(self):
307 |         """Apply weight normalization module from all of the layers."""
308 | 
309 |         def _apply_weight_norm(m):
310 |             if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
311 |                 nn.utils.weight_norm(m)
312 |                 logger.debug(f"Weight norm is applied to {m}.")
313 | 
314 |         self.apply(_apply_weight_norm)


--------------------------------------------------------------------------------
/python/models.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import commons
  9 | import modules
 10 | 
 11 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 13 | from commons import init_weights, get_padding
 14 | from generator import SiFiGANGenerator
 15 | from features import SignalGenerator, dilated_factor
 16 | 
 17 | class TextEncoder(nn.Module):
 18 |   def __init__(self,
 19 |       out_channels,
 20 |       hidden_channels,
 21 |       requires_grad=True):
 22 |     super().__init__()
 23 |     self.out_channels = out_channels
 24 |     self.hidden_channels = hidden_channels
 25 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
 26 |     #パラメータを学習しない
 27 |     if requires_grad == False:
 28 |       for param in self.parameters():
 29 |         param.requires_grad = False
 30 | 
 31 |   def forward(self, x, x_lengths):
 32 |     x = torch.transpose(x.half(), 1, -1) # [b, h, t]
 33 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 34 |     stats = self.proj(x) * x_mask
 35 |     m, logs = torch.split(stats, self.out_channels, dim=1)
 36 |     return x, m, logs, x_mask
 37 | 
 38 | class ResidualCouplingBlock(nn.Module):
 39 |   def __init__(self,
 40 |       channels,
 41 |       hidden_channels,
 42 |       kernel_size,
 43 |       dilation_rate,
 44 |       n_layers,
 45 |       n_flows=4,
 46 |       gin_channels=0,
 47 |       requires_grad=True):
 48 |     super().__init__()
 49 |     self.channels = channels
 50 |     self.hidden_channels = hidden_channels
 51 |     self.kernel_size = kernel_size
 52 |     self.dilation_rate = dilation_rate
 53 |     self.n_layers = n_layers
 54 |     self.n_flows = n_flows
 55 |     self.gin_channels = gin_channels
 56 | 
 57 |     self.flows = nn.ModuleList()
 58 |     for i in range(n_flows):
 59 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
 60 |       self.flows.append(modules.Flip())
 61 | 
 62 |     #パラメータを学習しない
 63 |     if requires_grad == False:
 64 |       for param in self.parameters():
 65 |         param.requires_grad = False
 66 | 
 67 |   def forward(self, x, x_mask, g=None, reverse=False):
 68 |     if not reverse:
 69 |       for flow in self.flows:
 70 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
 71 |     else:
 72 |       for flow in reversed(self.flows):
 73 |         x = flow(x, x_mask, g=g, reverse=reverse)
 74 |     return x
 75 | 
 76 | 
 77 | class PosteriorEncoder(nn.Module):
 78 |   def __init__(self,
 79 |       in_channels,
 80 |       out_channels,
 81 |       hidden_channels,
 82 |       kernel_size,
 83 |       dilation_rate,
 84 |       n_layers,
 85 |       gin_channels=0,
 86 |       requires_grad=True):
 87 |     super().__init__()
 88 |     self.in_channels = in_channels
 89 |     self.out_channels = out_channels
 90 |     self.hidden_channels = hidden_channels
 91 |     self.kernel_size = kernel_size
 92 |     self.dilation_rate = dilation_rate
 93 |     self.n_layers = n_layers
 94 |     self.gin_channels = gin_channels
 95 | 
 96 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
 97 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
 98 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 99 | 
100 |     #パラメータを学習しない
101 |     if requires_grad == False:
102 |       for param in self.parameters():
103 |         param.requires_grad = False
104 | 
105 | 
106 |   def forward(self, x, x_lengths, g=None):
107 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
108 |     x = self.pre(x) * x_mask
109 |     x = self.enc(x, x_mask, g=g)
110 |     stats = self.proj(x) * x_mask
111 |     m, logs = torch.split(stats, self.out_channels, dim=1)
112 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
113 |     return z, m, logs, x_mask
114 | 
115 | 
116 | class Generator(torch.nn.Module):
117 |     def __init__(self, 
118 |           initial_channel, 
119 |           resblock, 
120 |           resblock_kernel_sizes, 
121 |           resblock_dilation_sizes, 
122 |           upsample_rates, 
123 |           upsample_initial_channel, 
124 |           upsample_kernel_sizes, 
125 |           requires_grad=True):
126 |         super(Generator, self).__init__()
127 |         self.num_kernels = len(resblock_kernel_sizes)
128 |         self.num_upsamples = len(upsample_rates)
129 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
130 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
131 | 
132 |         self.ups = nn.ModuleList()
133 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
134 |             self.ups.append(weight_norm(
135 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
136 |                                 k, u, padding=(k-u)//2)))
137 | 
138 |         self.resblocks = nn.ModuleList()
139 |         for i in range(len(self.ups)):
140 |             ch = upsample_initial_channel//(2**(i+1))
141 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
142 |                 self.resblocks.append(resblock(ch, k, d))
143 | 
144 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
145 |         self.ups.apply(init_weights)
146 | 
147 |         if requires_grad == False:
148 |           for param in self.parameters():
149 |             param.requires_grad = False
150 | 
151 | 
152 |     def forward(self, x, g=None):
153 |         x = self.conv_pre(x)
154 | 
155 |         for i in range(self.num_upsamples):
156 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
157 |             x = self.ups[i](x)
158 |             xs = None
159 |             for j in range(self.num_kernels):
160 |                 if xs is None:
161 |                     xs = self.resblocks[i*self.num_kernels+j](x)
162 |                 else:
163 |                     xs += self.resblocks[i*self.num_kernels+j](x)
164 |             x = xs / self.num_kernels
165 |         x = F.leaky_relu(x)
166 |         x = self.conv_post(x)
167 |         x = torch.tanh(x)
168 | 
169 |         return x
170 | 
171 |     def remove_weight_norm(self):
172 |         print('Removing weight norm...')
173 |         for l in self.ups:
174 |             remove_weight_norm(l)
175 |         for l in self.resblocks:
176 |             l.remove_weight_norm()
177 | 
178 | 
179 | class DiscriminatorP(torch.nn.Module):
180 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
181 |         super(DiscriminatorP, self).__init__()
182 |         self.period = period
183 |         self.use_spectral_norm = use_spectral_norm
184 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
185 |         self.convs = nn.ModuleList([
186 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
187 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
188 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
189 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
190 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
191 |         ])
192 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
193 | 
194 |     def forward(self, x):
195 |         fmap = []
196 | 
197 |         # 1d to 2d
198 |         b, c, t = x.shape
199 |         if t % self.period != 0: # pad first
200 |             n_pad = self.period - (t % self.period)
201 |             x = F.pad(x, (0, n_pad), "reflect")
202 |             t = t + n_pad
203 |         x = x.view(b, c, t // self.period, self.period)
204 | 
205 |         for l in self.convs:
206 |             x = l(x)
207 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
208 |             fmap.append(x)
209 |         x = self.conv_post(x)
210 |         fmap.append(x)
211 |         x = torch.flatten(x, 1, -1)
212 | 
213 |         return x, fmap
214 | 
215 | 
216 | class DiscriminatorS(torch.nn.Module):
217 |     def __init__(self, use_spectral_norm=False):
218 |         super(DiscriminatorS, self).__init__()
219 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
220 |         self.convs = nn.ModuleList([
221 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
222 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
223 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
224 |             norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
225 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
226 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
227 |         ])
228 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
229 | 
230 |     def forward(self, x):
231 |         fmap = []
232 | 
233 |         for l in self.convs:
234 |             x = l(x)
235 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
236 |             fmap.append(x)
237 |         x = self.conv_post(x)
238 |         fmap.append(x)
239 |         x = torch.flatten(x, 1, -1)
240 | 
241 |         return x, fmap
242 | 
243 | 
244 | class MultiPeriodDiscriminator(torch.nn.Module):
245 |     def __init__(self, use_spectral_norm=False):
246 |         super(MultiPeriodDiscriminator, self).__init__()
247 |         #periods = [2,3,5,7,11]
248 |         periods = [3,5,7,11,13]
249 | 
250 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
251 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
252 |         self.discriminators = nn.ModuleList(discs)
253 | 
254 |     def forward(self, y, y_hat, flag = True):
255 |         if flag:
256 |             y_d_rs = []
257 |             y_d_gs = []
258 |             fmap_rs = []
259 |             fmap_gs = []
260 |             for i, d in enumerate(self.discriminators):
261 |                 y_d_r, fmap_r = d(y)
262 |                 y_d_g, fmap_g = d(y_hat)
263 |                 y_d_rs.append(y_d_r)
264 |                 y_d_gs.append(y_d_g)
265 |                 fmap_rs.append(fmap_r)
266 |                 fmap_gs.append(fmap_g)
267 | 
268 |             return y_d_rs, y_d_gs, fmap_rs, fmap_gs
269 |         else:
270 |             y_d_gs = []
271 |             with torch.no_grad():
272 |                 for i, d in enumerate(self.discriminators):
273 |                     y_d_g, _ = d(y_hat)
274 |                     y_d_gs.append(y_d_g)
275 | 
276 |             return y_d_gs            
277 | 
278 | 
279 | class SynthesizerTrn(nn.Module):
280 |   """
281 |   Synthesizer for Training
282 |   """
283 | 
284 |   def __init__(self, 
285 |     spec_channels,
286 |     segment_size,
287 |     inter_channels,
288 |     hidden_channels,
289 |     upsample_rates, 
290 |     upsample_initial_channel, 
291 |     upsample_kernel_sizes,
292 |     n_flow,
293 |     dec_out_channels=1,
294 |     dec_kernel_size=7,
295 |     n_speakers=0,
296 |     gin_channels=0,
297 |     requires_grad_pe=True,
298 |     requires_grad_flow=True,
299 |     requires_grad_text_enc=True,
300 |     requires_grad_dec=True,
301 |     requires_grad_emb_g=True,
302 |     sample_rate=24000,
303 |     hop_size=128,
304 |     sine_amp=0.1,
305 |     noise_amp=0.003,
306 |     signal_types=["sine"],
307 |     dense_factors=[0.5, 1, 4, 8],
308 |     upsample_scales=[8, 4, 2, 2],
309 |     ):
310 | 
311 |     super().__init__()
312 |     self.spec_channels = spec_channels
313 |     self.hidden_channels = hidden_channels
314 |     self.upsample_rates = upsample_rates
315 |     self.upsample_initial_channel = upsample_initial_channel
316 |     self.upsample_kernel_sizes = upsample_kernel_sizes
317 |     self.segment_size = segment_size
318 |     self.dec_out_channels = dec_out_channels
319 |     self.dec_kernel_size = dec_kernel_size
320 |     self.n_speakers = n_speakers
321 |     self.gin_channels = gin_channels
322 |     self.requires_grad_pe = requires_grad_pe
323 |     self.requires_grad_flow = requires_grad_flow
324 |     self.requires_grad_text_enc = requires_grad_text_enc
325 |     self.requires_grad_dec = requires_grad_dec
326 |     self.requires_grad_emb_g = requires_grad_emb_g
327 |     self.sample_rate = sample_rate
328 |     self.hop_size = hop_size
329 |     self.sine_amp = sine_amp
330 |     self.noise_amp = noise_amp
331 |     self.signal_types = signal_types
332 |     self.dense_factors = dense_factors
333 |     self.upsample_scales = upsample_scales
334 | 
335 |     self.enc_q = PosteriorEncoder(
336 |         spec_channels, 
337 |         inter_channels, 
338 |         hidden_channels, 
339 |         5, 
340 |         1, 
341 |         16, 
342 |         gin_channels=gin_channels, 
343 |         requires_grad=requires_grad_pe)
344 |     self.enc_p = TextEncoder(
345 |         inter_channels,
346 |         hidden_channels,
347 |         requires_grad=requires_grad_text_enc)
348 |     self.dec = SiFiGANGenerator(
349 |         in_channels=inter_channels,
350 |         out_channels=dec_out_channels,
351 |         channels=upsample_initial_channel,
352 |         kernel_size=dec_kernel_size,
353 |         upsample_scales=upsample_rates,
354 |         upsample_kernel_sizes=upsample_kernel_sizes,
355 |         requires_grad=requires_grad_dec)
356 |     self.flow = ResidualCouplingBlock(
357 |         inter_channels, 
358 |         hidden_channels, 
359 |         5, 
360 |         1, 
361 |         4, 
362 |         n_flows=n_flow, 
363 |         gin_channels=gin_channels,
364 |         requires_grad=requires_grad_flow)
365 |     self.signal_generator = SignalGenerator(
366 |         sample_rate=sample_rate,
367 |         hop_size=hop_size,
368 |         noise_amp=noise_amp,
369 |         signal_types=signal_types
370 |     )
371 | 
372 |     if n_speakers > 1:
373 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
374 |       self.emb_g.requires_grad = requires_grad_emb_g
375 | 
376 |   def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None):
377 |     sin, d = self.make_sin_d(f0)
378 | 
379 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
380 |     #target sid 作成
381 |     target_sids = self.make_random_target_sids(target_ids, sid)
382 | 
383 |     if self.n_speakers > 0:
384 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
385 |       tgt_g = self.emb_g(target_sids).unsqueeze(-1) # [b, h, 1]
386 |     else:
387 |       g = None
388 | 
389 |     #PE
390 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
391 |     #Flow
392 |     z_p = self.flow(z, y_mask, g=g)
393 |     #VC
394 |     tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True)
395 |     #アライメントの作成
396 |     liner_alignment = F.one_hot(torch.arange(0, x.shape[2]+2)).cuda()
397 |     liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0)
398 |     liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode='linear', align_corners=True)
399 |     liner_alignment = liner_alignment[:,1:-1,:]
400 |     #TextEncとPEのshape合わせ
401 |     m_p = torch.matmul(m_p, liner_alignment)
402 |     logs_p = torch.matmul(logs_p, liner_alignment)
403 | 
404 |     #slice
405 |     z_slice = commons.slice_segments(z, slice_id, self.segment_size)
406 |     #targetのslice
407 |     tgt_z_slice = commons.slice_segments(tgt_z, slice_id, self.segment_size)
408 |     #Dec
409 |     o = self.dec(sin, z_slice, d, sid=g)
410 |     tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g)
411 | 
412 |     return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)
413 | 
414 |   def make_sin_d(self, f0):
415 |     # f0 から sin と d を作成
416 |     # f0 : [b, 1, t]
417 |     # sin : [b, 1, t]
418 |     # d : [4][b, 1, t]
419 |     prod_upsample_scales = np.cumprod(self.upsample_scales)
420 |     dfs_batch = []
421 |     for df, us in zip(self.dense_factors, prod_upsample_scales):
422 |       dilated_tensor = dilated_factor(f0, self.sample_rate, df)
423 |       #result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
424 |       result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
425 |       dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))
426 |     in_batch = self.signal_generator(f0)
427 | 
428 |     return in_batch, dfs_batch
429 | 
430 |   def make_random_target_sids(self, target_ids, sid):
431 |     # target_sids は target_ids をランダムで埋める
432 |     target_sids = torch.zeros_like(sid)
433 |     for i in range(len(target_sids)):
434 |       source_id = sid[i]
435 |       deleted_target_ids = target_ids[target_ids != source_id] # source_id と target_id が同じにならないよう sid と同じものを削除
436 |       if len(deleted_target_ids) >= 1:
437 |         target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))]
438 |       else:
439 |         # target_id 候補が無いときは仕方ないので sid を使う
440 |         target_sids[i] = source_id
441 |     return target_sids
442 | 
443 |   def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
444 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
445 |     sin, d = self.make_sin_d(f0)
446 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
447 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
448 |     z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
449 |     z_p = self.flow(z, y_mask, g=g_src)
450 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
451 |     o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
452 |     return o_hat[0]
453 | 
454 |   def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
455 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
456 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
457 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
458 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
459 |     o_hat = self.dec(z * y_mask, g=g_tgt)
460 |     return o_hat, y_mask, (z)
461 | 
462 |   def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
463 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
464 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
465 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
466 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
467 |     o_hat = self.dec(z * y_mask, g=g_src)
468 |     return o_hat, y_mask, (z)
469 | 
470 |   def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
471 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
472 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
473 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
474 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
475 |     z_p = self.flow(z, y_mask, g=g_src)
476 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
477 |     z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
478 |     z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
479 |     o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
480 |     return o_hat, y_mask, (z, z_p, z_hat)
481 | 
482 |   def save_synthesizer(self, path):
483 |     enc_q = self.enc_q.state_dict()
484 |     dec = self.dec.state_dict()
485 |     emb_g = self.emb_g.state_dict()
486 |     torch.save({'enc_q': enc_q,'dec': dec, 'emb_g': emb_g}, path)
487 | 
488 |   def load_synthesizer(self, path):
489 |     dict = torch.load(path, map_location='cpu')
490 |     enc_q = dict['enc_q']
491 |     dec = dict['dec']
492 |     emb_g = dict['emb_g']
493 |     self.enc_q.load_state_dict(enc_q)
494 |     self.dec.load_state_dict(dec)
495 |     self.emb_g.load_state_dict(emb_g)
496 | 
497 | 


--------------------------------------------------------------------------------
/python/mmvc_client.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | #use thread limit
  3 | import os
  4 | os.environ["OMP_NUM_THREADS"] = "1"
  5 | import sys
  6 | import json
  7 | import csv
  8 | import numpy as np
  9 | import torch
 10 | import onnxruntime as ort
 11 | import pyaudio
 12 | import sounddevice as sd
 13 | import soundfile as sf
 14 | import wave
 15 | #noice reduce
 16 | import noisereduce as nr
 17 | #ファイルダイアログ関連
 18 | import tkinter as tk #add
 19 | from tkinter import filedialog #add
 20 | 
 21 | #user lib
 22 | from models import SynthesizerTrn
 23 | 
 24 | #remove F0_SCALE
 25 | 
 26 | import time
 27 | import pyworld as pw
 28 | from scipy.interpolate import interp1d
 29 | from features import SignalGenerator, dilated_factor
 30 | 
 31 | 
 32 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 33 |   assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}"
 34 |   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 35 |   iteration = checkpoint_dict['iteration']
 36 |   learning_rate = checkpoint_dict['learning_rate']
 37 |   if optimizer is not None:
 38 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 39 |   saved_state_dict = {
 40 |     **checkpoint_dict['pe'],
 41 |     **checkpoint_dict['flow'],
 42 |     **checkpoint_dict['text_enc'], 
 43 |     **checkpoint_dict['dec'],
 44 |     **checkpoint_dict['emb_g']
 45 |     }
 46 |   if hasattr(model, 'module'):
 47 |     state_dict = model.module.state_dict()
 48 |   else:
 49 |     state_dict = model.state_dict()
 50 |   new_state_dict= {}
 51 |   for k, v in state_dict.items():
 52 |     try:
 53 |       new_state_dict[k] = saved_state_dict[k]
 54 |     except:
 55 |       new_state_dict[k] = v
 56 |   if hasattr(model, 'module'):
 57 |     model.module.load_state_dict(new_state_dict)
 58 |   else:
 59 |     model.load_state_dict(new_state_dict)
 60 |   return model, optimizer, learning_rate, iteration
 61 | 
 62 | 
 63 | def get_hparams_from_file(config_path):
 64 |   with open(config_path, "r", encoding="utf-8") as f:
 65 |     data = f.read()
 66 |   config = json.loads(data)
 67 | 
 68 |   hparams =HParams(**config)
 69 |   return hparams
 70 | 
 71 | 
 72 | def read_correspondence_file(filename, delimiter='|', newline='\n'):
 73 |     data = {}
 74 |     with open(filename, "r", encoding="utf-8", newline=newline) as f:
 75 |         csv_reader = csv.reader(f, delimiter=delimiter)
 76 |         for row in csv_reader:
 77 |             sid = int(row[0])
 78 |             f0 = float(row[1])
 79 |             data[sid] = f0
 80 |     return data
 81 | 
 82 | class HParams():
 83 |   def __init__(self, **kwargs):
 84 |     for k, v in kwargs.items():
 85 |       if type(v) == dict:
 86 |         v = HParams(**v)
 87 |       self[k] = v
 88 |     
 89 |   def keys(self):
 90 |     return self.__dict__.keys()
 91 | 
 92 |   def items(self):
 93 |     return self.__dict__.items()
 94 | 
 95 |   def values(self):
 96 |     return self.__dict__.values()
 97 | 
 98 |   def __len__(self):
 99 |     return len(self.__dict__)
100 | 
101 |   def __getitem__(self, key):
102 |     return getattr(self, key)
103 | 
104 |   def __setitem__(self, key, value):
105 |     return setattr(self, key, value)
106 | 
107 |   def __contains__(self, key):
108 |     return key in self.__dict__
109 | 
110 |   def __repr__(self):
111 |     return self.__dict__.__repr__()
112 | 
113 | 
114 | class Hyperparameters():
115 |     CHANNELS = 1 #モノラル
116 |     FORMAT = pyaudio.paInt16
117 |     INPUT_DEVICE_1 = None
118 |     INPUT_DEVICE_2 = None
119 |     OUTPUT_DEVICE_1 = None
120 |     CONFIG_JSON_PATH = None
121 |     MODEL_PATH = None
122 |     NOISE_FILE = None
123 |     CORRESPONDENCE_PATH = None
124 |     FLAME_LENGTH = None
125 |     SOURCE_ID = None
126 |     TARGET_ID = None
127 |     F0_SCALE = None
128 |     MIC_SCALE = None
129 |     USE_NR = None
130 |     VOICE_LIST = None
131 |     VOICE_LABEL = None
132 |     VOICE_F0 = None
133 |     #jsonから取得
134 |     SAMPLE_RATE = None
135 |     MAX_WAV_VALUE = None
136 |     FILTER_LENGTH = None
137 |     HOP_LENGTH = None
138 |     SEGMENT_SIZE = None
139 |     N_SPEAKERS = None
140 |     CONFIG_JSON_Body = None
141 |     DELAY_FLAMES = None
142 |     #thread share var
143 |     REC_NOISE_END_FLAG = False
144 |     VC_END_FLAG = False
145 |     OVERLAP = None
146 |     DISPOSE_STFT_SPECS = 0
147 |     DISPOSE_CONV1D_SPECS = 0
148 |     INPUT_FILENAME = None
149 |     OUTPUT_FILENAME = None
150 |     GPU_ID = 0
151 |     Voice_Selector_Flag = None
152 |     USE_ONNX = None
153 |     ONNX_PROVIDERS = None
154 |     ORT_ENABLE_BASIC = None
155 |     hps = None
156 | 
157 |     def set_input_device_1(self, value):
158 |         Hyperparameters.INPUT_DEVICE_1 = value
159 | 
160 |     def set_input_device_2(self, value):
161 |         Hyperparameters.INPUT_DEVICE_2 = value
162 | 
163 |     def set_output_device_1(self, value):
164 |         Hyperparameters.OUTPUT_DEVICE_1 = value
165 | 
166 |     def set_config_path(self, value):
167 |         Hyperparameters.CONFIG_JSON_PATH = value
168 |         self.hps = get_hparams_from_file(Hyperparameters.CONFIG_JSON_PATH)
169 |         Hyperparameters.CONFIG_JSON_Body = self.hps
170 |         Hyperparameters.SAMPLE_RATE = self.hps.data.sampling_rate
171 |         Hyperparameters.MAX_WAV_VALUE = self.hps.data.max_wav_value
172 |         Hyperparameters.FILTER_LENGTH = self.hps.data.filter_length
173 |         Hyperparameters.HOP_LENGTH = self.hps.data.hop_length
174 |         Hyperparameters.SEGMENT_SIZE = self.hps.train.segment_size
175 |         Hyperparameters.N_SPEAKERS = self.hps.data.n_speakers
176 |         if not hasattr(self.hps.model, "use_mel_train"):
177 |             self.hps.model.use_mel_train = False
178 | 
179 |     def set_model_path(self, value):
180 |         Hyperparameters.MODEL_PATH = value
181 | 
182 |     def set_NOISE_FILE(self, value):
183 |         Hyperparameters.NOISE_FILE = value
184 | 
185 |     def set_CORRESPONDENCE_PATH(self, value):
186 |         Hyperparameters.CORRESPONDENCE_PATH = value
187 | 
188 |     def set_FLAME_LENGTH(self, value):
189 |         Hyperparameters.FLAME_LENGTH = value
190 | 
191 |     def set_SOURCE_ID(self, value):
192 |         Hyperparameters.SOURCE_ID = value
193 | 
194 |     def set_TARGET_ID(self, value):
195 |         Hyperparameters.TARGET_ID = value
196 | 
197 |     def set_F0_SCALE(self, value):
198 |         Hyperparameters.F0_SCALE = value
199 | 
200 |     def set_MIC_SCALE(self, value):
201 |         Hyperparameters.MIC_SCALE = value
202 | 
203 |     def set_OVERLAP(self, value):
204 |         Hyperparameters.OVERLAP = value
205 | 
206 |     def set_USE_NR(self, value):
207 |         Hyperparameters.USE_NR = value
208 | 
209 |     def set_VOICE_LIST(self, value):
210 |         Hyperparameters.VOICE_LIST = value
211 | 
212 |     def set_VOICE_LABEL(self, value):
213 |         Hyperparameters.VOICE_LABEL = value
214 | 
215 |     def set_VOICE_F0(self, value):
216 |         Hyperparameters.VOICE_F0 = value
217 | 
218 |     def set_DELAY_FLAMES(self, value):
219 |         Hyperparameters.DELAY_FLAMES = value
220 | 
221 |     def set_DISPOSE_STFT_SPECS(self, value):
222 |         Hyperparameters.DISPOSE_STFT_SPECS = value
223 | 
224 |     def set_DISPOSE_CONV1D_SPECS(self, value):
225 |         Hyperparameters.DISPOSE_CONV1D_SPECS = value
226 | 
227 |     def set_INPUT_FILENAME(self, value):
228 |         Hyperparameters.INPUT_FILENAME = value
229 | 
230 |     def set_OUTPUT_FILENAME(self, value):
231 |         Hyperparameters.OUTPUT_FILENAME = value
232 | 
233 |     def set_GPU_ID(self, value):
234 |         Hyperparameters.GPU_ID = value
235 | 
236 |     def set_Voice_Selector(self, value):
237 |         Hyperparameters.Voice_Selector_Flag = value
238 | 
239 |     def set_USE_ONNX(self, value):
240 |         Hyperparameters.USE_ONNX = value
241 | 
242 |     def set_ONNX_PROVIDERS(self, value):
243 |         Hyperparameters.ONNX_PROVIDERS = value
244 | 
245 |     def set_ONNX_ORT_ENABLE_BASIC(self, value):
246 |         Hyperparameters.ORT_ENABLE_BASIC = value
247 | 
248 |     def set_profile(self, profile):
249 |         sound_devices = sd.query_devices()
250 |         if type(profile.device.input_device1) == str:
251 |             self.set_input_device_1(sound_devices.index(sd.query_devices(profile.device.input_device1, 'input')))
252 |         else:
253 |             self.set_input_device_1(profile.device.input_device1)
254 |         
255 |         if type(profile.device.input_device2) == str:
256 |             self.set_input_device_2(sound_devices.index(sd.query_devices(profile.device.input_device2, 'input')))
257 |         else:
258 |             self.set_input_device_2(profile.device.input_device2)
259 |         
260 |         if type(profile.device.output_device) == str:
261 |             self.set_output_device_1(sound_devices.index(sd.query_devices(profile.device.output_device, 'output')))
262 |         else:
263 |             self.set_output_device_1(profile.device.output_device)
264 |         
265 |         self.set_config_path(profile.path.json)
266 |         self.set_model_path(profile.path.model)
267 |         self.set_NOISE_FILE(profile.path.noise)
268 |         self.set_CORRESPONDENCE_PATH(profile.path.correspondence)
269 |         self.set_FLAME_LENGTH(profile.vc_conf.frame_length)
270 |         self.set_SOURCE_ID(profile.vc_conf.source_id)
271 |         self.set_TARGET_ID(profile.vc_conf.target_id)
272 |         self.set_F0_SCALE(profile.vc_conf.f0_scale)
273 |         self.set_MIC_SCALE(profile.vc_conf.mic_scale)
274 |         self.set_OVERLAP(profile.vc_conf.overlap)
275 |         self.set_USE_NR(profile.others.use_nr)
276 |         self.set_VOICE_LIST(profile.others.voice_list)
277 |         self.set_VOICE_LABEL(profile.others.voice_label)
278 |         self.set_VOICE_F0(profile.others.voice_f0)
279 |         self.set_DELAY_FLAMES(profile.vc_conf.delay_flames)
280 |         self.set_DISPOSE_STFT_SPECS(profile.vc_conf.dispose_stft_specs)
281 |         self.set_DISPOSE_CONV1D_SPECS(profile.vc_conf.dispose_conv1d_specs)
282 |         if hasattr(profile.others, "input_filename"):
283 |             self.set_INPUT_FILENAME(profile.others.input_filename)
284 |         if hasattr(profile.others, "output_filename"):
285 |             self.set_OUTPUT_FILENAME(profile.others.output_filename)
286 |         self.set_GPU_ID(profile.device.gpu_id)
287 |         self.set_Voice_Selector(profile.others.voice_selector)
288 |         if hasattr(profile.vc_conf, "onnx"):
289 |             self.set_USE_ONNX(profile.vc_conf.onnx.use_onnx)
290 |             self.set_ONNX_PROVIDERS(profile.vc_conf.onnx.onnx_providers)
291 |             if hasattr(profile.vc_conf.onnx, "ort_enable_basic"):
292 |                 self.set_ONNX_ORT_ENABLE_BASIC(profile.vc_conf.onnx.ort_enable_basic)
293 |             else:
294 |                 self.set_ONNX_ORT_ENABLE_BASIC(False)
295 | 
296 |     def launch_model(self):
297 |         if self.hps.model.use_mel_train:
298 |             channels = self.hps.data.n_mel_channels
299 |         else:
300 |             channels = self.hps.data.filter_length // 2 + 1
301 | 
302 |         net_g = SynthesizerTrn(
303 |             spec_channels = channels,
304 |             segment_size = self.hps.train.segment_size // self.hps.data.hop_length,
305 |             inter_channels = self.hps.model.inter_channels,
306 |             hidden_channels = self.hps.model.hidden_channels,
307 |             upsample_rates = self.hps.model.upsample_rates,
308 |             upsample_initial_channel = self.hps.model.upsample_initial_channel,
309 |             upsample_kernel_sizes = self.hps.model.upsample_kernel_sizes,
310 |             n_flow = self.hps.model.n_flow,
311 |             dec_out_channels=1,
312 |             dec_kernel_size=7,
313 |             n_speakers = self.hps.data.n_speakers,
314 |             gin_channels = self.hps.model.gin_channels,
315 |             requires_grad_pe = self.hps.requires_grad.pe,
316 |             requires_grad_flow = self.hps.requires_grad.flow,
317 |             requires_grad_text_enc = self.hps.requires_grad.text_enc,
318 |             requires_grad_dec = self.hps.requires_grad.dec,
319 |             requires_grad_emb_g = self.hps.requires_grad.emb_g,
320 |             sample_rate = self.hps.data.sampling_rate,
321 |             hop_size = self.hps.data.hop_length,
322 |             sine_amp = self.hps.data.sine_amp,
323 |             noise_amp = self.hps.data.noise_amp,
324 |             signal_types = self.hps.data.signal_types,
325 |             dense_factors = self.hps.data.dense_factors,
326 |             upsample_scales = self.hps.model.upsample_rates,
327 |             )
328 |         _ = net_g.eval()
329 | 
330 |         return net_g
331 | 
332 |     #f0からcf0を推定する
333 |     def convert_continuos_f0(self, f0, f0_size):
334 |         """Convert F0 to continuous F0
335 | 
336 |         Args:
337 |             f0 (ndarray): original f0 sequence with the shape (T)
338 | 
339 |         Return:
340 |             (ndarray): continuous f0 with the shape (T)
341 | 
342 |         """
343 |         # get start and end of f0
344 |         if (f0 == 0).all():
345 |             return np.zeros((f0_size,))
346 |         start_f0 = f0[f0 != 0][0]
347 |         end_f0 = f0[f0 != 0][-1]
348 |         # padding start and end of f0 sequence
349 |         cf0 = f0
350 |         start_idx = np.where(cf0 == start_f0)[0][0]
351 |         end_idx = np.where(cf0 == end_f0)[0][-1]
352 |         cf0[:start_idx] = start_f0
353 |         cf0[end_idx:] = end_f0
354 |         # get non-zero frame index
355 |         nz_frames = np.where(cf0 != 0)[0]
356 |         # perform linear interpolation
357 |         f = interp1d(nz_frames, cf0[nz_frames], bounds_error=False, fill_value=0.0)
358 |         return f(np.arange(0, f0_size))
359 | 
360 |     def audio_trans(self, tdbm, input, net_g, noise_data, target_id, f0_scale, dispose_stft_specs, dispose_conv1d_specs, ort_session=None):
361 |         gpu_id = Hyperparameters.GPU_ID
362 |         mic_scale = Hyperparameters.MIC_SCALE
363 |         hop_length = Hyperparameters.HOP_LENGTH
364 |         delay_frames = Hyperparameters.DELAY_FLAMES
365 |         overlap_length = Hyperparameters.OVERLAP
366 |         dispose_conv1d_length = dispose_conv1d_specs * hop_length
367 |         dispose_specs =  dispose_stft_specs * 2 + dispose_conv1d_specs * 2
368 |         dispose_length = dispose_specs * hop_length
369 |         fixed_length = (delay_frames + dispose_length + overlap_length) // hop_length - dispose_stft_specs * 2
370 | 
371 |         # byte => torch
372 |         signal = np.frombuffer(input, dtype='int16')
373 |         #signal = torch.frombuffer(input, dtype=torch.float32)
374 |         signal = signal * mic_scale / Hyperparameters.MAX_WAV_VALUE
375 |         #F0推定テスト 5.5が奇跡的にぴったり
376 |         _f0, _time = pw.dio(signal, Hyperparameters.SAMPLE_RATE,frame_period = 5.5)    # 基本周波数の抽出
377 |         f0 = pw.stonemask(signal, _f0, _time, Hyperparameters.SAMPLE_RATE)  # 基本周波数の修正
378 |         f0 = self.convert_continuos_f0(f0, int(signal.shape[0] / hop_length))
379 |         f0 = torch.from_numpy(f0.astype(np.float32))
380 | 
381 |         if Hyperparameters.USE_NR:
382 |             signal = nr.reduce_noise(y=signal, sr=Hyperparameters.SAMPLE_RATE, y_noise = noise_data, n_std_thresh_stationary=2.5,stationary=True)
383 |         # any to many への取り組み(失敗)
384 |         # f0を変えるだけでは枯れた声は直らなかった
385 |         #f0trans = Shifter(Hyperparameters.SAMPLE_RATE, 1.75, frame_ms=20, shift_ms=10)
386 |         #transformed = f0trans.transform(signal)
387 |         signal = torch.from_numpy(signal.astype(np.float32)).clone()
388 | 
389 |         #voice conversion
390 |         with torch.no_grad():
391 |             #SID
392 |             trans_length = signal.size()[0]
393 |             spec, sid = tdbm.get_audio_text_speaker_pair(signal.view(1, trans_length), Hyperparameters.SOURCE_ID)
394 |             if dispose_stft_specs != 0:
395 |                 # specの頭と終がstft paddingの影響受けるので2コマを削る
396 |                 # wavもspecで削るぶんと同じだけ頭256と終256を削る
397 |                 spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
398 |                 f0 = f0[dispose_stft_specs:-dispose_stft_specs]
399 |             sid_src = sid
400 |             sid_target = torch.LongTensor([target_id]) # 話者IDはJVSの番号を100で割った余りです
401 |             spec = spec.unsqueeze(0)
402 |             spec_lengths = torch.tensor([spec.size(2)])
403 |             f0 = (f0 * f0_scale).unsqueeze(0).unsqueeze(0)
404 |             if Hyperparameters.USE_ONNX:
405 |                 if spec_lengths.numpy() != fixed_length: # 固定長に足りない場合は0パディング
406 |                     spec_padding_size = (1, spec.size(1), fixed_length - spec.size(2))
407 |                     spec_zero_padding = torch.zeros(spec_padding_size)
408 |                     spec = torch.cat([spec, spec_zero_padding], dim=2)
409 |                     f0_padding_size = (1, 1, fixed_length - f0.size(2))
410 |                     f0_zero_padding = torch.zeros(f0_padding_size)
411 |                     f0 = torch.cat([f0, f0_zero_padding], dim=2)
412 |                     spec_lengths = torch.tensor([spec.size(2)])
413 |                 sin, d = net_g.make_sin_d(f0)
414 |                 (d0, d1, d2, d3) = d
415 |                 audio = ort_session.run(
416 |                     ["audio"],
417 |                     {
418 |                         "specs": spec.numpy(),
419 |                         "lengths": spec_lengths.numpy(),
420 |                         "sin": sin.numpy(),
421 |                         "d0": d0.numpy(),
422 |                         "d1": d1.numpy(),
423 |                         "d2": d2.numpy(),
424 |                         "d3": d3.numpy(),
425 |                         "sid_src": sid_src.numpy(),
426 |                         "sid_tgt": sid_target.numpy()
427 |                     })[0][0,0]
428 |             else:
429 |                 if gpu_id >= 0:
430 |                     #spec, spec_lengths, sid_src, sin, d = [x.cuda(gpu_id) for x in data]
431 |                     spec = spec.cuda(gpu_id)
432 |                     spec_lengths = spec_lengths.cuda(gpu_id)
433 |                     sid_src = sid_src.cuda(gpu_id)
434 |                     sid_target = sid_target.cuda(gpu_id) # 話者IDはJVSの番号を100で割った余りです
435 |                     f0 = f0.cuda(gpu_id)
436 |                     audio = net_g.cuda(gpu_id).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy()
437 |                 else:
438 |                     audio = net_g.voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy()
439 | 
440 |         if dispose_conv1d_specs != 0:
441 |             # 出力されたwavでconv1d paddingの影響受けるところを削る
442 |             audio = audio[dispose_conv1d_length:-dispose_conv1d_length]
443 |         audio = audio * Hyperparameters.MAX_WAV_VALUE
444 |         audio = audio.astype(np.int16).tobytes()
445 | 
446 |         return audio
447 | 
448 |     def overlap_merge(self, now_wav, prev_wav, overlap_length):
449 |         """
450 |         生成したwavデータを前回生成したwavデータとoverlap_lengthだけ重ねてグラデーション的にマージします
451 |         終端のoverlap_lengthぶんは次回マージしてから再生するので削除します
452 | 
453 |         Parameters
454 |         ----------
455 |         now_wav: 今回生成した音声wavデータ
456 |         prev_wav: 前回生成した音声wavデータ
457 |         overlap_length: 重ねる長さ
458 |         """
459 |         if overlap_length == 0:
460 |             return now_wav
461 |         gradation = np.arange(overlap_length) / overlap_length
462 |         now = np.frombuffer(now_wav, dtype='int16')
463 |         prev = np.frombuffer(prev_wav, dtype='int16')
464 |         now_head = now[:overlap_length]
465 |         prev_tail = prev[-overlap_length:]
466 |         merged = prev_tail * (np.cos(gradation * np.pi * 0.5) ** 2) + now_head * (np.cos((1-gradation) * np.pi * 0.5) ** 2)
467 |         #merged = prev_tail * (1 - gradation) + now_head * gradation
468 |         overlapped = np.append(merged, now[overlap_length:-overlap_length])
469 |         signal = np.round(overlapped, decimals=0)
470 |         signal = signal.astype(np.int16).tobytes()
471 |         return signal
472 | 
473 |     def vc_run(self):
474 |         audio = pyaudio.PyAudio()
475 |         print("モデルを読み込んでいます。少々お待ちください。")
476 |         net_g = self.launch_model()
477 |         ort_session = None
478 |         if Hyperparameters.USE_ONNX :
479 |             # DirectMLで動かすための設定
480 |             ort_options = ort.SessionOptions()
481 |             ort_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
482 |             ort_options.enable_mem_pattern = False
483 |             if Hyperparameters.ORT_ENABLE_BASIC:
484 |                 ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC # https://kazuhito00.hatenablog.com/entry/2022/10/13/133248
485 |             #ort_options.enable_profiling = True
486 |             ort_session = ort.InferenceSession(
487 |                 Hyperparameters.MODEL_PATH,
488 |                 sess_options=ort_options,
489 |                 providers=Hyperparameters.ONNX_PROVIDERS)
490 |         else:
491 |             _ = load_checkpoint(Hyperparameters.MODEL_PATH, net_g, None)
492 | 
493 |         print("モデルの読み込みが完了しました。音声の入出力の準備を行います。少々お待ちください。")
494 |         tdbm = Transform_Data_By_Model()
495 | 
496 |         if Hyperparameters.USE_NR:
497 |             noise_data, noise_rate = sf.read(Hyperparameters.NOISE_FILE)
498 |         else:
499 |             noise_data = 0
500 | 
501 |         # audio stream voice
502 |         #マイク
503 |         audio_input_stream = audio.open(format=Hyperparameters.FORMAT,
504 |                             channels=1,
505 |                             rate=Hyperparameters.SAMPLE_RATE,
506 |                             frames_per_buffer=Hyperparameters.DELAY_FLAMES,
507 |                             input_device_index=Hyperparameters.INPUT_DEVICE_1,
508 |                             input=True)
509 | 
510 |         #Realtek Digital Output
511 |         audio_output_stream = audio.open(format=Hyperparameters.FORMAT,
512 |                             channels=1,
513 |                             rate=Hyperparameters.SAMPLE_RATE,
514 |                             frames_per_buffer=Hyperparameters.DELAY_FLAMES,
515 |                             output_device_index = Hyperparameters.OUTPUT_DEVICE_1,
516 |                             output=True)
517 | 
518 |         # テストファイル入出力のモックアップ
519 |         mock_stream = MockStream(Hyperparameters.SAMPLE_RATE)
520 |         if Hyperparameters.INPUT_FILENAME != None:
521 |             mock_stream.open_inputfile(Hyperparameters.INPUT_FILENAME)
522 |             audio_input_stream = mock_stream
523 |         if Hyperparameters.OUTPUT_FILENAME != None:
524 |             mock_stream.open_outputfile(Hyperparameters.OUTPUT_FILENAME)
525 |             audio_output_stream = mock_stream
526 | 
527 |         #CABLE Output
528 |         if Hyperparameters.INPUT_DEVICE_2 != False:
529 |             back_audio_input_stream = audio.open(format=Hyperparameters.FORMAT,
530 |                                 channels=1,
531 |                                 rate=Hyperparameters.SAMPLE_RATE,
532 |                                 frames_per_buffer=Hyperparameters.DELAY_FLAMES,
533 |                                 input_device_index=Hyperparameters.INPUT_DEVICE_2,
534 |                                 input=True)
535 |         else:
536 |             back_audio_input_stream = audio.open(format=Hyperparameters.FORMAT,
537 |                                 channels=1,
538 |                                 rate=Hyperparameters.SAMPLE_RATE,
539 |                                 frames_per_buffer=Hyperparameters.DELAY_FLAMES,
540 |                                 input_device_index=Hyperparameters.INPUT_DEVICE_1,
541 |                                 input=True)
542 | 
543 |         #Realtek Digital Output
544 |         back_audio_output_stream = audio.open(format=Hyperparameters.FORMAT,
545 |                             channels=1,
546 |                             rate=Hyperparameters.SAMPLE_RATE,
547 |                             frames_per_buffer=Hyperparameters.DELAY_FLAMES,
548 |                             output_device_index = Hyperparameters.OUTPUT_DEVICE_1,
549 |                             output=True)
550 | 
551 |         with_bgm = (Hyperparameters.INPUT_DEVICE_2 != False)
552 |         with_voice_selector = (Hyperparameters.INPUT_FILENAME == None) # 入力ファイルがない場合は音声選択ウィンドウあり
553 |         voice_selector_flag = Hyperparameters.Voice_Selector_Flag # 音声選択ウィンドウの有無
554 |         delay_frames = Hyperparameters.DELAY_FLAMES
555 |         overlap_length = Hyperparameters.OVERLAP
556 |         source_id = Hyperparameters.SOURCE_ID
557 |         target_id = Hyperparameters.TARGET_ID
558 |         target_f0_scale = 1.0
559 |         f0_scale = Hyperparameters.F0_SCALE
560 |         wav_bytes = 2 # 1音声データあたりのデータサイズ(2bytes) (math.log2(max_wav_value)+1)/8 で算出してもよいけど
561 |         hop_length = Hyperparameters.HOP_LENGTH
562 |         dispose_stft_specs = Hyperparameters.DISPOSE_STFT_SPECS
563 |         dispose_conv1d_specs = Hyperparameters.DISPOSE_CONV1D_SPECS
564 |         dispose_specs =  dispose_stft_specs * 2 + dispose_conv1d_specs * 2
565 |         dispose_length = dispose_specs * hop_length
566 |         assert delay_frames >= dispose_length + overlap_length, "delay_frames have to be larger than dispose_length + overlap_length"
567 | 
568 |         #第一節を取得する
569 |         try:
570 |             print("準備が完了しました。VC開始します。")
571 |             if with_voice_selector and voice_selector_flag:
572 |                 voice_selector = VoiceSelector()
573 |                 voice_selector.open_window()
574 | 
575 |             # in_wav: delay_frames * wav_bytes = 4096 * 2 = 8192
576 |             # prev_wav_tail: (dispose_length + overlap_length) * wav_bytes = (1536 + 128) * 2 = 3328
577 |             # prev_trans_wav: (delay_frames + overlap_length) * wav_bytes = (4096 + 128) * 2 = 8448
578 |             prev_wav_tail = bytes((dispose_length + overlap_length) * wav_bytes)
579 |             prev_trans_wav = bytes((delay_frames + overlap_length) * wav_bytes)
580 |             #prev_wav_tail = bytes(0)
581 |             #in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False)
582 |             #trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, 0, 0, ort_session=ort_session) # 遅延減らすため初回だけpadding対策使わない
583 |             #overlapped_wav = trans_wav
584 |             #prev_trans_wav = trans_wav
585 |             #if dispose_length + overlap_length != 0:
586 |             #    prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 次回の頭のデータとして終端データを保持する
587 |             #if with_bgm:
588 |             #    back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow = False) # 背景BGMを取得
589 |             while True:
590 |                 f0_factor = tdbm.get_f0_scale(source_id, target_id) * f0_scale * target_f0_scale
591 |                 in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False)
592 |                 trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, f0_factor, dispose_stft_specs, dispose_conv1d_specs, ort_session=ort_session)
593 |                 overlapped_wav = self.overlap_merge(trans_wav,  prev_trans_wav, overlap_length)
594 |                 audio_output_stream.write(overlapped_wav)
595 |                 prev_trans_wav = trans_wav
596 |                 if dispose_length + overlap_length != 0:
597 |                     prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 今回の終端の捨てデータぶんだけ次回の頭のデータとして保持する
598 |                 if with_bgm:
599 |                     back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow=False) # 背景BGMを取得
600 |                     back_audio_output_stream.write(back_in_raw)
601 | 
602 |                 if with_voice_selector and voice_selector_flag:
603 |                     target_id = voice_selector.voice_select_id
604 |                     target_f0_scale = voice_selector.voice_select_f0
605 |                     voice_selector.update_window()
606 | 
607 |                 if Hyperparameters.VC_END_FLAG: #エスケープ
608 |                     print("vc_finish")
609 |                     break
610 | 
611 |         except KeyboardInterrupt:
612 |             audio_input_stream.stop_stream()
613 |             audio_input_stream.close()
614 |             audio_output_stream.stop_stream()
615 |             audio_output_stream.close()
616 |             back_audio_input_stream.stop_stream()
617 |             back_audio_input_stream.close()
618 |             back_audio_output_stream.stop_stream()
619 |             back_audio_output_stream.close()
620 |             audio.terminate()
621 |             #prof_file = ort_session.end_profiling()
622 |             #print(prof_file)
623 |             print("Stop Streaming")    
624 | 
625 |         if with_voice_selector and voice_selector_flag:
626 |             voice_selector.close_window()
627 | 
628 | class Transform_Data_By_Model():
629 |     hann_window = {}
630 |     FILTER_LENGTH = 0
631 |     HOP_LENGTH = 0
632 |     SAMPLE_RATE = 0
633 |     HPS = None
634 |     CONFIG = None
635 |     correspondence_dict = None 
636 | 
637 |     def __init__(self):
638 |         self.G_HP = Hyperparameters()
639 |         self.HPS = get_hparams_from_file(self.G_HP.CONFIG_JSON_PATH)
640 |         self.correspondence_dict = read_correspondence_file(self.G_HP.CORRESPONDENCE_PATH)
641 |         #define samplerate
642 |         self.SAMPLE_RATE =self.HPS.data.sampling_rate
643 |         #define filter size
644 |         self.FILTER_LENGTH = self.HPS.data.filter_length
645 |         self.HOP_LENGTH = self.HPS.data.hop_length
646 | 
647 |     def spectrogram_torch(self, y, n_fft, sampling_rate, hop_size, win_size, center=False):
648 |         if torch.min(y) < -1.:
649 |             print('min value is ', torch.min(y))
650 |         if torch.max(y) > 1.:
651 |             print('max value is ', torch.max(y))
652 | 
653 |         dtype_device = str(y.dtype) + '_' + str(y.device)
654 |         wnsize_dtype_device = str(win_size) + '_' + dtype_device
655 |         if wnsize_dtype_device not in self.hann_window:
656 |             self.hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
657 | 
658 |         y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
659 |         y = y.squeeze(1)
660 | 
661 |         spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device],
662 |                         center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
663 |         spec = torch.view_as_real(spec)
664 |         
665 |         spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
666 |         return spec
667 | 
668 |     def get_audio_text_speaker_pair(self, wav, sid):
669 |         spec = self.get_spec(wav)
670 |         sid = self.get_sid(sid)
671 |         return (spec, sid)
672 | 
673 |     def get_spec(self, audio_norm):
674 |         filter_length = self.FILTER_LENGTH
675 |         sampling_rate = self.SAMPLE_RATE
676 |         hop_length = self.HOP_LENGTH
677 |         win_length = self.FILTER_LENGTH
678 |         spec = self.spectrogram_torch(audio_norm, filter_length,
679 |             sampling_rate, hop_length, win_length,
680 |             center=False)
681 |         spec = torch.squeeze(spec, 0)
682 |         return spec
683 | 
684 |     def get_text(self, text):
685 |         return text
686 | 
687 |     def get_sid(self, sid):
688 |         sid = torch.LongTensor([int(sid)])
689 |         return sid
690 |     
691 |     def get_f0_scale(self, sid_src, sid_target):
692 |         src_f0 = self.correspondence_dict[int(sid_src)]
693 |         target_f0 = self.correspondence_dict[int(sid_target)]
694 |         f0_scale = target_f0 / src_f0
695 |         return torch.FloatTensor([f0_scale])
696 | 
697 | class MockStream:
698 |     """
699 |     オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
700 |     """
701 |     def __init__(self, sampling_rate):
702 |         self.sampling_rate = sampling_rate
703 |         self.start_count = 2
704 |         self.end_count = 2
705 |         self.fr = None
706 |         self.fw = None
707 | 
708 |     def open_inputfile(self, input_filename):
709 |         self.fr = wave.open(input_filename, 'rb')
710 | 
711 |     def open_outputfile(self, output_filename):
712 |         self.fw = wave.open(output_filename, 'wb')
713 |         self.fw.setnchannels(1)
714 |         self.fw.setsampwidth(2)
715 |         self.fw.setframerate(self.sampling_rate)
716 | 
717 |     def read(self, length, exception_on_overflow=False):
718 |         if self.start_count > 0:
719 |             wav = bytes(length * 2)
720 |             self.start_count -= 1 # 最初の2回はダミーの空データ送る
721 |         else:
722 |             wav = self.fr.readframes(length)
723 |         if len(wav) <= 0: # データなくなってから最後の2回はダミーの空データを送る
724 |             wav = bytes(length * 2)
725 |             self.end_count -= 1
726 |             if self.end_count < 0:
727 |                 Hyperparameters.VC_END_FLAG = True
728 |         return wav
729 | 
730 |     def write(self, wav):
731 |         self.fw.writeframes(wav)
732 | 
733 |     def stop_stream(self):
734 |         pass
735 |     
736 |     def close(self):
737 |         if self.fr != None:
738 |             self.fr.close()
739 |             self.fr = None
740 |         if self.fw != None:
741 |             self.fw.close()
742 |             self.fw = None
743 | 
744 | class VoiceSelector():
745 |     def get_closure(self, button, id, f0):
746 | 
747 |         def on_click(event):
748 |             button.config(fg="red")
749 |             self.selected_button.config(fg="black")
750 |             self.selected_button = button
751 |             self.voice_select_id = id
752 |             self.voice_select_f0 = f0
753 |             #print(f"voice select id: {id}")
754 | 
755 |         return on_click
756 | 
757 |     def open_window(self):
758 |         self.voice_ids = Hyperparameters.VOICE_LIST
759 |         self.voice_labels = Hyperparameters.VOICE_LABEL
760 |         self.voice_f0s = Hyperparameters.VOICE_F0
761 | 
762 |         self.root_win = tk.Tk()
763 |         height = int(len(self.voice_ids) * 30)
764 |         self.root_win.geometry(f"200x{height}")
765 |         self.root_win.title("MMVC Client")
766 |         self.root_win.protocol("WM_DELETE_WINDOW", self.close_window)
767 | 
768 |         self.button_list = []
769 |         self.selected_button = None
770 |         self.voice_select_id = self.voice_ids[0]
771 |         self.voice_select_f0 = self.voice_f0s[0]
772 | 
773 |         for voice_id, voice_label, voice_f0 in zip(self.voice_ids, self.voice_labels, self.voice_f0s):
774 |             button = tk.Button(self.root_win, text=f"{voice_label}")
775 |             if voice_id == self.voice_select_id:
776 |                 button.config(fg="red")
777 |                 self.selected_button = button
778 |             button_on_click = self.get_closure(button, voice_id, voice_f0)
779 |             button.bind("<Button-1>", button_on_click)
780 |             button.pack()
781 |             self.button_list.append(button)
782 | 
783 |     def update_window(self):
784 |         self.root_win.update()
785 | 
786 |     def close_window(self):
787 |         if self.root_win != None:
788 |             self.root_win.destroy()
789 |             self.root_win = None
790 |             Hyperparameters.VC_END_FLAG = True
791 | 
792 | class VCPrifile():
793 |   def __init__(self, **kwargs):
794 |     for k, v in kwargs.items():
795 |       if type(v) == dict:
796 |         v = VCPrifile(**v)
797 |       self[k] = v
798 |     
799 |   def keys(self):
800 |     return self.__dict__.keys()
801 | 
802 |   def items(self):
803 |     return self.__dict__.items()
804 | 
805 |   def values(self):
806 |     return self.__dict__.values()
807 | 
808 |   def __len__(self):
809 |     return len(self.__dict__)
810 | 
811 |   def __getitem__(self, key):
812 |     return getattr(self, key)
813 | 
814 |   def __setitem__(self, key, value):
815 |     return setattr(self, key, value)
816 | 
817 |   def __contains__(self, key):
818 |     return key in self.__dict__
819 | 
820 |   def __repr__(self):
821 |     return self.__dict__.__repr__()
822 | 
823 | def config_get(conf):
824 |     config_path = conf
825 |     with open(config_path, "r", encoding="utf-8") as f:
826 |         data = f.read()
827 |     config = json.loads(data)
828 |     hparams = VCPrifile(**config)
829 |     return hparams
830 | 
831 | if __name__ == '__main__':
832 |     try: #add
833 |         args = sys.argv
834 |         if len(args) < 2:
835 |             end_counter = 0
836 |             while True:  # 無限ループ
837 |                 tkroot = tk.Tk()
838 |                 tkroot.withdraw()
839 |                 print('myprofile.conf を選択して下さい')
840 |                 typ = [('jsonファイル','*.conf')]
841 |                 dir = './'
842 |                 profile_path = filedialog.askopenfilename(filetypes = typ, initialdir = dir)
843 |                 tkroot.destroy()
844 |                 try:
845 |                     if profile_path:
846 |                         break
847 |                     else:
848 |                         print('ファイルが存在しません')
849 |                         end_counter = end_counter + 1
850 |                         print(end_counter)
851 |                         if end_counter > 3:
852 |                             break
853 |                         continue
854 |             
855 |                 except ValueError:
856 |                     # ValueError例外を処理するコード
857 |                     print('パスを入力してください・')
858 |                     continue
859 |         else:
860 |             profile_path = args[1]
861 |             print("起動時にmyprofile.confのパスが指定されました。")
862 |             print(profile_path)
863 | 
864 |         params = config_get(profile_path)
865 |         vc_main = Hyperparameters()
866 | 
867 |         print(params.path.json)
868 |         vc_main.set_profile(params)
869 |         vc_main.vc_run()
870 |     
871 |     except Exception as e:
872 |         print('エラーが発生しました。')
873 |         print(e)
874 |         os.system('PAUSE')
875 | 


--------------------------------------------------------------------------------