├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── attentions.py ├── audio_processing.py ├── commons.py ├── configs └── config.json ├── data_utils.py ├── diffusion.py ├── extract_f0_mel.py ├── extract_vec.py ├── f0_extractor ├── F0Predictor.py ├── FCPEF0Predictor.py ├── __init__.py └── fcpe │ ├── __init__.py │ ├── model.py │ ├── nvSTFT.py │ └── pcmer.py ├── feature_extractor ├── __init__.py └── contentvec768.py ├── filelists └── .gitkeep ├── hifigan ├── __init__.py ├── modules │ ├── hifigan │ │ ├── hifigan.py │ │ └── mel_utils.py │ ├── nsf_hifigan │ │ ├── env.py │ │ ├── models.py │ │ ├── nvSTFT.py │ │ └── utils.py │ └── parallel_wavegan │ │ ├── __init__.py │ │ ├── layers │ │ ├── __init__.py │ │ ├── causal_conv.py │ │ ├── pqmf.py │ │ ├── residual_block.py │ │ ├── residual_stack.py │ │ ├── tf_layers.py │ │ └── upsample.py │ │ ├── losses │ │ ├── __init__.py │ │ └── stft_loss.py │ │ ├── models │ │ ├── __init__.py │ │ ├── melgan.py │ │ ├── parallel_wavegan.py │ │ └── source.py │ │ ├── optimizers │ │ ├── __init__.py │ │ └── radam.py │ │ ├── stft_loss.py │ │ └── utils │ │ ├── __init__.py │ │ └── utils.py └── network │ └── vocoders │ ├── __init__.py │ ├── base_vocoder.py │ ├── hifigan.py │ ├── nsf_hifigan.py │ ├── pwg.py │ └── vocoder_utils.py ├── mel_processing.py ├── models.py ├── modules.py ├── preprocess_flist_config.py ├── pretrain ├── content-vec-best │ └── config.json ├── fcpe │ └── .gitkeep └── nsf-hifigan │ ├── config.json │ └── put_441hifigan_ckpt_here ├── resample.py ├── stft.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | DATASETS 2 | DUMMY 3 | DUMMY2 4 | samples 5 | logs 6 | __pycache__ 7 | .ipynb_checkpoints 8 | .*.swp 9 | 10 | build 11 | *.c 12 | monotonic_align/monotonic_align 13 | vocos/pytorch_model.bin 14 | dataset 15 | hifigan/model -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "waveglow"] 2 | path = waveglow 3 | url = https://github.com/NVIDIA/waveglow.git 4 | [submodule "hifigan"] 5 | path = hifigan 6 | url = https://github.com/jik876/hifi-gan.git 7 | [submodule "hifi-gan"] 8 | path = hifi-gan 9 | url = https://github.com/jik876/hifi-gan.git 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jaehyeon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Glow-SVC 2 | 3 | another implement by rcell based on official [glow-tts](https://github.com/jaywalnut310/glow-tts) repo 4 | + content-vec + fcpe(f0) -> glowtts -> nsf-hifigan -> wav 5 | + 坑多多多多多 6 | + fp16会炸炸炸 7 | 8 | pretrain: 9 | + [content-vec-best](https://huggingface.co/lengyue233/content-vec-best/resolve/main/pytorch_model.bin) 10 | + [fcpe](https://huggingface.co/datasets/ylzz1997/rmvpe_pretrain_model/resolve/main/fcpe.pt) 11 | + [nsf-hifigan](https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip) 12 | 13 | 14 | preprocess: 15 | + [resample.py](resample.py) -> [preprocess_flist_config.py](preprocess_flist_config.py) 16 | -> [extract_vec.py](extract_vec.py) -> [extract_f0_mel.py](extract_f0_mel.py) 17 | 18 | train: 19 | + python train.py -c configs/config.json -m model_name 20 | 21 | infer: 22 | + 暂无 -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | import commons 9 | import modules 10 | from modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None, **kwargs): 15 | super().__init__() 16 | self.hidden_channels = hidden_channels 17 | self.filter_channels = filter_channels 18 | self.n_heads = n_heads 19 | self.n_layers = n_layers 20 | self.kernel_size = kernel_size 21 | self.p_dropout = p_dropout 22 | self.window_size = window_size 23 | self.block_length = block_length 24 | 25 | self.drop = nn.Dropout(p_dropout) 26 | self.attn_layers = nn.ModuleList() 27 | self.norm_layers_1 = nn.ModuleList() 28 | self.ffn_layers = nn.ModuleList() 29 | self.norm_layers_2 = nn.ModuleList() 30 | for i in range(self.n_layers): 31 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, window_size=window_size, p_dropout=p_dropout, block_length=block_length)) 32 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 33 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 34 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 35 | 36 | def forward(self, x, x_mask): 37 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 38 | for i in range(self.n_layers): 39 | x = x * x_mask 40 | y = self.attn_layers[i](x, x, attn_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_1[i](x + y) 43 | 44 | y = self.ffn_layers[i](x, x_mask) 45 | y = self.drop(y) 46 | x = self.norm_layers_2[i](x + y) 47 | x = x * x_mask 48 | return x 49 | 50 | 51 | class CouplingBlock(nn.Module): 52 | def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0, sigmoid_scale=False): 53 | super().__init__() 54 | self.in_channels = in_channels 55 | self.hidden_channels = hidden_channels 56 | self.kernel_size = kernel_size 57 | self.dilation_rate = dilation_rate 58 | self.n_layers = n_layers 59 | self.gin_channels = gin_channels 60 | self.p_dropout = p_dropout 61 | self.sigmoid_scale = sigmoid_scale 62 | 63 | start = torch.nn.Conv1d(in_channels//2, hidden_channels, 1) 64 | start = torch.nn.utils.weight_norm(start) 65 | self.start = start 66 | # Initializing last layer to 0 makes the affine coupling layers 67 | # do nothing at first. It helps to stabilze training. 68 | end = torch.nn.Conv1d(hidden_channels, in_channels, 1) 69 | end.weight.data.zero_() 70 | end.bias.data.zero_() 71 | self.end = end 72 | 73 | self.wn = modules.WN(in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout) 74 | 75 | 76 | def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): 77 | b, c, t = x.size() 78 | if x_mask is None: 79 | x_mask = 1 80 | x_0, x_1 = x[:,:self.in_channels//2], x[:,self.in_channels//2:] 81 | 82 | x = self.start(x_0) * x_mask 83 | x = self.wn(x, x_mask, g) 84 | out = self.end(x) 85 | 86 | z_0 = x_0 87 | m = out[:, :self.in_channels//2, :] 88 | logs = out[:, self.in_channels//2:, :] 89 | if self.sigmoid_scale: 90 | logs = torch.log(1e-6 + torch.sigmoid(logs + 2)) 91 | 92 | if reverse: 93 | z_1 = (x_1 - m) * torch.exp(-logs) * x_mask 94 | logdet = None 95 | else: 96 | z_1 = (m + torch.exp(logs) * x_1) * x_mask 97 | logdet = torch.sum(logs * x_mask, [1, 2]) 98 | 99 | z = torch.cat([z_0, z_1], 1) 100 | return z, logdet 101 | 102 | def store_inverse(self): 103 | self.wn.remove_weight_norm() 104 | 105 | 106 | class MultiHeadAttention(nn.Module): 107 | def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False): 108 | super().__init__() 109 | assert channels % n_heads == 0 110 | 111 | self.channels = channels 112 | self.out_channels = out_channels 113 | self.n_heads = n_heads 114 | self.window_size = window_size 115 | self.heads_share = heads_share 116 | self.block_length = block_length 117 | self.proximal_bias = proximal_bias 118 | self.p_dropout = p_dropout 119 | self.attn = None 120 | 121 | self.k_channels = channels // n_heads 122 | self.conv_q = nn.Conv1d(channels, channels, 1) 123 | self.conv_k = nn.Conv1d(channels, channels, 1) 124 | self.conv_v = nn.Conv1d(channels, channels, 1) 125 | if window_size is not None: 126 | n_heads_rel = 1 if heads_share else n_heads 127 | rel_stddev = self.k_channels**-0.5 128 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 129 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 130 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 131 | self.drop = nn.Dropout(p_dropout) 132 | 133 | nn.init.xavier_uniform_(self.conv_q.weight) 134 | nn.init.xavier_uniform_(self.conv_k.weight) 135 | if proximal_init: 136 | self.conv_k.weight.data.copy_(self.conv_q.weight.data) 137 | self.conv_k.bias.data.copy_(self.conv_q.bias.data) 138 | nn.init.xavier_uniform_(self.conv_v.weight) 139 | 140 | def forward(self, x, c, attn_mask=None): 141 | q = self.conv_q(x) 142 | k = self.conv_k(c) 143 | v = self.conv_v(c) 144 | 145 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 146 | 147 | x = self.conv_o(x) 148 | return x 149 | 150 | def attention(self, query, key, value, mask=None): 151 | # reshape [b, d, t] -> [b, n_h, t, d_k] 152 | b, d, t_s, t_t = (*key.size(), query.size(2)) 153 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 154 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 155 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 156 | 157 | scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) 158 | if self.window_size is not None: 159 | assert t_s == t_t, "Relative attention is only available for self-attention." 160 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 161 | rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) 162 | rel_logits = self._relative_position_to_absolute_position(rel_logits) 163 | scores_local = rel_logits / math.sqrt(self.k_channels) 164 | scores = scores + scores_local 165 | if self.proximal_bias: 166 | assert t_s == t_t, "Proximal bias is only available for self-attention." 167 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 168 | if mask is not None: 169 | scores = scores.masked_fill(mask == 0, -1e4) 170 | if self.block_length is not None: 171 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 172 | scores = scores * block_mask + -1e4*(1 - block_mask) 173 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 174 | p_attn = self.drop(p_attn) 175 | output = torch.matmul(p_attn, value) 176 | if self.window_size is not None: 177 | relative_weights = self._absolute_position_to_relative_position(p_attn) 178 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 179 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 180 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 181 | return output, p_attn 182 | 183 | def _matmul_with_relative_values(self, x, y): 184 | """ 185 | x: [b, h, l, m] 186 | y: [h or 1, m, d] 187 | ret: [b, h, l, d] 188 | """ 189 | ret = torch.matmul(x, y.unsqueeze(0)) 190 | return ret 191 | 192 | def _matmul_with_relative_keys(self, x, y): 193 | """ 194 | x: [b, h, l, d] 195 | y: [h or 1, m, d] 196 | ret: [b, h, l, m] 197 | """ 198 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 199 | return ret 200 | 201 | def _get_relative_embeddings(self, relative_embeddings, length): 202 | max_relative_position = 2 * self.window_size + 1 203 | # Pad first before slice to avoid using cond ops. 204 | pad_length = max(length - (self.window_size + 1), 0) 205 | slice_start_position = max((self.window_size + 1) - length, 0) 206 | slice_end_position = slice_start_position + 2 * length - 1 207 | if pad_length > 0: 208 | padded_relative_embeddings = F.pad( 209 | relative_embeddings, 210 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 211 | else: 212 | padded_relative_embeddings = relative_embeddings 213 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 214 | return used_relative_embeddings 215 | 216 | def _relative_position_to_absolute_position(self, x): 217 | """ 218 | x: [b, h, l, 2*l-1] 219 | ret: [b, h, l, l] 220 | """ 221 | batch, heads, length, _ = x.size() 222 | # Concat columns of pad to shift from relative to absolute indexing. 223 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 224 | 225 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 226 | x_flat = x.view([batch, heads, length * 2 * length]) 227 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 228 | 229 | # Reshape and slice out the padded elements. 230 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 231 | return x_final 232 | 233 | def _absolute_position_to_relative_position(self, x): 234 | """ 235 | x: [b, h, l, l] 236 | ret: [b, h, l, 2*l-1] 237 | """ 238 | batch, heads, length, _ = x.size() 239 | # padd along column 240 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 241 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 242 | # add 0's in the beginning that will skew the elements after reshape 243 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 244 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 245 | return x_final 246 | 247 | def _attention_bias_proximal(self, length): 248 | """Bias for self-attention to encourage attention to close positions. 249 | Args: 250 | length: an integer scalar. 251 | Returns: 252 | a Tensor with shape [1, 1, length, length] 253 | """ 254 | r = torch.arange(length, dtype=torch.float32) 255 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 256 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 257 | 258 | 259 | class FFN(nn.Module): 260 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None): 261 | super().__init__() 262 | self.in_channels = in_channels 263 | self.out_channels = out_channels 264 | self.filter_channels = filter_channels 265 | self.kernel_size = kernel_size 266 | self.p_dropout = p_dropout 267 | self.activation = activation 268 | 269 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 270 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size//2) 271 | self.drop = nn.Dropout(p_dropout) 272 | 273 | def forward(self, x, x_mask): 274 | x = self.conv_1(x * x_mask) 275 | if self.activation == "gelu": 276 | x = x * torch.sigmoid(1.702 * x) 277 | else: 278 | x = torch.relu(x) 279 | x = self.drop(x) 280 | x = self.conv_2(x * x_mask) 281 | return x * x_mask 282 | 283 | -------------------------------------------------------------------------------- /audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.signal import get_window 4 | import librosa.util as librosa_util 5 | 6 | 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 8 | n_fft=800, dtype=np.float32, norm=None): 9 | """ 10 | # from librosa 0.6 11 | Compute the sum-square envelope of a window function at a given hop length. 12 | 13 | This is used to estimate modulation effects induced by windowing 14 | observations in short-time fourier transforms. 15 | 16 | Parameters 17 | ---------- 18 | window : string, tuple, number, callable, or list-like 19 | Window specification, as in `get_window` 20 | 21 | n_frames : int > 0 22 | The number of analysis frames 23 | 24 | hop_length : int > 0 25 | The number of samples to advance between frames 26 | 27 | win_length : [optional] 28 | The length of the window function. By default, this matches `n_fft`. 29 | 30 | n_fft : int > 0 31 | The length of each analysis frame. 32 | 33 | dtype : np.dtype 34 | The data type of the output 35 | 36 | Returns 37 | ------- 38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 39 | The sum-squared envelope of the window function 40 | """ 41 | if win_length is None: 42 | win_length = n_fft 43 | 44 | n = n_fft + hop_length * (n_frames - 1) 45 | x = np.zeros(n, dtype=dtype) 46 | 47 | # Compute the squared window at the desired length 48 | win_sq = get_window(window, win_length, fftbins=True) 49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 50 | win_sq = librosa_util.pad_center(win_sq, n_fft) 51 | 52 | # Fill the envelope 53 | for i in range(n_frames): 54 | sample = i * hop_length 55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 56 | return x 57 | 58 | 59 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 60 | """ 61 | PARAMS 62 | ------ 63 | magnitudes: spectrogram magnitudes 64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 65 | """ 66 | 67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 68 | angles = angles.astype(np.float32) 69 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 71 | 72 | for i in range(n_iters): 73 | _, angles = stft_fn.transform(signal) 74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 75 | return signal 76 | 77 | 78 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 79 | """ 80 | PARAMS 81 | ------ 82 | C: compression factor 83 | """ 84 | return torch.log(torch.clamp(x, min=clip_val) * C) 85 | 86 | 87 | def dynamic_range_decompression(x, C=1): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor used to compress 92 | """ 93 | return torch.exp(x) / C 94 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | from librosa.filters import mel as librosa_mel_fn 8 | from audio_processing import dynamic_range_compression 9 | from audio_processing import dynamic_range_decompression 10 | from stft import STFT 11 | 12 | 13 | def intersperse(lst, item): 14 | result = [item] * (len(lst) * 2 + 1) 15 | result[1::2] = lst 16 | return result 17 | 18 | 19 | def mle_loss(z, m, logs, logdet, mask): 20 | l = torch.sum(logs) + 0.5 * torch.sum(torch.exp(-2 * logs) * ((z - m)**2)) # neg normal likelihood w/o the constant term 21 | l = l - torch.sum(logdet) # log jacobian determinant 22 | l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes 23 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 24 | return l 25 | 26 | 27 | def duration_loss(logw, logw_, lengths): 28 | l = torch.sum((logw - logw_)**2) / torch.sum(lengths) 29 | return l 30 | 31 | 32 | @torch.jit.script 33 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 34 | n_channels_int = n_channels[0] 35 | in_act = input_a + input_b 36 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 37 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 38 | acts = t_act * s_act 39 | return acts 40 | 41 | 42 | def convert_pad_shape(pad_shape): 43 | l = pad_shape[::-1] 44 | pad_shape = [item for sublist in l for item in sublist] 45 | return pad_shape 46 | 47 | 48 | def shift_1d(x): 49 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 50 | return x 51 | 52 | 53 | def sequence_mask(length, max_length=None): 54 | if max_length is None: 55 | max_length = length.max() 56 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 57 | return x.unsqueeze(0) < length.unsqueeze(1) 58 | 59 | 60 | def maximum_path(value, mask, max_neg_val=-np.inf): 61 | """ Numpy-friendly version. It's about 4 times faster than torch version. 62 | value: [b, t_x, t_y] 63 | mask: [b, t_x, t_y] 64 | """ 65 | value = value * mask 66 | 67 | device = value.device 68 | dtype = value.dtype 69 | value = value.cpu().detach().numpy() 70 | mask = mask.cpu().detach().numpy().astype(np.bool) 71 | 72 | b, t_x, t_y = value.shape 73 | direction = np.zeros(value.shape, dtype=np.int64) 74 | v = np.zeros((b, t_x), dtype=np.float32) 75 | x_range = np.arange(t_x, dtype=np.float32).reshape(1,-1) 76 | for j in range(t_y): 77 | v0 = np.pad(v, [[0,0],[1,0]], mode="constant", constant_values=max_neg_val)[:, :-1] 78 | v1 = v 79 | max_mask = (v1 >= v0) 80 | v_max = np.where(max_mask, v1, v0) 81 | direction[:, :, j] = max_mask 82 | 83 | index_mask = (x_range <= j) 84 | v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) 85 | direction = np.where(mask, direction, 1) 86 | 87 | path = np.zeros(value.shape, dtype=np.float32) 88 | index = mask[:, :, 0].sum(1).astype(np.int64) - 1 89 | index_range = np.arange(b) 90 | for j in reversed(range(t_y)): 91 | path[index_range, index, j] = 1 92 | index = index + direction[index_range, index, j] - 1 93 | path = path * mask.astype(np.float32) 94 | path = torch.from_numpy(path).to(device=device, dtype=dtype) 95 | return path 96 | 97 | 98 | def generate_path(duration, mask): 99 | """ 100 | duration: [b, t_x] 101 | mask: [b, t_x, t_y] 102 | """ 103 | device = duration.device 104 | 105 | b, t_x, t_y = mask.shape 106 | cum_duration = torch.cumsum(duration, 1) 107 | path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) 108 | 109 | cum_duration_flat = cum_duration.view(b * t_x) 110 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 111 | path = path.view(b, t_x, t_y) 112 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:,:-1] 113 | path = path * mask 114 | return path 115 | 116 | 117 | class Adam(): 118 | def __init__(self, params, scheduler, dim_model, warmup_steps=4000, lr=1e0, betas=(0.9, 0.98), eps=1e-9): 119 | self.params = params 120 | self.scheduler = scheduler 121 | self.dim_model = dim_model 122 | self.warmup_steps = warmup_steps 123 | self.lr = lr 124 | self.betas = betas 125 | self.eps = eps 126 | 127 | self.step_num = 1 128 | self.cur_lr = lr * self._get_lr_scale() 129 | 130 | self._optim = torch.optim.Adam(params, lr=self.cur_lr, betas=betas, eps=eps) 131 | def _get_lr_scale(self): 132 | if self.scheduler == "noam": 133 | return np.power(self.dim_model, -0.5) * np.min([np.power(self.step_num, -0.5), self.step_num * np.power(self.warmup_steps, -1.5)]) 134 | else: 135 | return 1 136 | 137 | def _update_learning_rate(self): 138 | self.step_num += 1 139 | if self.scheduler == "noam": 140 | self.cur_lr = self.lr * self._get_lr_scale() 141 | for param_group in self._optim.param_groups: 142 | param_group['lr'] = self.cur_lr 143 | 144 | def get_lr(self): 145 | return self.cur_lr 146 | 147 | def step(self): 148 | self._optim.step() 149 | self._update_learning_rate() 150 | 151 | def zero_grad(self): 152 | self._optim.zero_grad() 153 | 154 | def load_state_dict(self, d): 155 | self._optim.load_state_dict(d) 156 | 157 | def state_dict(self): 158 | return self._optim.state_dict() 159 | 160 | 161 | class TacotronSTFT(nn.Module): 162 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 163 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 164 | mel_fmax=8000.0): 165 | super(TacotronSTFT, self).__init__() 166 | self.n_mel_channels = n_mel_channels 167 | self.sampling_rate = sampling_rate 168 | self.stft_fn = STFT(filter_length, hop_length, win_length) 169 | mel_basis = librosa_mel_fn( 170 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 171 | mel_basis = torch.from_numpy(mel_basis).float() 172 | self.register_buffer('mel_basis', mel_basis) 173 | 174 | def spectral_normalize(self, magnitudes): 175 | output = dynamic_range_compression(magnitudes) 176 | return output 177 | 178 | def spectral_de_normalize(self, magnitudes): 179 | output = dynamic_range_decompression(magnitudes) 180 | return output 181 | 182 | def mel_spectrogram(self, y): 183 | """Computes mel-spectrograms from a batch of waves 184 | PARAMS 185 | ------ 186 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 187 | 188 | RETURNS 189 | ------- 190 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 191 | """ 192 | assert(torch.min(y.data) >= -1) 193 | assert(torch.max(y.data) <= 1) 194 | 195 | magnitudes, phases = self.stft_fn.transform(y) 196 | magnitudes = magnitudes.data 197 | mel_output = torch.matmul(self.mel_basis, magnitudes) 198 | mel_output = self.spectral_normalize(mel_output) 199 | return mel_output 200 | 201 | 202 | def clip_grad_value_(parameters, clip_value, norm_type=2): 203 | if isinstance(parameters, torch.Tensor): 204 | parameters = [parameters] 205 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 206 | norm_type = float(norm_type) 207 | clip_value = float(clip_value) 208 | 209 | total_norm = 0 210 | for p in parameters: 211 | param_norm = p.grad.data.norm(norm_type) 212 | total_norm += param_norm.item() ** norm_type 213 | 214 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 215 | total_norm = total_norm ** (1. / norm_type) 216 | return total_norm 217 | 218 | 219 | def squeeze(x, x_mask=None, n_sqz=2): 220 | b, c, t = x.size() 221 | 222 | t = (t // n_sqz) * n_sqz 223 | x = x[:,:,:t] 224 | x_sqz = x.view(b, c, t//n_sqz, n_sqz) 225 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c*n_sqz, t//n_sqz) 226 | 227 | if x_mask is not None: 228 | x_mask = x_mask[:,:,n_sqz-1::n_sqz] 229 | else: 230 | x_mask = torch.ones(b, 1, t//n_sqz).to(device=x.device, dtype=x.dtype) 231 | return x_sqz * x_mask, x_mask 232 | 233 | 234 | def unsqueeze(x, x_mask=None, n_sqz=2): 235 | b, c, t = x.size() 236 | 237 | x_unsqz = x.view(b, n_sqz, c//n_sqz, t) 238 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c//n_sqz, t*n_sqz) 239 | 240 | if x_mask is not None: 241 | x_mask = x_mask.unsqueeze(-1).repeat(1,1,1,n_sqz).view(b, 1, t*n_sqz) 242 | else: 243 | x_mask = torch.ones(b, 1, t*n_sqz).to(device=x.device, dtype=x.dtype) 244 | return x_unsqz * x_mask, x_mask 245 | 246 | -------------------------------------------------------------------------------- /configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "use_cuda": true, 4 | "log_interval": 50, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 1e0, 8 | "betas": [0.9, 0.98], 9 | "eps": 1e-9, 10 | "warmup_steps": 4000, 11 | "scheduler": "noam", 12 | "batch_size": 12, 13 | "ddi": true, 14 | "fp16_run": false 15 | }, 16 | "data": { 17 | "load_mel_from_disk": false, 18 | "training_files":"filelists/train.list", 19 | "validation_files":"filelists/val.list", 20 | "text_cleaners":["english_cleaners"], 21 | "max_wav_value": 32768.0, 22 | 23 | "sampling_rate": 44100, 24 | "filter_length": 2048, 25 | "hop_length": 512, 26 | "win_length": 2048, 27 | "n_mel_channels": 128, 28 | "mel_fmin": 40, 29 | "mel_fmax": 16000, 30 | 31 | "add_noise": true, 32 | "add_blank": true, 33 | "spk2id": { 34 | "opencpop": 0 35 | } 36 | }, 37 | "model": { 38 | "hidden_channels": 192, 39 | "filter_channels": 768, 40 | "filter_channels_dp": 256, 41 | "kernel_size": 3, 42 | "p_dropout": 0.1, 43 | "n_blocks_dec": 12, 44 | "n_layers_enc": 6, 45 | "n_heads": 2, 46 | "p_dropout_dec": 0.05, 47 | "dilation_rate": 1, 48 | "kernel_size_dec": 5, 49 | "n_block_layers": 4, 50 | "n_sqz": 2, 51 | "prenet": true, 52 | "mean_only": true, 53 | "hidden_channels_enc": 192, 54 | "hidden_channels_dec": 192, 55 | "window_size": 4, 56 | "n_speakers": 200, 57 | "gin_channels": 192 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import random 4 | import numpy as np 5 | import torch 6 | import torch.utils.data 7 | 8 | import commons 9 | import mel_processing 10 | from utils import load_filepaths_and_text 11 | import torch.nn.functional as F 12 | """Multi speaker version""" 13 | 14 | 15 | class TextAudioSpeakerLoader(torch.utils.data.Dataset): 16 | """ 17 | 1) loads audio, speaker_id, text pairs 18 | 2) normalizes text and converts them to sequences of integers 19 | 3) computes spectrograms from audio files. 20 | """ 21 | 22 | def __init__(self, audiopaths_sid_text, hparams, val=False): 23 | self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) 24 | self.max_wav_value = hparams.max_wav_value 25 | self.sampling_rate = hparams.sampling_rate 26 | self.filter_length = hparams.filter_length 27 | self.hop_length = hparams.hop_length 28 | self.win_length = hparams.win_length 29 | self.sampling_rate = hparams.sampling_rate 30 | self.spk_map = hparams.spk2id 31 | 32 | self.cleaned_text = getattr(hparams, "cleaned_text", False) 33 | 34 | self.add_blank = hparams.add_blank 35 | self.min_text_len = getattr(hparams, "min_text_len", 1) 36 | self.max_text_len = getattr(hparams, "max_text_len", 300) 37 | self.hps = hparams 38 | random.seed(1234) 39 | random.shuffle(self.audiopaths_sid_text) 40 | self._filter(val) 41 | self.fcpe = None 42 | 43 | def _filter(self, val): 44 | """ 45 | Filter text & store spec lengths 46 | """ 47 | # Store spectrogram lengths for Bucketing 48 | # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) 49 | # spec_length = wav_length // hop_length 50 | 51 | audiopaths_sid_text_new = [] 52 | lengths = [] 53 | skipped = 0 54 | for item in self.audiopaths_sid_text: 55 | _id, spk = item[:2] 56 | audiopath = f'dataset/{spk}/{_id}.wav' 57 | if not os.path.exists(audiopath): 58 | skipped += 1 59 | continue 60 | length_ = os.path.getsize(audiopath) // (2 * self.hop_length) 61 | if (length_ < 120 or length_>1400 ) and not val: 62 | skipped += 1 63 | continue 64 | audiopaths_sid_text_new.append([audiopath, spk]) 65 | 66 | print("skipped: ", skipped, ", total: ", len(self.audiopaths_sid_text)) 67 | self.audiopaths_sid_text = audiopaths_sid_text_new 68 | self.lengths = lengths 69 | 70 | 71 | def get_audio_text_speaker_pair(self, audiopath_sid_text): 72 | # separate filename, speaker_id and text 73 | audiopath, sid = audiopath_sid_text 74 | 75 | mel, wav = self.get_spec(audiopath) 76 | 77 | ssl = torch.load(audiopath.replace(".wav", ".ssl.pt")) 78 | ssl = F.interpolate(ssl, size=mel.shape[-1], mode="nearest") 79 | 80 | sid = torch.LongTensor([int(self.spk_map[sid])]) 81 | f0 = self.get_pitch(wav[0], mel.shape[1], audiopath) 82 | return (ssl, mel, wav, sid, f0) 83 | 84 | def get_spec(self, filename): 85 | wav_torch, _ = mel_processing.load_wav_to_torch(filename, target_sr=self.hps.sampling_rate) 86 | mel_path = filename.replace(".wav", ".mel.pt") 87 | if os.path.exists(mel_path): 88 | mel = torch.load(mel_path) 89 | return mel, wav_torch.unsqueeze(0) 90 | 91 | mel = mel_processing.get_mel(wav_torch, 92 | self.hps.sampling_rate, 93 | self.hps.n_mel_channels, 94 | self.hps.filter_length, 95 | self.hps.win_length, 96 | self.hps.hop_length, 97 | self.hps.mel_fmin, 98 | self.hps.mel_fmax) 99 | torch.save(mel, mel_path) 100 | return mel, wav_torch.unsqueeze(0) 101 | 102 | def get_text(self, text, tone, language): 103 | text_norm, tone, language = cleaned_text_to_sequence(text, tone, language) 104 | if self.add_blank: 105 | text_norm = commons.intersperse(text_norm, 0) 106 | tone = commons.intersperse(tone, 0) 107 | language = commons.intersperse(language, 0) 108 | text_norm = torch.LongTensor(text_norm) 109 | tone = torch.LongTensor(tone) 110 | language = torch.LongTensor(language) 111 | return text_norm, tone, language 112 | 113 | def get_pitch(self, wav, p_len, wavpath): 114 | f0_path = wavpath.replace(".wav", ".f0.pt") 115 | if os.path.exists(f0_path): 116 | return torch.load(f0_path) 117 | 118 | 119 | if self.fcpe is None: 120 | from f0_extractor.FCPEF0Predictor import FCPEF0Predictor 121 | print("init fcpe") 122 | self.fcpe = FCPEF0Predictor(sampling_rate=self.sampling_rate, hop_length=self.hop_length) 123 | pred_f0, uv = self.fcpe.compute_f0_uv(wav, p_len=p_len) 124 | f0 = torch.FloatTensor(pred_f0) 125 | torch.save(f0, f0_path) 126 | return f0 127 | 128 | def get_sid(self, sid): 129 | sid = torch.LongTensor([int(sid)]) 130 | return sid 131 | 132 | def __getitem__(self, index): 133 | return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) 134 | 135 | def __len__(self): 136 | return len(self.audiopaths_sid_text) 137 | 138 | 139 | class TextAudioSpeakerCollate(): 140 | """ Zero-pads model inputs and targets 141 | """ 142 | 143 | def __init__(self, return_ids=False): 144 | self.return_ids = return_ids 145 | 146 | def __call__(self, batch): 147 | """Collate's training batch from normalized text, audio and speaker identities 148 | PARAMS 149 | ------ 150 | batch: [text_normalized, spec_normalized, wav_normalized, sid] 151 | """ 152 | # Right zero-pad all one-hot text sequences to max input length 153 | _, ids_sorted_decreasing = torch.sort( 154 | torch.LongTensor([x[1].size(1) for x in batch]), 155 | dim=0, descending=True) 156 | 157 | max_mel_len = max([x[1].size(1) for x in batch]) 158 | max_wav_len = max([x[2].size(1) for x in batch]) 159 | 160 | mel_lengths = torch.LongTensor(len(batch)) 161 | wav_lengths = torch.LongTensor(len(batch)) 162 | sid = torch.LongTensor(len(batch)) 163 | 164 | c_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_mel_len) 165 | wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) 166 | mel_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_mel_len) 167 | f0_padded = torch.FloatTensor(len(batch), max_mel_len) 168 | c_padded.zero_() 169 | mel_padded.zero_() 170 | wav_padded.zero_() 171 | f0_padded.zero_() 172 | 173 | for i in range(len(ids_sorted_decreasing)): 174 | row = batch[ids_sorted_decreasing[i]] 175 | 176 | content = row[0][0,:, :] 177 | c_padded[i,:, :content.size(1)] = content 178 | 179 | mel = row[1] 180 | mel_padded[i, :, :mel.size(1)] = mel 181 | mel_lengths[i] = mel.size(1) 182 | 183 | wav = row[2] 184 | wav_padded[i, :, :wav.size(1)] = wav 185 | wav_lengths[i] = wav.size(1) 186 | 187 | sid[i] = row[3] 188 | 189 | f0 = row[4] 190 | f0_padded[i, :f0.size(0)] = f0 191 | 192 | return c_padded, mel_padded, mel_lengths,wav_padded, wav_lengths,\ 193 | sid, f0_padded 194 | 195 | 196 | class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): 197 | """ 198 | Maintain similar input lengths in a batch. 199 | Length groups are specified by boundaries. 200 | Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. 201 | 202 | It removes samples which are not included in the boundaries. 203 | Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. 204 | """ 205 | 206 | def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): 207 | super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) 208 | self.lengths = dataset.lengths 209 | self.batch_size = batch_size 210 | self.boundaries = boundaries 211 | 212 | self.buckets, self.num_samples_per_bucket = self._create_buckets() 213 | self.total_size = sum(self.num_samples_per_bucket) 214 | self.num_samples = self.total_size // self.num_replicas 215 | 216 | def _create_buckets(self): 217 | buckets = [[] for _ in range(len(self.boundaries) - 1)] 218 | for i in range(len(self.lengths)): 219 | length = self.lengths[i] 220 | idx_bucket = self._bisect(length) 221 | if idx_bucket != -1: 222 | buckets[idx_bucket].append(i) 223 | 224 | for i in range(len(buckets) - 1, 0, -1): 225 | if len(buckets[i]) == 0: 226 | buckets.pop(i) 227 | self.boundaries.pop(i + 1) 228 | 229 | num_samples_per_bucket = [] 230 | for i in range(len(buckets)): 231 | len_bucket = len(buckets[i]) 232 | total_batch_size = self.num_replicas * self.batch_size 233 | rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size 234 | num_samples_per_bucket.append(len_bucket + rem) 235 | return buckets, num_samples_per_bucket 236 | 237 | def __iter__(self): 238 | # deterministically shuffle based on epoch 239 | g = torch.Generator() 240 | g.manual_seed(self.epoch) 241 | 242 | indices = [] 243 | if self.shuffle: 244 | for bucket in self.buckets: 245 | indices.append(torch.randperm(len(bucket), generator=g).tolist()) 246 | else: 247 | for bucket in self.buckets: 248 | indices.append(list(range(len(bucket)))) 249 | 250 | batches = [] 251 | for i in range(len(self.buckets)): 252 | bucket = self.buckets[i] 253 | len_bucket = len(bucket) 254 | ids_bucket = indices[i] 255 | num_samples_bucket = self.num_samples_per_bucket[i] 256 | 257 | # add extra samples to make it evenly divisible 258 | rem = num_samples_bucket - len_bucket 259 | ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] 260 | 261 | # subsample 262 | ids_bucket = ids_bucket[self.rank::self.num_replicas] 263 | 264 | # batching 265 | for j in range(len(ids_bucket) // self.batch_size): 266 | batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] 267 | batches.append(batch) 268 | 269 | if self.shuffle: 270 | batch_ids = torch.randperm(len(batches), generator=g).tolist() 271 | batches = [batches[i] for i in batch_ids] 272 | self.batches = batches 273 | 274 | assert len(self.batches) * self.batch_size == self.num_samples 275 | return iter(self.batches) 276 | 277 | def _bisect(self, x, lo=0, hi=None): 278 | if hi is None: 279 | hi = len(self.boundaries) - 1 280 | 281 | if hi > lo: 282 | mid = (hi + lo) // 2 283 | if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: 284 | return mid 285 | elif x <= self.boundaries[mid]: 286 | return self._bisect(x, lo, mid) 287 | else: 288 | return self._bisect(x, mid + 1, hi) 289 | else: 290 | return -1 291 | 292 | def __len__(self): 293 | return self.num_samples // self.batch_size 294 | -------------------------------------------------------------------------------- /extract_f0_mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from torch.utils.data import DataLoader 4 | 5 | import utils 6 | from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate 7 | from tqdm import tqdm 8 | import logging 9 | logging.getLogger('numba').setLevel(logging.INFO) 10 | config_path = 'configs/config.json' 11 | hps = utils.get_hparams_from_file(config_path) 12 | collate = TextAudioSpeakerCollate() 13 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) 14 | eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) 15 | 16 | for _ in tqdm(train_dataset): 17 | pass 18 | for _ in tqdm(eval_dataset): 19 | pass 20 | 21 | # train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, 22 | # batch_size=2, pin_memory=True, 23 | # drop_last=True, collate_fn=collate) 24 | # 25 | # for _ in tqdm(train_loader): 26 | # pass -------------------------------------------------------------------------------- /extract_vec.py: -------------------------------------------------------------------------------- 1 | import math 2 | import multiprocessing 3 | import os 4 | import argparse 5 | from pathlib import Path 6 | from random import shuffle 7 | 8 | import torch 9 | from glob import glob 10 | from tqdm import tqdm 11 | 12 | from feature_extractor import contentvec768 13 | import utils 14 | import logging 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | import librosa 18 | 19 | 20 | def process_one(file_path, model): 21 | path = Path(file_path) 22 | 23 | ssl_path = file_path.replace(".wav", ".ssl.pt") 24 | # try: 25 | # torch.load(ssl_path) 26 | # except: 27 | if not os.path.exists(ssl_path): 28 | print(111) 29 | print(ssl_path) 30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 31 | wav16k, sr = librosa.load(path, sr=16000) 32 | wav16k = torch.from_numpy(wav16k).to(device) 33 | ssl_content = contentvec768.get_content(model, wav_16k_tensor=wav16k) 34 | torch.save(ssl_content.cpu(), ssl_path) 35 | if not os.path.exists(ssl_path): 36 | print("errrrrrrrrrrrrrrrrr"*1000) 37 | # exit(0) 38 | 39 | 40 | def process_batch(filenames): 41 | print("Loading hubert for content...") 42 | device = "cuda" if torch.cuda.is_available() else "cpu" 43 | ssl_model = contentvec768.get_model().to(device) 44 | print("Loaded hubert.") 45 | for filename in tqdm(filenames): 46 | process_one(filename, ssl_model) 47 | 48 | 49 | if __name__ == "__main__": 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument( 52 | "--in_dir", type=str, default="dataset", help="path to input dir" 53 | ) 54 | 55 | args = parser.parse_args() 56 | filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10] 57 | print(len(filenames)) 58 | shuffle(filenames) 59 | multiprocessing.set_start_method("spawn", force=True) 60 | 61 | num_processes = 1 62 | chunk_size = int(math.ceil(len(filenames) / num_processes)) 63 | chunks = [ 64 | filenames[i : i + chunk_size] for i in range(0, len(filenames), chunk_size) 65 | ] 66 | print([len(c) for c in chunks]) 67 | processes = [ 68 | multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks 69 | ] 70 | for p in processes: 71 | p.start() 72 | -------------------------------------------------------------------------------- /f0_extractor/F0Predictor.py: -------------------------------------------------------------------------------- 1 | class F0Predictor(object): 2 | def compute_f0(self,wav,p_len): 3 | ''' 4 | input: wav:[signal_length] 5 | p_len:int 6 | output: f0:[signal_length//hop_length] 7 | ''' 8 | pass 9 | 10 | def compute_f0_uv(self,wav,p_len): 11 | ''' 12 | input: wav:[signal_length] 13 | p_len:int 14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] 15 | ''' 16 | pass -------------------------------------------------------------------------------- /f0_extractor/FCPEF0Predictor.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from f0_extractor.F0Predictor import F0Predictor 8 | 9 | from .fcpe.model import FCPEInfer 10 | 11 | 12 | class FCPEF0Predictor(F0Predictor): 13 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100, 14 | threshold=0.05): 15 | self.fcpe = FCPEInfer(model_path="pretrain/fcpe/fcpe.pt", device=device, dtype=dtype) 16 | self.hop_length = hop_length 17 | self.f0_min = f0_min 18 | self.f0_max = f0_max 19 | if device is None: 20 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 21 | else: 22 | self.device = device 23 | self.threshold = threshold 24 | self.sampling_rate = sampling_rate 25 | self.dtype = dtype 26 | self.name = "fcpe" 27 | 28 | def repeat_expand( 29 | self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest" 30 | ): 31 | ndim = content.ndim 32 | 33 | if content.ndim == 1: 34 | content = content[None, None] 35 | elif content.ndim == 2: 36 | content = content[None] 37 | 38 | assert content.ndim == 3 39 | 40 | is_np = isinstance(content, np.ndarray) 41 | if is_np: 42 | content = torch.from_numpy(content) 43 | 44 | results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) 45 | 46 | if is_np: 47 | results = results.numpy() 48 | 49 | if ndim == 1: 50 | return results[0, 0] 51 | elif ndim == 2: 52 | return results[0] 53 | 54 | def post_process(self, x, sampling_rate, f0, pad_to): 55 | if isinstance(f0, np.ndarray): 56 | f0 = torch.from_numpy(f0).float().to(x.device) 57 | 58 | if pad_to is None: 59 | return f0 60 | 61 | f0 = self.repeat_expand(f0, pad_to) 62 | 63 | vuv_vector = torch.zeros_like(f0) 64 | vuv_vector[f0 > 0.0] = 1.0 65 | vuv_vector[f0 <= 0.0] = 0.0 66 | 67 | # 去掉0频率, 并线性插值 68 | nzindex = torch.nonzero(f0).squeeze() 69 | f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() 70 | time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() 71 | time_frame = np.arange(pad_to) * self.hop_length / sampling_rate 72 | 73 | vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] 74 | 75 | if f0.shape[0] <= 0: 76 | return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy() 77 | if f0.shape[0] == 1: 78 | return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[ 79 | 0]).cpu().numpy(), vuv_vector.cpu().numpy() 80 | 81 | # 大概可以用 torch 重写? 82 | f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) 83 | # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0)) 84 | 85 | return f0, vuv_vector.cpu().numpy() 86 | 87 | def compute_f0(self, wav, p_len=None): 88 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 89 | if p_len is None: 90 | p_len = x.shape[0] // self.hop_length 91 | else: 92 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 93 | f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] 94 | if torch.all(f0 == 0): 95 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 96 | return rtn, rtn 97 | return self.post_process(x, self.sampling_rate, f0, p_len)[0] 98 | 99 | def compute_f0_uv(self, wav, p_len=None): 100 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 101 | if p_len is None: 102 | p_len = x.shape[0] // self.hop_length 103 | else: 104 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 105 | f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] 106 | if torch.all(f0 == 0): 107 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 108 | return rtn, rtn 109 | return self.post_process(x, self.sampling_rate, f0, p_len) 110 | 111 | def get_activation(self, wav, down_sample=1): 112 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 113 | activation, mel = self.fcpe.get_activation(x, sr=self.sampling_rate, threshold=self.threshold) 114 | activation = activation[0].T 115 | frame_length = activation.shape[-1] 116 | activation = torch.mean(activation.view(1, 360//down_sample, down_sample, frame_length), dim=2) 117 | 118 | return activation, mel 119 | -------------------------------------------------------------------------------- /f0_extractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/f0_extractor/__init__.py -------------------------------------------------------------------------------- /f0_extractor/fcpe/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import FCPEInfer # noqa: F401 2 | from .nvSTFT import STFT # noqa: F401 3 | from .pcmer import PCmer # noqa: F401 4 | -------------------------------------------------------------------------------- /f0_extractor/fcpe/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn.utils import weight_norm 6 | from torchaudio.transforms import Resample 7 | 8 | from .nvSTFT import STFT 9 | from .pcmer import PCmer 10 | 11 | 12 | def l2_regularization(model, l2_alpha): 13 | l2_loss = [] 14 | for module in model.modules(): 15 | if type(module) is nn.Conv2d: 16 | l2_loss.append((module.weight ** 2).sum() / 2.0) 17 | return l2_alpha * sum(l2_loss) 18 | 19 | 20 | class FCPE(nn.Module): 21 | def __init__( 22 | self, 23 | input_channel=128, 24 | out_dims=360, 25 | n_layers=12, 26 | n_chans=512, 27 | use_siren=False, 28 | use_full=False, 29 | loss_mse_scale=10, 30 | loss_l2_regularization=False, 31 | loss_l2_regularization_scale=1, 32 | loss_grad1_mse=False, 33 | loss_grad1_mse_scale=1, 34 | f0_max=1975.5, 35 | f0_min=32.70, 36 | confidence=False, 37 | threshold=0.05, 38 | use_input_conv=True 39 | ): 40 | super().__init__() 41 | if use_siren is True: 42 | raise ValueError("Siren is not supported yet.") 43 | if use_full is True: 44 | raise ValueError("Full model is not supported yet.") 45 | 46 | self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 47 | self.loss_l2_regularization = loss_l2_regularization if (loss_l2_regularization is not None) else False 48 | self.loss_l2_regularization_scale = loss_l2_regularization_scale if (loss_l2_regularization_scale 49 | is not None) else 1 50 | self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False 51 | self.loss_grad1_mse_scale = loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 52 | self.f0_max = f0_max if (f0_max is not None) else 1975.5 53 | self.f0_min = f0_min if (f0_min is not None) else 32.70 54 | self.confidence = confidence if (confidence is not None) else False 55 | self.threshold = threshold if (threshold is not None) else 0.05 56 | self.use_input_conv = use_input_conv if (use_input_conv is not None) else True 57 | 58 | self.cent_table_b = torch.Tensor( 59 | np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], 60 | out_dims)) 61 | self.register_buffer("cent_table", self.cent_table_b) 62 | 63 | # conv in stack 64 | _leaky = nn.LeakyReLU() 65 | self.stack = nn.Sequential( 66 | nn.Conv1d(input_channel, n_chans, 3, 1, 1), 67 | nn.GroupNorm(4, n_chans), 68 | _leaky, 69 | nn.Conv1d(n_chans, n_chans, 3, 1, 1)) 70 | 71 | # transformer 72 | self.decoder = PCmer( 73 | num_layers=n_layers, 74 | num_heads=8, 75 | dim_model=n_chans, 76 | dim_keys=n_chans, 77 | dim_values=n_chans, 78 | residual_dropout=0.1, 79 | attention_dropout=0.1) 80 | self.norm = nn.LayerNorm(n_chans) 81 | 82 | # out 83 | self.n_out = out_dims 84 | self.dense_out = weight_norm( 85 | nn.Linear(n_chans, self.n_out)) 86 | 87 | def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder = "local_argmax"): 88 | """ 89 | input: 90 | B x n_frames x n_unit 91 | return: 92 | dict of B x n_frames x feat 93 | """ 94 | if cdecoder == "argmax": 95 | self.cdecoder = self.cents_decoder 96 | elif cdecoder == "local_argmax": 97 | self.cdecoder = self.cents_local_decoder 98 | if self.use_input_conv: 99 | x = self.stack(mel.transpose(1, 2)).transpose(1, 2) 100 | else: 101 | x = mel 102 | x = self.decoder(x) 103 | x = self.norm(x) 104 | x = self.dense_out(x) # [B,N,D] 105 | x = torch.sigmoid(x) 106 | if not infer: 107 | gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1] 108 | gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim] 109 | loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) # bce loss 110 | # l2 regularization 111 | if self.loss_l2_regularization: 112 | loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale) 113 | x = loss_all 114 | if infer: 115 | x = self.cdecoder(x) 116 | x = self.cent_to_f0(x) 117 | if not return_hz_f0: 118 | x = (1 + x / 700).log() 119 | return x 120 | 121 | def cents_decoder(self, y, mask=True): 122 | B, N, _ = y.size() 123 | ci = self.cent_table[None, None, :].expand(B, N, -1) 124 | rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True) # cents: [B,N,1] 125 | if mask: 126 | confident = torch.max(y, dim=-1, keepdim=True)[0] 127 | confident_mask = torch.ones_like(confident) 128 | confident_mask[confident <= self.threshold] = float("-INF") 129 | rtn = rtn * confident_mask 130 | if self.confidence: 131 | return rtn, confident 132 | else: 133 | return rtn 134 | 135 | def cents_local_decoder(self, y, mask=True): 136 | B, N, _ = y.size() 137 | ci = self.cent_table[None, None, :].expand(B, N, -1) 138 | confident, max_index = torch.max(y, dim=-1, keepdim=True) 139 | local_argmax_index = torch.arange(0,9).to(max_index.device) + (max_index - 4) 140 | local_argmax_index[local_argmax_index<0] = 0 141 | local_argmax_index[local_argmax_index>=self.n_out] = self.n_out - 1 142 | ci_l = torch.gather(ci,-1,local_argmax_index) 143 | y_l = torch.gather(y,-1,local_argmax_index) 144 | rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True) # cents: [B,N,1] 145 | if mask: 146 | confident_mask = torch.ones_like(confident) 147 | confident_mask[confident <= self.threshold] = float("-INF") 148 | rtn = rtn * confident_mask 149 | if self.confidence: 150 | return rtn, confident 151 | else: 152 | return rtn 153 | 154 | def cent_to_f0(self, cent): 155 | return 10. * 2 ** (cent / 1200.) 156 | 157 | def f0_to_cent(self, f0): 158 | return 1200. * torch.log2(f0 / 10.) 159 | 160 | def gaussian_blurred_cent(self, cents): # cents: [B,N,1] 161 | mask = (cents > 0.1) & (cents < (1200. * np.log2(self.f0_max / 10.))) 162 | B, N, _ = cents.size() 163 | ci = self.cent_table[None, None, :].expand(B, N, -1) 164 | return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() 165 | 166 | def get_activation(self, mel): 167 | """ 168 | input: 169 | B x n_frames x n_unit 170 | return: 171 | dict of B x n_frames x feat 172 | """ 173 | if self.use_input_conv: 174 | x = self.stack(mel.transpose(1, 2)).transpose(1, 2) 175 | else: 176 | x = mel 177 | x = self.decoder(x) 178 | x = self.norm(x) 179 | x = self.dense_out(x) # [B,N,D] 180 | x = torch.sigmoid(x) 181 | return x 182 | 183 | class FCPEInfer: 184 | def __init__(self, model_path, device=None, dtype=torch.float32): 185 | if device is None: 186 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 187 | self.device = device 188 | ckpt = torch.load(model_path, map_location=torch.device(self.device)) 189 | self.args = DotDict(ckpt["config"]) 190 | self.dtype = dtype 191 | model = FCPE( 192 | input_channel=self.args.model.input_channel, 193 | out_dims=self.args.model.out_dims, 194 | n_layers=self.args.model.n_layers, 195 | n_chans=self.args.model.n_chans, 196 | use_siren=self.args.model.use_siren, 197 | use_full=self.args.model.use_full, 198 | loss_mse_scale=self.args.loss.loss_mse_scale, 199 | loss_l2_regularization=self.args.loss.loss_l2_regularization, 200 | loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, 201 | loss_grad1_mse=self.args.loss.loss_grad1_mse, 202 | loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, 203 | f0_max=self.args.model.f0_max, 204 | f0_min=self.args.model.f0_min, 205 | confidence=self.args.model.confidence, 206 | ) 207 | model.to(self.device).to(self.dtype) 208 | model.load_state_dict(ckpt['model']) 209 | model.eval() 210 | self.model = model 211 | self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) 212 | 213 | @torch.no_grad() 214 | def __call__(self, audio, sr, threshold=0.05): 215 | self.model.threshold = threshold 216 | audio = audio[None,:] 217 | mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) 218 | f0 = self.model(mel=mel, infer=True, return_hz_f0=True) 219 | return f0 220 | 221 | @torch.no_grad() 222 | def get_activation(self, audio, sr, threshold=0.05): 223 | self.model.threshold = threshold 224 | audio = audio[None, :] 225 | mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) 226 | activation = self.model.get_activation(mel=mel) 227 | return activation, mel 228 | 229 | class Wav2Mel: 230 | 231 | def __init__(self, args, device=None, dtype=torch.float32): 232 | # self.args = args 233 | self.sampling_rate = args.mel.sampling_rate 234 | self.hop_size = args.mel.hop_size 235 | if device is None: 236 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 237 | self.device = device 238 | self.dtype = dtype 239 | self.stft = STFT( 240 | args.mel.sampling_rate, 241 | args.mel.num_mels, 242 | args.mel.n_fft, 243 | args.mel.win_size, 244 | args.mel.hop_size, 245 | args.mel.fmin, 246 | args.mel.fmax 247 | ) 248 | self.resample_kernel = {} 249 | 250 | def extract_nvstft(self, audio, keyshift=0, train=False): 251 | mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) # B, n_frames, bins 252 | return mel 253 | 254 | def extract_mel(self, audio, sample_rate, keyshift=0, train=False): 255 | audio = audio.to(self.dtype).to(self.device) 256 | # resample 257 | if sample_rate == self.sampling_rate: 258 | audio_res = audio 259 | else: 260 | key_str = str(sample_rate) 261 | if key_str not in self.resample_kernel: 262 | self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128) 263 | self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device) 264 | audio_res = self.resample_kernel[key_str](audio) 265 | 266 | # extract 267 | mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) # B, n_frames, bins 268 | n_frames = int(audio.shape[1] // self.hop_size) + 1 269 | if n_frames > int(mel.shape[1]): 270 | mel = torch.cat((mel, mel[:, -1:, :]), 1) 271 | if n_frames < int(mel.shape[1]): 272 | mel = mel[:, :n_frames, :] 273 | return mel 274 | 275 | def __call__(self, audio, sample_rate, keyshift=0, train=False): 276 | return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) 277 | 278 | 279 | class DotDict(dict): 280 | def __getattr__(*args): 281 | val = dict.get(*args) 282 | return DotDict(val) if type(val) is dict else val 283 | 284 | __setattr__ = dict.__setitem__ 285 | __delattr__ = dict.__delitem__ 286 | -------------------------------------------------------------------------------- /f0_extractor/fcpe/nvSTFT.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import librosa 4 | import numpy as np 5 | import soundfile as sf 6 | import torch 7 | import torch.nn.functional as F 8 | import torch.utils.data 9 | from librosa.filters import mel as librosa_mel_fn 10 | 11 | os.environ["LRU_CACHE_CAPACITY"] = "3" 12 | 13 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): 14 | sampling_rate = None 15 | try: 16 | data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. 17 | except Exception as ex: 18 | print(f"'{full_path}' failed to load.\nException:") 19 | print(ex) 20 | if return_empty_on_exception: 21 | return [], sampling_rate or target_sr or 48000 22 | else: 23 | raise Exception(ex) 24 | 25 | if len(data.shape) > 1: 26 | data = data[:, 0] 27 | assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) 28 | 29 | if np.issubdtype(data.dtype, np.integer): # if audio data is type int 30 | max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX 31 | else: # if audio data is type fp32 32 | max_mag = max(np.amax(data), -np.amin(data)) 33 | max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 34 | 35 | data = torch.FloatTensor(data.astype(np.float32))/max_mag 36 | 37 | if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except 38 | return [], sampling_rate or target_sr or 48000 39 | if target_sr is not None and sampling_rate != target_sr: 40 | data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) 41 | sampling_rate = target_sr 42 | 43 | return data, sampling_rate 44 | 45 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 46 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 47 | 48 | def dynamic_range_decompression(x, C=1): 49 | return np.exp(x) / C 50 | 51 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 52 | return torch.log(torch.clamp(x, min=clip_val) * C) 53 | 54 | def dynamic_range_decompression_torch(x, C=1): 55 | return torch.exp(x) / C 56 | 57 | class STFT(): 58 | def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): 59 | self.target_sr = sr 60 | 61 | self.n_mels = n_mels 62 | self.n_fft = n_fft 63 | self.win_size = win_size 64 | self.hop_length = hop_length 65 | self.fmin = fmin 66 | self.fmax = fmax 67 | self.clip_val = clip_val 68 | self.mel_basis = {} 69 | self.hann_window = {} 70 | 71 | def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): 72 | sampling_rate = self.target_sr 73 | n_mels = self.n_mels 74 | n_fft = self.n_fft 75 | win_size = self.win_size 76 | hop_length = self.hop_length 77 | fmin = self.fmin 78 | fmax = self.fmax 79 | clip_val = self.clip_val 80 | 81 | factor = 2 ** (keyshift / 12) 82 | n_fft_new = int(np.round(n_fft * factor)) 83 | win_size_new = int(np.round(win_size * factor)) 84 | hop_length_new = int(np.round(hop_length * speed)) 85 | if not train: 86 | mel_basis = self.mel_basis 87 | hann_window = self.hann_window 88 | else: 89 | mel_basis = {} 90 | hann_window = {} 91 | 92 | if torch.min(y) < -1.: 93 | print('min value is ', torch.min(y)) 94 | if torch.max(y) > 1.: 95 | print('max value is ', torch.max(y)) 96 | 97 | mel_basis_key = str(fmax)+'_'+str(y.device) 98 | if mel_basis_key not in mel_basis: 99 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) 100 | mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) 101 | 102 | keyshift_key = str(keyshift)+'_'+str(y.device) 103 | if keyshift_key not in hann_window: 104 | hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) 105 | 106 | pad_left = (win_size_new - hop_length_new) //2 107 | pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left) 108 | if pad_right < y.size(-1): 109 | mode = 'reflect' 110 | else: 111 | mode = 'constant' 112 | y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode) 113 | y = y.squeeze(1) 114 | 115 | spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], 116 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) 117 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) 118 | if keyshift != 0: 119 | size = n_fft // 2 + 1 120 | resize = spec.size(1) 121 | if resize < size: 122 | spec = F.pad(spec, (0, 0, 0, size-resize)) 123 | spec = spec[:, :size, :] * win_size / win_size_new 124 | spec = torch.matmul(mel_basis[mel_basis_key], spec) 125 | spec = dynamic_range_compression_torch(spec, clip_val=clip_val) 126 | return spec 127 | 128 | def __call__(self, audiopath): 129 | audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) 130 | spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) 131 | return spect 132 | 133 | stft = STFT() 134 | -------------------------------------------------------------------------------- /f0_extractor/fcpe/pcmer.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from einops import rearrange, repeat 7 | from local_attention import LocalAttention 8 | from torch import nn 9 | 10 | #import fast_transformers.causal_product.causal_product_cuda 11 | 12 | def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None): 13 | b, h, *_ = data.shape 14 | # (batch size, head, length, model_dim) 15 | 16 | # normalize model dim 17 | data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1. 18 | 19 | # what is ration?, projection_matrix.shape[0] --> 266 20 | 21 | ratio = (projection_matrix.shape[0] ** -0.5) 22 | 23 | projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h) 24 | projection = projection.type_as(data) 25 | 26 | #data_dash = w^T x 27 | data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection) 28 | 29 | 30 | # diag_data = D**2 31 | diag_data = data ** 2 32 | diag_data = torch.sum(diag_data, dim=-1) 33 | diag_data = (diag_data / 2.0) * (data_normalizer ** 2) 34 | diag_data = diag_data.unsqueeze(dim=-1) 35 | 36 | #print () 37 | if is_query: 38 | data_dash = ratio * ( 39 | torch.exp(data_dash - diag_data - 40 | torch.max(data_dash, dim=-1, keepdim=True).values) + eps) 41 | else: 42 | data_dash = ratio * ( 43 | torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps) 44 | 45 | return data_dash.type_as(data) 46 | 47 | def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None): 48 | unstructured_block = torch.randn((cols, cols), device = device) 49 | q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced') 50 | q, r = map(lambda t: t.to(device), (q, r)) 51 | 52 | # proposed by @Parskatt 53 | # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf 54 | if qr_uniform_q: 55 | d = torch.diag(r, 0) 56 | q *= d.sign() 57 | return q.t() 58 | def exists(val): 59 | return val is not None 60 | 61 | def empty(tensor): 62 | return tensor.numel() == 0 63 | 64 | def default(val, d): 65 | return val if exists(val) else d 66 | 67 | def cast_tuple(val): 68 | return (val,) if not isinstance(val, tuple) else val 69 | 70 | class PCmer(nn.Module): 71 | """The encoder that is used in the Transformer model.""" 72 | 73 | def __init__(self, 74 | num_layers, 75 | num_heads, 76 | dim_model, 77 | dim_keys, 78 | dim_values, 79 | residual_dropout, 80 | attention_dropout): 81 | super().__init__() 82 | self.num_layers = num_layers 83 | self.num_heads = num_heads 84 | self.dim_model = dim_model 85 | self.dim_values = dim_values 86 | self.dim_keys = dim_keys 87 | self.residual_dropout = residual_dropout 88 | self.attention_dropout = attention_dropout 89 | 90 | self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) 91 | 92 | # METHODS ######################################################################################################## 93 | 94 | def forward(self, phone, mask=None): 95 | 96 | # apply all layers to the input 97 | for (i, layer) in enumerate(self._layers): 98 | phone = layer(phone, mask) 99 | # provide the final sequence 100 | return phone 101 | 102 | 103 | # ==================================================================================================================== # 104 | # CLASS _ E N C O D E R L A Y E R # 105 | # ==================================================================================================================== # 106 | 107 | 108 | class _EncoderLayer(nn.Module): 109 | """One layer of the encoder. 110 | 111 | Attributes: 112 | attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence. 113 | feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism. 114 | """ 115 | 116 | def __init__(self, parent: PCmer): 117 | """Creates a new instance of ``_EncoderLayer``. 118 | 119 | Args: 120 | parent (Encoder): The encoder that the layers is created for. 121 | """ 122 | super().__init__() 123 | 124 | 125 | self.conformer = ConformerConvModule(parent.dim_model) 126 | self.norm = nn.LayerNorm(parent.dim_model) 127 | self.dropout = nn.Dropout(parent.residual_dropout) 128 | 129 | # selfatt -> fastatt: performer! 130 | self.attn = SelfAttention(dim = parent.dim_model, 131 | heads = parent.num_heads, 132 | causal = False) 133 | 134 | # METHODS ######################################################################################################## 135 | 136 | def forward(self, phone, mask=None): 137 | 138 | # compute attention sub-layer 139 | phone = phone + (self.attn(self.norm(phone), mask=mask)) 140 | 141 | phone = phone + (self.conformer(phone)) 142 | 143 | return phone 144 | 145 | def calc_same_padding(kernel_size): 146 | pad = kernel_size // 2 147 | return (pad, pad - (kernel_size + 1) % 2) 148 | 149 | # helper classes 150 | 151 | class Swish(nn.Module): 152 | def forward(self, x): 153 | return x * x.sigmoid() 154 | 155 | class Transpose(nn.Module): 156 | def __init__(self, dims): 157 | super().__init__() 158 | assert len(dims) == 2, 'dims must be a tuple of two dimensions' 159 | self.dims = dims 160 | 161 | def forward(self, x): 162 | return x.transpose(*self.dims) 163 | 164 | class GLU(nn.Module): 165 | def __init__(self, dim): 166 | super().__init__() 167 | self.dim = dim 168 | 169 | def forward(self, x): 170 | out, gate = x.chunk(2, dim=self.dim) 171 | return out * gate.sigmoid() 172 | 173 | class DepthWiseConv1d(nn.Module): 174 | def __init__(self, chan_in, chan_out, kernel_size, padding): 175 | super().__init__() 176 | self.padding = padding 177 | self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in) 178 | 179 | def forward(self, x): 180 | x = F.pad(x, self.padding) 181 | return self.conv(x) 182 | 183 | class ConformerConvModule(nn.Module): 184 | def __init__( 185 | self, 186 | dim, 187 | causal = False, 188 | expansion_factor = 2, 189 | kernel_size = 31, 190 | dropout = 0.): 191 | super().__init__() 192 | 193 | inner_dim = dim * expansion_factor 194 | padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) 195 | 196 | self.net = nn.Sequential( 197 | nn.LayerNorm(dim), 198 | Transpose((1, 2)), 199 | nn.Conv1d(dim, inner_dim * 2, 1), 200 | GLU(dim=1), 201 | DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding), 202 | #nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(), 203 | Swish(), 204 | nn.Conv1d(inner_dim, dim, 1), 205 | Transpose((1, 2)), 206 | nn.Dropout(dropout) 207 | ) 208 | 209 | def forward(self, x): 210 | return self.net(x) 211 | 212 | def linear_attention(q, k, v): 213 | if v is None: 214 | #print (k.size(), q.size()) 215 | out = torch.einsum('...ed,...nd->...ne', k, q) 216 | return out 217 | 218 | else: 219 | k_cumsum = k.sum(dim = -2) 220 | #k_cumsum = k.sum(dim = -2) 221 | D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8) 222 | 223 | context = torch.einsum('...nd,...ne->...de', k, v) 224 | #print ("TRUEEE: ", context.size(), q.size(), D_inv.size()) 225 | out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv) 226 | return out 227 | 228 | def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None): 229 | nb_full_blocks = int(nb_rows / nb_columns) 230 | #print (nb_full_blocks) 231 | block_list = [] 232 | 233 | for _ in range(nb_full_blocks): 234 | q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device) 235 | block_list.append(q) 236 | # block_list[n] is a orthogonal matrix ... (model_dim * model_dim) 237 | #print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1))) 238 | #print (nb_rows, nb_full_blocks, nb_columns) 239 | remaining_rows = nb_rows - nb_full_blocks * nb_columns 240 | #print (remaining_rows) 241 | if remaining_rows > 0: 242 | q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device) 243 | #print (q[:remaining_rows].size()) 244 | block_list.append(q[:remaining_rows]) 245 | 246 | final_matrix = torch.cat(block_list) 247 | 248 | if scaling == 0: 249 | multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1) 250 | elif scaling == 1: 251 | multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device) 252 | else: 253 | raise ValueError(f'Invalid scaling {scaling}') 254 | 255 | return torch.diag(multiplier) @ final_matrix 256 | 257 | class FastAttention(nn.Module): 258 | def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False): 259 | super().__init__() 260 | nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) 261 | 262 | self.dim_heads = dim_heads 263 | self.nb_features = nb_features 264 | self.ortho_scaling = ortho_scaling 265 | 266 | self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q) 267 | projection_matrix = self.create_projection() 268 | self.register_buffer('projection_matrix', projection_matrix) 269 | 270 | self.generalized_attention = generalized_attention 271 | self.kernel_fn = kernel_fn 272 | 273 | # if this is turned on, no projection will be used 274 | # queries and keys will be softmax-ed as in the original efficient attention paper 275 | self.no_projection = no_projection 276 | 277 | self.causal = causal 278 | 279 | @torch.no_grad() 280 | def redraw_projection_matrix(self): 281 | projections = self.create_projection() 282 | self.projection_matrix.copy_(projections) 283 | del projections 284 | 285 | def forward(self, q, k, v): 286 | device = q.device 287 | 288 | if self.no_projection: 289 | q = q.softmax(dim = -1) 290 | k = torch.exp(k) if self.causal else k.softmax(dim = -2) 291 | else: 292 | create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device) 293 | 294 | q = create_kernel(q, is_query = True) 295 | k = create_kernel(k, is_query = False) 296 | 297 | attn_fn = linear_attention if not self.causal else self.causal_linear_fn 298 | if v is None: 299 | out = attn_fn(q, k, None) 300 | return out 301 | else: 302 | out = attn_fn(q, k, v) 303 | return out 304 | class SelfAttention(nn.Module): 305 | def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False): 306 | super().__init__() 307 | assert dim % heads == 0, 'dimension must be divisible by number of heads' 308 | dim_head = default(dim_head, dim // heads) 309 | inner_dim = dim_head * heads 310 | self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection) 311 | 312 | self.heads = heads 313 | self.global_heads = heads - local_heads 314 | self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None 315 | 316 | #print (heads, nb_features, dim_head) 317 | #name_embedding = torch.zeros(110, heads, dim_head, dim_head) 318 | #self.name_embedding = nn.Parameter(name_embedding, requires_grad=True) 319 | 320 | 321 | self.to_q = nn.Linear(dim, inner_dim) 322 | self.to_k = nn.Linear(dim, inner_dim) 323 | self.to_v = nn.Linear(dim, inner_dim) 324 | self.to_out = nn.Linear(inner_dim, dim) 325 | self.dropout = nn.Dropout(dropout) 326 | 327 | @torch.no_grad() 328 | def redraw_projection_matrix(self): 329 | self.fast_attention.redraw_projection_matrix() 330 | #torch.nn.init.zeros_(self.name_embedding) 331 | #print (torch.sum(self.name_embedding)) 332 | def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs): 333 | _, _, _, h, gh = *x.shape, self.heads, self.global_heads 334 | 335 | cross_attend = exists(context) 336 | 337 | context = default(context, x) 338 | context_mask = default(context_mask, mask) if not cross_attend else context_mask 339 | #print (torch.sum(self.name_embedding)) 340 | q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) 341 | 342 | q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) 343 | (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) 344 | 345 | attn_outs = [] 346 | #print (name) 347 | #print (self.name_embedding[name].size()) 348 | if not empty(q): 349 | if exists(context_mask): 350 | global_mask = context_mask[:, None, :, None] 351 | v.masked_fill_(~global_mask, 0.) 352 | if cross_attend: 353 | pass 354 | #print (torch.sum(self.name_embedding)) 355 | #out = self.fast_attention(q,self.name_embedding[name],None) 356 | #print (torch.sum(self.name_embedding[...,-1:])) 357 | else: 358 | out = self.fast_attention(q, k, v) 359 | attn_outs.append(out) 360 | 361 | if not empty(lq): 362 | assert not cross_attend, 'local attention is not compatible with cross attention' 363 | out = self.local_attn(lq, lk, lv, input_mask = mask) 364 | attn_outs.append(out) 365 | 366 | out = torch.cat(attn_outs, dim = 1) 367 | out = rearrange(out, 'b h n d -> b n (h d)') 368 | out = self.to_out(out) 369 | return self.dropout(out) -------------------------------------------------------------------------------- /feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /feature_extractor/contentvec768.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import HubertModel 3 | 4 | from torch import nn 5 | import logging 6 | 7 | logging.getLogger("numba").setLevel(logging.WARNING) 8 | 9 | class HubertModelWithFinalProj(HubertModel): 10 | def __init__(self, config): 11 | super().__init__(config) 12 | 13 | # Remove this layer is necessary to achieve the desired outcome. 14 | self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) 15 | 16 | def get_model(): 17 | model = HubertModelWithFinalProj.from_pretrained("./pretrain/content-vec-best") 18 | return model 19 | 20 | 21 | def get_content(hmodel, wav_16k_tensor): 22 | with torch.no_grad(): 23 | feats = hmodel(wav_16k_tensor.unsqueeze(0))["last_hidden_state"] 24 | return feats.transpose(1,2) 25 | 26 | -------------------------------------------------------------------------------- /filelists/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/filelists/.gitkeep -------------------------------------------------------------------------------- /hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from hifigan.network.vocoders.nsf_hifigan import NsfHifiGAN 2 | # from https://github.com/openvpi/diffsinger 3 | -------------------------------------------------------------------------------- /hifigan/modules/hifigan/hifigan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 5 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 6 | 7 | from hifigan.modules.parallel_wavegan.layers import UpsampleNetwork, ConvInUpsampleNetwork 8 | from hifigan.modules.parallel_wavegan.models.source import SourceModuleHnNSF 9 | import numpy as np 10 | 11 | LRELU_SLOPE = 0.1 12 | 13 | 14 | def init_weights(m, mean=0.0, std=0.01): 15 | classname = m.__class__.__name__ 16 | if classname.find("Conv") != -1: 17 | m.weight.data.normal_(mean, std) 18 | 19 | 20 | def apply_weight_norm(m): 21 | classname = m.__class__.__name__ 22 | if classname.find("Conv") != -1: 23 | weight_norm(m) 24 | 25 | 26 | def get_padding(kernel_size, dilation=1): 27 | return int((kernel_size * dilation - dilation) / 2) 28 | 29 | 30 | class ResBlock1(torch.nn.Module): 31 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 32 | super(ResBlock1, self).__init__() 33 | self.h = h 34 | self.convs1 = nn.ModuleList([ 35 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 36 | padding=get_padding(kernel_size, dilation[0]))), 37 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 38 | padding=get_padding(kernel_size, dilation[1]))), 39 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 40 | padding=get_padding(kernel_size, dilation[2]))) 41 | ]) 42 | self.convs1.apply(init_weights) 43 | 44 | self.convs2 = nn.ModuleList([ 45 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 46 | padding=get_padding(kernel_size, 1))), 47 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 48 | padding=get_padding(kernel_size, 1))), 49 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 50 | padding=get_padding(kernel_size, 1))) 51 | ]) 52 | self.convs2.apply(init_weights) 53 | 54 | def forward(self, x): 55 | for c1, c2 in zip(self.convs1, self.convs2): 56 | xt = F.leaky_relu(x, LRELU_SLOPE) 57 | xt = c1(xt) 58 | xt = F.leaky_relu(xt, LRELU_SLOPE) 59 | xt = c2(xt) 60 | x = xt + x 61 | return x 62 | 63 | def remove_weight_norm(self): 64 | for l in self.convs1: 65 | remove_weight_norm(l) 66 | for l in self.convs2: 67 | remove_weight_norm(l) 68 | 69 | 70 | class ResBlock2(torch.nn.Module): 71 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 72 | super(ResBlock2, self).__init__() 73 | self.h = h 74 | self.convs = nn.ModuleList([ 75 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 76 | padding=get_padding(kernel_size, dilation[0]))), 77 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 78 | padding=get_padding(kernel_size, dilation[1]))) 79 | ]) 80 | self.convs.apply(init_weights) 81 | 82 | def forward(self, x): 83 | for c in self.convs: 84 | xt = F.leaky_relu(x, LRELU_SLOPE) 85 | xt = c(xt) 86 | x = xt + x 87 | return x 88 | 89 | def remove_weight_norm(self): 90 | for l in self.convs: 91 | remove_weight_norm(l) 92 | 93 | 94 | class Conv1d1x1(Conv1d): 95 | """1x1 Conv1d with customized initialization.""" 96 | 97 | def __init__(self, in_channels, out_channels, bias): 98 | """Initialize 1x1 Conv1d module.""" 99 | super(Conv1d1x1, self).__init__(in_channels, out_channels, 100 | kernel_size=1, padding=0, 101 | dilation=1, bias=bias) 102 | 103 | 104 | class HifiGanGenerator(torch.nn.Module): 105 | def __init__(self, h, c_out=1): 106 | super(HifiGanGenerator, self).__init__() 107 | self.h = h 108 | self.num_kernels = len(h['resblock_kernel_sizes']) 109 | self.num_upsamples = len(h['upsample_rates']) 110 | 111 | if h['use_pitch_embed']: 112 | self.harmonic_num = 8 113 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h['upsample_rates'])) 114 | self.m_source = SourceModuleHnNSF( 115 | sampling_rate=h['audio_sample_rate'], 116 | harmonic_num=self.harmonic_num) 117 | self.noise_convs = nn.ModuleList() 118 | self.conv_pre = weight_norm(Conv1d(80, h['upsample_initial_channel'], 7, 1, padding=3)) 119 | resblock = ResBlock1 if h['resblock'] == '1' else ResBlock2 120 | 121 | self.ups = nn.ModuleList() 122 | for i, (u, k) in enumerate(zip(h['upsample_rates'], h['upsample_kernel_sizes'])): 123 | c_cur = h['upsample_initial_channel'] // (2 ** (i + 1)) 124 | self.ups.append(weight_norm( 125 | ConvTranspose1d(c_cur * 2, c_cur, k, u, padding=(k - u) // 2))) 126 | if h['use_pitch_embed']: 127 | if i + 1 < len(h['upsample_rates']): 128 | stride_f0 = np.prod(h['upsample_rates'][i + 1:]) 129 | self.noise_convs.append(Conv1d( 130 | 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) 131 | else: 132 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 133 | 134 | self.resblocks = nn.ModuleList() 135 | for i in range(len(self.ups)): 136 | ch = h['upsample_initial_channel'] // (2 ** (i + 1)) 137 | for j, (k, d) in enumerate(zip(h['resblock_kernel_sizes'], h['resblock_dilation_sizes'])): 138 | self.resblocks.append(resblock(h, ch, k, d)) 139 | 140 | self.conv_post = weight_norm(Conv1d(ch, c_out, 7, 1, padding=3)) 141 | self.ups.apply(init_weights) 142 | self.conv_post.apply(init_weights) 143 | 144 | def forward(self, x, f0=None): 145 | if f0 is not None: 146 | # harmonic-source signal, noise-source signal, uv flag 147 | f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) 148 | har_source, noi_source, uv = self.m_source(f0) 149 | har_source = har_source.transpose(1, 2) 150 | 151 | x = self.conv_pre(x) 152 | for i in range(self.num_upsamples): 153 | x = F.leaky_relu(x, LRELU_SLOPE) 154 | x = self.ups[i](x) 155 | if f0 is not None: 156 | x_source = self.noise_convs[i](har_source) 157 | x = x + x_source 158 | xs = None 159 | for j in range(self.num_kernels): 160 | if xs is None: 161 | xs = self.resblocks[i * self.num_kernels + j](x) 162 | else: 163 | xs += self.resblocks[i * self.num_kernels + j](x) 164 | x = xs / self.num_kernels 165 | x = F.leaky_relu(x) 166 | x = self.conv_post(x) 167 | x = torch.tanh(x) 168 | 169 | return x 170 | 171 | def remove_weight_norm(self): 172 | print('Removing weight norm...') 173 | for l in self.ups: 174 | remove_weight_norm(l) 175 | for l in self.resblocks: 176 | l.remove_weight_norm() 177 | remove_weight_norm(self.conv_pre) 178 | remove_weight_norm(self.conv_post) 179 | 180 | 181 | class DiscriminatorP(torch.nn.Module): 182 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, use_cond=False, c_in=1): 183 | super(DiscriminatorP, self).__init__() 184 | self.use_cond = use_cond 185 | if use_cond: 186 | from utils.hparams import hparams 187 | t = hparams['hop_size'] 188 | self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2) 189 | c_in = 2 190 | 191 | self.period = period 192 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 193 | self.convs = nn.ModuleList([ 194 | norm_f(Conv2d(c_in, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 195 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 196 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 197 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 198 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 199 | ]) 200 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 201 | 202 | def forward(self, x, mel): 203 | fmap = [] 204 | if self.use_cond: 205 | x_mel = self.cond_net(mel) 206 | x = torch.cat([x_mel, x], 1) 207 | # 1d to 2d 208 | b, c, t = x.shape 209 | if t % self.period != 0: # pad first 210 | n_pad = self.period - (t % self.period) 211 | x = F.pad(x, (0, n_pad), "reflect") 212 | t = t + n_pad 213 | x = x.view(b, c, t // self.period, self.period) 214 | 215 | for l in self.convs: 216 | x = l(x) 217 | x = F.leaky_relu(x, LRELU_SLOPE) 218 | fmap.append(x) 219 | x = self.conv_post(x) 220 | fmap.append(x) 221 | x = torch.flatten(x, 1, -1) 222 | 223 | return x, fmap 224 | 225 | 226 | class MultiPeriodDiscriminator(torch.nn.Module): 227 | def __init__(self, use_cond=False, c_in=1): 228 | super(MultiPeriodDiscriminator, self).__init__() 229 | self.discriminators = nn.ModuleList([ 230 | DiscriminatorP(2, use_cond=use_cond, c_in=c_in), 231 | DiscriminatorP(3, use_cond=use_cond, c_in=c_in), 232 | DiscriminatorP(5, use_cond=use_cond, c_in=c_in), 233 | DiscriminatorP(7, use_cond=use_cond, c_in=c_in), 234 | DiscriminatorP(11, use_cond=use_cond, c_in=c_in), 235 | ]) 236 | 237 | def forward(self, y, y_hat, mel=None): 238 | y_d_rs = [] 239 | y_d_gs = [] 240 | fmap_rs = [] 241 | fmap_gs = [] 242 | for i, d in enumerate(self.discriminators): 243 | y_d_r, fmap_r = d(y, mel) 244 | y_d_g, fmap_g = d(y_hat, mel) 245 | y_d_rs.append(y_d_r) 246 | fmap_rs.append(fmap_r) 247 | y_d_gs.append(y_d_g) 248 | fmap_gs.append(fmap_g) 249 | 250 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 251 | 252 | 253 | class DiscriminatorS(torch.nn.Module): 254 | def __init__(self, use_spectral_norm=False, use_cond=False, upsample_rates=None, c_in=1): 255 | super(DiscriminatorS, self).__init__() 256 | self.use_cond = use_cond 257 | if use_cond: 258 | t = np.prod(upsample_rates) 259 | self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2) 260 | c_in = 2 261 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 262 | self.convs = nn.ModuleList([ 263 | norm_f(Conv1d(c_in, 128, 15, 1, padding=7)), 264 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 265 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 266 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 267 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 268 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 269 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 270 | ]) 271 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 272 | 273 | def forward(self, x, mel): 274 | if self.use_cond: 275 | x_mel = self.cond_net(mel) 276 | x = torch.cat([x_mel, x], 1) 277 | fmap = [] 278 | for l in self.convs: 279 | x = l(x) 280 | x = F.leaky_relu(x, LRELU_SLOPE) 281 | fmap.append(x) 282 | x = self.conv_post(x) 283 | fmap.append(x) 284 | x = torch.flatten(x, 1, -1) 285 | 286 | return x, fmap 287 | 288 | 289 | class MultiScaleDiscriminator(torch.nn.Module): 290 | def __init__(self, use_cond=False, c_in=1): 291 | super(MultiScaleDiscriminator, self).__init__() 292 | from utils.hparams import hparams 293 | self.discriminators = nn.ModuleList([ 294 | DiscriminatorS(use_spectral_norm=True, use_cond=use_cond, 295 | upsample_rates=[4, 4, hparams['hop_size'] // 16], 296 | c_in=c_in), 297 | DiscriminatorS(use_cond=use_cond, 298 | upsample_rates=[4, 4, hparams['hop_size'] // 32], 299 | c_in=c_in), 300 | DiscriminatorS(use_cond=use_cond, 301 | upsample_rates=[4, 4, hparams['hop_size'] // 64], 302 | c_in=c_in), 303 | ]) 304 | self.meanpools = nn.ModuleList([ 305 | AvgPool1d(4, 2, padding=1), 306 | AvgPool1d(4, 2, padding=1) 307 | ]) 308 | 309 | def forward(self, y, y_hat, mel=None): 310 | y_d_rs = [] 311 | y_d_gs = [] 312 | fmap_rs = [] 313 | fmap_gs = [] 314 | for i, d in enumerate(self.discriminators): 315 | if i != 0: 316 | y = self.meanpools[i - 1](y) 317 | y_hat = self.meanpools[i - 1](y_hat) 318 | y_d_r, fmap_r = d(y, mel) 319 | y_d_g, fmap_g = d(y_hat, mel) 320 | y_d_rs.append(y_d_r) 321 | fmap_rs.append(fmap_r) 322 | y_d_gs.append(y_d_g) 323 | fmap_gs.append(fmap_g) 324 | 325 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 326 | 327 | 328 | def feature_loss(fmap_r, fmap_g): 329 | loss = 0 330 | for dr, dg in zip(fmap_r, fmap_g): 331 | for rl, gl in zip(dr, dg): 332 | loss += torch.mean(torch.abs(rl - gl)) 333 | 334 | return loss * 2 335 | 336 | 337 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 338 | r_losses = 0 339 | g_losses = 0 340 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 341 | r_loss = torch.mean((1 - dr) ** 2) 342 | g_loss = torch.mean(dg ** 2) 343 | r_losses += r_loss 344 | g_losses += g_loss 345 | r_losses = r_losses / len(disc_real_outputs) 346 | g_losses = g_losses / len(disc_real_outputs) 347 | return r_losses, g_losses 348 | 349 | 350 | def cond_discriminator_loss(outputs): 351 | loss = 0 352 | for dg in outputs: 353 | g_loss = torch.mean(dg ** 2) 354 | loss += g_loss 355 | loss = loss / len(outputs) 356 | return loss 357 | 358 | 359 | def generator_loss(disc_outputs): 360 | loss = 0 361 | for dg in disc_outputs: 362 | l = torch.mean((1 - dg) ** 2) 363 | loss += l 364 | loss = loss / len(disc_outputs) 365 | return loss 366 | -------------------------------------------------------------------------------- /hifigan/modules/hifigan/mel_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from librosa.filters import mel as librosa_mel_fn 5 | from scipy.io.wavfile import read 6 | 7 | MAX_WAV_VALUE = 32768.0 8 | 9 | 10 | def load_wav(full_path): 11 | sampling_rate, data = read(full_path) 12 | return data, sampling_rate 13 | 14 | 15 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 16 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 17 | 18 | 19 | def dynamic_range_decompression(x, C=1): 20 | return np.exp(x) / C 21 | 22 | 23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 24 | return torch.log(torch.clamp(x, min=clip_val) * C) 25 | 26 | 27 | def dynamic_range_decompression_torch(x, C=1): 28 | return torch.exp(x) / C 29 | 30 | 31 | def spectral_normalize_torch(magnitudes): 32 | output = dynamic_range_compression_torch(magnitudes) 33 | return output 34 | 35 | 36 | def spectral_de_normalize_torch(magnitudes): 37 | output = dynamic_range_decompression_torch(magnitudes) 38 | return output 39 | 40 | 41 | mel_basis = {} 42 | hann_window = {} 43 | 44 | 45 | def mel_spectrogram(y, hparams, center=False, complex=False): 46 | # hop_size: 512 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 47 | # win_size: 2048 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 48 | # fmin: 55 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 49 | # fmax: 10000 # To be increased/reduced depending on data. 50 | # fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter 51 | # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, 52 | n_fft = hparams['fft_size'] 53 | num_mels = hparams['audio_num_mel_bins'] 54 | sampling_rate = hparams['audio_sample_rate'] 55 | hop_size = hparams['hop_size'] 56 | win_size = hparams['win_size'] 57 | fmin = hparams['fmin'] 58 | fmax = hparams['fmax'] 59 | y = y.clamp(min=-1., max=1.) 60 | global mel_basis, hann_window 61 | if fmax not in mel_basis: 62 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 63 | mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 64 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 65 | 66 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 67 | mode='reflect') 68 | y = y.squeeze(1) 69 | 70 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 71 | center=center, pad_mode='reflect', normalized=False, onesided=True) 72 | 73 | if not complex: 74 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 75 | spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec) 76 | spec = spectral_normalize_torch(spec) 77 | else: 78 | B, C, T, _ = spec.shape 79 | spec = spec.transpose(1, 2) # [B, T, n_fft, 2] 80 | return spec 81 | -------------------------------------------------------------------------------- /hifigan/modules/nsf_hifigan/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | def build_env(config, config_name, path): 12 | t_path = os.path.join(path, config_name) 13 | if config != t_path: 14 | os.makedirs(path, exist_ok=True) 15 | shutil.copyfile(config, os.path.join(path, config_name)) -------------------------------------------------------------------------------- /hifigan/modules/nsf_hifigan/nvSTFT.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | os.environ["LRU_CACHE_CAPACITY"] = "3" 4 | import random 5 | import torch 6 | import torch.utils.data 7 | import numpy as np 8 | import librosa 9 | from librosa.util import normalize 10 | from librosa.filters import mel as librosa_mel_fn 11 | from scipy.io.wavfile import read 12 | import soundfile as sf 13 | 14 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): 15 | sampling_rate = None 16 | try: 17 | data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. 18 | except Exception as ex: 19 | print(f"'{full_path}' failed to load.\nException:") 20 | print(ex) 21 | if return_empty_on_exception: 22 | return [], sampling_rate or target_sr or 48000 23 | else: 24 | raise Exception(ex) 25 | 26 | if len(data.shape) > 1: 27 | data = data[:, 0] 28 | assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) 29 | 30 | if np.issubdtype(data.dtype, np.integer): # if audio data is type int 31 | max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX 32 | else: # if audio data is type fp32 33 | max_mag = max(np.amax(data), -np.amin(data)) 34 | max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 35 | 36 | data = torch.FloatTensor(data.astype(np.float32))/max_mag 37 | 38 | if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except 39 | return [], sampling_rate or target_sr or 48000 40 | if target_sr is not None and sampling_rate != target_sr: 41 | data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) 42 | sampling_rate = target_sr 43 | 44 | return data, sampling_rate 45 | 46 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 47 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 48 | 49 | def dynamic_range_decompression(x, C=1): 50 | return np.exp(x) / C 51 | 52 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 53 | return torch.log(torch.clamp(x, min=clip_val) * C) 54 | 55 | def dynamic_range_decompression_torch(x, C=1): 56 | return torch.exp(x) / C 57 | 58 | class STFT(): 59 | def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): 60 | self.target_sr = sr 61 | 62 | self.n_mels = n_mels 63 | self.n_fft = n_fft 64 | self.win_size = win_size 65 | self.hop_length = hop_length 66 | self.fmin = fmin 67 | self.fmax = fmax 68 | self.clip_val = clip_val 69 | self.mel_basis = {} 70 | self.hann_window = {} 71 | 72 | def get_mel(self, y, center=False): 73 | sampling_rate = self.target_sr 74 | n_mels = self.n_mels 75 | n_fft = self.n_fft 76 | win_size = self.win_size 77 | hop_length = self.hop_length 78 | fmin = self.fmin 79 | fmax = self.fmax 80 | clip_val = self.clip_val 81 | 82 | if torch.min(y) < -1.: 83 | print('min value is ', torch.min(y)) 84 | if torch.max(y) > 1.: 85 | print('max value is ', torch.max(y)) 86 | 87 | if fmax not in self.mel_basis: 88 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) 89 | self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 90 | self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device) 91 | 92 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect') 93 | y = y.squeeze(1) 94 | 95 | spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], 96 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 97 | # print(111,spec) 98 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 99 | # print(222,spec) 100 | spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec) 101 | # print(333,spec) 102 | spec = dynamic_range_compression_torch(spec, clip_val=clip_val) 103 | # print(444,spec) 104 | return spec 105 | 106 | def __call__(self, audiopath): 107 | audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) 108 | spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) 109 | return spect 110 | 111 | stft = STFT() -------------------------------------------------------------------------------- /hifigan/modules/nsf_hifigan/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import matplotlib 4 | import torch 5 | from torch.nn.utils import weight_norm 6 | matplotlib.use("Agg") 7 | import matplotlib.pylab as plt 8 | 9 | 10 | def plot_spectrogram(spectrogram): 11 | fig, ax = plt.subplots(figsize=(10, 2)) 12 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 13 | interpolation='none') 14 | plt.colorbar(im, ax=ax) 15 | 16 | fig.canvas.draw() 17 | plt.close() 18 | 19 | return fig 20 | 21 | 22 | def init_weights(m, mean=0.0, std=0.01): 23 | classname = m.__class__.__name__ 24 | if classname.find("Conv") != -1: 25 | m.weight.data.normal_(mean, std) 26 | 27 | 28 | def apply_weight_norm(m): 29 | classname = m.__class__.__name__ 30 | if classname.find("Conv") != -1: 31 | weight_norm(m) 32 | 33 | 34 | def get_padding(kernel_size, dilation=1): 35 | return int((kernel_size*dilation - dilation)/2) 36 | 37 | 38 | def load_checkpoint(filepath, device): 39 | assert os.path.isfile(filepath) 40 | print("Loading '{}'".format(filepath)) 41 | checkpoint_dict = torch.load(filepath, map_location=device) 42 | print("Complete.") 43 | return checkpoint_dict 44 | 45 | 46 | def save_checkpoint(filepath, obj): 47 | print("Saving checkpoint to {}".format(filepath)) 48 | torch.save(obj, filepath) 49 | print("Complete.") 50 | 51 | 52 | def del_old_checkpoints(cp_dir, prefix, n_models=2): 53 | pattern = os.path.join(cp_dir, prefix + '????????') 54 | cp_list = glob.glob(pattern) # get checkpoint paths 55 | cp_list = sorted(cp_list)# sort by iter 56 | if len(cp_list) > n_models: # if more than n_models models are found 57 | for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models 58 | open(cp, 'w').close()# empty file contents 59 | os.unlink(cp)# delete file (move to trash when using Colab) 60 | 61 | 62 | def scan_checkpoint(cp_dir, prefix): 63 | pattern = os.path.join(cp_dir, prefix + '????????') 64 | cp_list = glob.glob(pattern) 65 | if len(cp_list) == 0: 66 | return None 67 | return sorted(cp_list)[-1] -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/hifigan/modules/parallel_wavegan/__init__.py -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .causal_conv import * # NOQA 2 | from .pqmf import * # NOQA 3 | from .residual_block import * # NOQA 4 | from hifigan.modules.parallel_wavegan.layers.residual_stack import * # NOQA 5 | from .upsample import * # NOQA 6 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/causal_conv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Causal convolusion layer modules.""" 7 | 8 | 9 | import torch 10 | 11 | 12 | class CausalConv1d(torch.nn.Module): 13 | """CausalConv1d module with customized initialization.""" 14 | 15 | def __init__(self, in_channels, out_channels, kernel_size, 16 | dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}): 17 | """Initialize CausalConv1d module.""" 18 | super(CausalConv1d, self).__init__() 19 | self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params) 20 | self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, 21 | dilation=dilation, bias=bias) 22 | 23 | def forward(self, x): 24 | """Calculate forward propagation. 25 | 26 | Args: 27 | x (Tensor): Input tensor (B, in_channels, T). 28 | 29 | Returns: 30 | Tensor: Output tensor (B, out_channels, T). 31 | 32 | """ 33 | return self.conv(self.pad(x))[:, :, :x.size(2)] 34 | 35 | 36 | class CausalConvTranspose1d(torch.nn.Module): 37 | """CausalConvTranspose1d module with customized initialization.""" 38 | 39 | def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): 40 | """Initialize CausalConvTranspose1d module.""" 41 | super(CausalConvTranspose1d, self).__init__() 42 | self.deconv = torch.nn.ConvTranspose1d( 43 | in_channels, out_channels, kernel_size, stride, bias=bias) 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | """Calculate forward propagation. 48 | 49 | Args: 50 | x (Tensor): Input tensor (B, in_channels, T_in). 51 | 52 | Returns: 53 | Tensor: Output tensor (B, out_channels, T_out). 54 | 55 | """ 56 | return self.deconv(x)[:, :, :-self.stride] 57 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/pqmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Pseudo QMF modules.""" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | from scipy.signal import kaiser 13 | 14 | 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): 16 | """Design prototype filter for PQMF. 17 | 18 | This method is based on `A Kaiser window approach for the design of prototype 19 | filters of cosine modulated filterbanks`_. 20 | 21 | Args: 22 | taps (int): The number of filter taps. 23 | cutoff_ratio (float): Cut-off frequency ratio. 24 | beta (float): Beta coefficient for kaiser window. 25 | 26 | Returns: 27 | ndarray: Impluse response of prototype filter (taps + 1,). 28 | 29 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 30 | https://ieeexplore.ieee.org/abstract/document/681427 31 | 32 | """ 33 | # check the arguments are valid 34 | assert taps % 2 == 0, "The number of taps mush be even number." 35 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 36 | 37 | # make initial filter 38 | omega_c = np.pi * cutoff_ratio 39 | with np.errstate(invalid='ignore'): 40 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \ 41 | / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) 42 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 43 | 44 | # apply kaiser window 45 | w = kaiser(taps + 1, beta) 46 | h = h_i * w 47 | 48 | return h 49 | 50 | 51 | class PQMF(torch.nn.Module): 52 | """PQMF module. 53 | 54 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 55 | 56 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 57 | https://ieeexplore.ieee.org/document/258122 58 | 59 | """ 60 | 61 | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0): 62 | """Initilize PQMF module. 63 | 64 | Args: 65 | subbands (int): The number of subbands. 66 | taps (int): The number of filter taps. 67 | cutoff_ratio (float): Cut-off frequency ratio. 68 | beta (float): Beta coefficient for kaiser window. 69 | 70 | """ 71 | super(PQMF, self).__init__() 72 | 73 | # define filter coefficient 74 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 75 | h_analysis = np.zeros((subbands, len(h_proto))) 76 | h_synthesis = np.zeros((subbands, len(h_proto))) 77 | for k in range(subbands): 78 | h_analysis[k] = 2 * h_proto * np.cos( 79 | (2 * k + 1) * (np.pi / (2 * subbands)) * 80 | (np.arange(taps + 1) - ((taps - 1) / 2)) + 81 | (-1) ** k * np.pi / 4) 82 | h_synthesis[k] = 2 * h_proto * np.cos( 83 | (2 * k + 1) * (np.pi / (2 * subbands)) * 84 | (np.arange(taps + 1) - ((taps - 1) / 2)) - 85 | (-1) ** k * np.pi / 4) 86 | 87 | # convert to tensor 88 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1) 89 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0) 90 | 91 | # register coefficients as beffer 92 | self.register_buffer("analysis_filter", analysis_filter) 93 | self.register_buffer("synthesis_filter", synthesis_filter) 94 | 95 | # filter for downsampling & upsampling 96 | updown_filter = torch.zeros((subbands, subbands, subbands)).float() 97 | for k in range(subbands): 98 | updown_filter[k, k, 0] = 1.0 99 | self.register_buffer("updown_filter", updown_filter) 100 | self.subbands = subbands 101 | 102 | # keep padding info 103 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 104 | 105 | def analysis(self, x): 106 | """Analysis with PQMF. 107 | 108 | Args: 109 | x (Tensor): Input tensor (B, 1, T). 110 | 111 | Returns: 112 | Tensor: Output tensor (B, subbands, T // subbands). 113 | 114 | """ 115 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 116 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 117 | 118 | def synthesis(self, x): 119 | """Synthesis with PQMF. 120 | 121 | Args: 122 | x (Tensor): Input tensor (B, subbands, T // subbands). 123 | 124 | Returns: 125 | Tensor: Output tensor (B, 1, T). 126 | 127 | """ 128 | x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) 129 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) 130 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Residual block module in WaveNet. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | 15 | class Conv1d(torch.nn.Conv1d): 16 | """Conv1d module with customized initialization.""" 17 | 18 | def __init__(self, *args, **kwargs): 19 | """Initialize Conv1d module.""" 20 | super(Conv1d, self).__init__(*args, **kwargs) 21 | 22 | def reset_parameters(self): 23 | """Reset parameters.""" 24 | torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 25 | if self.bias is not None: 26 | torch.nn.init.constant_(self.bias, 0.0) 27 | 28 | 29 | class Conv1d1x1(Conv1d): 30 | """1x1 Conv1d with customized initialization.""" 31 | 32 | def __init__(self, in_channels, out_channels, bias): 33 | """Initialize 1x1 Conv1d module.""" 34 | super(Conv1d1x1, self).__init__(in_channels, out_channels, 35 | kernel_size=1, padding=0, 36 | dilation=1, bias=bias) 37 | 38 | 39 | class ResidualBlock(torch.nn.Module): 40 | """Residual block module in WaveNet.""" 41 | 42 | def __init__(self, 43 | kernel_size=3, 44 | residual_channels=64, 45 | gate_channels=128, 46 | skip_channels=64, 47 | aux_channels=80, 48 | dropout=0.0, 49 | dilation=1, 50 | bias=True, 51 | use_causal_conv=False 52 | ): 53 | """Initialize ResidualBlock module. 54 | 55 | Args: 56 | kernel_size (int): Kernel size of dilation convolution layer. 57 | residual_channels (int): Number of channels for residual connection. 58 | skip_channels (int): Number of channels for skip connection. 59 | aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. 60 | dropout (float): Dropout probability. 61 | dilation (int): Dilation factor. 62 | bias (bool): Whether to add bias parameter in convolution layers. 63 | use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution. 64 | 65 | """ 66 | super(ResidualBlock, self).__init__() 67 | self.dropout = dropout 68 | # no future time stamps available 69 | if use_causal_conv: 70 | padding = (kernel_size - 1) * dilation 71 | else: 72 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 73 | padding = (kernel_size - 1) // 2 * dilation 74 | self.use_causal_conv = use_causal_conv 75 | 76 | # dilation conv 77 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 78 | padding=padding, dilation=dilation, bias=bias) 79 | 80 | # local conditioning 81 | if aux_channels > 0: 82 | self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) 83 | else: 84 | self.conv1x1_aux = None 85 | 86 | # conv output is split into two groups 87 | gate_out_channels = gate_channels // 2 88 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) 89 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) 90 | 91 | def forward(self, x, c): 92 | """Calculate forward propagation. 93 | 94 | Args: 95 | x (Tensor): Input tensor (B, residual_channels, T). 96 | c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). 97 | 98 | Returns: 99 | Tensor: Output tensor for residual connection (B, residual_channels, T). 100 | Tensor: Output tensor for skip connection (B, skip_channels, T). 101 | 102 | """ 103 | residual = x 104 | x = F.dropout(x, p=self.dropout, training=self.training) 105 | x = self.conv(x) 106 | 107 | # remove future time steps if use_causal_conv conv 108 | x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x 109 | 110 | # split into two part for gated activation 111 | splitdim = 1 112 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 113 | 114 | # local conditioning 115 | if c is not None: 116 | assert self.conv1x1_aux is not None 117 | c = self.conv1x1_aux(c) 118 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 119 | xa, xb = xa + ca, xb + cb 120 | 121 | x = torch.tanh(xa) * torch.sigmoid(xb) 122 | 123 | # for skip connection 124 | s = self.conv1x1_skip(x) 125 | 126 | # for residual connection 127 | x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) 128 | 129 | return x, s 130 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/residual_stack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual stack module in MelGAN.""" 7 | 8 | import torch 9 | 10 | from . import CausalConv1d 11 | 12 | 13 | class ResidualStack(torch.nn.Module): 14 | """Residual stack module introduced in MelGAN.""" 15 | 16 | def __init__(self, 17 | kernel_size=3, 18 | channels=32, 19 | dilation=1, 20 | bias=True, 21 | nonlinear_activation="LeakyReLU", 22 | nonlinear_activation_params={"negative_slope": 0.2}, 23 | pad="ReflectionPad1d", 24 | pad_params={}, 25 | use_causal_conv=False, 26 | ): 27 | """Initialize ResidualStack module. 28 | 29 | Args: 30 | kernel_size (int): Kernel size of dilation convolution layer. 31 | channels (int): Number of channels of convolution layers. 32 | dilation (int): Dilation factor. 33 | bias (bool): Whether to add bias parameter in convolution layers. 34 | nonlinear_activation (str): Activation function module name. 35 | nonlinear_activation_params (dict): Hyperparameters for activation function. 36 | pad (str): Padding function module name before dilated convolution layer. 37 | pad_params (dict): Hyperparameters for padding function. 38 | use_causal_conv (bool): Whether to use causal convolution. 39 | 40 | """ 41 | super(ResidualStack, self).__init__() 42 | 43 | # defile residual stack part 44 | if not use_causal_conv: 45 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 46 | self.stack = torch.nn.Sequential( 47 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 48 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 49 | torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), 50 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 51 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 52 | ) 53 | else: 54 | self.stack = torch.nn.Sequential( 55 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 56 | CausalConv1d(channels, channels, kernel_size, dilation=dilation, 57 | bias=bias, pad=pad, pad_params=pad_params), 58 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 59 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 60 | ) 61 | 62 | # defile extra layer for skip connection 63 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 64 | 65 | def forward(self, c): 66 | """Calculate forward propagation. 67 | 68 | Args: 69 | c (Tensor): Input tensor (B, channels, T). 70 | 71 | Returns: 72 | Tensor: Output tensor (B, chennels, T). 73 | 74 | """ 75 | return self.stack(c) + self.skip_layer(c) 76 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/tf_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 MINH ANH (@dathudeptrai) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Tensorflow Layer modules complatible with pytorch.""" 7 | 8 | import tensorflow as tf 9 | 10 | 11 | class TFReflectionPad1d(tf.keras.layers.Layer): 12 | """Tensorflow ReflectionPad1d module.""" 13 | 14 | def __init__(self, padding_size): 15 | """Initialize TFReflectionPad1d module. 16 | 17 | Args: 18 | padding_size (int): Padding size. 19 | 20 | """ 21 | super(TFReflectionPad1d, self).__init__() 22 | self.padding_size = padding_size 23 | 24 | @tf.function 25 | def call(self, x): 26 | """Calculate forward propagation. 27 | 28 | Args: 29 | x (Tensor): Input tensor (B, T, 1, C). 30 | 31 | Returns: 32 | Tensor: Padded tensor (B, T + 2 * padding_size, 1, C). 33 | 34 | """ 35 | return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT") 36 | 37 | 38 | class TFConvTranspose1d(tf.keras.layers.Layer): 39 | """Tensorflow ConvTranspose1d module.""" 40 | 41 | def __init__(self, channels, kernel_size, stride, padding): 42 | """Initialize TFConvTranspose1d( module. 43 | 44 | Args: 45 | channels (int): Number of channels. 46 | kernel_size (int): kernel size. 47 | strides (int): Stride width. 48 | padding (str): Padding type ("same" or "valid"). 49 | 50 | """ 51 | super(TFConvTranspose1d, self).__init__() 52 | self.conv1d_transpose = tf.keras.layers.Conv2DTranspose( 53 | filters=channels, 54 | kernel_size=(kernel_size, 1), 55 | strides=(stride, 1), 56 | padding=padding, 57 | ) 58 | 59 | @tf.function 60 | def call(self, x): 61 | """Calculate forward propagation. 62 | 63 | Args: 64 | x (Tensor): Input tensor (B, T, 1, C). 65 | 66 | Returns: 67 | Tensors: Output tensor (B, T', 1, C'). 68 | 69 | """ 70 | x = self.conv1d_transpose(x) 71 | return x 72 | 73 | 74 | class TFResidualStack(tf.keras.layers.Layer): 75 | """Tensorflow ResidualStack module.""" 76 | 77 | def __init__(self, 78 | kernel_size, 79 | channels, 80 | dilation, 81 | bias, 82 | nonlinear_activation, 83 | nonlinear_activation_params, 84 | padding, 85 | ): 86 | """Initialize TFResidualStack module. 87 | 88 | Args: 89 | kernel_size (int): Kernel size. 90 | channles (int): Number of channels. 91 | dilation (int): Dilation ine. 92 | bias (bool): Whether to add bias parameter in convolution layers. 93 | nonlinear_activation (str): Activation function module name. 94 | nonlinear_activation_params (dict): Hyperparameters for activation function. 95 | padding (str): Padding type ("same" or "valid"). 96 | 97 | """ 98 | super(TFResidualStack, self).__init__() 99 | self.block = [ 100 | getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), 101 | TFReflectionPad1d(dilation), 102 | tf.keras.layers.Conv2D( 103 | filters=channels, 104 | kernel_size=(kernel_size, 1), 105 | dilation_rate=(dilation, 1), 106 | use_bias=bias, 107 | padding="valid", 108 | ), 109 | getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), 110 | tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) 111 | ] 112 | self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) 113 | 114 | @tf.function 115 | def call(self, x): 116 | """Calculate forward propagation. 117 | 118 | Args: 119 | x (Tensor): Input tensor (B, T, 1, C). 120 | 121 | Returns: 122 | Tensor: Output tensor (B, T, 1, C). 123 | 124 | """ 125 | _x = tf.identity(x) 126 | for i, layer in enumerate(self.block): 127 | _x = layer(_x) 128 | shortcut = self.shortcut(x) 129 | return shortcut + _x 130 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/layers/upsample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Upsampling module. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn.functional as F 12 | 13 | from . import Conv1d 14 | 15 | 16 | class Stretch2d(torch.nn.Module): 17 | """Stretch2d module.""" 18 | 19 | def __init__(self, x_scale, y_scale, mode="nearest"): 20 | """Initialize Stretch2d module. 21 | 22 | Args: 23 | x_scale (int): X scaling factor (Time axis in spectrogram). 24 | y_scale (int): Y scaling factor (Frequency axis in spectrogram). 25 | mode (str): Interpolation mode. 26 | 27 | """ 28 | super(Stretch2d, self).__init__() 29 | self.x_scale = x_scale 30 | self.y_scale = y_scale 31 | self.mode = mode 32 | 33 | def forward(self, x): 34 | """Calculate forward propagation. 35 | 36 | Args: 37 | x (Tensor): Input tensor (B, C, F, T). 38 | 39 | Returns: 40 | Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), 41 | 42 | """ 43 | return F.interpolate( 44 | x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) 45 | 46 | 47 | class Conv2d(torch.nn.Conv2d): 48 | """Conv2d module with customized initialization.""" 49 | 50 | def __init__(self, *args, **kwargs): 51 | """Initialize Conv2d module.""" 52 | super(Conv2d, self).__init__(*args, **kwargs) 53 | 54 | def reset_parameters(self): 55 | """Reset parameters.""" 56 | self.weight.data.fill_(1. / np.prod(self.kernel_size)) 57 | if self.bias is not None: 58 | torch.nn.init.constant_(self.bias, 0.0) 59 | 60 | 61 | class UpsampleNetwork(torch.nn.Module): 62 | """Upsampling network module.""" 63 | 64 | def __init__(self, 65 | upsample_scales, 66 | nonlinear_activation=None, 67 | nonlinear_activation_params={}, 68 | interpolate_mode="nearest", 69 | freq_axis_kernel_size=1, 70 | use_causal_conv=False, 71 | ): 72 | """Initialize upsampling network module. 73 | 74 | Args: 75 | upsample_scales (list): List of upsampling scales. 76 | nonlinear_activation (str): Activation function name. 77 | nonlinear_activation_params (dict): Arguments for specified activation function. 78 | interpolate_mode (str): Interpolation mode. 79 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 80 | 81 | """ 82 | super(UpsampleNetwork, self).__init__() 83 | self.use_causal_conv = use_causal_conv 84 | self.up_layers = torch.nn.ModuleList() 85 | for scale in upsample_scales: 86 | # interpolation layer 87 | stretch = Stretch2d(scale, 1, interpolate_mode) 88 | self.up_layers += [stretch] 89 | 90 | # conv layer 91 | assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." 92 | freq_axis_padding = (freq_axis_kernel_size - 1) // 2 93 | kernel_size = (freq_axis_kernel_size, scale * 2 + 1) 94 | if use_causal_conv: 95 | padding = (freq_axis_padding, scale * 2) 96 | else: 97 | padding = (freq_axis_padding, scale) 98 | conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) 99 | self.up_layers += [conv] 100 | 101 | # nonlinear 102 | if nonlinear_activation is not None: 103 | nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) 104 | self.up_layers += [nonlinear] 105 | 106 | def forward(self, c): 107 | """Calculate forward propagation. 108 | 109 | Args: 110 | c : Input tensor (B, C, T). 111 | 112 | Returns: 113 | Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales). 114 | 115 | """ 116 | c = c.unsqueeze(1) # (B, 1, C, T) 117 | for f in self.up_layers: 118 | if self.use_causal_conv and isinstance(f, Conv2d): 119 | c = f(c)[..., :c.size(-1)] 120 | else: 121 | c = f(c) 122 | return c.squeeze(1) # (B, C, T') 123 | 124 | 125 | class ConvInUpsampleNetwork(torch.nn.Module): 126 | """Convolution + upsampling network module.""" 127 | 128 | def __init__(self, 129 | upsample_scales, 130 | nonlinear_activation=None, 131 | nonlinear_activation_params={}, 132 | interpolate_mode="nearest", 133 | freq_axis_kernel_size=1, 134 | aux_channels=80, 135 | aux_context_window=0, 136 | use_causal_conv=False 137 | ): 138 | """Initialize convolution + upsampling network module. 139 | 140 | Args: 141 | upsample_scales (list): List of upsampling scales. 142 | nonlinear_activation (str): Activation function name. 143 | nonlinear_activation_params (dict): Arguments for specified activation function. 144 | mode (str): Interpolation mode. 145 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 146 | aux_channels (int): Number of channels of pre-convolutional layer. 147 | aux_context_window (int): Context window size of the pre-convolutional layer. 148 | use_causal_conv (bool): Whether to use causal structure. 149 | 150 | """ 151 | super(ConvInUpsampleNetwork, self).__init__() 152 | self.aux_context_window = aux_context_window 153 | self.use_causal_conv = use_causal_conv and aux_context_window > 0 154 | # To capture wide-context information in conditional features 155 | kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 156 | # NOTE(kan-bayashi): Here do not use padding because the input is already padded 157 | self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) 158 | self.upsample = UpsampleNetwork( 159 | upsample_scales=upsample_scales, 160 | nonlinear_activation=nonlinear_activation, 161 | nonlinear_activation_params=nonlinear_activation_params, 162 | interpolate_mode=interpolate_mode, 163 | freq_axis_kernel_size=freq_axis_kernel_size, 164 | use_causal_conv=use_causal_conv, 165 | ) 166 | 167 | def forward(self, c): 168 | """Calculate forward propagation. 169 | 170 | Args: 171 | c : Input tensor (B, C, T'). 172 | 173 | Returns: 174 | Tensor: Upsampled tensor (B, C, T), 175 | where T = (T' - aux_context_window * 2) * prod(upsample_scales). 176 | 177 | Note: 178 | The length of inputs considers the context window size. 179 | 180 | """ 181 | c_ = self.conv_in(c) 182 | c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ 183 | return self.upsample(c) 184 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .stft_loss import * # NOQA 2 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/losses/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | 15 | Args: 16 | x (Tensor): Input signal tensor (B, T). 17 | fft_size (int): FFT size. 18 | hop_size (int): Hop size. 19 | win_length (int): Window length. 20 | window (str): Window function type. 21 | 22 | Returns: 23 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 24 | 25 | """ 26 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window) 27 | real = x_stft[..., 0] 28 | imag = x_stft[..., 1] 29 | 30 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 31 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 32 | 33 | 34 | class SpectralConvergengeLoss(torch.nn.Module): 35 | """Spectral convergence loss module.""" 36 | 37 | def __init__(self): 38 | """Initilize spectral convergence loss module.""" 39 | super(SpectralConvergengeLoss, self).__init__() 40 | 41 | def forward(self, x_mag, y_mag): 42 | """Calculate forward propagation. 43 | 44 | Args: 45 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 46 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 47 | 48 | Returns: 49 | Tensor: Spectral convergence loss value. 50 | 51 | """ 52 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 53 | 54 | 55 | class LogSTFTMagnitudeLoss(torch.nn.Module): 56 | """Log STFT magnitude loss module.""" 57 | 58 | def __init__(self): 59 | """Initilize los STFT magnitude loss module.""" 60 | super(LogSTFTMagnitudeLoss, self).__init__() 61 | 62 | def forward(self, x_mag, y_mag): 63 | """Calculate forward propagation. 64 | 65 | Args: 66 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 67 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 68 | 69 | Returns: 70 | Tensor: Log STFT magnitude loss value. 71 | 72 | """ 73 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 74 | 75 | 76 | class STFTLoss(torch.nn.Module): 77 | """STFT loss module.""" 78 | 79 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 80 | """Initialize STFT loss module.""" 81 | super(STFTLoss, self).__init__() 82 | self.fft_size = fft_size 83 | self.shift_size = shift_size 84 | self.win_length = win_length 85 | self.window = getattr(torch, window)(win_length) 86 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 87 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 88 | 89 | def forward(self, x, y): 90 | """Calculate forward propagation. 91 | 92 | Args: 93 | x (Tensor): Predicted signal (B, T). 94 | y (Tensor): Groundtruth signal (B, T). 95 | 96 | Returns: 97 | Tensor: Spectral convergence loss value. 98 | Tensor: Log STFT magnitude loss value. 99 | 100 | """ 101 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 102 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 103 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 104 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 105 | 106 | return sc_loss, mag_loss 107 | 108 | 109 | class MultiResolutionSTFTLoss(torch.nn.Module): 110 | """Multi resolution STFT loss module.""" 111 | 112 | def __init__(self, 113 | fft_sizes=[1024, 2048, 512], 114 | hop_sizes=[120, 240, 50], 115 | win_lengths=[600, 1200, 240], 116 | window="hann_window"): 117 | """Initialize Multi resolution STFT loss module. 118 | 119 | Args: 120 | fft_sizes (list): List of FFT sizes. 121 | hop_sizes (list): List of hop sizes. 122 | win_lengths (list): List of window lengths. 123 | window (str): Window function type. 124 | 125 | """ 126 | super(MultiResolutionSTFTLoss, self).__init__() 127 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 128 | self.stft_losses = torch.nn.ModuleList() 129 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 130 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 131 | 132 | def forward(self, x, y): 133 | """Calculate forward propagation. 134 | 135 | Args: 136 | x (Tensor): Predicted signal (B, T). 137 | y (Tensor): Groundtruth signal (B, T). 138 | 139 | Returns: 140 | Tensor: Multi resolution spectral convergence loss value. 141 | Tensor: Multi resolution log STFT magnitude loss value. 142 | 143 | """ 144 | sc_loss = 0.0 145 | mag_loss = 0.0 146 | for f in self.stft_losses: 147 | sc_l, mag_l = f(x, y) 148 | sc_loss += sc_l 149 | mag_loss += mag_l 150 | sc_loss /= len(self.stft_losses) 151 | mag_loss /= len(self.stft_losses) 152 | 153 | return sc_loss, mag_loss 154 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .melgan import * # NOQA 2 | from .parallel_wavegan import * # NOQA 3 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.optim import * # NOQA 2 | from .radam import * # NOQA 3 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/optimizers/radam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """RAdam optimizer. 4 | 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam. 6 | """ 7 | 8 | import math 9 | import torch 10 | 11 | from torch.optim.optimizer import Optimizer 12 | 13 | 14 | class RAdam(Optimizer): 15 | """Rectified Adam optimizer.""" 16 | 17 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 18 | """Initilize RAdam optimizer.""" 19 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 20 | self.buffer = [[None, None, None] for ind in range(10)] 21 | super(RAdam, self).__init__(params, defaults) 22 | 23 | def __setstate__(self, state): 24 | """Set state.""" 25 | super(RAdam, self).__setstate__(state) 26 | 27 | def step(self, closure=None): 28 | """Run one step.""" 29 | loss = None 30 | if closure is not None: 31 | loss = closure() 32 | 33 | for group in self.param_groups: 34 | 35 | for p in group['params']: 36 | if p.grad is None: 37 | continue 38 | grad = p.grad.data.float() 39 | if grad.is_sparse: 40 | raise RuntimeError('RAdam does not support sparse gradients') 41 | 42 | p_data_fp32 = p.data.float() 43 | 44 | state = self.state[p] 45 | 46 | if len(state) == 0: 47 | state['step'] = 0 48 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 49 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 50 | else: 51 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 52 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 53 | 54 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 55 | beta1, beta2 = group['betas'] 56 | 57 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 58 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 59 | 60 | state['step'] += 1 61 | buffered = self.buffer[int(state['step'] % 10)] 62 | if state['step'] == buffered[0]: 63 | N_sma, step_size = buffered[1], buffered[2] 64 | else: 65 | buffered[0] = state['step'] 66 | beta2_t = beta2 ** state['step'] 67 | N_sma_max = 2 / (1 - beta2) - 1 68 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 69 | buffered[1] = N_sma 70 | 71 | # more conservative since it's an approximated value 72 | if N_sma >= 5: 73 | step_size = math.sqrt( 74 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA 75 | else: 76 | step_size = 1.0 / (1 - beta1 ** state['step']) 77 | buffered[2] = step_size 78 | 79 | if group['weight_decay'] != 0: 80 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 81 | 82 | # more conservative since it's an approximated value 83 | if N_sma >= 5: 84 | denom = exp_avg_sq.sqrt().add_(group['eps']) 85 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 86 | else: 87 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 88 | 89 | p.data.copy_(p_data_fp32) 90 | 91 | return loss 92 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | import librosa 8 | import torch 9 | 10 | from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft 11 | 12 | 13 | class STFTLoss(torch.nn.Module): 14 | """STFT loss module.""" 15 | 16 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", 17 | use_mel_loss=False): 18 | """Initialize STFT loss module.""" 19 | super(STFTLoss, self).__init__() 20 | self.fft_size = fft_size 21 | self.shift_size = shift_size 22 | self.win_length = win_length 23 | self.window = getattr(torch, window)(win_length) 24 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 25 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 26 | self.use_mel_loss = use_mel_loss 27 | self.mel_basis = None 28 | 29 | def forward(self, x, y): 30 | """Calculate forward propagation. 31 | 32 | Args: 33 | x (Tensor): Predicted signal (B, T). 34 | y (Tensor): Groundtruth signal (B, T). 35 | 36 | Returns: 37 | Tensor: Spectral convergence loss value. 38 | Tensor: Log STFT magnitude loss value. 39 | 40 | """ 41 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 42 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 43 | if self.use_mel_loss: 44 | if self.mel_basis is None: 45 | self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T 46 | x_mag = x_mag @ self.mel_basis 47 | y_mag = y_mag @ self.mel_basis 48 | 49 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 50 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 51 | 52 | return sc_loss, mag_loss 53 | 54 | 55 | class MultiResolutionSTFTLoss(torch.nn.Module): 56 | """Multi resolution STFT loss module.""" 57 | 58 | def __init__(self, 59 | fft_sizes=[1024, 2048, 512], 60 | hop_sizes=[120, 240, 50], 61 | win_lengths=[600, 1200, 240], 62 | window="hann_window", 63 | use_mel_loss=False): 64 | """Initialize Multi resolution STFT loss module. 65 | 66 | Args: 67 | fft_sizes (list): List of FFT sizes. 68 | hop_sizes (list): List of hop sizes. 69 | win_lengths (list): List of window lengths. 70 | window (str): Window function type. 71 | 72 | """ 73 | super(MultiResolutionSTFTLoss, self).__init__() 74 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 75 | self.stft_losses = torch.nn.ModuleList() 76 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 77 | self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)] 78 | 79 | def forward(self, x, y): 80 | """Calculate forward propagation. 81 | 82 | Args: 83 | x (Tensor): Predicted signal (B, T). 84 | y (Tensor): Groundtruth signal (B, T). 85 | 86 | Returns: 87 | Tensor: Multi resolution spectral convergence loss value. 88 | Tensor: Multi resolution log STFT magnitude loss value. 89 | 90 | """ 91 | sc_loss = 0.0 92 | mag_loss = 0.0 93 | for f in self.stft_losses: 94 | sc_l, mag_l = f(x, y) 95 | sc_loss += sc_l 96 | mag_loss += mag_l 97 | sc_loss /= len(self.stft_losses) 98 | mag_loss /= len(self.stft_losses) 99 | 100 | return sc_loss, mag_loss 101 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # NOQA 2 | -------------------------------------------------------------------------------- /hifigan/modules/parallel_wavegan/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Utility functions.""" 7 | 8 | import fnmatch 9 | import logging 10 | import os 11 | import sys 12 | 13 | import h5py 14 | import numpy as np 15 | 16 | 17 | def find_files(root_dir, query="*.wav", include_root_dir=True): 18 | """Find files recursively. 19 | 20 | Args: 21 | root_dir (str): Root root_dir to find. 22 | query (str): Query to find. 23 | include_root_dir (bool): If False, root_dir name is not included. 24 | 25 | Returns: 26 | list: List of found filenames. 27 | 28 | """ 29 | files = [] 30 | for root, dirnames, filenames in os.walk(root_dir, followlinks=True): 31 | for filename in fnmatch.filter(filenames, query): 32 | files.append(os.path.join(root, filename)) 33 | if not include_root_dir: 34 | files = [file_.replace(root_dir + "/", "") for file_ in files] 35 | 36 | return files 37 | 38 | 39 | def read_hdf5(hdf5_name, hdf5_path): 40 | """Read hdf5 dataset. 41 | 42 | Args: 43 | hdf5_name (str): Filename of hdf5 file. 44 | hdf5_path (str): Dataset name in hdf5 file. 45 | 46 | Return: 47 | any: Dataset values. 48 | 49 | """ 50 | if not os.path.exists(hdf5_name): 51 | logging.error(f"There is no such a hdf5 file ({hdf5_name}).") 52 | sys.exit(1) 53 | 54 | hdf5_file = h5py.File(hdf5_name, "r") 55 | 56 | if hdf5_path not in hdf5_file: 57 | logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})") 58 | sys.exit(1) 59 | 60 | hdf5_data = hdf5_file[hdf5_path][()] 61 | hdf5_file.close() 62 | 63 | return hdf5_data 64 | 65 | 66 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): 67 | """Write dataset to hdf5. 68 | 69 | Args: 70 | hdf5_name (str): Hdf5 dataset filename. 71 | hdf5_path (str): Dataset path in hdf5. 72 | write_data (ndarray): Data to write. 73 | is_overwrite (bool): Whether to overwrite dataset. 74 | 75 | """ 76 | # convert to numpy array 77 | write_data = np.array(write_data) 78 | 79 | # check folder existence 80 | folder_name, _ = os.path.split(hdf5_name) 81 | if not os.path.exists(folder_name) and len(folder_name) != 0: 82 | os.makedirs(folder_name) 83 | 84 | # check hdf5 existence 85 | if os.path.exists(hdf5_name): 86 | # if already exists, open with r+ mode 87 | hdf5_file = h5py.File(hdf5_name, "r+") 88 | # check dataset existence 89 | if hdf5_path in hdf5_file: 90 | if is_overwrite: 91 | logging.warning("Dataset in hdf5 file already exists. " 92 | "recreate dataset in hdf5.") 93 | hdf5_file.__delitem__(hdf5_path) 94 | else: 95 | logging.error("Dataset in hdf5 file already exists. " 96 | "if you want to overwrite, please set is_overwrite = True.") 97 | hdf5_file.close() 98 | sys.exit(1) 99 | else: 100 | # if not exists, open with w mode 101 | hdf5_file = h5py.File(hdf5_name, "w") 102 | 103 | # write data to hdf5 104 | hdf5_file.create_dataset(hdf5_path, data=write_data) 105 | hdf5_file.flush() 106 | hdf5_file.close() 107 | 108 | 109 | class HDF5ScpLoader(object): 110 | """Loader class for a fests.scp file of hdf5 file. 111 | 112 | Examples: 113 | key1 /some/path/a.h5:feats 114 | key2 /some/path/b.h5:feats 115 | key3 /some/path/c.h5:feats 116 | key4 /some/path/d.h5:feats 117 | ... 118 | >>> loader = HDF5ScpLoader("hdf5.scp") 119 | >>> array = loader["key1"] 120 | 121 | key1 /some/path/a.h5 122 | key2 /some/path/b.h5 123 | key3 /some/path/c.h5 124 | key4 /some/path/d.h5 125 | ... 126 | >>> loader = HDF5ScpLoader("hdf5.scp", "feats") 127 | >>> array = loader["key1"] 128 | 129 | """ 130 | 131 | def __init__(self, feats_scp, default_hdf5_path="feats"): 132 | """Initialize HDF5 scp loader. 133 | 134 | Args: 135 | feats_scp (str): Kaldi-style feats.scp file with hdf5 format. 136 | default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used. 137 | 138 | """ 139 | self.default_hdf5_path = default_hdf5_path 140 | with open(feats_scp, encoding='utf-8') as f: 141 | lines = [line.replace("\n", "") for line in f.readlines()] 142 | self.data = {} 143 | for line in lines: 144 | key, value = line.split() 145 | self.data[key] = value 146 | 147 | def get_path(self, key): 148 | """Get hdf5 file path for a given key.""" 149 | return self.data[key] 150 | 151 | def __getitem__(self, key): 152 | """Get ndarray for a given key.""" 153 | p = self.data[key] 154 | if ":" in p: 155 | return read_hdf5(*p.split(":")) 156 | else: 157 | return read_hdf5(p, self.default_hdf5_path) 158 | 159 | def __len__(self): 160 | """Return the length of the scp file.""" 161 | return len(self.data) 162 | 163 | def __iter__(self): 164 | """Return the iterator of the scp file.""" 165 | return iter(self.data) 166 | 167 | def keys(self): 168 | """Return the keys of the scp file.""" 169 | return self.data.keys() 170 | -------------------------------------------------------------------------------- /hifigan/network/vocoders/__init__.py: -------------------------------------------------------------------------------- 1 | from hifigan.network.vocoders import hifigan 2 | from hifigan.network.vocoders import nsf_hifigan 3 | -------------------------------------------------------------------------------- /hifigan/network/vocoders/base_vocoder.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | VOCODERS = {} 3 | 4 | 5 | def register_vocoder(cls): 6 | VOCODERS[cls.__name__.lower()] = cls 7 | VOCODERS[cls.__name__] = cls 8 | return cls 9 | 10 | 11 | def get_vocoder_cls(hparams): 12 | if hparams['vocoder'] in VOCODERS: 13 | return VOCODERS[hparams['vocoder']] 14 | else: 15 | vocoder_cls = hparams['vocoder'] 16 | pkg = ".".join(vocoder_cls.split(".")[:-1]) 17 | cls_name = vocoder_cls.split(".")[-1] 18 | vocoder_cls = getattr(importlib.import_module(pkg), cls_name) 19 | return vocoder_cls 20 | 21 | 22 | class BaseVocoder: 23 | def spec2wav(self, mel): 24 | """ 25 | 26 | :param mel: [T, 80] 27 | :return: wav: [T'] 28 | """ 29 | 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | def wav2spec(wav_fn): 34 | """ 35 | 36 | :param wav_fn: str 37 | :return: wav, mel: [T, 80] 38 | """ 39 | raise NotImplementedError 40 | -------------------------------------------------------------------------------- /hifigan/network/vocoders/hifigan.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import re 5 | 6 | import librosa 7 | import torch 8 | 9 | import utils 10 | from hifigan.modules.hifigan.hifigan import HifiGanGenerator 11 | from hifigan.network.vocoders.base_vocoder import register_vocoder 12 | from hifigan.network.vocoders.pwg import PWG 13 | from hifigan.network.vocoders.vocoder_utils import denoise 14 | 15 | 16 | def load_model(config_path, file_path): 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | ext = os.path.splitext(file_path)[-1] 19 | if ext == '.pth': 20 | if '.yaml' in config_path: 21 | config = set_hparams(config_path, global_hparams=False) 22 | elif '.json' in config_path: 23 | config = json.load(open(config_path, 'r', encoding='utf-8')) 24 | model = torch.load(file_path, map_location="cpu") 25 | elif ext == '.ckpt': 26 | ckpt_dict = torch.load(file_path, map_location="cpu") 27 | if '.yaml' in config_path: 28 | config = set_hparams(config_path, global_hparams=False) 29 | state = ckpt_dict["state_dict"]["model_gen"] 30 | elif '.json' in config_path: 31 | config = json.load(open(config_path, 'r', encoding='utf-8')) 32 | state = ckpt_dict["generator"] 33 | model = HifiGanGenerator(config) 34 | model.load_state_dict(state, strict=True) 35 | model.remove_weight_norm() 36 | model = model.eval().to(device) 37 | print(f"| Loaded model parameters from {file_path}.") 38 | print(f"| HifiGAN device: {device}.") 39 | return model, config, device 40 | 41 | 42 | total_time = 0 43 | 44 | 45 | @register_vocoder 46 | class HifiGAN(PWG): 47 | def __init__(self): 48 | base_dir = hparams['vocoder_ckpt'] 49 | config_path = f'{base_dir}/config.yaml' 50 | if os.path.exists(config_path): 51 | file_path = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.*'), key= 52 | lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).*', x.replace('\\','/'))[0]))[-1] 53 | print('| load HifiGAN: ', file_path) 54 | self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path) 55 | else: 56 | config_path = f'{base_dir}/config.json' 57 | ckpt = f'{base_dir}/generator_v1' 58 | if os.path.exists(config_path): 59 | self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path) 60 | 61 | def spec2wav(self, mel, **kwargs): 62 | device = self.device 63 | with torch.no_grad(): 64 | c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device) 65 | with utils.Timer('hifigan', print_time=hparams['profile_infer']): 66 | f0 = kwargs.get('f0') 67 | if f0 is not None and hparams.get('use_nsf'): 68 | f0 = torch.FloatTensor(f0[None, :]).to(device) 69 | y = self.model(c, f0).view(-1) 70 | else: 71 | y = self.model(c).view(-1) 72 | wav_out = y.cpu().numpy() 73 | if hparams.get('vocoder_denoise_c', 0.0) > 0: 74 | wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c']) 75 | return wav_out 76 | 77 | # @staticmethod 78 | # def wav2spec(wav_fn, **kwargs): 79 | # wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate']) 80 | # wav_torch = torch.FloatTensor(wav)[None, :] 81 | # mel = mel_spectrogram(wav_torch, hparams).numpy()[0] 82 | # return wav, mel.T 83 | -------------------------------------------------------------------------------- /hifigan/network/vocoders/nsf_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from hifigan.modules.nsf_hifigan.models import load_model, Generator 4 | from hifigan.modules.nsf_hifigan.nvSTFT import load_wav_to_torch, STFT 5 | from hifigan.network.vocoders.base_vocoder import BaseVocoder, register_vocoder 6 | 7 | @register_vocoder 8 | class NsfHifiGAN(BaseVocoder): 9 | def __init__(self, device=None): 10 | if device is None: 11 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 12 | self.device = device 13 | model_path = "pretrain/nsf-hifigan/model" 14 | if os.path.exists(model_path): 15 | print('| Load HifiGAN: ', model_path) 16 | self.model, self.h = load_model(model_path, device=self.device) 17 | else: 18 | print('Error: HifiGAN model file is not found!') 19 | 20 | def spec2wav_torch(self, mel, **kwargs): # mel: [B, T, bins] 21 | if self.h.sampling_rate != self.h['audio_sample_rate']: 22 | print('Mismatch parameters: self.h[\'audio_sample_rate\']=',self.h['audio_sample_rate'],'!=',self.h.sampling_rate,'(vocoder)') 23 | if self.h.num_mels != self.h['audio_num_mel_bins']: 24 | print('Mismatch parameters: self.h[\'audio_num_mel_bins\']=',self.h['audio_num_mel_bins'],'!=',self.h.num_mels,'(vocoder)') 25 | if self.h.n_fft != self.h['fft_size']: 26 | print('Mismatch parameters: self.h[\'fft_size\']=',self.h['fft_size'],'!=',self.h.n_fft,'(vocoder)') 27 | if self.h.win_size != self.h['win_size']: 28 | print('Mismatch parameters: self.h[\'win_size\']=',self.h['win_size'],'!=',self.h.win_size,'(vocoder)') 29 | if self.h.hop_size != self.h['hop_size']: 30 | print('Mismatch parameters: self.h[\'hop_size\']=',self.h['hop_size'],'!=',self.h.hop_size,'(vocoder)') 31 | if self.h.fmin != self.h['fmin']: 32 | print('Mismatch parameters: self.h[\'fmin\']=',self.h['fmin'],'!=',self.h.fmin,'(vocoder)') 33 | if self.h.fmax != self.h['fmax']: 34 | print('Mismatch parameters: self.h[\'fmax\']=',self.h['fmax'],'!=',self.h.fmax,'(vocoder)') 35 | with torch.no_grad(): 36 | c = mel.transpose(2, 1) #[B, T, bins] 37 | #log10 to log mel 38 | c = 2.30259 * c 39 | f0 = kwargs.get('f0') #[B, T] 40 | if f0 is not None and self.h.get('use_nsf'): 41 | y = self.model(c, f0).view(-1) 42 | else: 43 | y = self.model(c).view(-1) 44 | return y 45 | 46 | def spec2wav(self, mel, **kwargs): 47 | with torch.no_grad(): 48 | c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(self.device) 49 | #log10 to log mel 50 | c = 2.30259 * c 51 | f0 = kwargs.get('f0') 52 | if f0 is not None : 53 | f0 = torch.FloatTensor(f0[None, :]).to(self.device) 54 | y = self.model(c, f0).view(-1) 55 | wav_out = y.cpu().numpy() 56 | return wav_out 57 | 58 | def decode(self, mel, f0) -> torch.Tensor: 59 | with torch.no_grad(): 60 | c = mel.to(self.device) 61 | #log10 to log mel 62 | c = 2.30259 * c 63 | f0 = f0.to(self.device) 64 | y = self.model(c, f0).view(-1) 65 | wav_out = y.cpu().numpy() 66 | return wav_out 67 | def wav2spec(self, inp_path): 68 | assert inp_path.endswith('.wav') 69 | save_path = inp_path.replace(".wav", ".mel.pt") 70 | if os.path.exists(save_path): 71 | return torch.load(save_path) 72 | sampling_rate = self.h['sampling_rate'] 73 | num_mels = self.h['num_mels'] 74 | n_fft = self.h['n_fft'] 75 | win_size =self.h['win_size'] 76 | hop_size = self.h['hop_size'] 77 | fmin = self.h['fmin'] 78 | fmax = self.h['fmax'] 79 | stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax) 80 | with torch.no_grad(): 81 | wav_torch, _ = load_wav_to_torch(inp_path, target_sr=stft.target_sr) 82 | mel_torch = stft.get_mel(wav_torch.unsqueeze(0)).squeeze(0).T 83 | #log mel to log10 mel 84 | mel_torch = 0.434294 * mel_torch.T 85 | torch.save(mel_torch, save_path) 86 | return mel_torch -------------------------------------------------------------------------------- /hifigan/network/vocoders/pwg.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import re 3 | import librosa 4 | import torch 5 | import yaml 6 | from sklearn.preprocessing import StandardScaler 7 | from torch import nn 8 | from hifigan.modules.parallel_wavegan.models import ParallelWaveGANGenerator 9 | from hifigan.modules.parallel_wavegan.utils import read_hdf5 10 | from hifigan.network.vocoders.base_vocoder import BaseVocoder, register_vocoder 11 | import numpy as np 12 | 13 | 14 | def load_pwg_model(config_path, checkpoint_path, stats_path): 15 | # load config 16 | with open(config_path, encoding='utf-8') as f: 17 | config = yaml.load(f, Loader=yaml.Loader) 18 | 19 | # setup 20 | if torch.cuda.is_available(): 21 | device = torch.device("cuda") 22 | else: 23 | device = torch.device("cpu") 24 | model = ParallelWaveGANGenerator(**config["generator_params"]) 25 | 26 | ckpt_dict = torch.load(checkpoint_path, map_location="cpu") 27 | if 'state_dict' not in ckpt_dict: # official vocoder 28 | model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"]) 29 | scaler = StandardScaler() 30 | if config["format"] == "hdf5": 31 | scaler.mean_ = read_hdf5(stats_path, "mean") 32 | scaler.scale_ = read_hdf5(stats_path, "scale") 33 | elif config["format"] == "npy": 34 | scaler.mean_ = np.load(stats_path)[0] 35 | scaler.scale_ = np.load(stats_path)[1] 36 | else: 37 | raise ValueError("support only hdf5 or npy format.") 38 | else: # custom PWG vocoder 39 | fake_task = nn.Module() 40 | fake_task.model_gen = model 41 | fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False) 42 | scaler = None 43 | 44 | model.remove_weight_norm() 45 | model = model.eval().to(device) 46 | print(f"| Loaded model parameters from {checkpoint_path}.") 47 | print(f"| PWG device: {device}.") 48 | return model, scaler, config, device 49 | 50 | 51 | @register_vocoder 52 | class PWG(BaseVocoder): 53 | def __init__(self): 54 | if hparams['vocoder_ckpt'] == '': # load LJSpeech PWG pretrained model 55 | base_dir = 'wavegan_pretrained' 56 | ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl') 57 | ckpt = sorted(ckpts, key= 58 | lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1] 59 | config_path = f'{base_dir}/config.yaml' 60 | print('| load PWG: ', ckpt) 61 | self.model, self.scaler, self.config, self.device = load_pwg_model( 62 | config_path=config_path, 63 | checkpoint_path=ckpt, 64 | stats_path=f'{base_dir}/stats.h5', 65 | ) 66 | else: 67 | base_dir = hparams['vocoder_ckpt'] 68 | print(base_dir) 69 | config_path = f'{base_dir}/config.yaml' 70 | ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key= 71 | lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1] 72 | print('| load PWG: ', ckpt) 73 | self.scaler = None 74 | self.model, _, self.config, self.device = load_pwg_model( 75 | config_path=config_path, 76 | checkpoint_path=ckpt, 77 | stats_path=f'{base_dir}/stats.h5', 78 | ) 79 | 80 | def spec2wav(self, mel, **kwargs): 81 | # start generation 82 | config = self.config 83 | device = self.device 84 | pad_size = (config["generator_params"]["aux_context_window"], 85 | config["generator_params"]["aux_context_window"]) 86 | c = mel 87 | if self.scaler is not None: 88 | c = self.scaler.transform(c) 89 | 90 | with torch.no_grad(): 91 | z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device) 92 | c = np.pad(c, (pad_size, (0, 0)), "edge") 93 | c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) 94 | p = kwargs.get('f0') 95 | if p is not None: 96 | p = f0_to_coarse(p) 97 | p = np.pad(p, (pad_size,), "edge") 98 | p = torch.LongTensor(p[None, :]).to(device) 99 | y = self.model(z, c, p).view(-1) 100 | wav_out = y.cpu().numpy() 101 | return wav_out 102 | 103 | @staticmethod 104 | def wav2spec(wav_fn, return_linear=False): 105 | from preprocessing.data_gen_utils import process_utterance 106 | res = process_utterance( 107 | wav_fn, fft_size=hparams['fft_size'], 108 | hop_size=hparams['hop_size'], 109 | win_length=hparams['win_size'], 110 | num_mels=hparams['audio_num_mel_bins'], 111 | fmin=hparams['fmin'], 112 | fmax=hparams['fmax'], 113 | sample_rate=hparams['audio_sample_rate'], 114 | loud_norm=hparams['loud_norm'], 115 | min_level_db=hparams['min_level_db'], 116 | return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10))) 117 | if return_linear: 118 | return res[0], res[1].T, res[2].T # [T, 80], [T, n_fft] 119 | else: 120 | return res[0], res[1].T 121 | 122 | @staticmethod 123 | def wav2mfcc(wav_fn): 124 | fft_size = hparams['fft_size'] 125 | hop_size = hparams['hop_size'] 126 | win_length = hparams['win_size'] 127 | sample_rate = hparams['audio_sample_rate'] 128 | wav, _ = librosa.core.load(wav_fn, sr=sample_rate) 129 | mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13, 130 | n_fft=fft_size, hop_length=hop_size, 131 | win_length=win_length, pad_mode="constant", power=1.0) 132 | mfcc_delta = librosa.feature.delta(mfcc, order=1) 133 | mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) 134 | mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T 135 | return mfcc 136 | -------------------------------------------------------------------------------- /hifigan/network/vocoders/vocoder_utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | 3 | import numpy as np 4 | 5 | 6 | def denoise(wav, v=0.1): 7 | spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'], 8 | win_length=hparams['win_size'], pad_mode='constant') 9 | spec_m = np.abs(spec) 10 | spec_m = np.clip(spec_m - v, a_min=0, a_max=None) 11 | spec_a = np.angle(spec) 12 | 13 | return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'], 14 | win_length=hparams['win_size']) 15 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | os.environ["LRU_CACHE_CAPACITY"] = "3" 4 | import random 5 | import torch 6 | import torch.utils.data 7 | import numpy as np 8 | import librosa 9 | from librosa.util import normalize 10 | from librosa.filters import mel as librosa_mel_fn 11 | from scipy.io.wavfile import read 12 | import soundfile as sf 13 | 14 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): 15 | sampling_rate = None 16 | try: 17 | data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. 18 | except Exception as ex: 19 | print(f"'{full_path}' failed to load.\nException:") 20 | print(ex) 21 | if return_empty_on_exception: 22 | return [], sampling_rate or target_sr or 48000 23 | else: 24 | raise Exception(ex) 25 | 26 | if len(data.shape) > 1: 27 | data = data[:, 0] 28 | assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) 29 | 30 | if np.issubdtype(data.dtype, np.integer): # if audio data is type int 31 | max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX 32 | else: # if audio data is type fp32 33 | max_mag = max(np.amax(data), -np.amin(data)) 34 | max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 35 | 36 | data = torch.FloatTensor(data.astype(np.float32))/max_mag 37 | 38 | if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except 39 | return [], sampling_rate or target_sr or 48000 40 | if target_sr is not None and sampling_rate != target_sr: 41 | data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) 42 | sampling_rate = target_sr 43 | 44 | return data, sampling_rate 45 | 46 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 47 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 48 | 49 | def dynamic_range_decompression(x, C=1): 50 | return np.exp(x) / C 51 | 52 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 53 | return torch.log(torch.clamp(x, min=clip_val) * C) 54 | 55 | def dynamic_range_decompression_torch(x, C=1): 56 | return torch.exp(x) / C 57 | 58 | class STFT(): 59 | def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): 60 | self.target_sr = sr 61 | 62 | self.n_mels = n_mels 63 | self.n_fft = n_fft 64 | self.win_size = win_size 65 | self.hop_length = hop_length 66 | self.fmin = fmin 67 | self.fmax = fmax 68 | self.clip_val = clip_val 69 | self.mel_basis = {} 70 | self.hann_window = {} 71 | 72 | def get_mel(self, y, center=False): 73 | sampling_rate = self.target_sr 74 | n_mels = self.n_mels 75 | n_fft = self.n_fft 76 | win_size = self.win_size 77 | hop_length = self.hop_length 78 | fmin = self.fmin 79 | fmax = self.fmax 80 | clip_val = self.clip_val 81 | 82 | if torch.min(y) < -1.: 83 | print('min value is ', torch.min(y)) 84 | if torch.max(y) > 1.: 85 | print('max value is ', torch.max(y)) 86 | 87 | if fmax not in self.mel_basis: 88 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) 89 | self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 90 | self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device) 91 | 92 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect') 93 | y = y.squeeze(1) 94 | 95 | spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], 96 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 97 | # print(111,spec) 98 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 99 | # print(222,spec) 100 | spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec) 101 | # print(333,spec) 102 | spec = dynamic_range_compression_torch(spec, clip_val=clip_val) 103 | # print(444,spec) 104 | return spec 105 | 106 | def __call__(self, audiopath): 107 | audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) 108 | spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) 109 | return spect 110 | 111 | 112 | def get_mel(wav_torch, sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax): 113 | stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax) 114 | with torch.no_grad(): 115 | mel_torch = stft.get_mel(wav_torch.unsqueeze(0)).squeeze(0).T 116 | # log mel to log10 mel 117 | mel_torch = 0.434294 * mel_torch.T 118 | return mel_torch 119 | 120 | if __name__ == '__main__': 121 | mel, wav = get_mel("/Users/xingyijin/Downloads/api.wav", 16000, 80, 1024, 256, 80, 20, 11025) 122 | print(mel.shape, wav.shape) -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | import commons 10 | 11 | 12 | class LayerNorm(nn.Module): 13 | def __init__(self, channels, eps=1e-4): 14 | super().__init__() 15 | self.channels = channels 16 | self.eps = eps 17 | 18 | self.gamma = nn.Parameter(torch.ones(channels)) 19 | self.beta = nn.Parameter(torch.zeros(channels)) 20 | 21 | def forward(self, x): 22 | n_dims = len(x.shape) 23 | mean = torch.mean(x, 1, keepdim=True) 24 | variance = torch.mean((x -mean)**2, 1, keepdim=True) 25 | 26 | x = (x - mean) * torch.rsqrt(variance + self.eps) 27 | 28 | shape = [1, -1] + [1] * (n_dims - 2) 29 | x = x * self.gamma.view(*shape) + self.beta.view(*shape) 30 | return x 31 | 32 | 33 | class ConvReluNorm(nn.Module): 34 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 35 | super().__init__() 36 | self.in_channels = in_channels 37 | self.hidden_channels = hidden_channels 38 | self.out_channels = out_channels 39 | self.kernel_size = kernel_size 40 | self.n_layers = n_layers 41 | self.p_dropout = p_dropout 42 | assert n_layers > 1, "Number of layers should be larger than 0." 43 | 44 | self.conv_layers = nn.ModuleList() 45 | self.norm_layers = nn.ModuleList() 46 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 47 | self.norm_layers.append(LayerNorm(hidden_channels)) 48 | self.relu_drop = nn.Sequential( 49 | nn.ReLU(), 50 | nn.Dropout(p_dropout)) 51 | for _ in range(n_layers-1): 52 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 53 | self.norm_layers.append(LayerNorm(hidden_channels)) 54 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 55 | self.proj.weight.data.zero_() 56 | self.proj.bias.data.zero_() 57 | 58 | def forward(self, x, x_mask): 59 | x_org = x 60 | for i in range(self.n_layers): 61 | x = self.conv_layers[i](x * x_mask) 62 | x = self.norm_layers[i](x) 63 | x = self.relu_drop(x) 64 | x = x_org + self.proj(x) 65 | return x * x_mask 66 | 67 | 68 | class WN(torch.nn.Module): 69 | def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 70 | super(WN, self).__init__() 71 | assert(kernel_size % 2 == 1) 72 | assert(hidden_channels % 2 == 0) 73 | self.in_channels = in_channels 74 | self.hidden_channels =hidden_channels 75 | self.kernel_size = kernel_size, 76 | self.dilation_rate = dilation_rate 77 | self.n_layers = n_layers 78 | self.gin_channels = gin_channels 79 | self.p_dropout = p_dropout 80 | 81 | self.in_layers = torch.nn.ModuleList() 82 | self.res_skip_layers = torch.nn.ModuleList() 83 | self.drop = nn.Dropout(p_dropout) 84 | 85 | if gin_channels != 0: 86 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 87 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 88 | 89 | for i in range(n_layers): 90 | dilation = dilation_rate ** i 91 | padding = int((kernel_size * dilation - dilation) / 2) 92 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 93 | dilation=dilation, padding=padding) 94 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 95 | self.in_layers.append(in_layer) 96 | 97 | # last one is not necessary 98 | if i < n_layers - 1: 99 | res_skip_channels = 2 * hidden_channels 100 | else: 101 | res_skip_channels = hidden_channels 102 | 103 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 104 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 105 | self.res_skip_layers.append(res_skip_layer) 106 | 107 | def forward(self, x, x_mask=None, g=None, **kwargs): 108 | output = torch.zeros_like(x) 109 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 110 | 111 | if g is not None: 112 | g = self.cond_layer(g) 113 | 114 | for i in range(self.n_layers): 115 | x_in = self.in_layers[i](x) 116 | x_in = self.drop(x_in) 117 | if g is not None: 118 | cond_offset = i * 2 * self.hidden_channels 119 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 120 | else: 121 | g_l = torch.zeros_like(x_in) 122 | 123 | acts = commons.fused_add_tanh_sigmoid_multiply( 124 | x_in, 125 | g_l, 126 | n_channels_tensor) 127 | 128 | res_skip_acts = self.res_skip_layers[i](acts) 129 | if i < self.n_layers - 1: 130 | x = (x + res_skip_acts[:,:self.hidden_channels,:]) * x_mask 131 | output = output + res_skip_acts[:,self.hidden_channels:,:] 132 | else: 133 | output = output + res_skip_acts 134 | return output * x_mask 135 | 136 | def remove_weight_norm(self): 137 | if self.gin_channels != 0: 138 | torch.nn.utils.remove_weight_norm(self.cond_layer) 139 | for l in self.in_layers: 140 | torch.nn.utils.remove_weight_norm(l) 141 | for l in self.res_skip_layers: 142 | torch.nn.utils.remove_weight_norm(l) 143 | 144 | 145 | class ActNorm(nn.Module): 146 | def __init__(self, channels, ddi=False, **kwargs): 147 | super().__init__() 148 | self.channels = channels 149 | self.initialized = not ddi 150 | 151 | self.logs = nn.Parameter(torch.zeros(1, channels, 1)) 152 | self.bias = nn.Parameter(torch.zeros(1, channels, 1)) 153 | 154 | def forward(self, x, x_mask=None, reverse=False, **kwargs): 155 | if x_mask is None: 156 | x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype) 157 | x_len = torch.sum(x_mask, [1, 2]) 158 | if not self.initialized: 159 | self.initialize(x, x_mask) 160 | self.initialized = True 161 | 162 | if reverse: 163 | z = (x - self.bias) * torch.exp(-self.logs) * x_mask 164 | logdet = None 165 | else: 166 | z = (self.bias + torch.exp(self.logs) * x) * x_mask 167 | logdet = torch.sum(self.logs) * x_len # [b] 168 | 169 | return z, logdet 170 | 171 | def store_inverse(self): 172 | pass 173 | 174 | def set_ddi(self, ddi): 175 | self.initialized = not ddi 176 | 177 | def initialize(self, x, x_mask): 178 | with torch.no_grad(): 179 | denom = torch.sum(x_mask, [0, 2]) 180 | m = torch.sum(x * x_mask, [0, 2]) / denom 181 | m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom 182 | v = m_sq - (m ** 2) 183 | logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) 184 | 185 | bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) 186 | logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) 187 | 188 | self.bias.data.copy_(bias_init) 189 | self.logs.data.copy_(logs_init) 190 | 191 | 192 | class InvConvNear(nn.Module): 193 | def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): 194 | super().__init__() 195 | assert(n_split % 2 == 0) 196 | self.channels = channels 197 | self.n_split = n_split 198 | self.no_jacobian = no_jacobian 199 | 200 | w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] 201 | if torch.det(w_init) < 0: 202 | w_init[:,0] = -1 * w_init[:,0] 203 | self.weight = nn.Parameter(w_init) 204 | 205 | def forward(self, x, x_mask=None, reverse=False, **kwargs): 206 | b, c, t = x.size() 207 | assert(c % self.n_split == 0) 208 | if x_mask is None: 209 | x_mask = 1 210 | x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t 211 | else: 212 | x_len = torch.sum(x_mask, [1, 2]) 213 | 214 | x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) 215 | x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t) 216 | 217 | if reverse: 218 | if hasattr(self, "weight_inv"): 219 | weight = self.weight_inv 220 | else: 221 | weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) 222 | logdet = None 223 | else: 224 | weight = self.weight 225 | if self.no_jacobian: 226 | logdet = 0 227 | else: 228 | logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b] 229 | 230 | weight = weight.view(self.n_split, self.n_split, 1, 1) 231 | z = F.conv2d(x, weight) 232 | 233 | z = z.view(b, 2, self.n_split // 2, c // self.n_split, t) 234 | z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask 235 | return z, logdet 236 | 237 | def store_inverse(self): 238 | self.weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) 239 | -------------------------------------------------------------------------------- /preprocess_flist_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import tqdm 4 | 5 | import os 6 | data_all = [] 7 | spk2id = {} 8 | current_spk = 0 9 | for spk in os.listdir('dataset'): 10 | if os.path.isdir(os.path.join('dataset', spk)): 11 | for wav in os.listdir(os.path.join('dataset', spk)): 12 | if wav.endswith('wav'): 13 | name = wav.split('.')[0] 14 | data_all.append(f"{name}|{spk}\n") 15 | if spk not in spk2id.keys(): 16 | spk2id[spk] = current_spk 17 | current_spk+=1 18 | 19 | 20 | import random 21 | random.shuffle(data_all) 22 | data_train = data_all[:-5] 23 | data_val = data_all[-5:] 24 | with open('filelists/train.list', 'w', encoding='utf-8') as f: 25 | for line in data_train: 26 | f.write(line) 27 | 28 | with open('filelists/val.list', 'w', encoding='utf-8') as f: 29 | for line in data_val: 30 | f.write(line) 31 | 32 | template = json.load(open('configs/config.json', 'r', encoding='utf-8')) 33 | template["data"]['spk2id'] = spk2id 34 | json.dump(template, open('configs/config.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False) 35 | 36 | -------------------------------------------------------------------------------- /pretrain/content-vec-best/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_dropout": 0.1, 3 | "apply_spec_augment": true, 4 | "architectures": [ 5 | "HubertModelWithFinalProj" 6 | ], 7 | "attention_dropout": 0.1, 8 | "bos_token_id": 1, 9 | "classifier_proj_size": 256, 10 | "conv_bias": false, 11 | "conv_dim": [ 12 | 512, 13 | 512, 14 | 512, 15 | 512, 16 | 512, 17 | 512, 18 | 512 19 | ], 20 | "conv_kernel": [ 21 | 10, 22 | 3, 23 | 3, 24 | 3, 25 | 3, 26 | 2, 27 | 2 28 | ], 29 | "conv_stride": [ 30 | 5, 31 | 2, 32 | 2, 33 | 2, 34 | 2, 35 | 2, 36 | 2 37 | ], 38 | "ctc_loss_reduction": "sum", 39 | "ctc_zero_infinity": false, 40 | "do_stable_layer_norm": false, 41 | "eos_token_id": 2, 42 | "feat_extract_activation": "gelu", 43 | "feat_extract_norm": "group", 44 | "feat_proj_dropout": 0.0, 45 | "feat_proj_layer_norm": true, 46 | "final_dropout": 0.1, 47 | "hidden_act": "gelu", 48 | "hidden_dropout": 0.1, 49 | "hidden_size": 768, 50 | "initializer_range": 0.02, 51 | "intermediate_size": 3072, 52 | "layer_norm_eps": 1e-05, 53 | "layerdrop": 0.1, 54 | "mask_feature_length": 10, 55 | "mask_feature_min_masks": 0, 56 | "mask_feature_prob": 0.0, 57 | "mask_time_length": 10, 58 | "mask_time_min_masks": 2, 59 | "mask_time_prob": 0.05, 60 | "model_type": "hubert", 61 | "num_attention_heads": 12, 62 | "num_conv_pos_embedding_groups": 16, 63 | "num_conv_pos_embeddings": 128, 64 | "num_feat_extract_layers": 7, 65 | "num_hidden_layers": 12, 66 | "pad_token_id": 0, 67 | "torch_dtype": "float32", 68 | "transformers_version": "4.27.3", 69 | "use_weighted_layer_sum": false, 70 | "vocab_size": 32 71 | } -------------------------------------------------------------------------------- /pretrain/fcpe/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/pretrain/fcpe/.gitkeep -------------------------------------------------------------------------------- /pretrain/nsf-hifigan/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 4, 4 | "batch_size": 10, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [ 8, 8, 2, 2, 2], 12 | "upsample_kernel_sizes": [16,16, 4, 4, 4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | "discriminator_periods": [3, 5, 7, 11, 17, 23, 37], 17 | 18 | "segment_size": 16384, 19 | "num_mels": 128, 20 | "num_freq": 1025, 21 | "n_fft" : 2048, 22 | "hop_size": 512, 23 | "win_size": 2048, 24 | 25 | "sampling_rate": 44100, 26 | 27 | "fmin": 40, 28 | "fmax": 16000, 29 | "fmax_for_loss": null, 30 | 31 | "num_workers": 16, 32 | 33 | "dist_config": { 34 | "dist_backend": "nccl", 35 | "dist_url": "tcp://localhost:54321", 36 | "world_size": 1 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pretrain/nsf-hifigan/put_441hifigan_ckpt_here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/pretrain/nsf-hifigan/put_441hifigan_ckpt_here -------------------------------------------------------------------------------- /resample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | import numpy as np 5 | from multiprocessing import Pool, cpu_count 6 | 7 | import soundfile 8 | from scipy.io import wavfile 9 | from tqdm import tqdm 10 | 11 | 12 | def process(item): 13 | spkdir, wav_name, args = item 14 | # speaker 's5', 'p280', 'p315' are excluded, 15 | speaker = spkdir.replace("\\", "/").split("/")[-1] 16 | wav_path = os.path.join(args.in_dir, speaker, wav_name) 17 | if os.path.exists(wav_path) and '.wav' in wav_path: 18 | os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True) 19 | wav, sr = librosa.load(wav_path, sr=args.sr2) 20 | soundfile.write( 21 | os.path.join(args.out_dir2, speaker, wav_name), 22 | wav, 23 | sr 24 | ) 25 | 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--sr2", type=int, default=44100, help="sampling rate") 31 | parser.add_argument("--in_dir", type=str, default="./raw", help="path to source dir") 32 | parser.add_argument("--out_dir2", type=str, default="./dataset", help="path to target dir") 33 | args = parser.parse_args() 34 | processs = 8 35 | pool = Pool(processes=processs) 36 | 37 | for speaker in os.listdir(args.in_dir): 38 | spk_dir = os.path.join(args.in_dir, speaker) 39 | if os.path.isdir(spk_dir): 40 | print(spk_dir) 41 | for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])): 42 | pass 43 | -------------------------------------------------------------------------------- /stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window 38 | from librosa.util import pad_center, tiny 39 | from librosa import stft, istft 40 | from audio_processing import window_sumsquare 41 | 42 | 43 | class STFT(torch.nn.Module): 44 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 45 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 46 | window='hann'): 47 | super(STFT, self).__init__() 48 | self.filter_length = filter_length 49 | self.hop_length = hop_length 50 | self.win_length = win_length 51 | self.window = window 52 | self.forward_transform = None 53 | scale = self.filter_length / self.hop_length 54 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 55 | 56 | cutoff = int((self.filter_length / 2 + 1)) 57 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 58 | np.imag(fourier_basis[:cutoff, :])]) 59 | 60 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 61 | inverse_basis = torch.FloatTensor( 62 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 63 | 64 | if window is not None: 65 | assert(filter_length >= win_length) 66 | # get window and zero center pad it to filter_length 67 | fft_window = get_window(window, win_length, fftbins=True) 68 | fft_window = pad_center(fft_window, filter_length) 69 | fft_window = torch.from_numpy(fft_window).float() 70 | 71 | # window the bases 72 | forward_basis *= fft_window 73 | inverse_basis *= fft_window 74 | 75 | self.register_buffer('forward_basis', forward_basis.float()) 76 | self.register_buffer('inverse_basis', inverse_basis.float()) 77 | 78 | def transform(self, input_data): 79 | num_batches = input_data.size(0) 80 | num_samples = input_data.size(1) 81 | 82 | self.num_samples = num_samples 83 | 84 | if input_data.device.type == "cuda": 85 | # similar to librosa, reflect-pad the input 86 | input_data = input_data.view(num_batches, 1, num_samples) 87 | input_data = F.pad( 88 | input_data.unsqueeze(1), 89 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 90 | mode='reflect') 91 | input_data = input_data.squeeze(1) 92 | 93 | forward_transform = F.conv1d( 94 | input_data, 95 | self.forward_basis, 96 | stride=self.hop_length, 97 | padding=0) 98 | 99 | cutoff = int((self.filter_length / 2) + 1) 100 | real_part = forward_transform[:, :cutoff, :] 101 | imag_part = forward_transform[:, cutoff:, :] 102 | else: 103 | x = input_data.detach().numpy() 104 | real_part = [] 105 | imag_part = [] 106 | for y in x: 107 | y_ = stft(y, self.filter_length, self.hop_length, self.win_length, self.window) 108 | real_part.append(y_.real[None,:,:]) 109 | imag_part.append(y_.imag[None,:,:]) 110 | real_part = np.concatenate(real_part, 0) 111 | imag_part = np.concatenate(imag_part, 0) 112 | 113 | real_part = torch.from_numpy(real_part).to(input_data.dtype) 114 | imag_part = torch.from_numpy(imag_part).to(input_data.dtype) 115 | 116 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 117 | phase = torch.atan2(imag_part.data, real_part.data) 118 | 119 | return magnitude, phase 120 | 121 | def inverse(self, magnitude, phase): 122 | recombine_magnitude_phase = torch.cat( 123 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 124 | 125 | if magnitude.device.type == "cuda": 126 | inverse_transform = F.conv_transpose1d( 127 | recombine_magnitude_phase, 128 | self.inverse_basis, 129 | stride=self.hop_length, 130 | padding=0) 131 | 132 | if self.window is not None: 133 | window_sum = window_sumsquare( 134 | self.window, magnitude.size(-1), hop_length=self.hop_length, 135 | win_length=self.win_length, n_fft=self.filter_length, 136 | dtype=np.float32) 137 | # remove modulation effects 138 | approx_nonzero_indices = torch.from_numpy( 139 | np.where(window_sum > tiny(window_sum))[0]) 140 | window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) 141 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 142 | 143 | # scale by hop ratio 144 | inverse_transform *= float(self.filter_length) / self.hop_length 145 | 146 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 147 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 148 | inverse_transform = inverse_transform.squeeze(1) 149 | else: 150 | x_org = recombine_magnitude_phase.detach().numpy() 151 | n_b, n_f, n_t = x_org.shape 152 | x = np.empty([n_b, n_f//2, n_t], dtype=np.complex64) 153 | x.real = x_org[:,:n_f//2] 154 | x.imag = x_org[:,n_f//2:] 155 | inverse_transform = [] 156 | for y in x: 157 | y_ = istft(y, self.hop_length, self.win_length, self.window) 158 | inverse_transform.append(y_[None,:]) 159 | inverse_transform = np.concatenate(inverse_transform, 0) 160 | inverse_transform = torch.from_numpy(inverse_transform).to(recombine_magnitude_phase.dtype) 161 | 162 | return inverse_transform 163 | 164 | def forward(self, input_data): 165 | self.magnitude, self.phase = self.transform(input_data) 166 | reconstruction = self.inverse(self.magnitude, self.phase) 167 | return reconstruction 168 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from tqdm import tqdm 4 | from torch.utils.data import DataLoader 5 | from torch.utils.tensorboard import SummaryWriter 6 | import torch.multiprocessing as mp 7 | import torch.distributed as dist 8 | from torch.nn.parallel import DistributedDataParallel as DDP 9 | import logging 10 | logging.getLogger("matplotlib").setLevel(logging.INFO) 11 | logging.getLogger("h5py").setLevel(logging.INFO) 12 | logging.getLogger("numba").setLevel(logging.INFO) 13 | 14 | from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate 15 | import models 16 | import commons 17 | import utils 18 | from hifigan import NsfHifiGAN 19 | 20 | global_step = 0 21 | 22 | 23 | def main(): 24 | """Assume Single Node Multi GPUs Training Only""" 25 | assert torch.cuda.is_available(), "CPU training is not allowed." 26 | 27 | n_gpus = torch.cuda.device_count() 28 | os.environ['MASTER_ADDR'] = 'localhost' 29 | os.environ['MASTER_PORT'] = '7998' 30 | 31 | hps = utils.get_hparams() 32 | mp.spawn(train_and_eval, nprocs=n_gpus, args=(n_gpus, hps,)) 33 | 34 | 35 | def train_and_eval(rank, n_gpus, hps): 36 | global global_step 37 | if rank == 0: 38 | logger = utils.get_logger(hps.model_dir) 39 | logger.info(hps) 40 | utils.check_git_hash(hps.model_dir) 41 | writer = SummaryWriter(log_dir=hps.model_dir) 42 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) 43 | 44 | dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank) 45 | torch.manual_seed(hps.train.seed) 46 | torch.cuda.set_device(rank) 47 | 48 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) 49 | train_sampler = torch.utils.data.distributed.DistributedSampler( 50 | train_dataset, 51 | num_replicas=n_gpus, 52 | rank=rank, 53 | shuffle=True) 54 | collate_fn = TextAudioSpeakerCollate() 55 | train_loader = DataLoader(train_dataset, num_workers=3, shuffle=False, 56 | batch_size=hps.train.batch_size, pin_memory=True, 57 | drop_last=True, collate_fn=collate_fn, sampler=train_sampler, persistent_workers=True) 58 | if rank == 0: 59 | val_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) 60 | val_loader = DataLoader(val_dataset, num_workers=0, shuffle=False, 61 | batch_size=1, pin_memory=True, 62 | drop_last=True, collate_fn=collate_fn) 63 | 64 | generator = models.FlowGenerator( 65 | n_vocab=0, 66 | out_channels=hps.data.n_mel_channels, 67 | **hps.model).cuda(rank) 68 | # vocoder = Vocos.from_pretrained('vocos/config.yaml', 'vocos/pytorch_model.bin').cuda() 69 | vocoder = NsfHifiGAN('cuda') 70 | optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, 71 | dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, 72 | lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) 73 | 74 | 75 | # optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, 76 | # dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, 77 | # lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) 78 | generator = DDP(generator) 79 | epoch_str = 1 80 | global_step = 0 81 | # 82 | # try: 83 | # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, 84 | # optimizer_g) 85 | # optimizer_g.step_num = (epoch_str - 1) * len(train_loader) 86 | # optimizer_g._update_learning_rate() 87 | # global_step = (epoch_str - 1) * len(train_loader) 88 | # except: 89 | # if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")): 90 | # _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g) 91 | # 92 | skip_optimizer = False 93 | try: 94 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, 95 | optimizer_g, False) 96 | epoch_str += 1 97 | optimizer_g.step_num = (epoch_str - 1) * len(train_loader) 98 | optimizer_g._update_learning_rate() 99 | global_step = (epoch_str - 1) * len(train_loader) 100 | except: 101 | epoch_str = 1 102 | global_step = 0 103 | if skip_optimizer: 104 | epoch_str = 1 105 | global_step = 0 106 | 107 | for epoch in range(epoch_str, hps.train.epochs + 1): 108 | if rank == 0: 109 | save_interval = 5 110 | if epoch % save_interval == 0: 111 | evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval, vocoder) 112 | train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer) 113 | if epoch % save_interval == 0: 114 | utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, 115 | os.path.join(hps.model_dir, "G_{}.pth".format(epoch))) 116 | try: 117 | to_remove_path = os.path.join(hps.model_dir, "G_{}.pth".format(epoch - save_interval* 3)) 118 | os.remove(to_remove_path) 119 | print(f'removing {to_remove_path}') 120 | except: 121 | print(f'removing {to_remove_path} failed') 122 | else: 123 | train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None) 124 | 125 | 126 | def train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer): 127 | train_loader.sampler.set_epoch(epoch) 128 | global global_step 129 | 130 | generator.train() 131 | for batch_idx, (x, mel,mel_lengths,wav, wav_lengths, speakers, f0) in enumerate(tqdm(train_loader)): 132 | mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) 133 | speakers = speakers.cuda(rank, non_blocking=True) 134 | x = x.cuda(rank, non_blocking=True) 135 | f0 = f0.cuda(rank, non_blocking=True) 136 | 137 | # Train Generator 138 | optimizer_g.zero_grad() 139 | 140 | (z, z_m, z_logs, logdet, z_mask), l_noise = generator(x, mel, mel_lengths,f0, g=speakers, gen=False) 141 | l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) 142 | 143 | loss_gs = [l_mle, l_noise] 144 | loss_g = sum(loss_gs) 145 | 146 | loss_g.backward() 147 | grad_norm = commons.clip_grad_value_(generator.parameters(), 5) 148 | optimizer_g.step() 149 | if rank == 0: 150 | if batch_idx % hps.train.log_interval == 0: 151 | y_gen, _ = generator.module(x[:1],f0=f0[:1], g=speakers[:1], gen=True, glow=True) 152 | logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 153 | epoch, batch_idx * len(x), len(train_loader.dataset), 154 | 100. * batch_idx / len(train_loader), 155 | loss_g.item())) 156 | lr = optimizer_g._optim.param_groups[0]['lr'] 157 | logger.info([x.item() for x in loss_gs] + [global_step, lr]) 158 | 159 | scalar_dict = {"loss/g/total": loss_g, "learning_rate": lr, "grad_norm": grad_norm} 160 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(loss_gs)}) 161 | utils.summarize( 162 | writer=writer, 163 | global_step=global_step, 164 | images={"train/gt/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 165 | "train/gen/mel": utils.plot_spectrogram_to_numpy(y_gen[0].data.cpu().numpy()) 166 | }, 167 | scalars=scalar_dict) 168 | global_step += 1 169 | 170 | if rank == 0: 171 | logger.info('====> Epoch: {}'.format(epoch)) 172 | 173 | 174 | def evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval, vocoder): 175 | if rank == 0: 176 | global global_step 177 | generator.eval() 178 | audio_dict = {} 179 | img_dict = {} 180 | with torch.no_grad(): 181 | for batch_idx, (x, mel,mel_lengths,wav, wav_lengths, speakers, f0) in enumerate( 182 | val_loader): 183 | mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) 184 | speakers = speakers.cuda(rank, non_blocking=True) 185 | x = x.cuda(rank, non_blocking=True) 186 | f0 = f0.cuda(rank, non_blocking=True) 187 | 188 | mel_flow, pred_f0 = generator.module(x, f0=f0, g=speakers, gen=True, glow=True) 189 | y_flow = vocoder.spec2wav(mel_flow.squeeze(0).transpose(0, 1).cpu().numpy(), 190 | f0=pred_f0[0, 0, :].cpu().numpy()) 191 | 192 | # mel_diff, pred_f0 = generator.module(x, f0=f0,g=speakers, gen=True, glow=False) 193 | # y_diff = vocoder.spec2wav(mel_diff.squeeze(0).transpose(0, 1).cpu().numpy(), 194 | # f0=pred_f0[0, 0, :].cpu().numpy()) 195 | 196 | 197 | y_rec = vocoder.spec2wav(mel.squeeze(0).transpose(0, 1).cpu().numpy(), 198 | f0=f0[0, :].cpu().numpy()) 199 | 200 | img_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 201 | f"gen/mel_flow_{batch_idx}": utils.plot_spectrogram_to_numpy(mel_flow[0].data.cpu().numpy()), 202 | # f"gen/mel_diff_{batch_idx}": utils.plot_spectrogram_to_numpy(mel_diff[0].data.cpu().numpy()), 203 | }) 204 | audio_dict.update({ 205 | # "gen/wav_gen_{}_diff".format(batch_idx): y_diff, 206 | "gen/wav_gen_{}_flow".format(batch_idx): y_flow, 207 | "gt/wav_gen_{}_rec".format(batch_idx): y_rec, 208 | }) 209 | 210 | utils.summarize( 211 | writer=writer_eval, 212 | global_step=global_step, 213 | images=img_dict, 214 | audios=audio_dict, 215 | audio_sampling_rate=hps.data.sampling_rate 216 | ) 217 | logger.info('====> Epoch: {}'.format(epoch)) 218 | 219 | 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import sys 4 | import argparse 5 | import logging 6 | import json 7 | import subprocess 8 | import numpy as np 9 | from scipy.io.wavfile import read 10 | import torch 11 | 12 | MATPLOTLIB_FLAG = False 13 | 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 15 | logger = logging 16 | 17 | 18 | def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): 19 | assert os.path.isfile(checkpoint_path) 20 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 21 | iteration = 1 22 | if 'iteration' in checkpoint_dict.keys(): 23 | iteration = checkpoint_dict['iteration'] 24 | if 'learning_rate' in checkpoint_dict.keys(): 25 | learning_rate = checkpoint_dict['learning_rate'] 26 | if optimizer is not None and 'optimizer' in checkpoint_dict.keys() and not skip_optimizer: 27 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 28 | saved_state_dict = checkpoint_dict['model'] 29 | if hasattr(model, 'module'): 30 | state_dict = model.module.state_dict() 31 | else: 32 | state_dict = model.state_dict() 33 | new_state_dict = {} 34 | for k, v in state_dict.items(): 35 | try: 36 | new_state_dict[k] = saved_state_dict[k] 37 | assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape) 38 | except: 39 | print("%s is not in the checkpoint" % k) 40 | new_state_dict[k] = v 41 | if hasattr(model, 'module'): 42 | model.module.load_state_dict(new_state_dict) 43 | else: 44 | model.load_state_dict(new_state_dict) 45 | logger.info("Loaded checkpoint '{}' (iteration {})".format( 46 | checkpoint_path, iteration)) 47 | return model, optimizer, learning_rate, iteration 48 | 49 | 50 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): 51 | logger.info("Saving model and optimizer state at iteration {} to {}".format( 52 | iteration, checkpoint_path)) 53 | if hasattr(model, 'module'): 54 | state_dict = model.module.state_dict() 55 | else: 56 | state_dict = model.state_dict() 57 | torch.save({'model': state_dict, 58 | 'iteration': iteration, 59 | 'optimizer': optimizer.state_dict(), 60 | 'learning_rate': learning_rate}, checkpoint_path) 61 | 62 | 63 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=None): 64 | for k, v in scalars.items(): 65 | writer.add_scalar(k, v, global_step) 66 | for k, v in histograms.items(): 67 | writer.add_histogram(k, v, global_step) 68 | for k, v in images.items(): 69 | writer.add_image(k, v, global_step, dataformats='HWC') 70 | for k, v in audios.items(): 71 | writer.add_audio(k, v, global_step, audio_sampling_rate) 72 | 73 | 74 | def latest_checkpoint_path(dir_path, regex="G_*.pth"): 75 | f_list = glob.glob(os.path.join(dir_path, regex)) 76 | f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) 77 | x = f_list[-1] 78 | print(x) 79 | return x 80 | 81 | 82 | def plot_spectrogram_to_numpy(spectrogram): 83 | global MATPLOTLIB_FLAG 84 | if not MATPLOTLIB_FLAG: 85 | import matplotlib 86 | matplotlib.use("Agg") 87 | MATPLOTLIB_FLAG = True 88 | mpl_logger = logging.getLogger('matplotlib') 89 | mpl_logger.setLevel(logging.WARNING) 90 | import matplotlib.pylab as plt 91 | import numpy as np 92 | 93 | fig, ax = plt.subplots(figsize=(10, 2)) 94 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 95 | interpolation='none') 96 | plt.colorbar(im, ax=ax) 97 | plt.xlabel("Frames") 98 | plt.ylabel("Channels") 99 | plt.tight_layout() 100 | 101 | fig.canvas.draw() 102 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 103 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 104 | plt.close() 105 | return data 106 | 107 | 108 | def plot_alignment_to_numpy(alignment, info=None): 109 | global MATPLOTLIB_FLAG 110 | if not MATPLOTLIB_FLAG: 111 | import matplotlib 112 | matplotlib.use("Agg") 113 | MATPLOTLIB_FLAG = True 114 | mpl_logger = logging.getLogger('matplotlib') 115 | mpl_logger.setLevel(logging.WARNING) 116 | import matplotlib.pylab as plt 117 | import numpy as np 118 | 119 | fig, ax = plt.subplots(figsize=(6, 4)) 120 | im = ax.imshow(alignment, aspect='auto', origin='lower', 121 | interpolation='none') 122 | fig.colorbar(im, ax=ax) 123 | xlabel = 'Decoder timestep' 124 | if info is not None: 125 | xlabel += '\n\n' + info 126 | plt.xlabel(xlabel) 127 | plt.ylabel('Encoder timestep') 128 | plt.tight_layout() 129 | 130 | fig.canvas.draw() 131 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 132 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 133 | plt.close() 134 | return data 135 | 136 | 137 | def load_wav_to_torch(full_path): 138 | sampling_rate, data = read(full_path) 139 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 140 | 141 | 142 | def load_filepaths_and_text(filename, split="|"): 143 | with open(filename, encoding='utf-8') as f: 144 | filepaths_and_text = [line.strip().split(split) for line in f] 145 | return filepaths_and_text 146 | 147 | 148 | def get_hparams(init=True): 149 | parser = argparse.ArgumentParser() 150 | parser.add_argument('-c', '--config', type=str, default="./configs/base.json", 151 | help='JSON file for configuration') 152 | parser.add_argument('-m', '--model', type=str, required=True, 153 | help='Model name') 154 | 155 | args = parser.parse_args() 156 | model_dir = os.path.join("./logs", args.model) 157 | 158 | if not os.path.exists(model_dir): 159 | os.makedirs(model_dir) 160 | 161 | config_path = args.config 162 | config_save_path = os.path.join(model_dir, "config.json") 163 | if init: 164 | with open(config_path, "r") as f: 165 | data = f.read() 166 | with open(config_save_path, "w") as f: 167 | f.write(data) 168 | else: 169 | with open(config_save_path, "r") as f: 170 | data = f.read() 171 | config = json.loads(data) 172 | 173 | hparams = HParams(**config) 174 | hparams.model_dir = model_dir 175 | return hparams 176 | 177 | 178 | def get_hparams_from_dir(model_dir): 179 | config_save_path = os.path.join(model_dir, "config.json") 180 | with open(config_save_path, "r") as f: 181 | data = f.read() 182 | config = json.loads(data) 183 | 184 | hparams = HParams(**config) 185 | hparams.model_dir = model_dir 186 | return hparams 187 | 188 | 189 | def get_hparams_from_file(config_path): 190 | with open(config_path, "r") as f: 191 | data = f.read() 192 | config = json.loads(data) 193 | 194 | hparams = HParams(**config) 195 | return hparams 196 | 197 | 198 | def check_git_hash(model_dir): 199 | source_dir = os.path.dirname(os.path.realpath(__file__)) 200 | if not os.path.exists(os.path.join(source_dir, ".git")): 201 | logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( 202 | source_dir 203 | )) 204 | return 205 | 206 | cur_hash = subprocess.getoutput("git rev-parse HEAD") 207 | 208 | path = os.path.join(model_dir, "githash") 209 | if os.path.exists(path): 210 | saved_hash = open(path).read() 211 | if saved_hash != cur_hash: 212 | logger.warn("git hash values are different. {}(saved) != {}(current)".format( 213 | saved_hash[:8], cur_hash[:8])) 214 | else: 215 | open(path, "w").write(cur_hash) 216 | 217 | 218 | def get_logger(model_dir, filename="train.log"): 219 | global logger 220 | logger = logging.getLogger(os.path.basename(model_dir)) 221 | logger.setLevel(logging.DEBUG) 222 | 223 | formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") 224 | if not os.path.exists(model_dir): 225 | os.makedirs(model_dir) 226 | h = logging.FileHandler(os.path.join(model_dir, filename)) 227 | h.setLevel(logging.DEBUG) 228 | h.setFormatter(formatter) 229 | logger.addHandler(h) 230 | return logger 231 | 232 | 233 | class HParams(): 234 | def __init__(self, **kwargs): 235 | for k, v in kwargs.items(): 236 | if type(v) == dict: 237 | v = HParams(**v) 238 | self[k] = v 239 | 240 | def keys(self): 241 | return self.__dict__.keys() 242 | 243 | def items(self): 244 | return self.__dict__.items() 245 | 246 | def values(self): 247 | return self.__dict__.values() 248 | 249 | def __len__(self): 250 | return len(self.__dict__) 251 | 252 | def __getitem__(self, key): 253 | return getattr(self, key) 254 | 255 | def __setitem__(self, key, value): 256 | return setattr(self, key, value) 257 | 258 | def __contains__(self, key): 259 | return key in self.__dict__ 260 | 261 | def __repr__(self): 262 | return self.__dict__.__repr__() 263 | 264 | 265 | 266 | def plot_data_to_numpy(x, y): 267 | global MATPLOTLIB_FLAG 268 | if not MATPLOTLIB_FLAG: 269 | import matplotlib 270 | matplotlib.use("Agg") 271 | MATPLOTLIB_FLAG = True 272 | mpl_logger = logging.getLogger('matplotlib') 273 | mpl_logger.setLevel(logging.WARNING) 274 | import matplotlib.pylab as plt 275 | import numpy as np 276 | 277 | fig, ax = plt.subplots(figsize=(10, 2)) 278 | plt.plot(x) 279 | plt.plot(y) 280 | plt.tight_layout() 281 | 282 | fig.canvas.draw() 283 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 284 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 285 | plt.close() 286 | return data 287 | --------------------------------------------------------------------------------