├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── attentions.py
├── audio_processing.py
├── commons.py
├── configs
    └── config.json
├── data_utils.py
├── diffusion.py
├── extract_f0_mel.py
├── extract_vec.py
├── f0_extractor
    ├── F0Predictor.py
    ├── FCPEF0Predictor.py
    ├── __init__.py
    └── fcpe
    │   ├── __init__.py
    │   ├── model.py
    │   ├── nvSTFT.py
    │   └── pcmer.py
├── feature_extractor
    ├── __init__.py
    └── contentvec768.py
├── filelists
    └── .gitkeep
├── hifigan
    ├── __init__.py
    ├── modules
    │   ├── hifigan
    │   │   ├── hifigan.py
    │   │   └── mel_utils.py
    │   ├── nsf_hifigan
    │   │   ├── env.py
    │   │   ├── models.py
    │   │   ├── nvSTFT.py
    │   │   └── utils.py
    │   └── parallel_wavegan
    │   │   ├── __init__.py
    │   │   ├── layers
    │   │       ├── __init__.py
    │   │       ├── causal_conv.py
    │   │       ├── pqmf.py
    │   │       ├── residual_block.py
    │   │       ├── residual_stack.py
    │   │       ├── tf_layers.py
    │   │       └── upsample.py
    │   │   ├── losses
    │   │       ├── __init__.py
    │   │       └── stft_loss.py
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── melgan.py
    │   │       ├── parallel_wavegan.py
    │   │       └── source.py
    │   │   ├── optimizers
    │   │       ├── __init__.py
    │   │       └── radam.py
    │   │   ├── stft_loss.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       └── utils.py
    └── network
    │   └── vocoders
    │       ├── __init__.py
    │       ├── base_vocoder.py
    │       ├── hifigan.py
    │       ├── nsf_hifigan.py
    │       ├── pwg.py
    │       └── vocoder_utils.py
├── mel_processing.py
├── models.py
├── modules.py
├── preprocess_flist_config.py
├── pretrain
    ├── content-vec-best
    │   └── config.json
    ├── fcpe
    │   └── .gitkeep
    └── nsf-hifigan
    │   ├── config.json
    │   └── put_441hifigan_ckpt_here
├── resample.py
├── stft.py
├── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | DATASETS
 2 | DUMMY
 3 | DUMMY2
 4 | samples
 5 | logs
 6 | __pycache__
 7 | .ipynb_checkpoints
 8 | .*.swp
 9 | 
10 | build
11 | *.c
12 | monotonic_align/monotonic_align
13 | vocos/pytorch_model.bin
14 | dataset
15 | hifigan/model


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "waveglow"]
 2 | 	path = waveglow
 3 | 	url = https://github.com/NVIDIA/waveglow.git
 4 | [submodule "hifigan"]
 5 |     path = hifigan
 6 |     url = https://github.com/jik876/hifi-gan.git
 7 | [submodule "hifi-gan"]
 8 | 	path = hifi-gan
 9 | 	url = https://github.com/jik876/hifi-gan.git
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jaehyeon Kim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Glow-SVC
 2 | 
 3 | another implement by rcell based on official [glow-tts](https://github.com/jaywalnut310/glow-tts) repo
 4 | + content-vec + fcpe(f0) -> glowtts -> nsf-hifigan -> wav
 5 | + 坑多多多多多
 6 | + fp16会炸炸炸
 7 | 
 8 | pretrain:
 9 | + [content-vec-best](https://huggingface.co/lengyue233/content-vec-best/resolve/main/pytorch_model.bin)
10 | + [fcpe](https://huggingface.co/datasets/ylzz1997/rmvpe_pretrain_model/resolve/main/fcpe.pt)
11 | + [nsf-hifigan](https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip)
12 | 
13 | 
14 | preprocess：
15 | + [resample.py](resample.py) -> [preprocess_flist_config.py](preprocess_flist_config.py) 
16 | -> [extract_vec.py](extract_vec.py) -> [extract_f0_mel.py](extract_f0_mel.py)
17 | 
18 | train:
19 | + python train.py -c configs/config.json -m model_name
20 | 
21 | infer:
22 | + 暂无


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import commons
  9 | import modules
 10 | from modules import LayerNorm
 11 |    
 12 | 
 13 | class Encoder(nn.Module):
 14 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=None, block_length=None, **kwargs):
 15 |     super().__init__()
 16 |     self.hidden_channels = hidden_channels
 17 |     self.filter_channels = filter_channels
 18 |     self.n_heads = n_heads
 19 |     self.n_layers = n_layers
 20 |     self.kernel_size = kernel_size
 21 |     self.p_dropout = p_dropout
 22 |     self.window_size = window_size
 23 |     self.block_length = block_length
 24 | 
 25 |     self.drop = nn.Dropout(p_dropout)
 26 |     self.attn_layers = nn.ModuleList()
 27 |     self.norm_layers_1 = nn.ModuleList()
 28 |     self.ffn_layers = nn.ModuleList()
 29 |     self.norm_layers_2 = nn.ModuleList()
 30 |     for i in range(self.n_layers):
 31 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, window_size=window_size, p_dropout=p_dropout, block_length=block_length))
 32 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 33 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 34 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 35 | 
 36 |   def forward(self, x, x_mask):
 37 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 38 |     for i in range(self.n_layers):
 39 |       x = x * x_mask
 40 |       y = self.attn_layers[i](x, x, attn_mask)
 41 |       y = self.drop(y)
 42 |       x = self.norm_layers_1[i](x + y)
 43 | 
 44 |       y = self.ffn_layers[i](x, x_mask)
 45 |       y = self.drop(y)
 46 |       x = self.norm_layers_2[i](x + y)
 47 |     x = x * x_mask
 48 |     return x
 49 | 
 50 | 
 51 | class CouplingBlock(nn.Module):
 52 |   def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0, sigmoid_scale=False):
 53 |     super().__init__()
 54 |     self.in_channels = in_channels
 55 |     self.hidden_channels = hidden_channels
 56 |     self.kernel_size = kernel_size
 57 |     self.dilation_rate = dilation_rate
 58 |     self.n_layers = n_layers
 59 |     self.gin_channels = gin_channels
 60 |     self.p_dropout = p_dropout
 61 |     self.sigmoid_scale = sigmoid_scale
 62 | 
 63 |     start = torch.nn.Conv1d(in_channels//2, hidden_channels, 1)
 64 |     start = torch.nn.utils.weight_norm(start)
 65 |     self.start = start
 66 |     # Initializing last layer to 0 makes the affine coupling layers
 67 |     # do nothing at first.  It helps to stabilze training.
 68 |     end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
 69 |     end.weight.data.zero_()
 70 |     end.bias.data.zero_()
 71 |     self.end = end
 72 | 
 73 |     self.wn = modules.WN(in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout)
 74 | 
 75 | 
 76 |   def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
 77 |     b, c, t = x.size()
 78 |     if x_mask is None:
 79 |       x_mask = 1
 80 |     x_0, x_1 = x[:,:self.in_channels//2], x[:,self.in_channels//2:]
 81 | 
 82 |     x = self.start(x_0) * x_mask
 83 |     x = self.wn(x, x_mask, g)
 84 |     out = self.end(x)
 85 | 
 86 |     z_0 = x_0
 87 |     m = out[:, :self.in_channels//2, :]
 88 |     logs = out[:, self.in_channels//2:, :]
 89 |     if self.sigmoid_scale:
 90 |       logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
 91 | 
 92 |     if reverse:
 93 |       z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
 94 |       logdet = None
 95 |     else:
 96 |       z_1 = (m + torch.exp(logs) * x_1) * x_mask
 97 |       logdet = torch.sum(logs * x_mask, [1, 2])
 98 | 
 99 |     z = torch.cat([z_0, z_1], 1)
100 |     return z, logdet
101 | 
102 |   def store_inverse(self):
103 |     self.wn.remove_weight_norm()
104 | 
105 | 
106 | class MultiHeadAttention(nn.Module):
107 |   def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., block_length=None, proximal_bias=False, proximal_init=False):
108 |     super().__init__()
109 |     assert channels % n_heads == 0
110 | 
111 |     self.channels = channels
112 |     self.out_channels = out_channels
113 |     self.n_heads = n_heads
114 |     self.window_size = window_size
115 |     self.heads_share = heads_share
116 |     self.block_length = block_length
117 |     self.proximal_bias = proximal_bias
118 |     self.p_dropout = p_dropout
119 |     self.attn = None
120 | 
121 |     self.k_channels = channels // n_heads
122 |     self.conv_q = nn.Conv1d(channels, channels, 1)
123 |     self.conv_k = nn.Conv1d(channels, channels, 1)
124 |     self.conv_v = nn.Conv1d(channels, channels, 1)
125 |     if window_size is not None:
126 |       n_heads_rel = 1 if heads_share else n_heads
127 |       rel_stddev = self.k_channels**-0.5
128 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
130 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
131 |     self.drop = nn.Dropout(p_dropout)
132 | 
133 |     nn.init.xavier_uniform_(self.conv_q.weight)
134 |     nn.init.xavier_uniform_(self.conv_k.weight)
135 |     if proximal_init:
136 |       self.conv_k.weight.data.copy_(self.conv_q.weight.data)
137 |       self.conv_k.bias.data.copy_(self.conv_q.bias.data)
138 |     nn.init.xavier_uniform_(self.conv_v.weight)
139 |       
140 |   def forward(self, x, c, attn_mask=None):
141 |     q = self.conv_q(x)
142 |     k = self.conv_k(c)
143 |     v = self.conv_v(c)
144 |     
145 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
146 | 
147 |     x = self.conv_o(x)
148 |     return x
149 |     
150 |   def attention(self, query, key, value, mask=None):
151 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
152 |     b, d, t_s, t_t = (*key.size(), query.size(2))
153 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
154 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
155 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
156 | 
157 |     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
158 |     if self.window_size is not None:
159 |       assert t_s == t_t, "Relative attention is only available for self-attention."
160 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
161 |       rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
162 |       rel_logits = self._relative_position_to_absolute_position(rel_logits)
163 |       scores_local = rel_logits / math.sqrt(self.k_channels)
164 |       scores = scores + scores_local
165 |     if self.proximal_bias:
166 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
167 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
168 |     if mask is not None:
169 |       scores = scores.masked_fill(mask == 0, -1e4)
170 |       if self.block_length is not None:
171 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
172 |         scores = scores * block_mask + -1e4*(1 - block_mask)
173 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
174 |     p_attn = self.drop(p_attn)
175 |     output = torch.matmul(p_attn, value)
176 |     if self.window_size is not None:
177 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
178 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
179 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
180 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
181 |     return output, p_attn
182 | 
183 |   def _matmul_with_relative_values(self, x, y):
184 |     """
185 |     x: [b, h, l, m]
186 |     y: [h or 1, m, d]
187 |     ret: [b, h, l, d]
188 |     """
189 |     ret = torch.matmul(x, y.unsqueeze(0))
190 |     return ret
191 | 
192 |   def _matmul_with_relative_keys(self, x, y):
193 |     """
194 |     x: [b, h, l, d]
195 |     y: [h or 1, m, d]
196 |     ret: [b, h, l, m]
197 |     """
198 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
199 |     return ret
200 | 
201 |   def _get_relative_embeddings(self, relative_embeddings, length):
202 |     max_relative_position = 2 * self.window_size + 1
203 |     # Pad first before slice to avoid using cond ops.
204 |     pad_length = max(length - (self.window_size + 1), 0)
205 |     slice_start_position = max((self.window_size + 1) - length, 0)
206 |     slice_end_position = slice_start_position + 2 * length - 1
207 |     if pad_length > 0:
208 |       padded_relative_embeddings = F.pad(
209 |           relative_embeddings,
210 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
211 |     else:
212 |       padded_relative_embeddings = relative_embeddings
213 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
214 |     return used_relative_embeddings
215 | 
216 |   def _relative_position_to_absolute_position(self, x):
217 |     """
218 |     x: [b, h, l, 2*l-1]
219 |     ret: [b, h, l, l]
220 |     """
221 |     batch, heads, length, _ = x.size()
222 |     # Concat columns of pad to shift from relative to absolute indexing.
223 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
224 | 
225 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
226 |     x_flat = x.view([batch, heads, length * 2 * length])
227 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
228 | 
229 |     # Reshape and slice out the padded elements.
230 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
231 |     return x_final
232 | 
233 |   def _absolute_position_to_relative_position(self, x):
234 |     """
235 |     x: [b, h, l, l]
236 |     ret: [b, h, l, 2*l-1]
237 |     """
238 |     batch, heads, length, _ = x.size()
239 |     # padd along column
240 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
241 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
242 |     # add 0's in the beginning that will skew the elements after reshape
243 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
244 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
245 |     return x_final
246 | 
247 |   def _attention_bias_proximal(self, length):
248 |     """Bias for self-attention to encourage attention to close positions.
249 |     Args:
250 |       length: an integer scalar.
251 |     Returns:
252 |       a Tensor with shape [1, 1, length, length]
253 |     """
254 |     r = torch.arange(length, dtype=torch.float32)
255 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
256 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
257 | 
258 | 
259 | class FFN(nn.Module):
260 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None):
261 |     super().__init__()
262 |     self.in_channels = in_channels
263 |     self.out_channels = out_channels
264 |     self.filter_channels = filter_channels
265 |     self.kernel_size = kernel_size
266 |     self.p_dropout = p_dropout
267 |     self.activation = activation
268 | 
269 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
270 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size//2)
271 |     self.drop = nn.Dropout(p_dropout)
272 | 
273 |   def forward(self, x, x_mask):
274 |     x = self.conv_1(x * x_mask)
275 |     if self.activation == "gelu":
276 |       x = x * torch.sigmoid(1.702 * x)
277 |     else:
278 |       x = torch.relu(x)
279 |     x = self.drop(x)
280 |     x = self.conv_2(x * x_mask)
281 |     return x * x_mask
282 |   
283 | 


--------------------------------------------------------------------------------
/audio_processing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.signal import get_window
 4 | import librosa.util as librosa_util
 5 | 
 6 | 
 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 8 |                      n_fft=800, dtype=np.float32, norm=None):
 9 |     """
10 |     # from librosa 0.6
11 |     Compute the sum-square envelope of a window function at a given hop length.
12 | 
13 |     This is used to estimate modulation effects induced by windowing
14 |     observations in short-time fourier transforms.
15 | 
16 |     Parameters
17 |     ----------
18 |     window : string, tuple, number, callable, or list-like
19 |         Window specification, as in `get_window`
20 | 
21 |     n_frames : int > 0
22 |         The number of analysis frames
23 | 
24 |     hop_length : int > 0
25 |         The number of samples to advance between frames
26 | 
27 |     win_length : [optional]
28 |         The length of the window function.  By default, this matches `n_fft`.
29 | 
30 |     n_fft : int > 0
31 |         The length of each analysis frame.
32 | 
33 |     dtype : np.dtype
34 |         The data type of the output
35 | 
36 |     Returns
37 |     -------
38 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39 |         The sum-squared envelope of the window function
40 |     """
41 |     if win_length is None:
42 |         win_length = n_fft
43 | 
44 |     n = n_fft + hop_length * (n_frames - 1)
45 |     x = np.zeros(n, dtype=dtype)
46 | 
47 |     # Compute the squared window at the desired length
48 |     win_sq = get_window(window, win_length, fftbins=True)
49 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
51 | 
52 |     # Fill the envelope
53 |     for i in range(n_frames):
54 |         sample = i * hop_length
55 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56 |     return x
57 | 
58 | 
59 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
60 |     """
61 |     PARAMS
62 |     ------
63 |     magnitudes: spectrogram magnitudes
64 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65 |     """
66 | 
67 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68 |     angles = angles.astype(np.float32)
69 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
70 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71 | 
72 |     for i in range(n_iters):
73 |         _, angles = stft_fn.transform(signal)
74 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75 |     return signal
76 | 
77 | 
78 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
79 |     """
80 |     PARAMS
81 |     ------
82 |     C: compression factor
83 |     """
84 |     return torch.log(torch.clamp(x, min=clip_val) * C)
85 | 
86 | 
87 | def dynamic_range_decompression(x, C=1):
88 |     """
89 |     PARAMS
90 |     ------
91 |     C: compression factor used to compress
92 |     """
93 |     return torch.exp(x) / C
94 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | from librosa.filters import mel as librosa_mel_fn
  8 | from audio_processing import dynamic_range_compression
  9 | from audio_processing import dynamic_range_decompression
 10 | from stft import STFT
 11 | 
 12 | 
 13 | def intersperse(lst, item):
 14 |   result = [item] * (len(lst) * 2 + 1)
 15 |   result[1::2] = lst
 16 |   return result
 17 | 
 18 | 
 19 | def mle_loss(z, m, logs, logdet, mask):
 20 |   l = torch.sum(logs) + 0.5 * torch.sum(torch.exp(-2 * logs) * ((z - m)**2)) # neg normal likelihood w/o the constant term
 21 |   l = l - torch.sum(logdet) # log jacobian determinant
 22 |   l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes
 23 |   l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term
 24 |   return l
 25 | 
 26 | 
 27 | def duration_loss(logw, logw_, lengths):
 28 |   l = torch.sum((logw - logw_)**2) / torch.sum(lengths)
 29 |   return l
 30 | 
 31 | 
 32 | @torch.jit.script
 33 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
 34 |   n_channels_int = n_channels[0]
 35 |   in_act = input_a + input_b
 36 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
 37 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 38 |   acts = t_act * s_act
 39 |   return acts
 40 | 
 41 | 
 42 | def convert_pad_shape(pad_shape):
 43 |   l = pad_shape[::-1]
 44 |   pad_shape = [item for sublist in l for item in sublist]
 45 |   return pad_shape
 46 | 
 47 | 
 48 | def shift_1d(x):
 49 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
 50 |   return x
 51 | 
 52 | 
 53 | def sequence_mask(length, max_length=None):
 54 |   if max_length is None:
 55 |     max_length = length.max()
 56 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
 57 |   return x.unsqueeze(0) < length.unsqueeze(1)
 58 | 
 59 | 
 60 | def maximum_path(value, mask, max_neg_val=-np.inf):
 61 |   """ Numpy-friendly version. It's about 4 times faster than torch version.
 62 |   value: [b, t_x, t_y]
 63 |   mask: [b, t_x, t_y]
 64 |   """
 65 |   value = value * mask
 66 | 
 67 |   device = value.device
 68 |   dtype = value.dtype
 69 |   value = value.cpu().detach().numpy()
 70 |   mask = mask.cpu().detach().numpy().astype(np.bool)
 71 |   
 72 |   b, t_x, t_y = value.shape
 73 |   direction = np.zeros(value.shape, dtype=np.int64)
 74 |   v = np.zeros((b, t_x), dtype=np.float32)
 75 |   x_range = np.arange(t_x, dtype=np.float32).reshape(1,-1)
 76 |   for j in range(t_y):
 77 |     v0 = np.pad(v, [[0,0],[1,0]], mode="constant", constant_values=max_neg_val)[:, :-1]
 78 |     v1 = v
 79 |     max_mask = (v1 >= v0)
 80 |     v_max = np.where(max_mask, v1, v0)
 81 |     direction[:, :, j] = max_mask
 82 |     
 83 |     index_mask = (x_range <= j)
 84 |     v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
 85 |   direction = np.where(mask, direction, 1)
 86 |     
 87 |   path = np.zeros(value.shape, dtype=np.float32)
 88 |   index = mask[:, :, 0].sum(1).astype(np.int64) - 1
 89 |   index_range = np.arange(b)
 90 |   for j in reversed(range(t_y)):
 91 |     path[index_range, index, j] = 1
 92 |     index = index + direction[index_range, index, j] - 1
 93 |   path = path * mask.astype(np.float32)
 94 |   path = torch.from_numpy(path).to(device=device, dtype=dtype)
 95 |   return path
 96 | 
 97 | 
 98 | def generate_path(duration, mask):
 99 |   """
100 |   duration: [b, t_x]
101 |   mask: [b, t_x, t_y]
102 |   """
103 |   device = duration.device
104 |   
105 |   b, t_x, t_y = mask.shape
106 |   cum_duration = torch.cumsum(duration, 1)
107 |   path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
108 |   
109 |   cum_duration_flat = cum_duration.view(b * t_x)
110 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
111 |   path = path.view(b, t_x, t_y)
112 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:,:-1]
113 |   path = path * mask
114 |   return path
115 | 
116 | 
117 | class Adam():
118 |   def __init__(self, params, scheduler, dim_model, warmup_steps=4000, lr=1e0, betas=(0.9, 0.98), eps=1e-9):
119 |     self.params = params
120 |     self.scheduler = scheduler
121 |     self.dim_model = dim_model
122 |     self.warmup_steps = warmup_steps
123 |     self.lr = lr
124 |     self.betas = betas
125 |     self.eps = eps
126 | 
127 |     self.step_num = 1
128 |     self.cur_lr = lr * self._get_lr_scale()
129 |     
130 |     self._optim = torch.optim.Adam(params, lr=self.cur_lr, betas=betas, eps=eps)
131 |   def _get_lr_scale(self):
132 |     if self.scheduler == "noam":
133 |       return np.power(self.dim_model, -0.5) * np.min([np.power(self.step_num, -0.5), self.step_num * np.power(self.warmup_steps, -1.5)])
134 |     else:
135 |       return 1
136 | 
137 |   def _update_learning_rate(self):
138 |     self.step_num += 1
139 |     if self.scheduler == "noam":
140 |       self.cur_lr = self.lr * self._get_lr_scale()
141 |       for param_group in self._optim.param_groups:
142 |         param_group['lr'] = self.cur_lr
143 | 
144 |   def get_lr(self):
145 |     return self.cur_lr
146 | 
147 |   def step(self):
148 |     self._optim.step()
149 |     self._update_learning_rate()
150 | 
151 |   def zero_grad(self):
152 |     self._optim.zero_grad()
153 | 
154 |   def load_state_dict(self, d):
155 |     self._optim.load_state_dict(d)
156 | 
157 |   def state_dict(self):
158 |     return self._optim.state_dict()
159 | 
160 | 
161 | class TacotronSTFT(nn.Module):
162 |   def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
163 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
164 |                  mel_fmax=8000.0):
165 |     super(TacotronSTFT, self).__init__()
166 |     self.n_mel_channels = n_mel_channels
167 |     self.sampling_rate = sampling_rate
168 |     self.stft_fn = STFT(filter_length, hop_length, win_length)
169 |     mel_basis = librosa_mel_fn(
170 |         sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
171 |     mel_basis = torch.from_numpy(mel_basis).float()
172 |     self.register_buffer('mel_basis', mel_basis)
173 | 
174 |   def spectral_normalize(self, magnitudes):
175 |     output = dynamic_range_compression(magnitudes)
176 |     return output
177 | 
178 |   def spectral_de_normalize(self, magnitudes):
179 |     output = dynamic_range_decompression(magnitudes)
180 |     return output
181 | 
182 |   def mel_spectrogram(self, y):
183 |     """Computes mel-spectrograms from a batch of waves
184 |     PARAMS
185 |     ------
186 |     y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
187 | 
188 |     RETURNS
189 |     -------
190 |     mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
191 |     """
192 |     assert(torch.min(y.data) >= -1)
193 |     assert(torch.max(y.data) <= 1)
194 | 
195 |     magnitudes, phases = self.stft_fn.transform(y)
196 |     magnitudes = magnitudes.data
197 |     mel_output = torch.matmul(self.mel_basis, magnitudes)
198 |     mel_output = self.spectral_normalize(mel_output)
199 |     return mel_output
200 | 
201 | 
202 | def clip_grad_value_(parameters, clip_value, norm_type=2):
203 |   if isinstance(parameters, torch.Tensor):
204 |     parameters = [parameters]
205 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
206 |   norm_type = float(norm_type)
207 |   clip_value = float(clip_value)
208 | 
209 |   total_norm = 0
210 |   for p in parameters:
211 |     param_norm = p.grad.data.norm(norm_type)
212 |     total_norm += param_norm.item() ** norm_type
213 | 
214 |     p.grad.data.clamp_(min=-clip_value, max=clip_value)
215 |   total_norm = total_norm ** (1. / norm_type)
216 |   return total_norm
217 | 
218 | 
219 | def squeeze(x, x_mask=None, n_sqz=2):
220 |   b, c, t = x.size()
221 | 
222 |   t = (t // n_sqz) * n_sqz
223 |   x = x[:,:,:t]
224 |   x_sqz = x.view(b, c, t//n_sqz, n_sqz)
225 |   x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c*n_sqz, t//n_sqz)
226 |   
227 |   if x_mask is not None:
228 |     x_mask = x_mask[:,:,n_sqz-1::n_sqz]
229 |   else:
230 |     x_mask = torch.ones(b, 1, t//n_sqz).to(device=x.device, dtype=x.dtype)
231 |   return x_sqz * x_mask, x_mask
232 | 
233 | 
234 | def unsqueeze(x, x_mask=None, n_sqz=2):
235 |   b, c, t = x.size()
236 | 
237 |   x_unsqz = x.view(b, n_sqz, c//n_sqz, t)
238 |   x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c//n_sqz, t*n_sqz)
239 | 
240 |   if x_mask is not None:
241 |     x_mask = x_mask.unsqueeze(-1).repeat(1,1,1,n_sqz).view(b, 1, t*n_sqz)
242 |   else:
243 |     x_mask = torch.ones(b, 1, t*n_sqz).to(device=x.device, dtype=x.dtype)
244 |   return x_unsqz * x_mask, x_mask
245 | 
246 | 


--------------------------------------------------------------------------------
/configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "use_cuda": true,
 4 |     "log_interval": 50,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 1e0,
 8 |     "betas": [0.9, 0.98],
 9 |     "eps": 1e-9,
10 |     "warmup_steps": 4000,
11 |     "scheduler": "noam",
12 |     "batch_size": 12,
13 |     "ddi": true,
14 |     "fp16_run": false
15 |   },
16 |   "data": {
17 |     "load_mel_from_disk": false,
18 |     "training_files":"filelists/train.list",
19 |     "validation_files":"filelists/val.list",
20 |     "text_cleaners":["english_cleaners"],
21 |     "max_wav_value": 32768.0,
22 | 
23 |     "sampling_rate": 44100,
24 |     "filter_length": 2048,
25 |     "hop_length": 512,
26 |     "win_length": 2048,
27 |     "n_mel_channels": 128,
28 |     "mel_fmin": 40,
29 |     "mel_fmax": 16000,
30 | 
31 |     "add_noise": true,
32 |     "add_blank": true,
33 |     "spk2id": {
34 |       "opencpop": 0
35 |     }
36 |   },
37 |   "model": {
38 |     "hidden_channels": 192,
39 |     "filter_channels": 768,
40 |     "filter_channels_dp": 256,
41 |     "kernel_size": 3,
42 |     "p_dropout": 0.1,
43 |     "n_blocks_dec": 12,
44 |     "n_layers_enc": 6,
45 |     "n_heads": 2,
46 |     "p_dropout_dec": 0.05,
47 |     "dilation_rate": 1,
48 |     "kernel_size_dec": 5,
49 |     "n_block_layers": 4,
50 |     "n_sqz": 2,
51 |     "prenet": true,
52 |     "mean_only": true,
53 |     "hidden_channels_enc": 192,
54 |     "hidden_channels_dec": 192,
55 |     "window_size": 4,
56 |     "n_speakers": 200,
57 |     "gin_channels": 192
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import random
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | 
  8 | import commons
  9 | import mel_processing
 10 | from utils import load_filepaths_and_text
 11 | import torch.nn.functional as F
 12 | """Multi speaker version"""
 13 | 
 14 | 
 15 | class TextAudioSpeakerLoader(torch.utils.data.Dataset):
 16 |     """
 17 |         1) loads audio, speaker_id, text pairs
 18 |         2) normalizes text and converts them to sequences of integers
 19 |         3) computes spectrograms from audio files.
 20 |     """
 21 | 
 22 |     def __init__(self, audiopaths_sid_text, hparams, val=False):
 23 |         self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
 24 |         self.max_wav_value = hparams.max_wav_value
 25 |         self.sampling_rate = hparams.sampling_rate
 26 |         self.filter_length = hparams.filter_length
 27 |         self.hop_length = hparams.hop_length
 28 |         self.win_length = hparams.win_length
 29 |         self.sampling_rate = hparams.sampling_rate
 30 |         self.spk_map = hparams.spk2id
 31 | 
 32 |         self.cleaned_text = getattr(hparams, "cleaned_text", False)
 33 | 
 34 |         self.add_blank = hparams.add_blank
 35 |         self.min_text_len = getattr(hparams, "min_text_len", 1)
 36 |         self.max_text_len = getattr(hparams, "max_text_len", 300)
 37 |         self.hps = hparams
 38 |         random.seed(1234)
 39 |         random.shuffle(self.audiopaths_sid_text)
 40 |         self._filter(val)
 41 |         self.fcpe = None
 42 | 
 43 |     def _filter(self, val):
 44 |         """
 45 |         Filter text & store spec lengths
 46 |         """
 47 |         # Store spectrogram lengths for Bucketing
 48 |         # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
 49 |         # spec_length = wav_length // hop_length
 50 | 
 51 |         audiopaths_sid_text_new = []
 52 |         lengths = []
 53 |         skipped = 0
 54 |         for item in self.audiopaths_sid_text:
 55 |             _id, spk = item[:2]
 56 |             audiopath =  f'dataset/{spk}/{_id}.wav'
 57 |             if not os.path.exists(audiopath):
 58 |                 skipped += 1
 59 |                 continue
 60 |             length_ = os.path.getsize(audiopath) // (2 * self.hop_length)
 61 |             if (length_ < 120 or length_>1400 ) and not val:
 62 |                 skipped += 1
 63 |                 continue
 64 |             audiopaths_sid_text_new.append([audiopath, spk])
 65 | 
 66 |         print("skipped: ", skipped, ", total: ", len(self.audiopaths_sid_text))
 67 |         self.audiopaths_sid_text = audiopaths_sid_text_new
 68 |         self.lengths = lengths
 69 | 
 70 | 
 71 |     def get_audio_text_speaker_pair(self, audiopath_sid_text):
 72 |         # separate filename, speaker_id and text
 73 |         audiopath, sid = audiopath_sid_text
 74 | 
 75 |         mel, wav = self.get_spec(audiopath)
 76 | 
 77 |         ssl = torch.load(audiopath.replace(".wav", ".ssl.pt"))
 78 |         ssl = F.interpolate(ssl, size=mel.shape[-1], mode="nearest")
 79 | 
 80 |         sid = torch.LongTensor([int(self.spk_map[sid])])
 81 |         f0 = self.get_pitch(wav[0], mel.shape[1], audiopath)
 82 |         return (ssl, mel, wav, sid, f0)
 83 | 
 84 |     def get_spec(self, filename):
 85 |         wav_torch, _ = mel_processing.load_wav_to_torch(filename, target_sr=self.hps.sampling_rate)
 86 |         mel_path = filename.replace(".wav", ".mel.pt")
 87 |         if os.path.exists(mel_path):
 88 |             mel = torch.load(mel_path)
 89 |             return mel, wav_torch.unsqueeze(0)
 90 | 
 91 |         mel = mel_processing.get_mel(wav_torch,
 92 |                                      self.hps.sampling_rate,
 93 |                                      self.hps.n_mel_channels,
 94 |                                      self.hps.filter_length,
 95 |                                      self.hps.win_length,
 96 |                                      self.hps.hop_length,
 97 |                                      self.hps.mel_fmin,
 98 |                                      self.hps.mel_fmax)
 99 |         torch.save(mel, mel_path)
100 |         return mel, wav_torch.unsqueeze(0)
101 | 
102 |     def get_text(self, text, tone, language):
103 |         text_norm, tone, language = cleaned_text_to_sequence(text, tone, language)
104 |         if self.add_blank:
105 |             text_norm = commons.intersperse(text_norm, 0)
106 |             tone = commons.intersperse(tone, 0)
107 |             language = commons.intersperse(language, 0)
108 |         text_norm = torch.LongTensor(text_norm)
109 |         tone = torch.LongTensor(tone)
110 |         language = torch.LongTensor(language)
111 |         return text_norm, tone, language
112 | 
113 |     def get_pitch(self, wav, p_len, wavpath):
114 |         f0_path = wavpath.replace(".wav", ".f0.pt")
115 |         if os.path.exists(f0_path):
116 |             return torch.load(f0_path)
117 | 
118 | 
119 |         if self.fcpe is None:
120 |             from f0_extractor.FCPEF0Predictor import FCPEF0Predictor
121 |             print("init fcpe")
122 |             self.fcpe = FCPEF0Predictor(sampling_rate=self.sampling_rate, hop_length=self.hop_length)
123 |         pred_f0, uv = self.fcpe.compute_f0_uv(wav, p_len=p_len)
124 |         f0 = torch.FloatTensor(pred_f0)
125 |         torch.save(f0, f0_path)
126 |         return f0
127 | 
128 |     def get_sid(self, sid):
129 |         sid = torch.LongTensor([int(sid)])
130 |         return sid
131 | 
132 |     def __getitem__(self, index):
133 |         return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
134 | 
135 |     def __len__(self):
136 |         return len(self.audiopaths_sid_text)
137 | 
138 | 
139 | class TextAudioSpeakerCollate():
140 |     """ Zero-pads model inputs and targets
141 |     """
142 | 
143 |     def __init__(self, return_ids=False):
144 |         self.return_ids = return_ids
145 | 
146 |     def __call__(self, batch):
147 |         """Collate's training batch from normalized text, audio and speaker identities
148 |         PARAMS
149 |         ------
150 |         batch: [text_normalized, spec_normalized, wav_normalized, sid]
151 |         """
152 |         # Right zero-pad all one-hot text sequences to max input length
153 |         _, ids_sorted_decreasing = torch.sort(
154 |             torch.LongTensor([x[1].size(1) for x in batch]),
155 |             dim=0, descending=True)
156 | 
157 |         max_mel_len = max([x[1].size(1) for x in batch])
158 |         max_wav_len = max([x[2].size(1) for x in batch])
159 | 
160 |         mel_lengths = torch.LongTensor(len(batch))
161 |         wav_lengths = torch.LongTensor(len(batch))
162 |         sid = torch.LongTensor(len(batch))
163 | 
164 |         c_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_mel_len)
165 |         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
166 |         mel_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_mel_len)
167 |         f0_padded = torch.FloatTensor(len(batch), max_mel_len)
168 |         c_padded.zero_()
169 |         mel_padded.zero_()
170 |         wav_padded.zero_()
171 |         f0_padded.zero_()
172 | 
173 |         for i in range(len(ids_sorted_decreasing)):
174 |             row = batch[ids_sorted_decreasing[i]]
175 | 
176 |             content = row[0][0,:, :]
177 |             c_padded[i,:, :content.size(1)] = content
178 | 
179 |             mel = row[1]
180 |             mel_padded[i, :, :mel.size(1)] = mel
181 |             mel_lengths[i] = mel.size(1)
182 | 
183 |             wav = row[2]
184 |             wav_padded[i, :, :wav.size(1)] = wav
185 |             wav_lengths[i] = wav.size(1)
186 | 
187 |             sid[i] = row[3]
188 | 
189 |             f0 = row[4]
190 |             f0_padded[i, :f0.size(0)] = f0
191 | 
192 |         return c_padded, mel_padded, mel_lengths,wav_padded, wav_lengths,\
193 |             sid, f0_padded
194 | 
195 | 
196 | class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
197 |     """
198 |     Maintain similar input lengths in a batch.
199 |     Length groups are specified by boundaries.
200 |     Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
201 | 
202 |     It removes samples which are not included in the boundaries.
203 |     Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
204 |     """
205 | 
206 |     def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
207 |         super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
208 |         self.lengths = dataset.lengths
209 |         self.batch_size = batch_size
210 |         self.boundaries = boundaries
211 | 
212 |         self.buckets, self.num_samples_per_bucket = self._create_buckets()
213 |         self.total_size = sum(self.num_samples_per_bucket)
214 |         self.num_samples = self.total_size // self.num_replicas
215 | 
216 |     def _create_buckets(self):
217 |         buckets = [[] for _ in range(len(self.boundaries) - 1)]
218 |         for i in range(len(self.lengths)):
219 |             length = self.lengths[i]
220 |             idx_bucket = self._bisect(length)
221 |             if idx_bucket != -1:
222 |                 buckets[idx_bucket].append(i)
223 | 
224 |         for i in range(len(buckets) - 1, 0, -1):
225 |             if len(buckets[i]) == 0:
226 |                 buckets.pop(i)
227 |                 self.boundaries.pop(i + 1)
228 | 
229 |         num_samples_per_bucket = []
230 |         for i in range(len(buckets)):
231 |             len_bucket = len(buckets[i])
232 |             total_batch_size = self.num_replicas * self.batch_size
233 |             rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
234 |             num_samples_per_bucket.append(len_bucket + rem)
235 |         return buckets, num_samples_per_bucket
236 | 
237 |     def __iter__(self):
238 |         # deterministically shuffle based on epoch
239 |         g = torch.Generator()
240 |         g.manual_seed(self.epoch)
241 | 
242 |         indices = []
243 |         if self.shuffle:
244 |             for bucket in self.buckets:
245 |                 indices.append(torch.randperm(len(bucket), generator=g).tolist())
246 |         else:
247 |             for bucket in self.buckets:
248 |                 indices.append(list(range(len(bucket))))
249 | 
250 |         batches = []
251 |         for i in range(len(self.buckets)):
252 |             bucket = self.buckets[i]
253 |             len_bucket = len(bucket)
254 |             ids_bucket = indices[i]
255 |             num_samples_bucket = self.num_samples_per_bucket[i]
256 | 
257 |             # add extra samples to make it evenly divisible
258 |             rem = num_samples_bucket - len_bucket
259 |             ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
260 | 
261 |             # subsample
262 |             ids_bucket = ids_bucket[self.rank::self.num_replicas]
263 | 
264 |             # batching
265 |             for j in range(len(ids_bucket) // self.batch_size):
266 |                 batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
267 |                 batches.append(batch)
268 | 
269 |         if self.shuffle:
270 |             batch_ids = torch.randperm(len(batches), generator=g).tolist()
271 |             batches = [batches[i] for i in batch_ids]
272 |         self.batches = batches
273 | 
274 |         assert len(self.batches) * self.batch_size == self.num_samples
275 |         return iter(self.batches)
276 | 
277 |     def _bisect(self, x, lo=0, hi=None):
278 |         if hi is None:
279 |             hi = len(self.boundaries) - 1
280 | 
281 |         if hi > lo:
282 |             mid = (hi + lo) // 2
283 |             if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
284 |                 return mid
285 |             elif x <= self.boundaries[mid]:
286 |                 return self._bisect(x, lo, mid)
287 |             else:
288 |                 return self._bisect(x, mid + 1, hi)
289 |         else:
290 |             return -1
291 | 
292 |     def __len__(self):
293 |         return self.num_samples // self.batch_size
294 | 


--------------------------------------------------------------------------------
/extract_f0_mel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from torch.utils.data import DataLoader
 4 | 
 5 | import utils
 6 | from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
 7 | from tqdm import tqdm
 8 | import logging
 9 | logging.getLogger('numba').setLevel(logging.INFO)
10 | config_path = 'configs/config.json'
11 | hps = utils.get_hparams_from_file(config_path)
12 | collate = TextAudioSpeakerCollate()
13 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
14 | eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
15 | 
16 | for _ in tqdm(train_dataset):
17 |     pass
18 | for _ in tqdm(eval_dataset):
19 |     pass
20 | 
21 | # train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False,
22 | #                             batch_size=2, pin_memory=True,
23 | #                             drop_last=True, collate_fn=collate)
24 | #
25 | # for _ in tqdm(train_loader):
26 | #     pass


--------------------------------------------------------------------------------
/extract_vec.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import multiprocessing
 3 | import os
 4 | import argparse
 5 | from pathlib import Path
 6 | from random import shuffle
 7 | 
 8 | import torch
 9 | from glob import glob
10 | from tqdm import tqdm
11 | 
12 | from feature_extractor import contentvec768
13 | import utils
14 | import logging
15 | 
16 | logging.getLogger("numba").setLevel(logging.WARNING)
17 | import librosa
18 | 
19 | 
20 | def process_one(file_path, model):
21 |     path = Path(file_path)
22 |     
23 |     ssl_path = file_path.replace(".wav", ".ssl.pt")
24 |     # try:
25 |     #     torch.load(ssl_path)
26 |     # except:
27 |     if not os.path.exists(ssl_path):
28 |         print(111)
29 |         print(ssl_path)
30 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31 |         wav16k, sr = librosa.load(path, sr=16000)
32 |         wav16k = torch.from_numpy(wav16k).to(device)
33 |         ssl_content = contentvec768.get_content(model, wav_16k_tensor=wav16k)
34 |         torch.save(ssl_content.cpu(), ssl_path)
35 |         if not os.path.exists(ssl_path):
36 |             print("errrrrrrrrrrrrrrrrr"*1000)
37 |         # exit(0)
38 |         
39 | 
40 | def process_batch(filenames):
41 |     print("Loading hubert for content...")
42 |     device = "cuda" if torch.cuda.is_available() else "cpu"
43 |     ssl_model = contentvec768.get_model().to(device)
44 |     print("Loaded hubert.")
45 |     for filename in tqdm(filenames):
46 |         process_one(filename, ssl_model)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     parser = argparse.ArgumentParser()
51 |     parser.add_argument(
52 |         "--in_dir", type=str, default="dataset", help="path to input dir"
53 |     )
54 | 
55 |     args = parser.parse_args()
56 |     filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True)  # [:10]
57 |     print(len(filenames))
58 |     shuffle(filenames)
59 |     multiprocessing.set_start_method("spawn", force=True)
60 | 
61 |     num_processes = 1
62 |     chunk_size = int(math.ceil(len(filenames) / num_processes))
63 |     chunks = [
64 |         filenames[i : i + chunk_size] for i in range(0, len(filenames), chunk_size)
65 |     ]
66 |     print([len(c) for c in chunks])
67 |     processes = [
68 |         multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks
69 |     ]
70 |     for p in processes:
71 |         p.start()
72 | 


--------------------------------------------------------------------------------
/f0_extractor/F0Predictor.py:
--------------------------------------------------------------------------------
 1 | class F0Predictor(object):
 2 |     def compute_f0(self,wav,p_len):
 3 |         '''
 4 |         input: wav:[signal_length]
 5 |                p_len:int
 6 |         output: f0:[signal_length//hop_length]
 7 |         '''
 8 |         pass
 9 | 
10 |     def compute_f0_uv(self,wav,p_len):
11 |         '''
12 |         input: wav:[signal_length]
13 |                p_len:int
14 |         output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 |         '''
16 |         pass


--------------------------------------------------------------------------------
/f0_extractor/FCPEF0Predictor.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from f0_extractor.F0Predictor import F0Predictor
  8 | 
  9 | from .fcpe.model import FCPEInfer
 10 | 
 11 | 
 12 | class FCPEF0Predictor(F0Predictor):
 13 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100,
 14 |                  threshold=0.05):
 15 |         self.fcpe = FCPEInfer(model_path="pretrain/fcpe/fcpe.pt", device=device, dtype=dtype)
 16 |         self.hop_length = hop_length
 17 |         self.f0_min = f0_min
 18 |         self.f0_max = f0_max
 19 |         if device is None:
 20 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 21 |         else:
 22 |             self.device = device
 23 |         self.threshold = threshold
 24 |         self.sampling_rate = sampling_rate
 25 |         self.dtype = dtype
 26 |         self.name = "fcpe"
 27 | 
 28 |     def repeat_expand(
 29 |             self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
 30 |     ):
 31 |         ndim = content.ndim
 32 | 
 33 |         if content.ndim == 1:
 34 |             content = content[None, None]
 35 |         elif content.ndim == 2:
 36 |             content = content[None]
 37 | 
 38 |         assert content.ndim == 3
 39 | 
 40 |         is_np = isinstance(content, np.ndarray)
 41 |         if is_np:
 42 |             content = torch.from_numpy(content)
 43 | 
 44 |         results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
 45 | 
 46 |         if is_np:
 47 |             results = results.numpy()
 48 | 
 49 |         if ndim == 1:
 50 |             return results[0, 0]
 51 |         elif ndim == 2:
 52 |             return results[0]
 53 | 
 54 |     def post_process(self, x, sampling_rate, f0, pad_to):
 55 |         if isinstance(f0, np.ndarray):
 56 |             f0 = torch.from_numpy(f0).float().to(x.device)
 57 | 
 58 |         if pad_to is None:
 59 |             return f0
 60 | 
 61 |         f0 = self.repeat_expand(f0, pad_to)
 62 | 
 63 |         vuv_vector = torch.zeros_like(f0)
 64 |         vuv_vector[f0 > 0.0] = 1.0
 65 |         vuv_vector[f0 <= 0.0] = 0.0
 66 | 
 67 |         # 去掉0频率, 并线性插值
 68 |         nzindex = torch.nonzero(f0).squeeze()
 69 |         f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
 70 |         time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
 71 |         time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
 72 | 
 73 |         vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
 74 | 
 75 |         if f0.shape[0] <= 0:
 76 |             return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy()
 77 |         if f0.shape[0] == 1:
 78 |             return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[
 79 |                 0]).cpu().numpy(), vuv_vector.cpu().numpy()
 80 | 
 81 |         # 大概可以用 torch 重写?
 82 |         f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
 83 |         # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
 84 | 
 85 |         return f0, vuv_vector.cpu().numpy()
 86 | 
 87 |     def compute_f0(self, wav, p_len=None):
 88 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
 89 |         if p_len is None:
 90 |             p_len = x.shape[0] // self.hop_length
 91 |         else:
 92 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
 93 |         f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
 94 |         if torch.all(f0 == 0):
 95 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
 96 |             return rtn, rtn
 97 |         return self.post_process(x, self.sampling_rate, f0, p_len)[0]
 98 | 
 99 |     def compute_f0_uv(self, wav, p_len=None):
100 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
101 |         if p_len is None:
102 |             p_len = x.shape[0] // self.hop_length
103 |         else:
104 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
105 |         f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
106 |         if torch.all(f0 == 0):
107 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
108 |             return rtn, rtn
109 |         return self.post_process(x, self.sampling_rate, f0, p_len)
110 | 
111 |     def get_activation(self, wav, down_sample=1):
112 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
113 |         activation, mel = self.fcpe.get_activation(x, sr=self.sampling_rate, threshold=self.threshold)
114 |         activation = activation[0].T
115 |         frame_length = activation.shape[-1]
116 |         activation = torch.mean(activation.view(1, 360//down_sample, down_sample, frame_length), dim=2)
117 | 
118 |         return activation, mel
119 | 


--------------------------------------------------------------------------------
/f0_extractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/f0_extractor/__init__.py


--------------------------------------------------------------------------------
/f0_extractor/fcpe/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import FCPEInfer  # noqa: F401
2 | from .nvSTFT import STFT  # noqa: F401
3 | from .pcmer import PCmer  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/f0_extractor/fcpe/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.nn.utils import weight_norm
  6 | from torchaudio.transforms import Resample
  7 | 
  8 | from .nvSTFT import STFT
  9 | from .pcmer import PCmer
 10 | 
 11 | 
 12 | def l2_regularization(model, l2_alpha):
 13 |     l2_loss = []
 14 |     for module in model.modules():
 15 |         if type(module) is nn.Conv2d:
 16 |             l2_loss.append((module.weight ** 2).sum() / 2.0)
 17 |     return l2_alpha * sum(l2_loss)
 18 | 
 19 | 
 20 | class FCPE(nn.Module):
 21 |     def __init__(
 22 |             self,
 23 |             input_channel=128,
 24 |             out_dims=360,
 25 |             n_layers=12,
 26 |             n_chans=512,
 27 |             use_siren=False,
 28 |             use_full=False,
 29 |             loss_mse_scale=10,
 30 |             loss_l2_regularization=False,
 31 |             loss_l2_regularization_scale=1,
 32 |             loss_grad1_mse=False,
 33 |             loss_grad1_mse_scale=1,
 34 |             f0_max=1975.5,
 35 |             f0_min=32.70,
 36 |             confidence=False,
 37 |             threshold=0.05,
 38 |             use_input_conv=True
 39 |     ):
 40 |         super().__init__()
 41 |         if use_siren is True:
 42 |             raise ValueError("Siren is not supported yet.")
 43 |         if use_full is True:
 44 |             raise ValueError("Full model is not supported yet.")
 45 | 
 46 |         self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
 47 |         self.loss_l2_regularization = loss_l2_regularization if (loss_l2_regularization is not None) else False
 48 |         self.loss_l2_regularization_scale = loss_l2_regularization_scale if (loss_l2_regularization_scale
 49 |                                                                              is not None) else 1
 50 |         self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
 51 |         self.loss_grad1_mse_scale = loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
 52 |         self.f0_max = f0_max if (f0_max is not None) else 1975.5
 53 |         self.f0_min = f0_min if (f0_min is not None) else 32.70
 54 |         self.confidence = confidence if (confidence is not None) else False
 55 |         self.threshold = threshold if (threshold is not None) else 0.05
 56 |         self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
 57 | 
 58 |         self.cent_table_b = torch.Tensor(
 59 |             np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0],
 60 |                         out_dims))
 61 |         self.register_buffer("cent_table", self.cent_table_b)
 62 | 
 63 |         # conv in stack
 64 |         _leaky = nn.LeakyReLU()
 65 |         self.stack = nn.Sequential(
 66 |             nn.Conv1d(input_channel, n_chans, 3, 1, 1),
 67 |             nn.GroupNorm(4, n_chans),
 68 |             _leaky,
 69 |             nn.Conv1d(n_chans, n_chans, 3, 1, 1))
 70 | 
 71 |         # transformer
 72 |         self.decoder = PCmer(
 73 |             num_layers=n_layers,
 74 |             num_heads=8,
 75 |             dim_model=n_chans,
 76 |             dim_keys=n_chans,
 77 |             dim_values=n_chans,
 78 |             residual_dropout=0.1,
 79 |             attention_dropout=0.1)
 80 |         self.norm = nn.LayerNorm(n_chans)
 81 | 
 82 |         # out
 83 |         self.n_out = out_dims
 84 |         self.dense_out = weight_norm(
 85 |             nn.Linear(n_chans, self.n_out))
 86 | 
 87 |     def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder = "local_argmax"):
 88 |         """
 89 |         input:
 90 |             B x n_frames x n_unit
 91 |         return:
 92 |             dict of B x n_frames x feat
 93 |         """
 94 |         if cdecoder == "argmax":
 95 |             self.cdecoder = self.cents_decoder
 96 |         elif cdecoder == "local_argmax":
 97 |             self.cdecoder = self.cents_local_decoder
 98 |         if self.use_input_conv:
 99 |             x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
100 |         else:
101 |             x = mel
102 |         x = self.decoder(x)
103 |         x = self.norm(x)
104 |         x = self.dense_out(x)  # [B,N,D]
105 |         x = torch.sigmoid(x)
106 |         if not infer:
107 |             gt_cent_f0 = self.f0_to_cent(gt_f0)  # mel f0  #[B,N,1]
108 |             gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)  # #[B,N,out_dim]
109 |             loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)  # bce loss
110 |             # l2 regularization
111 |             if self.loss_l2_regularization:
112 |                 loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale)
113 |             x = loss_all
114 |         if infer:
115 |             x = self.cdecoder(x)
116 |             x = self.cent_to_f0(x)
117 |             if not return_hz_f0:
118 |                 x = (1 + x / 700).log()
119 |         return x
120 | 
121 |     def cents_decoder(self, y, mask=True):
122 |         B, N, _ = y.size()
123 |         ci = self.cent_table[None, None, :].expand(B, N, -1)
124 |         rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True)  # cents: [B,N,1]
125 |         if mask:
126 |             confident = torch.max(y, dim=-1, keepdim=True)[0]
127 |             confident_mask = torch.ones_like(confident)
128 |             confident_mask[confident <= self.threshold] = float("-INF")
129 |             rtn = rtn * confident_mask
130 |         if self.confidence:
131 |             return rtn, confident
132 |         else:
133 |             return rtn
134 |         
135 |     def cents_local_decoder(self, y, mask=True):
136 |         B, N, _ = y.size()
137 |         ci = self.cent_table[None, None, :].expand(B, N, -1)
138 |         confident, max_index = torch.max(y, dim=-1, keepdim=True)
139 |         local_argmax_index = torch.arange(0,9).to(max_index.device) + (max_index - 4)
140 |         local_argmax_index[local_argmax_index<0] = 0
141 |         local_argmax_index[local_argmax_index>=self.n_out] = self.n_out - 1
142 |         ci_l = torch.gather(ci,-1,local_argmax_index)
143 |         y_l = torch.gather(y,-1,local_argmax_index)
144 |         rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True)  # cents: [B,N,1]
145 |         if mask:
146 |             confident_mask = torch.ones_like(confident)
147 |             confident_mask[confident <= self.threshold] = float("-INF")
148 |             rtn = rtn * confident_mask
149 |         if self.confidence:
150 |             return rtn, confident
151 |         else:
152 |             return rtn
153 | 
154 |     def cent_to_f0(self, cent):
155 |         return 10. * 2 ** (cent / 1200.)
156 | 
157 |     def f0_to_cent(self, f0):
158 |         return 1200. * torch.log2(f0 / 10.)
159 | 
160 |     def gaussian_blurred_cent(self, cents):  # cents: [B,N,1]
161 |         mask = (cents > 0.1) & (cents < (1200. * np.log2(self.f0_max / 10.)))
162 |         B, N, _ = cents.size()
163 |         ci = self.cent_table[None, None, :].expand(B, N, -1)
164 |         return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
165 | 
166 |     def get_activation(self, mel):
167 |         """
168 |         input:
169 |             B x n_frames x n_unit
170 |         return:
171 |             dict of B x n_frames x feat
172 |         """
173 |         if self.use_input_conv:
174 |             x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
175 |         else:
176 |             x = mel
177 |         x = self.decoder(x)
178 |         x = self.norm(x)
179 |         x = self.dense_out(x)  # [B,N,D]
180 |         x = torch.sigmoid(x)
181 |         return x
182 | 
183 | class FCPEInfer:
184 |     def __init__(self, model_path, device=None, dtype=torch.float32):
185 |         if device is None:
186 |             device = 'cuda' if torch.cuda.is_available() else 'cpu'
187 |         self.device = device
188 |         ckpt = torch.load(model_path, map_location=torch.device(self.device))
189 |         self.args = DotDict(ckpt["config"])
190 |         self.dtype = dtype
191 |         model = FCPE(
192 |             input_channel=self.args.model.input_channel,
193 |             out_dims=self.args.model.out_dims,
194 |             n_layers=self.args.model.n_layers,
195 |             n_chans=self.args.model.n_chans,
196 |             use_siren=self.args.model.use_siren,
197 |             use_full=self.args.model.use_full,
198 |             loss_mse_scale=self.args.loss.loss_mse_scale,
199 |             loss_l2_regularization=self.args.loss.loss_l2_regularization,
200 |             loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
201 |             loss_grad1_mse=self.args.loss.loss_grad1_mse,
202 |             loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
203 |             f0_max=self.args.model.f0_max,
204 |             f0_min=self.args.model.f0_min,
205 |             confidence=self.args.model.confidence,
206 |         )
207 |         model.to(self.device).to(self.dtype)
208 |         model.load_state_dict(ckpt['model'])
209 |         model.eval()
210 |         self.model = model
211 |         self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
212 | 
213 |     @torch.no_grad()
214 |     def __call__(self, audio, sr, threshold=0.05):
215 |         self.model.threshold = threshold
216 |         audio = audio[None,:]
217 |         mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
218 |         f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
219 |         return f0
220 | 
221 |     @torch.no_grad()
222 |     def get_activation(self, audio, sr, threshold=0.05):
223 |         self.model.threshold = threshold
224 |         audio = audio[None, :]
225 |         mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
226 |         activation = self.model.get_activation(mel=mel)
227 |         return activation, mel
228 | 
229 | class Wav2Mel:
230 | 
231 |     def __init__(self, args, device=None, dtype=torch.float32):
232 |         # self.args = args
233 |         self.sampling_rate = args.mel.sampling_rate
234 |         self.hop_size = args.mel.hop_size
235 |         if device is None:
236 |             device = 'cuda' if torch.cuda.is_available() else 'cpu'
237 |         self.device = device
238 |         self.dtype = dtype
239 |         self.stft = STFT(
240 |             args.mel.sampling_rate,
241 |             args.mel.num_mels,
242 |             args.mel.n_fft,
243 |             args.mel.win_size,
244 |             args.mel.hop_size,
245 |             args.mel.fmin,
246 |             args.mel.fmax
247 |         )
248 |         self.resample_kernel = {}
249 | 
250 |     def extract_nvstft(self, audio, keyshift=0, train=False):
251 |         mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)  # B, n_frames, bins
252 |         return mel
253 | 
254 |     def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
255 |         audio = audio.to(self.dtype).to(self.device)
256 |         # resample
257 |         if sample_rate == self.sampling_rate:
258 |             audio_res = audio
259 |         else:
260 |             key_str = str(sample_rate)
261 |             if key_str not in self.resample_kernel:
262 |                 self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128)
263 |             self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device)
264 |             audio_res = self.resample_kernel[key_str](audio)
265 | 
266 |         # extract
267 |         mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train)  # B, n_frames, bins
268 |         n_frames = int(audio.shape[1] // self.hop_size) + 1
269 |         if n_frames > int(mel.shape[1]):
270 |             mel = torch.cat((mel, mel[:, -1:, :]), 1)
271 |         if n_frames < int(mel.shape[1]):
272 |             mel = mel[:, :n_frames, :]
273 |         return mel
274 | 
275 |     def __call__(self, audio, sample_rate, keyshift=0, train=False):
276 |         return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
277 | 
278 | 
279 | class DotDict(dict):
280 |     def __getattr__(*args):
281 |         val = dict.get(*args)
282 |         return DotDict(val) if type(val) is dict else val
283 | 
284 |     __setattr__ = dict.__setitem__
285 |     __delattr__ = dict.__delitem__
286 | 


--------------------------------------------------------------------------------
/f0_extractor/fcpe/nvSTFT.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import librosa
  4 | import numpy as np
  5 | import soundfile as sf
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import torch.utils.data
  9 | from librosa.filters import mel as librosa_mel_fn
 10 | 
 11 | os.environ["LRU_CACHE_CAPACITY"] = "3"
 12 | 
 13 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
 14 |     sampling_rate = None
 15 |     try:
 16 |         data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
 17 |     except Exception as ex:
 18 |         print(f"'{full_path}' failed to load.\nException:")
 19 |         print(ex)
 20 |         if return_empty_on_exception:
 21 |             return [], sampling_rate or target_sr or 48000
 22 |         else:
 23 |             raise Exception(ex)
 24 |     
 25 |     if len(data.shape) > 1:
 26 |         data = data[:, 0]
 27 |         assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
 28 |     
 29 |     if np.issubdtype(data.dtype, np.integer): # if audio data is type int
 30 |         max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
 31 |     else: # if audio data is type fp32
 32 |         max_mag = max(np.amax(data), -np.amin(data))
 33 |         max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
 34 |     
 35 |     data = torch.FloatTensor(data.astype(np.float32))/max_mag
 36 |     
 37 |     if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
 38 |         return [], sampling_rate or target_sr or 48000
 39 |     if target_sr is not None and sampling_rate != target_sr:
 40 |         data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
 41 |         sampling_rate = target_sr
 42 |     
 43 |     return data, sampling_rate
 44 | 
 45 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 46 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 47 | 
 48 | def dynamic_range_decompression(x, C=1):
 49 |     return np.exp(x) / C
 50 | 
 51 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 52 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 53 | 
 54 | def dynamic_range_decompression_torch(x, C=1):
 55 |     return torch.exp(x) / C
 56 | 
 57 | class STFT():
 58 |     def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
 59 |         self.target_sr = sr
 60 |         
 61 |         self.n_mels     = n_mels
 62 |         self.n_fft      = n_fft
 63 |         self.win_size   = win_size
 64 |         self.hop_length = hop_length
 65 |         self.fmin     = fmin
 66 |         self.fmax     = fmax
 67 |         self.clip_val = clip_val
 68 |         self.mel_basis = {}
 69 |         self.hann_window = {}
 70 |     
 71 |     def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
 72 |         sampling_rate = self.target_sr
 73 |         n_mels     = self.n_mels
 74 |         n_fft      = self.n_fft
 75 |         win_size   = self.win_size
 76 |         hop_length = self.hop_length
 77 |         fmin       = self.fmin
 78 |         fmax       = self.fmax
 79 |         clip_val   = self.clip_val
 80 |         
 81 |         factor = 2 ** (keyshift / 12)       
 82 |         n_fft_new = int(np.round(n_fft * factor))
 83 |         win_size_new = int(np.round(win_size * factor))
 84 |         hop_length_new = int(np.round(hop_length * speed))
 85 |         if not train:
 86 |             mel_basis = self.mel_basis
 87 |             hann_window = self.hann_window
 88 |         else:
 89 |             mel_basis = {}
 90 |             hann_window = {}
 91 |         
 92 |         if torch.min(y) < -1.:
 93 |             print('min value is ', torch.min(y))
 94 |         if torch.max(y) > 1.:
 95 |             print('max value is ', torch.max(y))
 96 |         
 97 |         mel_basis_key = str(fmax)+'_'+str(y.device)
 98 |         if mel_basis_key not in mel_basis:
 99 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
100 |             mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
101 |         
102 |         keyshift_key = str(keyshift)+'_'+str(y.device)
103 |         if keyshift_key not in hann_window:
104 |             hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
105 |         
106 |         pad_left = (win_size_new - hop_length_new) //2
107 |         pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
108 |         if pad_right < y.size(-1):
109 |             mode = 'reflect'
110 |         else:
111 |             mode = 'constant'
112 |         y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
113 |         y = y.squeeze(1)
114 |         
115 |         spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key],
116 |                           center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)                          
117 |         spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
118 |         if keyshift != 0:
119 |             size = n_fft // 2 + 1
120 |             resize = spec.size(1)
121 |             if resize < size:
122 |                 spec = F.pad(spec, (0, 0, 0, size-resize))
123 |             spec = spec[:, :size, :] * win_size / win_size_new   
124 |         spec = torch.matmul(mel_basis[mel_basis_key], spec)
125 |         spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
126 |         return spec
127 |     
128 |     def __call__(self, audiopath):
129 |         audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
130 |         spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
131 |         return spect
132 | 
133 | stft = STFT()
134 | 


--------------------------------------------------------------------------------
/f0_extractor/fcpe/pcmer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from functools import partial
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from einops import rearrange, repeat
  7 | from local_attention import LocalAttention
  8 | from torch import nn
  9 | 
 10 | #import fast_transformers.causal_product.causal_product_cuda
 11 | 
 12 | def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None):
 13 |     b, h, *_ = data.shape
 14 |     # (batch size, head, length, model_dim)
 15 | 
 16 |     # normalize model dim
 17 |     data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.
 18 | 
 19 |     # what is ration?, projection_matrix.shape[0] --> 266
 20 |     
 21 |     ratio = (projection_matrix.shape[0] ** -0.5)
 22 | 
 23 |     projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h)
 24 |     projection = projection.type_as(data)
 25 | 
 26 |     #data_dash = w^T x
 27 |     data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection)
 28 | 
 29 |     
 30 |     # diag_data = D**2 
 31 |     diag_data = data ** 2
 32 |     diag_data = torch.sum(diag_data, dim=-1)
 33 |     diag_data = (diag_data / 2.0) * (data_normalizer ** 2)
 34 |     diag_data = diag_data.unsqueeze(dim=-1)
 35 |     
 36 |     #print ()
 37 |     if is_query:
 38 |         data_dash = ratio * (
 39 |             torch.exp(data_dash - diag_data -
 40 |                     torch.max(data_dash, dim=-1, keepdim=True).values) + eps)
 41 |     else:
 42 |         data_dash = ratio * (
 43 |             torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps)
 44 | 
 45 |     return data_dash.type_as(data)
 46 | 
 47 | def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None):
 48 |     unstructured_block = torch.randn((cols, cols), device = device)
 49 |     q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced')
 50 |     q, r = map(lambda t: t.to(device), (q, r))
 51 | 
 52 |     # proposed by @Parskatt
 53 |     # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
 54 |     if qr_uniform_q:
 55 |         d = torch.diag(r, 0)
 56 |         q *= d.sign()
 57 |     return q.t()
 58 | def exists(val):
 59 |     return val is not None
 60 | 
 61 | def empty(tensor):
 62 |     return tensor.numel() == 0
 63 | 
 64 | def default(val, d):
 65 |     return val if exists(val) else d
 66 | 
 67 | def cast_tuple(val):
 68 |     return (val,) if not isinstance(val, tuple) else val
 69 | 
 70 | class PCmer(nn.Module):
 71 |     """The encoder that is used in the Transformer model."""
 72 |     
 73 |     def __init__(self, 
 74 |                 num_layers,
 75 |                 num_heads,
 76 |                 dim_model,
 77 |                 dim_keys,
 78 |                 dim_values,
 79 |                 residual_dropout,
 80 |                 attention_dropout):
 81 |         super().__init__()
 82 |         self.num_layers = num_layers
 83 |         self.num_heads = num_heads
 84 |         self.dim_model = dim_model
 85 |         self.dim_values = dim_values
 86 |         self.dim_keys = dim_keys
 87 |         self.residual_dropout = residual_dropout
 88 |         self.attention_dropout = attention_dropout
 89 | 
 90 |         self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
 91 |         
 92 |     #  METHODS  ########################################################################################################
 93 |     
 94 |     def forward(self, phone, mask=None):
 95 |         
 96 |         # apply all layers to the input
 97 |         for (i, layer) in enumerate(self._layers):
 98 |             phone = layer(phone, mask)
 99 |         # provide the final sequence
100 |         return phone
101 | 
102 | 
103 | # ==================================================================================================================== #
104 | #  CLASS  _ E N C O D E R  L A Y E R                                                                                   #
105 | # ==================================================================================================================== #
106 | 
107 | 
108 | class _EncoderLayer(nn.Module):
109 |     """One layer of the encoder.
110 |     
111 |     Attributes:
112 |         attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
113 |         feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
114 |     """
115 |     
116 |     def __init__(self, parent: PCmer):
117 |         """Creates a new instance of ``_EncoderLayer``.
118 |         
119 |         Args:
120 |             parent (Encoder): The encoder that the layers is created for.
121 |         """
122 |         super().__init__()
123 |         
124 |         
125 |         self.conformer = ConformerConvModule(parent.dim_model)
126 |         self.norm = nn.LayerNorm(parent.dim_model)
127 |         self.dropout = nn.Dropout(parent.residual_dropout)
128 |         
129 |         # selfatt -> fastatt: performer!
130 |         self.attn = SelfAttention(dim = parent.dim_model,
131 |                                   heads = parent.num_heads,
132 |                                   causal = False)
133 |         
134 |     #  METHODS  ########################################################################################################
135 | 
136 |     def forward(self, phone, mask=None):
137 |         
138 |         # compute attention sub-layer
139 |         phone = phone + (self.attn(self.norm(phone), mask=mask))
140 |         
141 |         phone = phone + (self.conformer(phone))
142 |         
143 |         return phone 
144 | 
145 | def calc_same_padding(kernel_size):
146 |     pad = kernel_size // 2
147 |     return (pad, pad - (kernel_size + 1) % 2)
148 | 
149 | # helper classes
150 | 
151 | class Swish(nn.Module):
152 |     def forward(self, x):
153 |         return x * x.sigmoid()
154 | 
155 | class Transpose(nn.Module):
156 |     def __init__(self, dims):
157 |         super().__init__()
158 |         assert len(dims) == 2, 'dims must be a tuple of two dimensions'
159 |         self.dims = dims
160 | 
161 |     def forward(self, x):
162 |         return x.transpose(*self.dims)
163 | 
164 | class GLU(nn.Module):
165 |     def __init__(self, dim):
166 |         super().__init__()
167 |         self.dim = dim
168 | 
169 |     def forward(self, x):
170 |         out, gate = x.chunk(2, dim=self.dim)
171 |         return out * gate.sigmoid()
172 | 
173 | class DepthWiseConv1d(nn.Module):
174 |     def __init__(self, chan_in, chan_out, kernel_size, padding):
175 |         super().__init__()
176 |         self.padding = padding
177 |         self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
178 | 
179 |     def forward(self, x):
180 |         x = F.pad(x, self.padding)
181 |         return self.conv(x)
182 | 
183 | class ConformerConvModule(nn.Module):
184 |     def __init__(
185 |         self,
186 |         dim,
187 |         causal = False,
188 |         expansion_factor = 2,
189 |         kernel_size = 31,
190 |         dropout = 0.):
191 |         super().__init__()
192 | 
193 |         inner_dim = dim * expansion_factor
194 |         padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
195 | 
196 |         self.net = nn.Sequential(
197 |             nn.LayerNorm(dim),
198 |             Transpose((1, 2)),
199 |             nn.Conv1d(dim, inner_dim * 2, 1),
200 |             GLU(dim=1),
201 |             DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
202 |             #nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
203 |             Swish(),
204 |             nn.Conv1d(inner_dim, dim, 1),
205 |             Transpose((1, 2)),
206 |             nn.Dropout(dropout)
207 |         )
208 | 
209 |     def forward(self, x):
210 |         return self.net(x)
211 | 
212 | def linear_attention(q, k, v):
213 |     if v is None:
214 |         #print (k.size(), q.size())
215 |         out = torch.einsum('...ed,...nd->...ne', k, q)
216 |         return out
217 | 
218 |     else:
219 |         k_cumsum = k.sum(dim = -2) 
220 |         #k_cumsum = k.sum(dim = -2)
221 |         D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8)
222 | 
223 |         context = torch.einsum('...nd,...ne->...de', k, v)
224 |         #print ("TRUEEE: ", context.size(), q.size(), D_inv.size())
225 |         out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv)
226 |         return out
227 | 
228 | def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None):
229 |     nb_full_blocks = int(nb_rows / nb_columns)
230 |     #print (nb_full_blocks)
231 |     block_list = []
232 | 
233 |     for _ in range(nb_full_blocks):
234 |         q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
235 |         block_list.append(q)
236 |     # block_list[n] is a orthogonal matrix ... (model_dim * model_dim)
237 |     #print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1)))
238 |     #print (nb_rows, nb_full_blocks, nb_columns)
239 |     remaining_rows = nb_rows - nb_full_blocks * nb_columns
240 |     #print (remaining_rows)
241 |     if remaining_rows > 0:
242 |         q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
243 |         #print (q[:remaining_rows].size())
244 |         block_list.append(q[:remaining_rows])
245 | 
246 |     final_matrix = torch.cat(block_list)
247 |     
248 |     if scaling == 0:
249 |         multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1)
250 |     elif scaling == 1:
251 |         multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device)
252 |     else:
253 |         raise ValueError(f'Invalid scaling {scaling}')
254 | 
255 |     return torch.diag(multiplier) @ final_matrix
256 | 
257 | class FastAttention(nn.Module):
258 |     def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False):
259 |         super().__init__()
260 |         nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
261 | 
262 |         self.dim_heads = dim_heads
263 |         self.nb_features = nb_features
264 |         self.ortho_scaling = ortho_scaling
265 | 
266 |         self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q)
267 |         projection_matrix = self.create_projection()
268 |         self.register_buffer('projection_matrix', projection_matrix)
269 | 
270 |         self.generalized_attention = generalized_attention
271 |         self.kernel_fn = kernel_fn
272 | 
273 |         # if this is turned on, no projection will be used
274 |         # queries and keys will be softmax-ed as in the original efficient attention paper
275 |         self.no_projection = no_projection
276 | 
277 |         self.causal = causal
278 | 
279 |     @torch.no_grad()
280 |     def redraw_projection_matrix(self):
281 |         projections = self.create_projection()
282 |         self.projection_matrix.copy_(projections)
283 |         del projections
284 | 
285 |     def forward(self, q, k, v):
286 |         device = q.device
287 | 
288 |         if self.no_projection:
289 |             q = q.softmax(dim = -1)
290 |             k = torch.exp(k) if self.causal else k.softmax(dim = -2)
291 |         else:
292 |             create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device)
293 |             
294 |             q = create_kernel(q, is_query = True)
295 |             k = create_kernel(k, is_query = False)
296 | 
297 |         attn_fn = linear_attention if not self.causal else self.causal_linear_fn
298 |         if v is None:
299 |             out = attn_fn(q, k, None)
300 |             return out
301 |         else:
302 |             out = attn_fn(q, k, v)
303 |             return out
304 | class SelfAttention(nn.Module):
305 |     def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False):
306 |         super().__init__()
307 |         assert dim % heads == 0, 'dimension must be divisible by number of heads'
308 |         dim_head = default(dim_head, dim // heads)
309 |         inner_dim = dim_head * heads
310 |         self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection)
311 | 
312 |         self.heads = heads
313 |         self.global_heads = heads - local_heads
314 |         self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None
315 | 
316 |         #print (heads, nb_features, dim_head)
317 |         #name_embedding = torch.zeros(110, heads, dim_head, dim_head)
318 |         #self.name_embedding = nn.Parameter(name_embedding, requires_grad=True)
319 |         
320 | 
321 |         self.to_q = nn.Linear(dim, inner_dim)
322 |         self.to_k = nn.Linear(dim, inner_dim)
323 |         self.to_v = nn.Linear(dim, inner_dim)
324 |         self.to_out = nn.Linear(inner_dim, dim)
325 |         self.dropout = nn.Dropout(dropout)
326 | 
327 |     @torch.no_grad()
328 |     def redraw_projection_matrix(self):
329 |         self.fast_attention.redraw_projection_matrix()
330 |         #torch.nn.init.zeros_(self.name_embedding)
331 |         #print (torch.sum(self.name_embedding))
332 |     def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs):
333 |         _, _, _, h, gh = *x.shape, self.heads, self.global_heads
334 |         
335 |         cross_attend = exists(context)
336 | 
337 |         context = default(context, x)
338 |         context_mask = default(context_mask, mask) if not cross_attend else context_mask
339 |         #print (torch.sum(self.name_embedding))
340 |         q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
341 | 
342 |         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
343 |         (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
344 | 
345 |         attn_outs = []
346 |         #print (name)
347 |         #print (self.name_embedding[name].size())
348 |         if not empty(q):
349 |             if exists(context_mask):
350 |                 global_mask = context_mask[:, None, :, None]
351 |                 v.masked_fill_(~global_mask, 0.)
352 |             if cross_attend:
353 |                 pass
354 |                 #print (torch.sum(self.name_embedding))
355 |                 #out = self.fast_attention(q,self.name_embedding[name],None)
356 |                 #print (torch.sum(self.name_embedding[...,-1:]))
357 |             else:
358 |                 out = self.fast_attention(q, k, v)
359 |             attn_outs.append(out)
360 | 
361 |         if not empty(lq):
362 |             assert not cross_attend, 'local attention is not compatible with cross attention'
363 |             out = self.local_attn(lq, lk, lv, input_mask = mask)
364 |             attn_outs.append(out)
365 | 
366 |         out = torch.cat(attn_outs, dim = 1)
367 |         out = rearrange(out, 'b h n d -> b n (h d)')
368 |         out =  self.to_out(out)
369 |         return self.dropout(out)


--------------------------------------------------------------------------------
/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/feature_extractor/contentvec768.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import HubertModel
 3 | 
 4 | from torch import nn
 5 | import logging
 6 | 
 7 | logging.getLogger("numba").setLevel(logging.WARNING)
 8 | 
 9 | class HubertModelWithFinalProj(HubertModel):
10 |   def __init__(self, config):
11 |     super().__init__(config)
12 | 
13 |     # Remove this layer is necessary to achieve the desired outcome.
14 |     self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
15 | 
16 | def get_model():
17 |   model = HubertModelWithFinalProj.from_pretrained("./pretrain/content-vec-best")
18 |   return model
19 | 
20 | 
21 | def get_content(hmodel, wav_16k_tensor):
22 |   with torch.no_grad():
23 |       feats = hmodel(wav_16k_tensor.unsqueeze(0))["last_hidden_state"]
24 |   return feats.transpose(1,2)
25 | 
26 | 


--------------------------------------------------------------------------------
/filelists/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/filelists/.gitkeep


--------------------------------------------------------------------------------
/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from hifigan.network.vocoders.nsf_hifigan  import NsfHifiGAN
2 | # from https://github.com/openvpi/diffsinger
3 | 


--------------------------------------------------------------------------------
/hifigan/modules/hifigan/hifigan.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torch.nn as nn
  4 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  5 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
  6 | 
  7 | from hifigan.modules.parallel_wavegan.layers import UpsampleNetwork, ConvInUpsampleNetwork
  8 | from hifigan.modules.parallel_wavegan.models.source import SourceModuleHnNSF
  9 | import numpy as np
 10 | 
 11 | LRELU_SLOPE = 0.1
 12 | 
 13 | 
 14 | def init_weights(m, mean=0.0, std=0.01):
 15 |     classname = m.__class__.__name__
 16 |     if classname.find("Conv") != -1:
 17 |         m.weight.data.normal_(mean, std)
 18 | 
 19 | 
 20 | def apply_weight_norm(m):
 21 |     classname = m.__class__.__name__
 22 |     if classname.find("Conv") != -1:
 23 |         weight_norm(m)
 24 | 
 25 | 
 26 | def get_padding(kernel_size, dilation=1):
 27 |     return int((kernel_size * dilation - dilation) / 2)
 28 | 
 29 | 
 30 | class ResBlock1(torch.nn.Module):
 31 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 32 |         super(ResBlock1, self).__init__()
 33 |         self.h = h
 34 |         self.convs1 = nn.ModuleList([
 35 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 36 |                                padding=get_padding(kernel_size, dilation[0]))),
 37 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 38 |                                padding=get_padding(kernel_size, dilation[1]))),
 39 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
 40 |                                padding=get_padding(kernel_size, dilation[2])))
 41 |         ])
 42 |         self.convs1.apply(init_weights)
 43 | 
 44 |         self.convs2 = nn.ModuleList([
 45 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 46 |                                padding=get_padding(kernel_size, 1))),
 47 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 48 |                                padding=get_padding(kernel_size, 1))),
 49 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 50 |                                padding=get_padding(kernel_size, 1)))
 51 |         ])
 52 |         self.convs2.apply(init_weights)
 53 | 
 54 |     def forward(self, x):
 55 |         for c1, c2 in zip(self.convs1, self.convs2):
 56 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 57 |             xt = c1(xt)
 58 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
 59 |             xt = c2(xt)
 60 |             x = xt + x
 61 |         return x
 62 | 
 63 |     def remove_weight_norm(self):
 64 |         for l in self.convs1:
 65 |             remove_weight_norm(l)
 66 |         for l in self.convs2:
 67 |             remove_weight_norm(l)
 68 | 
 69 | 
 70 | class ResBlock2(torch.nn.Module):
 71 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
 72 |         super(ResBlock2, self).__init__()
 73 |         self.h = h
 74 |         self.convs = nn.ModuleList([
 75 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 76 |                                padding=get_padding(kernel_size, dilation[0]))),
 77 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 78 |                                padding=get_padding(kernel_size, dilation[1])))
 79 |         ])
 80 |         self.convs.apply(init_weights)
 81 | 
 82 |     def forward(self, x):
 83 |         for c in self.convs:
 84 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 85 |             xt = c(xt)
 86 |             x = xt + x
 87 |         return x
 88 | 
 89 |     def remove_weight_norm(self):
 90 |         for l in self.convs:
 91 |             remove_weight_norm(l)
 92 | 
 93 | 
 94 | class Conv1d1x1(Conv1d):
 95 |     """1x1 Conv1d with customized initialization."""
 96 | 
 97 |     def __init__(self, in_channels, out_channels, bias):
 98 |         """Initialize 1x1 Conv1d module."""
 99 |         super(Conv1d1x1, self).__init__(in_channels, out_channels,
100 |                                         kernel_size=1, padding=0,
101 |                                         dilation=1, bias=bias)
102 | 
103 | 
104 | class HifiGanGenerator(torch.nn.Module):
105 |     def __init__(self, h, c_out=1):
106 |         super(HifiGanGenerator, self).__init__()
107 |         self.h = h
108 |         self.num_kernels = len(h['resblock_kernel_sizes'])
109 |         self.num_upsamples = len(h['upsample_rates'])
110 | 
111 |         if h['use_pitch_embed']:
112 |             self.harmonic_num = 8
113 |             self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h['upsample_rates']))
114 |             self.m_source = SourceModuleHnNSF(
115 |                 sampling_rate=h['audio_sample_rate'],
116 |                 harmonic_num=self.harmonic_num)
117 |             self.noise_convs = nn.ModuleList()
118 |         self.conv_pre = weight_norm(Conv1d(80, h['upsample_initial_channel'], 7, 1, padding=3))
119 |         resblock = ResBlock1 if h['resblock'] == '1' else ResBlock2
120 | 
121 |         self.ups = nn.ModuleList()
122 |         for i, (u, k) in enumerate(zip(h['upsample_rates'], h['upsample_kernel_sizes'])):
123 |             c_cur = h['upsample_initial_channel'] // (2 ** (i + 1))
124 |             self.ups.append(weight_norm(
125 |                 ConvTranspose1d(c_cur * 2, c_cur, k, u, padding=(k - u) // 2)))
126 |             if h['use_pitch_embed']:
127 |                 if i + 1 < len(h['upsample_rates']):
128 |                     stride_f0 = np.prod(h['upsample_rates'][i + 1:])
129 |                     self.noise_convs.append(Conv1d(
130 |                         1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
131 |                 else:
132 |                     self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
133 | 
134 |         self.resblocks = nn.ModuleList()
135 |         for i in range(len(self.ups)):
136 |             ch = h['upsample_initial_channel'] // (2 ** (i + 1))
137 |             for j, (k, d) in enumerate(zip(h['resblock_kernel_sizes'], h['resblock_dilation_sizes'])):
138 |                 self.resblocks.append(resblock(h, ch, k, d))
139 | 
140 |         self.conv_post = weight_norm(Conv1d(ch, c_out, 7, 1, padding=3))
141 |         self.ups.apply(init_weights)
142 |         self.conv_post.apply(init_weights)
143 | 
144 |     def forward(self, x, f0=None):
145 |         if f0 is not None:
146 |             # harmonic-source signal, noise-source signal, uv flag
147 |             f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)
148 |             har_source, noi_source, uv = self.m_source(f0)
149 |             har_source = har_source.transpose(1, 2)
150 | 
151 |         x = self.conv_pre(x)
152 |         for i in range(self.num_upsamples):
153 |             x = F.leaky_relu(x, LRELU_SLOPE)
154 |             x = self.ups[i](x)
155 |             if f0 is not None:
156 |                 x_source = self.noise_convs[i](har_source)
157 |                 x = x + x_source
158 |             xs = None
159 |             for j in range(self.num_kernels):
160 |                 if xs is None:
161 |                     xs = self.resblocks[i * self.num_kernels + j](x)
162 |                 else:
163 |                     xs += self.resblocks[i * self.num_kernels + j](x)
164 |             x = xs / self.num_kernels
165 |         x = F.leaky_relu(x)
166 |         x = self.conv_post(x)
167 |         x = torch.tanh(x)
168 | 
169 |         return x
170 | 
171 |     def remove_weight_norm(self):
172 |         print('Removing weight norm...')
173 |         for l in self.ups:
174 |             remove_weight_norm(l)
175 |         for l in self.resblocks:
176 |             l.remove_weight_norm()
177 |         remove_weight_norm(self.conv_pre)
178 |         remove_weight_norm(self.conv_post)
179 | 
180 | 
181 | class DiscriminatorP(torch.nn.Module):
182 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, use_cond=False, c_in=1):
183 |         super(DiscriminatorP, self).__init__()
184 |         self.use_cond = use_cond
185 |         if use_cond:
186 |             from utils.hparams import hparams
187 |             t = hparams['hop_size']
188 |             self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
189 |             c_in = 2
190 | 
191 |         self.period = period
192 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
193 |         self.convs = nn.ModuleList([
194 |             norm_f(Conv2d(c_in, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
195 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
196 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
197 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
198 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
199 |         ])
200 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
201 | 
202 |     def forward(self, x, mel):
203 |         fmap = []
204 |         if self.use_cond:
205 |             x_mel = self.cond_net(mel)
206 |             x = torch.cat([x_mel, x], 1)
207 |         # 1d to 2d
208 |         b, c, t = x.shape
209 |         if t % self.period != 0:  # pad first
210 |             n_pad = self.period - (t % self.period)
211 |             x = F.pad(x, (0, n_pad), "reflect")
212 |             t = t + n_pad
213 |         x = x.view(b, c, t // self.period, self.period)
214 | 
215 |         for l in self.convs:
216 |             x = l(x)
217 |             x = F.leaky_relu(x, LRELU_SLOPE)
218 |             fmap.append(x)
219 |         x = self.conv_post(x)
220 |         fmap.append(x)
221 |         x = torch.flatten(x, 1, -1)
222 | 
223 |         return x, fmap
224 | 
225 | 
226 | class MultiPeriodDiscriminator(torch.nn.Module):
227 |     def __init__(self, use_cond=False, c_in=1):
228 |         super(MultiPeriodDiscriminator, self).__init__()
229 |         self.discriminators = nn.ModuleList([
230 |             DiscriminatorP(2, use_cond=use_cond, c_in=c_in),
231 |             DiscriminatorP(3, use_cond=use_cond, c_in=c_in),
232 |             DiscriminatorP(5, use_cond=use_cond, c_in=c_in),
233 |             DiscriminatorP(7, use_cond=use_cond, c_in=c_in),
234 |             DiscriminatorP(11, use_cond=use_cond, c_in=c_in),
235 |         ])
236 | 
237 |     def forward(self, y, y_hat, mel=None):
238 |         y_d_rs = []
239 |         y_d_gs = []
240 |         fmap_rs = []
241 |         fmap_gs = []
242 |         for i, d in enumerate(self.discriminators):
243 |             y_d_r, fmap_r = d(y, mel)
244 |             y_d_g, fmap_g = d(y_hat, mel)
245 |             y_d_rs.append(y_d_r)
246 |             fmap_rs.append(fmap_r)
247 |             y_d_gs.append(y_d_g)
248 |             fmap_gs.append(fmap_g)
249 | 
250 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
251 | 
252 | 
253 | class DiscriminatorS(torch.nn.Module):
254 |     def __init__(self, use_spectral_norm=False, use_cond=False, upsample_rates=None, c_in=1):
255 |         super(DiscriminatorS, self).__init__()
256 |         self.use_cond = use_cond
257 |         if use_cond:
258 |             t = np.prod(upsample_rates)
259 |             self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
260 |             c_in = 2
261 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
262 |         self.convs = nn.ModuleList([
263 |             norm_f(Conv1d(c_in, 128, 15, 1, padding=7)),
264 |             norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
265 |             norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
266 |             norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
267 |             norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
268 |             norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
269 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
270 |         ])
271 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
272 | 
273 |     def forward(self, x, mel):
274 |         if self.use_cond:
275 |             x_mel = self.cond_net(mel)
276 |             x = torch.cat([x_mel, x], 1)
277 |         fmap = []
278 |         for l in self.convs:
279 |             x = l(x)
280 |             x = F.leaky_relu(x, LRELU_SLOPE)
281 |             fmap.append(x)
282 |         x = self.conv_post(x)
283 |         fmap.append(x)
284 |         x = torch.flatten(x, 1, -1)
285 | 
286 |         return x, fmap
287 | 
288 | 
289 | class MultiScaleDiscriminator(torch.nn.Module):
290 |     def __init__(self, use_cond=False, c_in=1):
291 |         super(MultiScaleDiscriminator, self).__init__()
292 |         from utils.hparams import hparams
293 |         self.discriminators = nn.ModuleList([
294 |             DiscriminatorS(use_spectral_norm=True, use_cond=use_cond,
295 |                            upsample_rates=[4, 4, hparams['hop_size'] // 16],
296 |                            c_in=c_in),
297 |             DiscriminatorS(use_cond=use_cond,
298 |                            upsample_rates=[4, 4, hparams['hop_size'] // 32],
299 |                            c_in=c_in),
300 |             DiscriminatorS(use_cond=use_cond,
301 |                            upsample_rates=[4, 4, hparams['hop_size'] // 64],
302 |                            c_in=c_in),
303 |         ])
304 |         self.meanpools = nn.ModuleList([
305 |             AvgPool1d(4, 2, padding=1),
306 |             AvgPool1d(4, 2, padding=1)
307 |         ])
308 | 
309 |     def forward(self, y, y_hat, mel=None):
310 |         y_d_rs = []
311 |         y_d_gs = []
312 |         fmap_rs = []
313 |         fmap_gs = []
314 |         for i, d in enumerate(self.discriminators):
315 |             if i != 0:
316 |                 y = self.meanpools[i - 1](y)
317 |                 y_hat = self.meanpools[i - 1](y_hat)
318 |             y_d_r, fmap_r = d(y, mel)
319 |             y_d_g, fmap_g = d(y_hat, mel)
320 |             y_d_rs.append(y_d_r)
321 |             fmap_rs.append(fmap_r)
322 |             y_d_gs.append(y_d_g)
323 |             fmap_gs.append(fmap_g)
324 | 
325 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
326 | 
327 | 
328 | def feature_loss(fmap_r, fmap_g):
329 |     loss = 0
330 |     for dr, dg in zip(fmap_r, fmap_g):
331 |         for rl, gl in zip(dr, dg):
332 |             loss += torch.mean(torch.abs(rl - gl))
333 | 
334 |     return loss * 2
335 | 
336 | 
337 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
338 |     r_losses = 0
339 |     g_losses = 0
340 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
341 |         r_loss = torch.mean((1 - dr) ** 2)
342 |         g_loss = torch.mean(dg ** 2)
343 |         r_losses += r_loss
344 |         g_losses += g_loss
345 |     r_losses = r_losses / len(disc_real_outputs)
346 |     g_losses = g_losses / len(disc_real_outputs)
347 |     return r_losses, g_losses
348 | 
349 | 
350 | def cond_discriminator_loss(outputs):
351 |     loss = 0
352 |     for dg in outputs:
353 |         g_loss = torch.mean(dg ** 2)
354 |         loss += g_loss
355 |     loss = loss / len(outputs)
356 |     return loss
357 | 
358 | 
359 | def generator_loss(disc_outputs):
360 |     loss = 0
361 |     for dg in disc_outputs:
362 |         l = torch.mean((1 - dg) ** 2)
363 |         loss += l
364 |     loss = loss / len(disc_outputs)
365 |     return loss
366 | 


--------------------------------------------------------------------------------
/hifigan/modules/hifigan/mel_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from librosa.filters import mel as librosa_mel_fn
 5 | from scipy.io.wavfile import read
 6 | 
 7 | MAX_WAV_VALUE = 32768.0
 8 | 
 9 | 
10 | def load_wav(full_path):
11 |     sampling_rate, data = read(full_path)
12 |     return data, sampling_rate
13 | 
14 | 
15 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
16 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17 | 
18 | 
19 | def dynamic_range_decompression(x, C=1):
20 |     return np.exp(x) / C
21 | 
22 | 
23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24 |     return torch.log(torch.clamp(x, min=clip_val) * C)
25 | 
26 | 
27 | def dynamic_range_decompression_torch(x, C=1):
28 |     return torch.exp(x) / C
29 | 
30 | 
31 | def spectral_normalize_torch(magnitudes):
32 |     output = dynamic_range_compression_torch(magnitudes)
33 |     return output
34 | 
35 | 
36 | def spectral_de_normalize_torch(magnitudes):
37 |     output = dynamic_range_decompression_torch(magnitudes)
38 |     return output
39 | 
40 | 
41 | mel_basis = {}
42 | hann_window = {}
43 | 
44 | 
45 | def mel_spectrogram(y, hparams, center=False, complex=False):
46 |     # hop_size: 512  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
47 |     # win_size: 2048  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
48 |     # fmin: 55  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
49 |     # fmax: 10000  # To be increased/reduced depending on data.
50 |     # fft_size: 2048  # Extra window size is filled with 0 paddings to match this parameter
51 |     # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
52 |     n_fft = hparams['fft_size']
53 |     num_mels = hparams['audio_num_mel_bins']
54 |     sampling_rate = hparams['audio_sample_rate']
55 |     hop_size = hparams['hop_size']
56 |     win_size = hparams['win_size']
57 |     fmin = hparams['fmin']
58 |     fmax = hparams['fmax']
59 |     y = y.clamp(min=-1., max=1.)
60 |     global mel_basis, hann_window
61 |     if fmax not in mel_basis:
62 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
63 |         mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
64 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
65 | 
66 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
67 |                                 mode='reflect')
68 |     y = y.squeeze(1)
69 | 
70 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
71 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
72 | 
73 |     if not complex:
74 |         spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
75 |         spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
76 |         spec = spectral_normalize_torch(spec)
77 |     else:
78 |         B, C, T, _ = spec.shape
79 |         spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
80 |     return spec
81 | 


--------------------------------------------------------------------------------
/hifigan/modules/nsf_hifigan/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | class AttrDict(dict):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(AttrDict, self).__init__(*args, **kwargs)
 8 |         self.__dict__ = self
 9 | 
10 | 
11 | def build_env(config, config_name, path):
12 |     t_path = os.path.join(path, config_name)
13 |     if config != t_path:
14 |         os.makedirs(path, exist_ok=True)
15 |         shutil.copyfile(config, os.path.join(path, config_name))


--------------------------------------------------------------------------------
/hifigan/modules/nsf_hifigan/nvSTFT.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | os.environ["LRU_CACHE_CAPACITY"] = "3"
  4 | import random
  5 | import torch
  6 | import torch.utils.data
  7 | import numpy as np
  8 | import librosa
  9 | from librosa.util import normalize
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | from scipy.io.wavfile import read
 12 | import soundfile as sf
 13 | 
 14 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
 15 |     sampling_rate = None
 16 |     try:
 17 |         data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
 18 |     except Exception as ex:
 19 |         print(f"'{full_path}' failed to load.\nException:")
 20 |         print(ex)
 21 |         if return_empty_on_exception:
 22 |             return [], sampling_rate or target_sr or 48000
 23 |         else:
 24 |             raise Exception(ex)
 25 | 
 26 |     if len(data.shape) > 1:
 27 |         data = data[:, 0]
 28 |         assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
 29 | 
 30 |     if np.issubdtype(data.dtype, np.integer): # if audio data is type int
 31 |         max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
 32 |     else: # if audio data is type fp32
 33 |         max_mag = max(np.amax(data), -np.amin(data))
 34 |         max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
 35 | 
 36 |     data = torch.FloatTensor(data.astype(np.float32))/max_mag
 37 | 
 38 |     if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
 39 |         return [], sampling_rate or target_sr or 48000
 40 |     if target_sr is not None and sampling_rate != target_sr:
 41 |         data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
 42 |         sampling_rate = target_sr
 43 | 
 44 |     return data, sampling_rate
 45 | 
 46 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 47 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 48 | 
 49 | def dynamic_range_decompression(x, C=1):
 50 |     return np.exp(x) / C
 51 | 
 52 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 53 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 54 | 
 55 | def dynamic_range_decompression_torch(x, C=1):
 56 |     return torch.exp(x) / C
 57 | 
 58 | class STFT():
 59 |     def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
 60 |         self.target_sr = sr
 61 | 
 62 |         self.n_mels     = n_mels
 63 |         self.n_fft      = n_fft
 64 |         self.win_size   = win_size
 65 |         self.hop_length = hop_length
 66 |         self.fmin     = fmin
 67 |         self.fmax     = fmax
 68 |         self.clip_val = clip_val
 69 |         self.mel_basis = {}
 70 |         self.hann_window = {}
 71 | 
 72 |     def get_mel(self, y, center=False):
 73 |         sampling_rate = self.target_sr
 74 |         n_mels     = self.n_mels
 75 |         n_fft      = self.n_fft
 76 |         win_size   = self.win_size
 77 |         hop_length = self.hop_length
 78 |         fmin       = self.fmin
 79 |         fmax       = self.fmax
 80 |         clip_val   = self.clip_val
 81 | 
 82 |         if torch.min(y) < -1.:
 83 |             print('min value is ', torch.min(y))
 84 |         if torch.max(y) > 1.:
 85 |             print('max value is ', torch.max(y))
 86 | 
 87 |         if fmax not in self.mel_basis:
 88 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
 89 |             self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 90 |             self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
 91 | 
 92 |         y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
 93 |         y = y.squeeze(1)
 94 | 
 95 |         spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
 96 |                           center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 97 |         # print(111,spec)
 98 |         spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
 99 |         # print(222,spec)
100 |         spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
101 |         # print(333,spec)
102 |         spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
103 |         # print(444,spec)
104 |         return spec
105 | 
106 |     def __call__(self, audiopath):
107 |         audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
108 |         spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
109 |         return spect
110 | 
111 | stft = STFT()


--------------------------------------------------------------------------------
/hifigan/modules/nsf_hifigan/utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import matplotlib
 4 | import torch
 5 | from torch.nn.utils import weight_norm
 6 | matplotlib.use("Agg")
 7 | import matplotlib.pylab as plt
 8 | 
 9 | 
10 | def plot_spectrogram(spectrogram):
11 |     fig, ax = plt.subplots(figsize=(10, 2))
12 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13 |                    interpolation='none')
14 |     plt.colorbar(im, ax=ax)
15 | 
16 |     fig.canvas.draw()
17 |     plt.close()
18 | 
19 |     return fig
20 | 
21 | 
22 | def init_weights(m, mean=0.0, std=0.01):
23 |     classname = m.__class__.__name__
24 |     if classname.find("Conv") != -1:
25 |         m.weight.data.normal_(mean, std)
26 | 
27 | 
28 | def apply_weight_norm(m):
29 |     classname = m.__class__.__name__
30 |     if classname.find("Conv") != -1:
31 |         weight_norm(m)
32 | 
33 | 
34 | def get_padding(kernel_size, dilation=1):
35 |     return int((kernel_size*dilation - dilation)/2)
36 | 
37 | 
38 | def load_checkpoint(filepath, device):
39 |     assert os.path.isfile(filepath)
40 |     print("Loading '{}'".format(filepath))
41 |     checkpoint_dict = torch.load(filepath, map_location=device)
42 |     print("Complete.")
43 |     return checkpoint_dict
44 | 
45 | 
46 | def save_checkpoint(filepath, obj):
47 |     print("Saving checkpoint to {}".format(filepath))
48 |     torch.save(obj, filepath)
49 |     print("Complete.")
50 | 
51 | 
52 | def del_old_checkpoints(cp_dir, prefix, n_models=2):
53 |     pattern = os.path.join(cp_dir, prefix + '????????')
54 |     cp_list = glob.glob(pattern) # get checkpoint paths
55 |     cp_list = sorted(cp_list)# sort by iter
56 |     if len(cp_list) > n_models: # if more than n_models models are found
57 |         for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
58 |             open(cp, 'w').close()# empty file contents
59 |             os.unlink(cp)# delete file (move to trash when using Colab)
60 | 
61 | 
62 | def scan_checkpoint(cp_dir, prefix):
63 |     pattern = os.path.join(cp_dir, prefix + '????????')
64 |     cp_list = glob.glob(pattern)
65 |     if len(cp_list) == 0:
66 |         return None
67 |     return sorted(cp_list)[-1]


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/hifigan/modules/parallel_wavegan/__init__.py


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .causal_conv import *  # NOQA
2 | from .pqmf import *  # NOQA
3 | from .residual_block import *  # NOQA
4 | from hifigan.modules.parallel_wavegan.layers.residual_stack import *  # NOQA
5 | from .upsample import *  # NOQA
6 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/causal_conv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Causal convolusion layer modules."""
 7 | 
 8 | 
 9 | import torch
10 | 
11 | 
12 | class CausalConv1d(torch.nn.Module):
13 |     """CausalConv1d module with customized initialization."""
14 | 
15 |     def __init__(self, in_channels, out_channels, kernel_size,
16 |                  dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
17 |         """Initialize CausalConv1d module."""
18 |         super(CausalConv1d, self).__init__()
19 |         self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
20 |         self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
21 |                                     dilation=dilation, bias=bias)
22 | 
23 |     def forward(self, x):
24 |         """Calculate forward propagation.
25 | 
26 |         Args:
27 |             x (Tensor): Input tensor (B, in_channels, T).
28 | 
29 |         Returns:
30 |             Tensor: Output tensor (B, out_channels, T).
31 | 
32 |         """
33 |         return self.conv(self.pad(x))[:, :, :x.size(2)]
34 | 
35 | 
36 | class CausalConvTranspose1d(torch.nn.Module):
37 |     """CausalConvTranspose1d module with customized initialization."""
38 | 
39 |     def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
40 |         """Initialize CausalConvTranspose1d module."""
41 |         super(CausalConvTranspose1d, self).__init__()
42 |         self.deconv = torch.nn.ConvTranspose1d(
43 |             in_channels, out_channels, kernel_size, stride, bias=bias)
44 |         self.stride = stride
45 | 
46 |     def forward(self, x):
47 |         """Calculate forward propagation.
48 | 
49 |         Args:
50 |             x (Tensor): Input tensor (B, in_channels, T_in).
51 | 
52 |         Returns:
53 |             Tensor: Output tensor (B, out_channels, T_out).
54 | 
55 |         """
56 |         return self.deconv(x)[:, :, :-self.stride]
57 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/pqmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Pseudo QMF modules."""
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from scipy.signal import kaiser
 13 | 
 14 | 
 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
 16 |     """Design prototype filter for PQMF.
 17 | 
 18 |     This method is based on `A Kaiser window approach for the design of prototype
 19 |     filters of cosine modulated filterbanks`_.
 20 | 
 21 |     Args:
 22 |         taps (int): The number of filter taps.
 23 |         cutoff_ratio (float): Cut-off frequency ratio.
 24 |         beta (float): Beta coefficient for kaiser window.
 25 | 
 26 |     Returns:
 27 |         ndarray: Impluse response of prototype filter (taps + 1,).
 28 | 
 29 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 30 |         https://ieeexplore.ieee.org/abstract/document/681427
 31 | 
 32 |     """
 33 |     # check the arguments are valid
 34 |     assert taps % 2 == 0, "The number of taps mush be even number."
 35 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 36 | 
 37 |     # make initial filter
 38 |     omega_c = np.pi * cutoff_ratio
 39 |     with np.errstate(invalid='ignore'):
 40 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
 41 |             / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
 42 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 43 | 
 44 |     # apply kaiser window
 45 |     w = kaiser(taps + 1, beta)
 46 |     h = h_i * w
 47 | 
 48 |     return h
 49 | 
 50 | 
 51 | class PQMF(torch.nn.Module):
 52 |     """PQMF module.
 53 | 
 54 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 55 | 
 56 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 57 |         https://ieeexplore.ieee.org/document/258122
 58 | 
 59 |     """
 60 | 
 61 |     def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
 62 |         """Initilize PQMF module.
 63 | 
 64 |         Args:
 65 |             subbands (int): The number of subbands.
 66 |             taps (int): The number of filter taps.
 67 |             cutoff_ratio (float): Cut-off frequency ratio.
 68 |             beta (float): Beta coefficient for kaiser window.
 69 | 
 70 |         """
 71 |         super(PQMF, self).__init__()
 72 | 
 73 |         # define filter coefficient
 74 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 75 |         h_analysis = np.zeros((subbands, len(h_proto)))
 76 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 77 |         for k in range(subbands):
 78 |             h_analysis[k] = 2 * h_proto * np.cos(
 79 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 80 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) +
 81 |                 (-1) ** k * np.pi / 4)
 82 |             h_synthesis[k] = 2 * h_proto * np.cos(
 83 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 84 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) -
 85 |                 (-1) ** k * np.pi / 4)
 86 | 
 87 |         # convert to tensor
 88 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
 89 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
 90 | 
 91 |         # register coefficients as beffer
 92 |         self.register_buffer("analysis_filter", analysis_filter)
 93 |         self.register_buffer("synthesis_filter", synthesis_filter)
 94 | 
 95 |         # filter for downsampling & upsampling
 96 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float()
 97 |         for k in range(subbands):
 98 |             updown_filter[k, k, 0] = 1.0
 99 |         self.register_buffer("updown_filter", updown_filter)
100 |         self.subbands = subbands
101 | 
102 |         # keep padding info
103 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
104 | 
105 |     def analysis(self, x):
106 |         """Analysis with PQMF.
107 | 
108 |         Args:
109 |             x (Tensor): Input tensor (B, 1, T).
110 | 
111 |         Returns:
112 |             Tensor: Output tensor (B, subbands, T // subbands).
113 | 
114 |         """
115 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
116 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
117 | 
118 |     def synthesis(self, x):
119 |         """Synthesis with PQMF.
120 | 
121 |         Args:
122 |             x (Tensor): Input tensor (B, subbands, T // subbands).
123 | 
124 |         Returns:
125 |             Tensor: Output tensor (B, 1, T).
126 | 
127 |         """
128 |         x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
129 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)
130 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/residual_block.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Residual block module in WaveNet.
  4 | 
  5 | This code is modified from https://github.com/r9y9/wavenet_vocoder.
  6 | 
  7 | """
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | import torch.nn.functional as F
 13 | 
 14 | 
 15 | class Conv1d(torch.nn.Conv1d):
 16 |     """Conv1d module with customized initialization."""
 17 | 
 18 |     def __init__(self, *args, **kwargs):
 19 |         """Initialize Conv1d module."""
 20 |         super(Conv1d, self).__init__(*args, **kwargs)
 21 | 
 22 |     def reset_parameters(self):
 23 |         """Reset parameters."""
 24 |         torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
 25 |         if self.bias is not None:
 26 |             torch.nn.init.constant_(self.bias, 0.0)
 27 | 
 28 | 
 29 | class Conv1d1x1(Conv1d):
 30 |     """1x1 Conv1d with customized initialization."""
 31 | 
 32 |     def __init__(self, in_channels, out_channels, bias):
 33 |         """Initialize 1x1 Conv1d module."""
 34 |         super(Conv1d1x1, self).__init__(in_channels, out_channels,
 35 |                                         kernel_size=1, padding=0,
 36 |                                         dilation=1, bias=bias)
 37 | 
 38 | 
 39 | class ResidualBlock(torch.nn.Module):
 40 |     """Residual block module in WaveNet."""
 41 | 
 42 |     def __init__(self,
 43 |                  kernel_size=3,
 44 |                  residual_channels=64,
 45 |                  gate_channels=128,
 46 |                  skip_channels=64,
 47 |                  aux_channels=80,
 48 |                  dropout=0.0,
 49 |                  dilation=1,
 50 |                  bias=True,
 51 |                  use_causal_conv=False
 52 |                  ):
 53 |         """Initialize ResidualBlock module.
 54 | 
 55 |         Args:
 56 |             kernel_size (int): Kernel size of dilation convolution layer.
 57 |             residual_channels (int): Number of channels for residual connection.
 58 |             skip_channels (int): Number of channels for skip connection.
 59 |             aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
 60 |             dropout (float): Dropout probability.
 61 |             dilation (int): Dilation factor.
 62 |             bias (bool): Whether to add bias parameter in convolution layers.
 63 |             use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
 64 | 
 65 |         """
 66 |         super(ResidualBlock, self).__init__()
 67 |         self.dropout = dropout
 68 |         # no future time stamps available
 69 |         if use_causal_conv:
 70 |             padding = (kernel_size - 1) * dilation
 71 |         else:
 72 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
 73 |             padding = (kernel_size - 1) // 2 * dilation
 74 |         self.use_causal_conv = use_causal_conv
 75 | 
 76 |         # dilation conv
 77 |         self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
 78 |                            padding=padding, dilation=dilation, bias=bias)
 79 | 
 80 |         # local conditioning
 81 |         if aux_channels > 0:
 82 |             self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
 83 |         else:
 84 |             self.conv1x1_aux = None
 85 | 
 86 |         # conv output is split into two groups
 87 |         gate_out_channels = gate_channels // 2
 88 |         self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
 89 |         self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
 90 | 
 91 |     def forward(self, x, c):
 92 |         """Calculate forward propagation.
 93 | 
 94 |         Args:
 95 |             x (Tensor): Input tensor (B, residual_channels, T).
 96 |             c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
 97 | 
 98 |         Returns:
 99 |             Tensor: Output tensor for residual connection (B, residual_channels, T).
100 |             Tensor: Output tensor for skip connection (B, skip_channels, T).
101 | 
102 |         """
103 |         residual = x
104 |         x = F.dropout(x, p=self.dropout, training=self.training)
105 |         x = self.conv(x)
106 | 
107 |         # remove future time steps if use_causal_conv conv
108 |         x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
109 | 
110 |         # split into two part for gated activation
111 |         splitdim = 1
112 |         xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
113 | 
114 |         # local conditioning
115 |         if c is not None:
116 |             assert self.conv1x1_aux is not None
117 |             c = self.conv1x1_aux(c)
118 |             ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
119 |             xa, xb = xa + ca, xb + cb
120 | 
121 |         x = torch.tanh(xa) * torch.sigmoid(xb)
122 | 
123 |         # for skip connection
124 |         s = self.conv1x1_skip(x)
125 | 
126 |         # for residual connection
127 |         x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
128 | 
129 |         return x, s
130 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/residual_stack.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Residual stack module in MelGAN."""
 7 | 
 8 | import torch
 9 | 
10 | from . import CausalConv1d
11 | 
12 | 
13 | class ResidualStack(torch.nn.Module):
14 |     """Residual stack module introduced in MelGAN."""
15 | 
16 |     def __init__(self,
17 |                  kernel_size=3,
18 |                  channels=32,
19 |                  dilation=1,
20 |                  bias=True,
21 |                  nonlinear_activation="LeakyReLU",
22 |                  nonlinear_activation_params={"negative_slope": 0.2},
23 |                  pad="ReflectionPad1d",
24 |                  pad_params={},
25 |                  use_causal_conv=False,
26 |                  ):
27 |         """Initialize ResidualStack module.
28 | 
29 |         Args:
30 |             kernel_size (int): Kernel size of dilation convolution layer.
31 |             channels (int): Number of channels of convolution layers.
32 |             dilation (int): Dilation factor.
33 |             bias (bool): Whether to add bias parameter in convolution layers.
34 |             nonlinear_activation (str): Activation function module name.
35 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
36 |             pad (str): Padding function module name before dilated convolution layer.
37 |             pad_params (dict): Hyperparameters for padding function.
38 |             use_causal_conv (bool): Whether to use causal convolution.
39 | 
40 |         """
41 |         super(ResidualStack, self).__init__()
42 | 
43 |         # defile residual stack part
44 |         if not use_causal_conv:
45 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
46 |             self.stack = torch.nn.Sequential(
47 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
48 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
49 |                 torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
50 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
51 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
52 |             )
53 |         else:
54 |             self.stack = torch.nn.Sequential(
55 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
56 |                 CausalConv1d(channels, channels, kernel_size, dilation=dilation,
57 |                              bias=bias, pad=pad, pad_params=pad_params),
58 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
59 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
60 |             )
61 | 
62 |         # defile extra layer for skip connection
63 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
64 | 
65 |     def forward(self, c):
66 |         """Calculate forward propagation.
67 | 
68 |         Args:
69 |             c (Tensor): Input tensor (B, channels, T).
70 | 
71 |         Returns:
72 |             Tensor: Output tensor (B, chennels, T).
73 | 
74 |         """
75 |         return self.stack(c) + self.skip_layer(c)
76 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/tf_layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 MINH ANH (@dathudeptrai)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Tensorflow Layer modules complatible with pytorch."""
  7 | 
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | class TFReflectionPad1d(tf.keras.layers.Layer):
 12 |     """Tensorflow ReflectionPad1d module."""
 13 | 
 14 |     def __init__(self, padding_size):
 15 |         """Initialize TFReflectionPad1d module.
 16 | 
 17 |         Args:
 18 |             padding_size (int): Padding size.
 19 | 
 20 |         """
 21 |         super(TFReflectionPad1d, self).__init__()
 22 |         self.padding_size = padding_size
 23 | 
 24 |     @tf.function
 25 |     def call(self, x):
 26 |         """Calculate forward propagation.
 27 | 
 28 |         Args:
 29 |             x (Tensor): Input tensor (B, T, 1, C).
 30 | 
 31 |         Returns:
 32 |             Tensor: Padded tensor (B, T + 2 * padding_size, 1, C).
 33 | 
 34 |         """
 35 |         return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT")
 36 | 
 37 | 
 38 | class TFConvTranspose1d(tf.keras.layers.Layer):
 39 |     """Tensorflow ConvTranspose1d module."""
 40 | 
 41 |     def __init__(self, channels, kernel_size, stride, padding):
 42 |         """Initialize TFConvTranspose1d( module.
 43 | 
 44 |         Args:
 45 |             channels (int): Number of channels.
 46 |             kernel_size (int): kernel size.
 47 |             strides (int): Stride width.
 48 |             padding (str): Padding type ("same" or "valid").
 49 | 
 50 |         """
 51 |         super(TFConvTranspose1d, self).__init__()
 52 |         self.conv1d_transpose = tf.keras.layers.Conv2DTranspose(
 53 |             filters=channels,
 54 |             kernel_size=(kernel_size, 1),
 55 |             strides=(stride, 1),
 56 |             padding=padding,
 57 |         )
 58 | 
 59 |     @tf.function
 60 |     def call(self, x):
 61 |         """Calculate forward propagation.
 62 | 
 63 |         Args:
 64 |             x (Tensor): Input tensor (B, T, 1, C).
 65 | 
 66 |         Returns:
 67 |             Tensors: Output tensor (B, T', 1, C').
 68 | 
 69 |         """
 70 |         x = self.conv1d_transpose(x)
 71 |         return x
 72 | 
 73 | 
 74 | class TFResidualStack(tf.keras.layers.Layer):
 75 |     """Tensorflow ResidualStack module."""
 76 | 
 77 |     def __init__(self,
 78 |                  kernel_size,
 79 |                  channels,
 80 |                  dilation,
 81 |                  bias,
 82 |                  nonlinear_activation,
 83 |                  nonlinear_activation_params,
 84 |                  padding,
 85 |                  ):
 86 |         """Initialize TFResidualStack module.
 87 | 
 88 |         Args:
 89 |             kernel_size (int): Kernel size.
 90 |             channles (int): Number of channels.
 91 |             dilation (int): Dilation ine.
 92 |             bias (bool): Whether to add bias parameter in convolution layers.
 93 |             nonlinear_activation (str): Activation function module name.
 94 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 95 |             padding (str): Padding type ("same" or "valid").
 96 | 
 97 |         """
 98 |         super(TFResidualStack, self).__init__()
 99 |         self.block = [
100 |             getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
101 |             TFReflectionPad1d(dilation),
102 |             tf.keras.layers.Conv2D(
103 |                 filters=channels,
104 |                 kernel_size=(kernel_size, 1),
105 |                 dilation_rate=(dilation, 1),
106 |                 use_bias=bias,
107 |                 padding="valid",
108 |             ),
109 |             getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
110 |             tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
111 |         ]
112 |         self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
113 | 
114 |     @tf.function
115 |     def call(self, x):
116 |         """Calculate forward propagation.
117 | 
118 |         Args:
119 |             x (Tensor): Input tensor (B, T, 1, C).
120 | 
121 |         Returns:
122 |             Tensor: Output tensor (B, T, 1, C).
123 | 
124 |         """
125 |         _x = tf.identity(x)
126 |         for i, layer in enumerate(self.block):
127 |             _x = layer(_x)
128 |         shortcut = self.shortcut(x)
129 |         return shortcut + _x
130 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/layers/upsample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Upsampling module.
  4 | 
  5 | This code is modified from https://github.com/r9y9/wavenet_vocoder.
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn.functional as F
 12 | 
 13 | from . import Conv1d
 14 | 
 15 | 
 16 | class Stretch2d(torch.nn.Module):
 17 |     """Stretch2d module."""
 18 | 
 19 |     def __init__(self, x_scale, y_scale, mode="nearest"):
 20 |         """Initialize Stretch2d module.
 21 | 
 22 |         Args:
 23 |             x_scale (int): X scaling factor (Time axis in spectrogram).
 24 |             y_scale (int): Y scaling factor (Frequency axis in spectrogram).
 25 |             mode (str): Interpolation mode.
 26 | 
 27 |         """
 28 |         super(Stretch2d, self).__init__()
 29 |         self.x_scale = x_scale
 30 |         self.y_scale = y_scale
 31 |         self.mode = mode
 32 | 
 33 |     def forward(self, x):
 34 |         """Calculate forward propagation.
 35 | 
 36 |         Args:
 37 |             x (Tensor): Input tensor (B, C, F, T).
 38 | 
 39 |         Returns:
 40 |             Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
 41 | 
 42 |         """
 43 |         return F.interpolate(
 44 |             x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
 45 | 
 46 | 
 47 | class Conv2d(torch.nn.Conv2d):
 48 |     """Conv2d module with customized initialization."""
 49 | 
 50 |     def __init__(self, *args, **kwargs):
 51 |         """Initialize Conv2d module."""
 52 |         super(Conv2d, self).__init__(*args, **kwargs)
 53 | 
 54 |     def reset_parameters(self):
 55 |         """Reset parameters."""
 56 |         self.weight.data.fill_(1. / np.prod(self.kernel_size))
 57 |         if self.bias is not None:
 58 |             torch.nn.init.constant_(self.bias, 0.0)
 59 | 
 60 | 
 61 | class UpsampleNetwork(torch.nn.Module):
 62 |     """Upsampling network module."""
 63 | 
 64 |     def __init__(self,
 65 |                  upsample_scales,
 66 |                  nonlinear_activation=None,
 67 |                  nonlinear_activation_params={},
 68 |                  interpolate_mode="nearest",
 69 |                  freq_axis_kernel_size=1,
 70 |                  use_causal_conv=False,
 71 |                  ):
 72 |         """Initialize upsampling network module.
 73 | 
 74 |         Args:
 75 |             upsample_scales (list): List of upsampling scales.
 76 |             nonlinear_activation (str): Activation function name.
 77 |             nonlinear_activation_params (dict): Arguments for specified activation function.
 78 |             interpolate_mode (str): Interpolation mode.
 79 |             freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
 80 | 
 81 |         """
 82 |         super(UpsampleNetwork, self).__init__()
 83 |         self.use_causal_conv = use_causal_conv
 84 |         self.up_layers = torch.nn.ModuleList()
 85 |         for scale in upsample_scales:
 86 |             # interpolation layer
 87 |             stretch = Stretch2d(scale, 1, interpolate_mode)
 88 |             self.up_layers += [stretch]
 89 | 
 90 |             # conv layer
 91 |             assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
 92 |             freq_axis_padding = (freq_axis_kernel_size - 1) // 2
 93 |             kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
 94 |             if use_causal_conv:
 95 |                 padding = (freq_axis_padding, scale * 2)
 96 |             else:
 97 |                 padding = (freq_axis_padding, scale)
 98 |             conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
 99 |             self.up_layers += [conv]
100 | 
101 |             # nonlinear
102 |             if nonlinear_activation is not None:
103 |                 nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
104 |                 self.up_layers += [nonlinear]
105 | 
106 |     def forward(self, c):
107 |         """Calculate forward propagation.
108 | 
109 |         Args:
110 |             c : Input tensor (B, C, T).
111 | 
112 |         Returns:
113 |             Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
114 | 
115 |         """
116 |         c = c.unsqueeze(1)  # (B, 1, C, T)
117 |         for f in self.up_layers:
118 |             if self.use_causal_conv and isinstance(f, Conv2d):
119 |                 c = f(c)[..., :c.size(-1)]
120 |             else:
121 |                 c = f(c)
122 |         return c.squeeze(1)  # (B, C, T')
123 | 
124 | 
125 | class ConvInUpsampleNetwork(torch.nn.Module):
126 |     """Convolution + upsampling network module."""
127 | 
128 |     def __init__(self,
129 |                  upsample_scales,
130 |                  nonlinear_activation=None,
131 |                  nonlinear_activation_params={},
132 |                  interpolate_mode="nearest",
133 |                  freq_axis_kernel_size=1,
134 |                  aux_channels=80,
135 |                  aux_context_window=0,
136 |                  use_causal_conv=False
137 |                  ):
138 |         """Initialize convolution + upsampling network module.
139 | 
140 |         Args:
141 |             upsample_scales (list): List of upsampling scales.
142 |             nonlinear_activation (str): Activation function name.
143 |             nonlinear_activation_params (dict): Arguments for specified activation function.
144 |             mode (str): Interpolation mode.
145 |             freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
146 |             aux_channels (int): Number of channels of pre-convolutional layer.
147 |             aux_context_window (int): Context window size of the pre-convolutional layer.
148 |             use_causal_conv (bool): Whether to use causal structure.
149 | 
150 |         """
151 |         super(ConvInUpsampleNetwork, self).__init__()
152 |         self.aux_context_window = aux_context_window
153 |         self.use_causal_conv = use_causal_conv and aux_context_window > 0
154 |         # To capture wide-context information in conditional features
155 |         kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
156 |         # NOTE(kan-bayashi): Here do not use padding because the input is already padded
157 |         self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
158 |         self.upsample = UpsampleNetwork(
159 |             upsample_scales=upsample_scales,
160 |             nonlinear_activation=nonlinear_activation,
161 |             nonlinear_activation_params=nonlinear_activation_params,
162 |             interpolate_mode=interpolate_mode,
163 |             freq_axis_kernel_size=freq_axis_kernel_size,
164 |             use_causal_conv=use_causal_conv,
165 |         )
166 | 
167 |     def forward(self, c):
168 |         """Calculate forward propagation.
169 | 
170 |         Args:
171 |             c : Input tensor (B, C, T').
172 | 
173 |         Returns:
174 |             Tensor: Upsampled tensor (B, C, T),
175 |                 where T = (T' - aux_context_window * 2) * prod(upsample_scales).
176 | 
177 |         Note:
178 |             The length of inputs considers the context window size.
179 | 
180 |         """
181 |         c_ = self.conv_in(c)
182 |         c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
183 |         return self.upsample(c)
184 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .stft_loss import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/losses/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 | 
 15 |     Args:
 16 |         x (Tensor): Input signal tensor (B, T).
 17 |         fft_size (int): FFT size.
 18 |         hop_size (int): Hop size.
 19 |         win_length (int): Window length.
 20 |         window (str): Window function type.
 21 | 
 22 |     Returns:
 23 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 24 | 
 25 |     """
 26 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
 27 |     real = x_stft[..., 0]
 28 |     imag = x_stft[..., 1]
 29 | 
 30 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 31 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 32 | 
 33 | 
 34 | class SpectralConvergengeLoss(torch.nn.Module):
 35 |     """Spectral convergence loss module."""
 36 | 
 37 |     def __init__(self):
 38 |         """Initilize spectral convergence loss module."""
 39 |         super(SpectralConvergengeLoss, self).__init__()
 40 | 
 41 |     def forward(self, x_mag, y_mag):
 42 |         """Calculate forward propagation.
 43 | 
 44 |         Args:
 45 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 46 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 47 | 
 48 |         Returns:
 49 |             Tensor: Spectral convergence loss value.
 50 | 
 51 |         """
 52 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 53 | 
 54 | 
 55 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 56 |     """Log STFT magnitude loss module."""
 57 | 
 58 |     def __init__(self):
 59 |         """Initilize los STFT magnitude loss module."""
 60 |         super(LogSTFTMagnitudeLoss, self).__init__()
 61 | 
 62 |     def forward(self, x_mag, y_mag):
 63 |         """Calculate forward propagation.
 64 | 
 65 |         Args:
 66 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 67 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 68 | 
 69 |         Returns:
 70 |             Tensor: Log STFT magnitude loss value.
 71 | 
 72 |         """
 73 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 74 | 
 75 | 
 76 | class STFTLoss(torch.nn.Module):
 77 |     """STFT loss module."""
 78 | 
 79 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 80 |         """Initialize STFT loss module."""
 81 |         super(STFTLoss, self).__init__()
 82 |         self.fft_size = fft_size
 83 |         self.shift_size = shift_size
 84 |         self.win_length = win_length
 85 |         self.window = getattr(torch, window)(win_length)
 86 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 87 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 88 | 
 89 |     def forward(self, x, y):
 90 |         """Calculate forward propagation.
 91 | 
 92 |         Args:
 93 |             x (Tensor): Predicted signal (B, T).
 94 |             y (Tensor): Groundtruth signal (B, T).
 95 | 
 96 |         Returns:
 97 |             Tensor: Spectral convergence loss value.
 98 |             Tensor: Log STFT magnitude loss value.
 99 | 
100 |         """
101 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
102 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
103 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
104 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
105 | 
106 |         return sc_loss, mag_loss
107 | 
108 | 
109 | class MultiResolutionSTFTLoss(torch.nn.Module):
110 |     """Multi resolution STFT loss module."""
111 | 
112 |     def __init__(self,
113 |                  fft_sizes=[1024, 2048, 512],
114 |                  hop_sizes=[120, 240, 50],
115 |                  win_lengths=[600, 1200, 240],
116 |                  window="hann_window"):
117 |         """Initialize Multi resolution STFT loss module.
118 | 
119 |         Args:
120 |             fft_sizes (list): List of FFT sizes.
121 |             hop_sizes (list): List of hop sizes.
122 |             win_lengths (list): List of window lengths.
123 |             window (str): Window function type.
124 | 
125 |         """
126 |         super(MultiResolutionSTFTLoss, self).__init__()
127 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
128 |         self.stft_losses = torch.nn.ModuleList()
129 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
130 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
131 | 
132 |     def forward(self, x, y):
133 |         """Calculate forward propagation.
134 | 
135 |         Args:
136 |             x (Tensor): Predicted signal (B, T).
137 |             y (Tensor): Groundtruth signal (B, T).
138 | 
139 |         Returns:
140 |             Tensor: Multi resolution spectral convergence loss value.
141 |             Tensor: Multi resolution log STFT magnitude loss value.
142 | 
143 |         """
144 |         sc_loss = 0.0
145 |         mag_loss = 0.0
146 |         for f in self.stft_losses:
147 |             sc_l, mag_l = f(x, y)
148 |             sc_loss += sc_l
149 |             mag_loss += mag_l
150 |         sc_loss /= len(self.stft_losses)
151 |         mag_loss /= len(self.stft_losses)
152 | 
153 |         return sc_loss, mag_loss
154 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .melgan import *  # NOQA
2 | from .parallel_wavegan import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from torch.optim import *  # NOQA
2 | from .radam import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/optimizers/radam.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """RAdam optimizer.
 4 | 
 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
 6 | """
 7 | 
 8 | import math
 9 | import torch
10 | 
11 | from torch.optim.optimizer import Optimizer
12 | 
13 | 
14 | class RAdam(Optimizer):
15 |     """Rectified Adam optimizer."""
16 | 
17 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
18 |         """Initilize RAdam optimizer."""
19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
20 |         self.buffer = [[None, None, None] for ind in range(10)]
21 |         super(RAdam, self).__init__(params, defaults)
22 | 
23 |     def __setstate__(self, state):
24 |         """Set state."""
25 |         super(RAdam, self).__setstate__(state)
26 | 
27 |     def step(self, closure=None):
28 |         """Run one step."""
29 |         loss = None
30 |         if closure is not None:
31 |             loss = closure()
32 | 
33 |         for group in self.param_groups:
34 | 
35 |             for p in group['params']:
36 |                 if p.grad is None:
37 |                     continue
38 |                 grad = p.grad.data.float()
39 |                 if grad.is_sparse:
40 |                     raise RuntimeError('RAdam does not support sparse gradients')
41 | 
42 |                 p_data_fp32 = p.data.float()
43 | 
44 |                 state = self.state[p]
45 | 
46 |                 if len(state) == 0:
47 |                     state['step'] = 0
48 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
49 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
50 |                 else:
51 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
52 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
53 | 
54 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
55 |                 beta1, beta2 = group['betas']
56 | 
57 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
58 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
59 | 
60 |                 state['step'] += 1
61 |                 buffered = self.buffer[int(state['step'] % 10)]
62 |                 if state['step'] == buffered[0]:
63 |                     N_sma, step_size = buffered[1], buffered[2]
64 |                 else:
65 |                     buffered[0] = state['step']
66 |                     beta2_t = beta2 ** state['step']
67 |                     N_sma_max = 2 / (1 - beta2) - 1
68 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
69 |                     buffered[1] = N_sma
70 | 
71 |                     # more conservative since it's an approximated value
72 |                     if N_sma >= 5:
73 |                         step_size = math.sqrt(
74 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])  # NOQA
75 |                     else:
76 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
77 |                     buffered[2] = step_size
78 | 
79 |                 if group['weight_decay'] != 0:
80 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
81 | 
82 |                 # more conservative since it's an approximated value
83 |                 if N_sma >= 5:
84 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
85 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
86 |                 else:
87 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
88 | 
89 |                 p.data.copy_(p_data_fp32)
90 | 
91 |         return loss
92 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | import librosa
  8 | import torch
  9 | 
 10 | from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft
 11 | 
 12 | 
 13 | class STFTLoss(torch.nn.Module):
 14 |     """STFT loss module."""
 15 | 
 16 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window",
 17 |                  use_mel_loss=False):
 18 |         """Initialize STFT loss module."""
 19 |         super(STFTLoss, self).__init__()
 20 |         self.fft_size = fft_size
 21 |         self.shift_size = shift_size
 22 |         self.win_length = win_length
 23 |         self.window = getattr(torch, window)(win_length)
 24 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 25 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 26 |         self.use_mel_loss = use_mel_loss
 27 |         self.mel_basis = None
 28 | 
 29 |     def forward(self, x, y):
 30 |         """Calculate forward propagation.
 31 | 
 32 |         Args:
 33 |             x (Tensor): Predicted signal (B, T).
 34 |             y (Tensor): Groundtruth signal (B, T).
 35 | 
 36 |         Returns:
 37 |             Tensor: Spectral convergence loss value.
 38 |             Tensor: Log STFT magnitude loss value.
 39 | 
 40 |         """
 41 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 42 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 43 |         if self.use_mel_loss:
 44 |             if self.mel_basis is None:
 45 |                 self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T
 46 |             x_mag = x_mag @ self.mel_basis
 47 |             y_mag = y_mag @ self.mel_basis
 48 | 
 49 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 50 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 51 | 
 52 |         return sc_loss, mag_loss
 53 | 
 54 | 
 55 | class MultiResolutionSTFTLoss(torch.nn.Module):
 56 |     """Multi resolution STFT loss module."""
 57 | 
 58 |     def __init__(self,
 59 |                  fft_sizes=[1024, 2048, 512],
 60 |                  hop_sizes=[120, 240, 50],
 61 |                  win_lengths=[600, 1200, 240],
 62 |                  window="hann_window",
 63 |                  use_mel_loss=False):
 64 |         """Initialize Multi resolution STFT loss module.
 65 | 
 66 |         Args:
 67 |             fft_sizes (list): List of FFT sizes.
 68 |             hop_sizes (list): List of hop sizes.
 69 |             win_lengths (list): List of window lengths.
 70 |             window (str): Window function type.
 71 | 
 72 |         """
 73 |         super(MultiResolutionSTFTLoss, self).__init__()
 74 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
 75 |         self.stft_losses = torch.nn.ModuleList()
 76 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
 77 |             self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)]
 78 | 
 79 |     def forward(self, x, y):
 80 |         """Calculate forward propagation.
 81 | 
 82 |         Args:
 83 |             x (Tensor): Predicted signal (B, T).
 84 |             y (Tensor): Groundtruth signal (B, T).
 85 | 
 86 |         Returns:
 87 |             Tensor: Multi resolution spectral convergence loss value.
 88 |             Tensor: Multi resolution log STFT magnitude loss value.
 89 | 
 90 |         """
 91 |         sc_loss = 0.0
 92 |         mag_loss = 0.0
 93 |         for f in self.stft_losses:
 94 |             sc_l, mag_l = f(x, y)
 95 |             sc_loss += sc_l
 96 |             mag_loss += mag_l
 97 |         sc_loss /= len(self.stft_losses)
 98 |         mag_loss /= len(self.stft_losses)
 99 | 
100 |         return sc_loss, mag_loss
101 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/hifigan/modules/parallel_wavegan/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Utility functions."""
  7 | 
  8 | import fnmatch
  9 | import logging
 10 | import os
 11 | import sys
 12 | 
 13 | import h5py
 14 | import numpy as np
 15 | 
 16 | 
 17 | def find_files(root_dir, query="*.wav", include_root_dir=True):
 18 |     """Find files recursively.
 19 | 
 20 |     Args:
 21 |         root_dir (str): Root root_dir to find.
 22 |         query (str): Query to find.
 23 |         include_root_dir (bool): If False, root_dir name is not included.
 24 | 
 25 |     Returns:
 26 |         list: List of found filenames.
 27 | 
 28 |     """
 29 |     files = []
 30 |     for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
 31 |         for filename in fnmatch.filter(filenames, query):
 32 |             files.append(os.path.join(root, filename))
 33 |     if not include_root_dir:
 34 |         files = [file_.replace(root_dir + "/", "") for file_ in files]
 35 | 
 36 |     return files
 37 | 
 38 | 
 39 | def read_hdf5(hdf5_name, hdf5_path):
 40 |     """Read hdf5 dataset.
 41 | 
 42 |     Args:
 43 |         hdf5_name (str): Filename of hdf5 file.
 44 |         hdf5_path (str): Dataset name in hdf5 file.
 45 | 
 46 |     Return:
 47 |         any: Dataset values.
 48 | 
 49 |     """
 50 |     if not os.path.exists(hdf5_name):
 51 |         logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
 52 |         sys.exit(1)
 53 | 
 54 |     hdf5_file = h5py.File(hdf5_name, "r")
 55 | 
 56 |     if hdf5_path not in hdf5_file:
 57 |         logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
 58 |         sys.exit(1)
 59 | 
 60 |     hdf5_data = hdf5_file[hdf5_path][()]
 61 |     hdf5_file.close()
 62 | 
 63 |     return hdf5_data
 64 | 
 65 | 
 66 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
 67 |     """Write dataset to hdf5.
 68 | 
 69 |     Args:
 70 |         hdf5_name (str): Hdf5 dataset filename.
 71 |         hdf5_path (str): Dataset path in hdf5.
 72 |         write_data (ndarray): Data to write.
 73 |         is_overwrite (bool): Whether to overwrite dataset.
 74 | 
 75 |     """
 76 |     # convert to numpy array
 77 |     write_data = np.array(write_data)
 78 | 
 79 |     # check folder existence
 80 |     folder_name, _ = os.path.split(hdf5_name)
 81 |     if not os.path.exists(folder_name) and len(folder_name) != 0:
 82 |         os.makedirs(folder_name)
 83 | 
 84 |     # check hdf5 existence
 85 |     if os.path.exists(hdf5_name):
 86 |         # if already exists, open with r+ mode
 87 |         hdf5_file = h5py.File(hdf5_name, "r+")
 88 |         # check dataset existence
 89 |         if hdf5_path in hdf5_file:
 90 |             if is_overwrite:
 91 |                 logging.warning("Dataset in hdf5 file already exists. "
 92 |                                 "recreate dataset in hdf5.")
 93 |                 hdf5_file.__delitem__(hdf5_path)
 94 |             else:
 95 |                 logging.error("Dataset in hdf5 file already exists. "
 96 |                               "if you want to overwrite, please set is_overwrite = True.")
 97 |                 hdf5_file.close()
 98 |                 sys.exit(1)
 99 |     else:
100 |         # if not exists, open with w mode
101 |         hdf5_file = h5py.File(hdf5_name, "w")
102 | 
103 |     # write data to hdf5
104 |     hdf5_file.create_dataset(hdf5_path, data=write_data)
105 |     hdf5_file.flush()
106 |     hdf5_file.close()
107 | 
108 | 
109 | class HDF5ScpLoader(object):
110 |     """Loader class for a fests.scp file of hdf5 file.
111 | 
112 |     Examples:
113 |         key1 /some/path/a.h5:feats
114 |         key2 /some/path/b.h5:feats
115 |         key3 /some/path/c.h5:feats
116 |         key4 /some/path/d.h5:feats
117 |         ...
118 |         >>> loader = HDF5ScpLoader("hdf5.scp")
119 |         >>> array = loader["key1"]
120 | 
121 |         key1 /some/path/a.h5
122 |         key2 /some/path/b.h5
123 |         key3 /some/path/c.h5
124 |         key4 /some/path/d.h5
125 |         ...
126 |         >>> loader = HDF5ScpLoader("hdf5.scp", "feats")
127 |         >>> array = loader["key1"]
128 | 
129 |     """
130 | 
131 |     def __init__(self, feats_scp, default_hdf5_path="feats"):
132 |         """Initialize HDF5 scp loader.
133 | 
134 |         Args:
135 |             feats_scp (str): Kaldi-style feats.scp file with hdf5 format.
136 |             default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used.
137 | 
138 |         """
139 |         self.default_hdf5_path = default_hdf5_path
140 |         with open(feats_scp, encoding='utf-8') as f:
141 |             lines = [line.replace("\n", "") for line in f.readlines()]
142 |         self.data = {}
143 |         for line in lines:
144 |             key, value = line.split()
145 |             self.data[key] = value
146 | 
147 |     def get_path(self, key):
148 |         """Get hdf5 file path for a given key."""
149 |         return self.data[key]
150 | 
151 |     def __getitem__(self, key):
152 |         """Get ndarray for a given key."""
153 |         p = self.data[key]
154 |         if ":" in p:
155 |             return read_hdf5(*p.split(":"))
156 |         else:
157 |             return read_hdf5(p, self.default_hdf5_path)
158 | 
159 |     def __len__(self):
160 |         """Return the length of the scp file."""
161 |         return len(self.data)
162 | 
163 |     def __iter__(self):
164 |         """Return the iterator of the scp file."""
165 |         return iter(self.data)
166 | 
167 |     def keys(self):
168 |         """Return the keys of the scp file."""
169 |         return self.data.keys()
170 | 


--------------------------------------------------------------------------------
/hifigan/network/vocoders/__init__.py:
--------------------------------------------------------------------------------
1 | from hifigan.network.vocoders import hifigan
2 | from hifigan.network.vocoders import nsf_hifigan
3 | 


--------------------------------------------------------------------------------
/hifigan/network/vocoders/base_vocoder.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | VOCODERS = {}
 3 | 
 4 | 
 5 | def register_vocoder(cls):
 6 |     VOCODERS[cls.__name__.lower()] = cls
 7 |     VOCODERS[cls.__name__] = cls
 8 |     return cls
 9 | 
10 | 
11 | def get_vocoder_cls(hparams):
12 |     if hparams['vocoder'] in VOCODERS:
13 |         return VOCODERS[hparams['vocoder']]
14 |     else:
15 |         vocoder_cls = hparams['vocoder']
16 |         pkg = ".".join(vocoder_cls.split(".")[:-1])
17 |         cls_name = vocoder_cls.split(".")[-1]
18 |         vocoder_cls = getattr(importlib.import_module(pkg), cls_name)
19 |         return vocoder_cls
20 | 
21 | 
22 | class BaseVocoder:
23 |     def spec2wav(self, mel):
24 |         """
25 | 
26 |         :param mel: [T, 80]
27 |         :return: wav: [T']
28 |         """
29 | 
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     def wav2spec(wav_fn):
34 |         """
35 | 
36 |         :param wav_fn: str
37 |         :return: wav, mel: [T, 80]
38 |         """
39 |         raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/hifigan/network/vocoders/hifigan.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | import re
 5 | 
 6 | import librosa
 7 | import torch
 8 | 
 9 | import utils
10 | from hifigan.modules.hifigan.hifigan import HifiGanGenerator
11 | from hifigan.network.vocoders.base_vocoder import register_vocoder
12 | from hifigan.network.vocoders.pwg import PWG
13 | from hifigan.network.vocoders.vocoder_utils import denoise
14 | 
15 | 
16 | def load_model(config_path, file_path):
17 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18 |     ext = os.path.splitext(file_path)[-1]
19 |     if ext == '.pth':
20 |         if '.yaml' in config_path:
21 |             config = set_hparams(config_path, global_hparams=False)
22 |         elif '.json' in config_path:
23 |             config = json.load(open(config_path, 'r', encoding='utf-8'))
24 |         model = torch.load(file_path, map_location="cpu")
25 |     elif ext == '.ckpt':
26 |         ckpt_dict = torch.load(file_path, map_location="cpu")
27 |         if '.yaml' in config_path:
28 |             config = set_hparams(config_path, global_hparams=False)
29 |             state = ckpt_dict["state_dict"]["model_gen"]
30 |         elif '.json' in config_path:
31 |             config = json.load(open(config_path, 'r', encoding='utf-8'))
32 |             state = ckpt_dict["generator"]
33 |         model = HifiGanGenerator(config)
34 |         model.load_state_dict(state, strict=True)
35 |         model.remove_weight_norm()
36 |     model = model.eval().to(device)
37 |     print(f"| Loaded model parameters from {file_path}.")
38 |     print(f"| HifiGAN device: {device}.")
39 |     return model, config, device
40 | 
41 | 
42 | total_time = 0
43 | 
44 | 
45 | @register_vocoder
46 | class HifiGAN(PWG):
47 |     def __init__(self):
48 |         base_dir = hparams['vocoder_ckpt']
49 |         config_path = f'{base_dir}/config.yaml'
50 |         if os.path.exists(config_path):
51 |             file_path = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.*'), key=
52 |             lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).*', x.replace('\\','/'))[0]))[-1]
53 |             print('| load HifiGAN: ', file_path)
54 |             self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path)
55 |         else:
56 |             config_path = f'{base_dir}/config.json'
57 |             ckpt = f'{base_dir}/generator_v1'
58 |             if os.path.exists(config_path):
59 |                 self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path)
60 | 
61 |     def spec2wav(self, mel, **kwargs):
62 |         device = self.device
63 |         with torch.no_grad():
64 |             c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device)
65 |             with utils.Timer('hifigan', print_time=hparams['profile_infer']):
66 |                 f0 = kwargs.get('f0')
67 |                 if f0 is not None and hparams.get('use_nsf'):
68 |                     f0 = torch.FloatTensor(f0[None, :]).to(device)
69 |                     y = self.model(c, f0).view(-1)
70 |                 else:
71 |                     y = self.model(c).view(-1)
72 |         wav_out = y.cpu().numpy()
73 |         if hparams.get('vocoder_denoise_c', 0.0) > 0:
74 |             wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c'])
75 |         return wav_out
76 | 
77 |     # @staticmethod
78 |     # def wav2spec(wav_fn, **kwargs):
79 |     #     wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate'])
80 |     #     wav_torch = torch.FloatTensor(wav)[None, :]
81 |     #     mel = mel_spectrogram(wav_torch, hparams).numpy()[0]
82 |     #     return wav, mel.T
83 | 


--------------------------------------------------------------------------------
/hifigan/network/vocoders/nsf_hifigan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from hifigan.modules.nsf_hifigan.models import load_model, Generator
 4 | from hifigan.modules.nsf_hifigan.nvSTFT import load_wav_to_torch, STFT
 5 | from hifigan.network.vocoders.base_vocoder import BaseVocoder, register_vocoder
 6 | 
 7 | @register_vocoder
 8 | class NsfHifiGAN(BaseVocoder):
 9 |     def __init__(self, device=None):
10 |         if device is None:
11 |             device = 'cuda' if torch.cuda.is_available() else 'cpu'
12 |         self.device = device
13 |         model_path = "pretrain/nsf-hifigan/model"
14 |         if os.path.exists(model_path):
15 |             print('| Load HifiGAN: ', model_path)
16 |             self.model, self.h = load_model(model_path, device=self.device)
17 |         else:
18 |             print('Error: HifiGAN model file is not found!')
19 | 
20 |     def spec2wav_torch(self, mel, **kwargs): # mel: [B, T, bins]
21 |         if self.h.sampling_rate != self.h['audio_sample_rate']:
22 |             print('Mismatch parameters: self.h[\'audio_sample_rate\']=',self.h['audio_sample_rate'],'!=',self.h.sampling_rate,'(vocoder)')
23 |         if self.h.num_mels != self.h['audio_num_mel_bins']:
24 |             print('Mismatch parameters: self.h[\'audio_num_mel_bins\']=',self.h['audio_num_mel_bins'],'!=',self.h.num_mels,'(vocoder)')
25 |         if self.h.n_fft != self.h['fft_size']:
26 |             print('Mismatch parameters: self.h[\'fft_size\']=',self.h['fft_size'],'!=',self.h.n_fft,'(vocoder)')
27 |         if self.h.win_size != self.h['win_size']:
28 |             print('Mismatch parameters: self.h[\'win_size\']=',self.h['win_size'],'!=',self.h.win_size,'(vocoder)')
29 |         if self.h.hop_size != self.h['hop_size']:
30 |             print('Mismatch parameters: self.h[\'hop_size\']=',self.h['hop_size'],'!=',self.h.hop_size,'(vocoder)')
31 |         if self.h.fmin != self.h['fmin']:
32 |             print('Mismatch parameters: self.h[\'fmin\']=',self.h['fmin'],'!=',self.h.fmin,'(vocoder)')
33 |         if self.h.fmax != self.h['fmax']:
34 |             print('Mismatch parameters: self.h[\'fmax\']=',self.h['fmax'],'!=',self.h.fmax,'(vocoder)')
35 |         with torch.no_grad():
36 |             c = mel.transpose(2, 1) #[B, T, bins]
37 |             #log10 to log mel
38 |             c = 2.30259 * c
39 |             f0 = kwargs.get('f0') #[B, T]
40 |             if f0 is not None and self.h.get('use_nsf'):
41 |                 y = self.model(c, f0).view(-1)
42 |             else:
43 |                 y = self.model(c).view(-1)
44 |         return y
45 | 
46 |     def spec2wav(self, mel, **kwargs):
47 |         with torch.no_grad():
48 |             c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(self.device)
49 |             #log10 to log mel
50 |             c = 2.30259 * c
51 |             f0 = kwargs.get('f0')
52 |             if f0 is not None :
53 |                 f0 = torch.FloatTensor(f0[None, :]).to(self.device)
54 |                 y = self.model(c, f0).view(-1)
55 |         wav_out = y.cpu().numpy()
56 |         return wav_out
57 | 
58 |     def decode(self, mel, f0) -> torch.Tensor:
59 |         with torch.no_grad():
60 |             c = mel.to(self.device)
61 |             #log10 to log mel
62 |             c = 2.30259 * c
63 |             f0 = f0.to(self.device)
64 |             y = self.model(c, f0).view(-1)
65 |         wav_out = y.cpu().numpy()
66 |         return wav_out
67 |     def wav2spec(self, inp_path):
68 |         assert inp_path.endswith('.wav')
69 |         save_path = inp_path.replace(".wav", ".mel.pt")
70 |         if os.path.exists(save_path):
71 |             return torch.load(save_path)
72 |         sampling_rate = self.h['sampling_rate']
73 |         num_mels = self.h['num_mels']
74 |         n_fft = self.h['n_fft']
75 |         win_size =self.h['win_size']
76 |         hop_size = self.h['hop_size']
77 |         fmin = self.h['fmin']
78 |         fmax = self.h['fmax']
79 |         stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax)
80 |         with torch.no_grad():
81 |             wav_torch, _ = load_wav_to_torch(inp_path, target_sr=stft.target_sr)
82 |             mel_torch = stft.get_mel(wav_torch.unsqueeze(0)).squeeze(0).T
83 |             #log mel to log10 mel
84 |             mel_torch = 0.434294 * mel_torch.T
85 |             torch.save(mel_torch, save_path)
86 |             return mel_torch


--------------------------------------------------------------------------------
/hifigan/network/vocoders/pwg.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import re
  3 | import librosa
  4 | import torch
  5 | import yaml
  6 | from sklearn.preprocessing import StandardScaler
  7 | from torch import nn
  8 | from hifigan.modules.parallel_wavegan.models import ParallelWaveGANGenerator
  9 | from hifigan.modules.parallel_wavegan.utils import read_hdf5
 10 | from hifigan.network.vocoders.base_vocoder import BaseVocoder, register_vocoder
 11 | import numpy as np
 12 | 
 13 | 
 14 | def load_pwg_model(config_path, checkpoint_path, stats_path):
 15 |     # load config
 16 |     with open(config_path, encoding='utf-8') as f:
 17 |         config = yaml.load(f, Loader=yaml.Loader)
 18 | 
 19 |     # setup
 20 |     if torch.cuda.is_available():
 21 |         device = torch.device("cuda")
 22 |     else:
 23 |         device = torch.device("cpu")
 24 |     model = ParallelWaveGANGenerator(**config["generator_params"])
 25 | 
 26 |     ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
 27 |     if 'state_dict' not in ckpt_dict:  # official vocoder
 28 |         model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"])
 29 |         scaler = StandardScaler()
 30 |         if config["format"] == "hdf5":
 31 |             scaler.mean_ = read_hdf5(stats_path, "mean")
 32 |             scaler.scale_ = read_hdf5(stats_path, "scale")
 33 |         elif config["format"] == "npy":
 34 |             scaler.mean_ = np.load(stats_path)[0]
 35 |             scaler.scale_ = np.load(stats_path)[1]
 36 |         else:
 37 |             raise ValueError("support only hdf5 or npy format.")
 38 |     else:  # custom PWG vocoder
 39 |         fake_task = nn.Module()
 40 |         fake_task.model_gen = model
 41 |         fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False)
 42 |         scaler = None
 43 | 
 44 |     model.remove_weight_norm()
 45 |     model = model.eval().to(device)
 46 |     print(f"| Loaded model parameters from {checkpoint_path}.")
 47 |     print(f"| PWG device: {device}.")
 48 |     return model, scaler, config, device
 49 | 
 50 | 
 51 | @register_vocoder
 52 | class PWG(BaseVocoder):
 53 |     def __init__(self):
 54 |         if hparams['vocoder_ckpt'] == '':  # load LJSpeech PWG pretrained model
 55 |             base_dir = 'wavegan_pretrained'
 56 |             ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl')
 57 |             ckpt = sorted(ckpts, key=
 58 |             lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1]
 59 |             config_path = f'{base_dir}/config.yaml'
 60 |             print('| load PWG: ', ckpt)
 61 |             self.model, self.scaler, self.config, self.device = load_pwg_model(
 62 |                 config_path=config_path,
 63 |                 checkpoint_path=ckpt,
 64 |                 stats_path=f'{base_dir}/stats.h5',
 65 |             )
 66 |         else:
 67 |             base_dir = hparams['vocoder_ckpt']
 68 |             print(base_dir)
 69 |             config_path = f'{base_dir}/config.yaml'
 70 |             ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
 71 |             lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
 72 |             print('| load PWG: ', ckpt)
 73 |             self.scaler = None
 74 |             self.model, _, self.config, self.device = load_pwg_model(
 75 |                 config_path=config_path,
 76 |                 checkpoint_path=ckpt,
 77 |                 stats_path=f'{base_dir}/stats.h5',
 78 |             )
 79 | 
 80 |     def spec2wav(self, mel, **kwargs):
 81 |         # start generation
 82 |         config = self.config
 83 |         device = self.device
 84 |         pad_size = (config["generator_params"]["aux_context_window"],
 85 |                     config["generator_params"]["aux_context_window"])
 86 |         c = mel
 87 |         if self.scaler is not None:
 88 |             c = self.scaler.transform(c)
 89 | 
 90 |         with torch.no_grad():
 91 |             z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
 92 |             c = np.pad(c, (pad_size, (0, 0)), "edge")
 93 |             c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
 94 |             p = kwargs.get('f0')
 95 |             if p is not None:
 96 |                 p = f0_to_coarse(p)
 97 |                 p = np.pad(p, (pad_size,), "edge")
 98 |                 p = torch.LongTensor(p[None, :]).to(device)
 99 |             y = self.model(z, c, p).view(-1)
100 |         wav_out = y.cpu().numpy()
101 |         return wav_out
102 | 
103 |     @staticmethod
104 |     def wav2spec(wav_fn, return_linear=False):
105 |         from preprocessing.data_gen_utils import process_utterance
106 |         res = process_utterance(
107 |             wav_fn, fft_size=hparams['fft_size'],
108 |             hop_size=hparams['hop_size'],
109 |             win_length=hparams['win_size'],
110 |             num_mels=hparams['audio_num_mel_bins'],
111 |             fmin=hparams['fmin'],
112 |             fmax=hparams['fmax'],
113 |             sample_rate=hparams['audio_sample_rate'],
114 |             loud_norm=hparams['loud_norm'],
115 |             min_level_db=hparams['min_level_db'],
116 |             return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10)))
117 |         if return_linear:
118 |             return res[0], res[1].T, res[2].T  # [T, 80], [T, n_fft]
119 |         else:
120 |             return res[0], res[1].T
121 | 
122 |     @staticmethod
123 |     def wav2mfcc(wav_fn):
124 |         fft_size = hparams['fft_size']
125 |         hop_size = hparams['hop_size']
126 |         win_length = hparams['win_size']
127 |         sample_rate = hparams['audio_sample_rate']
128 |         wav, _ = librosa.core.load(wav_fn, sr=sample_rate)
129 |         mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
130 |                                     n_fft=fft_size, hop_length=hop_size,
131 |                                     win_length=win_length, pad_mode="constant", power=1.0)
132 |         mfcc_delta = librosa.feature.delta(mfcc, order=1)
133 |         mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
134 |         mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T
135 |         return mfcc
136 | 


--------------------------------------------------------------------------------
/hifigan/network/vocoders/vocoder_utils.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def denoise(wav, v=0.1):
 7 |     spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'],
 8 |                         win_length=hparams['win_size'], pad_mode='constant')
 9 |     spec_m = np.abs(spec)
10 |     spec_m = np.clip(spec_m - v, a_min=0, a_max=None)
11 |     spec_a = np.angle(spec)
12 | 
13 |     return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'],
14 |                          win_length=hparams['win_size'])
15 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | os.environ["LRU_CACHE_CAPACITY"] = "3"
  4 | import random
  5 | import torch
  6 | import torch.utils.data
  7 | import numpy as np
  8 | import librosa
  9 | from librosa.util import normalize
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | from scipy.io.wavfile import read
 12 | import soundfile as sf
 13 | 
 14 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
 15 |     sampling_rate = None
 16 |     try:
 17 |         data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
 18 |     except Exception as ex:
 19 |         print(f"'{full_path}' failed to load.\nException:")
 20 |         print(ex)
 21 |         if return_empty_on_exception:
 22 |             return [], sampling_rate or target_sr or 48000
 23 |         else:
 24 |             raise Exception(ex)
 25 | 
 26 |     if len(data.shape) > 1:
 27 |         data = data[:, 0]
 28 |         assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
 29 | 
 30 |     if np.issubdtype(data.dtype, np.integer): # if audio data is type int
 31 |         max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
 32 |     else: # if audio data is type fp32
 33 |         max_mag = max(np.amax(data), -np.amin(data))
 34 |         max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
 35 | 
 36 |     data = torch.FloatTensor(data.astype(np.float32))/max_mag
 37 | 
 38 |     if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
 39 |         return [], sampling_rate or target_sr or 48000
 40 |     if target_sr is not None and sampling_rate != target_sr:
 41 |         data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
 42 |         sampling_rate = target_sr
 43 | 
 44 |     return data, sampling_rate
 45 | 
 46 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 47 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 48 | 
 49 | def dynamic_range_decompression(x, C=1):
 50 |     return np.exp(x) / C
 51 | 
 52 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 53 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 54 | 
 55 | def dynamic_range_decompression_torch(x, C=1):
 56 |     return torch.exp(x) / C
 57 | 
 58 | class STFT():
 59 |     def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
 60 |         self.target_sr = sr
 61 | 
 62 |         self.n_mels     = n_mels
 63 |         self.n_fft      = n_fft
 64 |         self.win_size   = win_size
 65 |         self.hop_length = hop_length
 66 |         self.fmin     = fmin
 67 |         self.fmax     = fmax
 68 |         self.clip_val = clip_val
 69 |         self.mel_basis = {}
 70 |         self.hann_window = {}
 71 | 
 72 |     def get_mel(self, y, center=False):
 73 |         sampling_rate = self.target_sr
 74 |         n_mels     = self.n_mels
 75 |         n_fft      = self.n_fft
 76 |         win_size   = self.win_size
 77 |         hop_length = self.hop_length
 78 |         fmin       = self.fmin
 79 |         fmax       = self.fmax
 80 |         clip_val   = self.clip_val
 81 | 
 82 |         if torch.min(y) < -1.:
 83 |             print('min value is ', torch.min(y))
 84 |         if torch.max(y) > 1.:
 85 |             print('max value is ', torch.max(y))
 86 | 
 87 |         if fmax not in self.mel_basis:
 88 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
 89 |             self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 90 |             self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
 91 | 
 92 |         y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
 93 |         y = y.squeeze(1)
 94 | 
 95 |         spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
 96 |                           center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 97 |         # print(111,spec)
 98 |         spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
 99 |         # print(222,spec)
100 |         spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
101 |         # print(333,spec)
102 |         spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
103 |         # print(444,spec)
104 |         return spec
105 | 
106 |     def __call__(self, audiopath):
107 |         audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
108 |         spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
109 |         return spect
110 | 
111 | 
112 | def get_mel(wav_torch, sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax):
113 |     stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax)
114 |     with torch.no_grad():
115 |         mel_torch = stft.get_mel(wav_torch.unsqueeze(0)).squeeze(0).T
116 |         # log mel to log10 mel
117 |         mel_torch = 0.434294 * mel_torch.T
118 |         return mel_torch
119 | 
120 | if __name__ == '__main__':
121 |     mel, wav = get_mel("/Users/xingyijin/Downloads/api.wav", 16000, 80, 1024, 256, 80, 20, 11025)
122 |     print(mel.shape, wav.shape)


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | import commons
 10 | 
 11 | 
 12 | class LayerNorm(nn.Module):
 13 |   def __init__(self, channels, eps=1e-4):
 14 |       super().__init__()
 15 |       self.channels = channels
 16 |       self.eps = eps
 17 | 
 18 |       self.gamma = nn.Parameter(torch.ones(channels))
 19 |       self.beta = nn.Parameter(torch.zeros(channels))
 20 | 
 21 |   def forward(self, x):
 22 |     n_dims = len(x.shape)
 23 |     mean = torch.mean(x, 1, keepdim=True)
 24 |     variance = torch.mean((x -mean)**2, 1, keepdim=True)
 25 | 
 26 |     x = (x - mean) * torch.rsqrt(variance + self.eps)
 27 | 
 28 |     shape = [1, -1] + [1] * (n_dims - 2)
 29 |     x = x * self.gamma.view(*shape) + self.beta.view(*shape)
 30 |     return x
 31 | 
 32 |  
 33 | class ConvReluNorm(nn.Module):
 34 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 35 |     super().__init__()
 36 |     self.in_channels = in_channels
 37 |     self.hidden_channels = hidden_channels
 38 |     self.out_channels = out_channels
 39 |     self.kernel_size = kernel_size
 40 |     self.n_layers = n_layers
 41 |     self.p_dropout = p_dropout
 42 |     assert n_layers > 1, "Number of layers should be larger than 0."
 43 | 
 44 |     self.conv_layers = nn.ModuleList()
 45 |     self.norm_layers = nn.ModuleList()
 46 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 47 |     self.norm_layers.append(LayerNorm(hidden_channels))
 48 |     self.relu_drop = nn.Sequential(
 49 |         nn.ReLU(),
 50 |         nn.Dropout(p_dropout))
 51 |     for _ in range(n_layers-1):
 52 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 53 |       self.norm_layers.append(LayerNorm(hidden_channels))
 54 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 55 |     self.proj.weight.data.zero_()
 56 |     self.proj.bias.data.zero_()
 57 | 
 58 |   def forward(self, x, x_mask):
 59 |     x_org = x
 60 |     for i in range(self.n_layers):
 61 |       x = self.conv_layers[i](x * x_mask)
 62 |       x = self.norm_layers[i](x)
 63 |       x = self.relu_drop(x)
 64 |     x = x_org + self.proj(x)
 65 |     return x * x_mask
 66 | 
 67 | 
 68 | class WN(torch.nn.Module):
 69 |   def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
 70 |       super(WN, self).__init__()
 71 |       assert(kernel_size % 2 == 1)
 72 |       assert(hidden_channels % 2 == 0)
 73 |       self.in_channels = in_channels
 74 |       self.hidden_channels =hidden_channels
 75 |       self.kernel_size = kernel_size,
 76 |       self.dilation_rate = dilation_rate
 77 |       self.n_layers = n_layers
 78 |       self.gin_channels = gin_channels
 79 |       self.p_dropout = p_dropout
 80 | 
 81 |       self.in_layers = torch.nn.ModuleList()
 82 |       self.res_skip_layers = torch.nn.ModuleList()
 83 |       self.drop = nn.Dropout(p_dropout)
 84 | 
 85 |       if gin_channels != 0:
 86 |         cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
 87 |         self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 88 | 
 89 |       for i in range(n_layers):
 90 |         dilation = dilation_rate ** i
 91 |         padding = int((kernel_size * dilation - dilation) / 2)
 92 |         in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
 93 |                                    dilation=dilation, padding=padding)
 94 |         in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
 95 |         self.in_layers.append(in_layer)
 96 | 
 97 |         # last one is not necessary
 98 |         if i < n_layers - 1:
 99 |             res_skip_channels = 2 * hidden_channels
100 |         else:
101 |             res_skip_channels = hidden_channels
102 | 
103 |         res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
104 |         res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
105 |         self.res_skip_layers.append(res_skip_layer)
106 | 
107 |   def forward(self, x, x_mask=None, g=None, **kwargs):
108 |       output = torch.zeros_like(x)
109 |       n_channels_tensor = torch.IntTensor([self.hidden_channels])
110 | 
111 |       if g is not None:
112 |         g = self.cond_layer(g)
113 | 
114 |       for i in range(self.n_layers):
115 |           x_in = self.in_layers[i](x)
116 |           x_in = self.drop(x_in)
117 |           if g is not None:
118 |             cond_offset = i * 2 * self.hidden_channels
119 |             g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
120 |           else:
121 |             g_l = torch.zeros_like(x_in)
122 | 
123 |           acts = commons.fused_add_tanh_sigmoid_multiply(
124 |               x_in,
125 |               g_l,
126 |               n_channels_tensor)
127 | 
128 |           res_skip_acts = self.res_skip_layers[i](acts)
129 |           if i < self.n_layers - 1:
130 |             x = (x + res_skip_acts[:,:self.hidden_channels,:]) * x_mask
131 |             output = output + res_skip_acts[:,self.hidden_channels:,:]
132 |           else:
133 |             output = output + res_skip_acts
134 |       return output * x_mask
135 | 
136 |   def remove_weight_norm(self):
137 |     if self.gin_channels != 0:
138 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
139 |     for l in self.in_layers:
140 |       torch.nn.utils.remove_weight_norm(l)
141 |     for l in self.res_skip_layers:
142 |      torch.nn.utils.remove_weight_norm(l)
143 | 
144 | 
145 | class ActNorm(nn.Module):
146 |   def __init__(self, channels, ddi=False, **kwargs):
147 |     super().__init__()
148 |     self.channels = channels
149 |     self.initialized = not ddi
150 | 
151 |     self.logs = nn.Parameter(torch.zeros(1, channels, 1))
152 |     self.bias = nn.Parameter(torch.zeros(1, channels, 1))
153 | 
154 |   def forward(self, x, x_mask=None, reverse=False, **kwargs):
155 |     if x_mask is None:
156 |       x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype)
157 |     x_len = torch.sum(x_mask, [1, 2])
158 |     if not self.initialized:
159 |       self.initialize(x, x_mask)
160 |       self.initialized = True
161 | 
162 |     if reverse:
163 |       z = (x - self.bias) * torch.exp(-self.logs) * x_mask
164 |       logdet = None
165 |     else:
166 |       z = (self.bias + torch.exp(self.logs) * x) * x_mask
167 |       logdet = torch.sum(self.logs) * x_len # [b]
168 | 
169 |     return z, logdet
170 | 
171 |   def store_inverse(self):
172 |     pass
173 | 
174 |   def set_ddi(self, ddi):
175 |     self.initialized = not ddi
176 | 
177 |   def initialize(self, x, x_mask):
178 |     with torch.no_grad():
179 |       denom = torch.sum(x_mask, [0, 2])
180 |       m = torch.sum(x * x_mask, [0, 2]) / denom
181 |       m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom
182 |       v = m_sq - (m ** 2)
183 |       logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))
184 | 
185 |       bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
186 |       logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype)
187 | 
188 |       self.bias.data.copy_(bias_init)
189 |       self.logs.data.copy_(logs_init)
190 | 
191 | 
192 | class InvConvNear(nn.Module):
193 |   def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs):
194 |     super().__init__()
195 |     assert(n_split % 2 == 0)
196 |     self.channels = channels
197 |     self.n_split = n_split
198 |     self.no_jacobian = no_jacobian
199 |     
200 |     w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0]
201 |     if torch.det(w_init) < 0:
202 |       w_init[:,0] = -1 * w_init[:,0]
203 |     self.weight = nn.Parameter(w_init)
204 | 
205 |   def forward(self, x, x_mask=None, reverse=False, **kwargs):
206 |     b, c, t = x.size()
207 |     assert(c % self.n_split == 0)
208 |     if x_mask is None:
209 |       x_mask = 1
210 |       x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t
211 |     else:
212 |       x_len = torch.sum(x_mask, [1, 2])
213 | 
214 |     x = x.view(b, 2, c // self.n_split, self.n_split // 2, t)
215 |     x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t)
216 | 
217 |     if reverse:
218 |       if hasattr(self, "weight_inv"):
219 |         weight = self.weight_inv
220 |       else:
221 |         weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
222 |       logdet = None
223 |     else:
224 |       weight = self.weight
225 |       if self.no_jacobian:
226 |         logdet = 0
227 |       else:
228 |         logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b]
229 | 
230 |     weight = weight.view(self.n_split, self.n_split, 1, 1)
231 |     z = F.conv2d(x, weight)
232 | 
233 |     z = z.view(b, 2, self.n_split // 2, c // self.n_split, t)
234 |     z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask
235 |     return z, logdet
236 | 
237 |   def store_inverse(self):
238 |     self.weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
239 | 


--------------------------------------------------------------------------------
/preprocess_flist_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import tqdm
 4 | 
 5 | import os
 6 | data_all = []
 7 | spk2id = {}
 8 | current_spk = 0
 9 | for spk in os.listdir('dataset'):
10 |     if os.path.isdir(os.path.join('dataset', spk)):
11 |         for wav in os.listdir(os.path.join('dataset', spk)):
12 |             if wav.endswith('wav'):
13 |                 name = wav.split('.')[0]
14 |                 data_all.append(f"{name}|{spk}\n")
15 |                 if spk not in spk2id.keys():
16 |                     spk2id[spk] = current_spk
17 |                     current_spk+=1
18 | 
19 | 
20 | import random
21 | random.shuffle(data_all)
22 | data_train = data_all[:-5]
23 | data_val = data_all[-5:]
24 | with open('filelists/train.list', 'w', encoding='utf-8') as f:
25 |     for line in data_train:
26 |         f.write(line)
27 | 
28 | with open('filelists/val.list', 'w', encoding='utf-8') as f:
29 |     for line in data_val:
30 |         f.write(line)
31 | 
32 | template = json.load(open('configs/config.json', 'r', encoding='utf-8'))
33 | template["data"]['spk2id'] = spk2id
34 | json.dump(template, open('configs/config.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
35 | 
36 | 


--------------------------------------------------------------------------------
/pretrain/content-vec-best/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_dropout": 0.1,
 3 |   "apply_spec_augment": true,
 4 |   "architectures": [
 5 |     "HubertModelWithFinalProj"
 6 |   ],
 7 |   "attention_dropout": 0.1,
 8 |   "bos_token_id": 1,
 9 |   "classifier_proj_size": 256,
10 |   "conv_bias": false,
11 |   "conv_dim": [
12 |     512,
13 |     512,
14 |     512,
15 |     512,
16 |     512,
17 |     512,
18 |     512
19 |   ],
20 |   "conv_kernel": [
21 |     10,
22 |     3,
23 |     3,
24 |     3,
25 |     3,
26 |     2,
27 |     2
28 |   ],
29 |   "conv_stride": [
30 |     5,
31 |     2,
32 |     2,
33 |     2,
34 |     2,
35 |     2,
36 |     2
37 |   ],
38 |   "ctc_loss_reduction": "sum",
39 |   "ctc_zero_infinity": false,
40 |   "do_stable_layer_norm": false,
41 |   "eos_token_id": 2,
42 |   "feat_extract_activation": "gelu",
43 |   "feat_extract_norm": "group",
44 |   "feat_proj_dropout": 0.0,
45 |   "feat_proj_layer_norm": true,
46 |   "final_dropout": 0.1,
47 |   "hidden_act": "gelu",
48 |   "hidden_dropout": 0.1,
49 |   "hidden_size": 768,
50 |   "initializer_range": 0.02,
51 |   "intermediate_size": 3072,
52 |   "layer_norm_eps": 1e-05,
53 |   "layerdrop": 0.1,
54 |   "mask_feature_length": 10,
55 |   "mask_feature_min_masks": 0,
56 |   "mask_feature_prob": 0.0,
57 |   "mask_time_length": 10,
58 |   "mask_time_min_masks": 2,
59 |   "mask_time_prob": 0.05,
60 |   "model_type": "hubert",
61 |   "num_attention_heads": 12,
62 |   "num_conv_pos_embedding_groups": 16,
63 |   "num_conv_pos_embeddings": 128,
64 |   "num_feat_extract_layers": 7,
65 |   "num_hidden_layers": 12,
66 |   "pad_token_id": 0,
67 |   "torch_dtype": "float32",
68 |   "transformers_version": "4.27.3",
69 |   "use_weighted_layer_sum": false,
70 |   "vocab_size": 32
71 | }


--------------------------------------------------------------------------------
/pretrain/fcpe/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/pretrain/fcpe/.gitkeep


--------------------------------------------------------------------------------
/pretrain/nsf-hifigan/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 4,
 4 |     "batch_size": 10,
 5 |     "learning_rate": 0.0002,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates":        [ 8, 8, 2, 2, 2],
12 |     "upsample_kernel_sizes": [16,16, 4, 4, 4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |     "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
17 | 
18 |     "segment_size": 16384,
19 |     "num_mels": 128,
20 |     "num_freq": 1025,
21 |     "n_fft"   : 2048,
22 |     "hop_size": 512,
23 |     "win_size": 2048,
24 | 
25 |     "sampling_rate": 44100,
26 | 
27 |     "fmin": 40,
28 |     "fmax": 16000,
29 |     "fmax_for_loss": null,
30 | 
31 |     "num_workers": 16,
32 | 
33 |     "dist_config": {
34 |         "dist_backend": "nccl",
35 |         "dist_url": "tcp://localhost:54321",
36 |         "world_size": 1
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/pretrain/nsf-hifigan/put_441hifigan_ckpt_here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/glow-svc/15d280a2b87a5938da3cb68e85273102a42b0d75/pretrain/nsf-hifigan/put_441hifigan_ckpt_here


--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | import numpy as np
 5 | from multiprocessing import Pool, cpu_count
 6 | 
 7 | import soundfile
 8 | from scipy.io import wavfile
 9 | from tqdm import tqdm
10 | 
11 | 
12 | def process(item):
13 |     spkdir, wav_name, args = item
14 |     # speaker 's5', 'p280', 'p315' are excluded,
15 |     speaker = spkdir.replace("\\", "/").split("/")[-1]
16 |     wav_path = os.path.join(args.in_dir, speaker, wav_name)
17 |     if os.path.exists(wav_path) and '.wav' in wav_path:
18 |         os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True)
19 |         wav, sr = librosa.load(wav_path, sr=args.sr2)
20 |         soundfile.write(
21 |             os.path.join(args.out_dir2, speaker, wav_name),
22 |             wav,
23 |             sr
24 |         )
25 | 
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("--sr2", type=int, default=44100, help="sampling rate")
31 |     parser.add_argument("--in_dir", type=str, default="./raw", help="path to source dir")
32 |     parser.add_argument("--out_dir2", type=str, default="./dataset", help="path to target dir")
33 |     args = parser.parse_args()
34 |     processs = 8
35 |     pool = Pool(processes=processs)
36 | 
37 |     for speaker in os.listdir(args.in_dir):
38 |         spk_dir = os.path.join(args.in_dir, speaker)
39 |         if os.path.isdir(spk_dir):
40 |             print(spk_dir)
41 |             for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
42 |                 pass
43 | 


--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | Copyright (c) 2017, Prem Seetharaman
  5 | All rights reserved.
  6 | 
  7 | * Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice,
 11 |   this list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice, this
 14 |   list of conditions and the following disclaimer in the
 15 |   documentation and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from this
 19 |   software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | import torch.nn.functional as F
 36 | from torch.autograd import Variable
 37 | from scipy.signal import get_window
 38 | from librosa.util import pad_center, tiny
 39 | from librosa import stft, istft
 40 | from audio_processing import window_sumsquare
 41 | 
 42 | 
 43 | class STFT(torch.nn.Module):
 44 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 45 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 46 |                  window='hann'):
 47 |         super(STFT, self).__init__()
 48 |         self.filter_length = filter_length
 49 |         self.hop_length = hop_length
 50 |         self.win_length = win_length
 51 |         self.window = window
 52 |         self.forward_transform = None
 53 |         scale = self.filter_length / self.hop_length
 54 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 55 | 
 56 |         cutoff = int((self.filter_length / 2 + 1))
 57 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 58 |                                    np.imag(fourier_basis[:cutoff, :])])
 59 | 
 60 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 61 |         inverse_basis = torch.FloatTensor(
 62 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 63 | 
 64 |         if window is not None:
 65 |             assert(filter_length >= win_length)
 66 |             # get window and zero center pad it to filter_length
 67 |             fft_window = get_window(window, win_length, fftbins=True)
 68 |             fft_window = pad_center(fft_window, filter_length)
 69 |             fft_window = torch.from_numpy(fft_window).float()
 70 | 
 71 |             # window the bases
 72 |             forward_basis *= fft_window
 73 |             inverse_basis *= fft_window
 74 | 
 75 |         self.register_buffer('forward_basis', forward_basis.float())
 76 |         self.register_buffer('inverse_basis', inverse_basis.float())
 77 | 
 78 |     def transform(self, input_data):
 79 |         num_batches = input_data.size(0)
 80 |         num_samples = input_data.size(1)
 81 | 
 82 |         self.num_samples = num_samples
 83 | 
 84 |         if input_data.device.type == "cuda":
 85 |             # similar to librosa, reflect-pad the input
 86 |             input_data = input_data.view(num_batches, 1, num_samples)
 87 |             input_data = F.pad(
 88 |                 input_data.unsqueeze(1),
 89 |                 (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 90 |                 mode='reflect')
 91 |             input_data = input_data.squeeze(1)
 92 |             
 93 |             forward_transform = F.conv1d(
 94 |                 input_data,
 95 |                 self.forward_basis,
 96 |                 stride=self.hop_length,
 97 |                 padding=0)
 98 | 
 99 |             cutoff = int((self.filter_length / 2) + 1)
100 |             real_part = forward_transform[:, :cutoff, :]
101 |             imag_part = forward_transform[:, cutoff:, :]
102 |         else:
103 |             x = input_data.detach().numpy()
104 |             real_part = []
105 |             imag_part = []
106 |             for y in x:
107 |                 y_ = stft(y, self.filter_length, self.hop_length, self.win_length, self.window)
108 |                 real_part.append(y_.real[None,:,:])
109 |                 imag_part.append(y_.imag[None,:,:])
110 |             real_part = np.concatenate(real_part, 0)
111 |             imag_part = np.concatenate(imag_part, 0)
112 |             
113 |             real_part = torch.from_numpy(real_part).to(input_data.dtype)
114 |             imag_part = torch.from_numpy(imag_part).to(input_data.dtype)
115 | 
116 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
117 |         phase = torch.atan2(imag_part.data, real_part.data)
118 | 
119 |         return magnitude, phase
120 | 
121 |     def inverse(self, magnitude, phase):
122 |         recombine_magnitude_phase = torch.cat(
123 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
124 | 
125 |         if magnitude.device.type == "cuda":
126 |             inverse_transform = F.conv_transpose1d(
127 |                 recombine_magnitude_phase,
128 |                 self.inverse_basis,
129 |                 stride=self.hop_length,
130 |                 padding=0)
131 | 
132 |             if self.window is not None:
133 |                 window_sum = window_sumsquare(
134 |                     self.window, magnitude.size(-1), hop_length=self.hop_length,
135 |                     win_length=self.win_length, n_fft=self.filter_length,
136 |                     dtype=np.float32)
137 |                 # remove modulation effects
138 |                 approx_nonzero_indices = torch.from_numpy(
139 |                     np.where(window_sum > tiny(window_sum))[0])
140 |                 window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
141 |                 inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
142 | 
143 |                 # scale by hop ratio
144 |                 inverse_transform *= float(self.filter_length) / self.hop_length
145 | 
146 |             inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
147 |             inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
148 |             inverse_transform = inverse_transform.squeeze(1)
149 |         else:
150 |             x_org = recombine_magnitude_phase.detach().numpy()
151 |             n_b, n_f, n_t = x_org.shape
152 |             x = np.empty([n_b, n_f//2, n_t], dtype=np.complex64)
153 |             x.real = x_org[:,:n_f//2]
154 |             x.imag = x_org[:,n_f//2:]
155 |             inverse_transform = []
156 |             for y in x:
157 |                 y_ = istft(y, self.hop_length, self.win_length, self.window)
158 |                 inverse_transform.append(y_[None,:])
159 |             inverse_transform = np.concatenate(inverse_transform, 0)
160 |             inverse_transform = torch.from_numpy(inverse_transform).to(recombine_magnitude_phase.dtype)
161 | 
162 |         return inverse_transform
163 | 
164 |     def forward(self, input_data):
165 |         self.magnitude, self.phase = self.transform(input_data)
166 |         reconstruction = self.inverse(self.magnitude, self.phase)
167 |         return reconstruction
168 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from tqdm import tqdm
  4 | from torch.utils.data import DataLoader
  5 | from torch.utils.tensorboard import SummaryWriter
  6 | import torch.multiprocessing as mp
  7 | import torch.distributed as dist
  8 | from torch.nn.parallel import DistributedDataParallel as DDP
  9 | import logging
 10 | logging.getLogger("matplotlib").setLevel(logging.INFO)
 11 | logging.getLogger("h5py").setLevel(logging.INFO)
 12 | logging.getLogger("numba").setLevel(logging.INFO)
 13 | 
 14 | from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
 15 | import models
 16 | import commons
 17 | import utils
 18 | from hifigan import NsfHifiGAN
 19 | 
 20 | global_step = 0
 21 | 
 22 | 
 23 | def main():
 24 |     """Assume Single Node Multi GPUs Training Only"""
 25 |     assert torch.cuda.is_available(), "CPU training is not allowed."
 26 | 
 27 |     n_gpus = torch.cuda.device_count()
 28 |     os.environ['MASTER_ADDR'] = 'localhost'
 29 |     os.environ['MASTER_PORT'] = '7998'
 30 | 
 31 |     hps = utils.get_hparams()
 32 |     mp.spawn(train_and_eval, nprocs=n_gpus, args=(n_gpus, hps,))
 33 | 
 34 | 
 35 | def train_and_eval(rank, n_gpus, hps):
 36 |     global global_step
 37 |     if rank == 0:
 38 |         logger = utils.get_logger(hps.model_dir)
 39 |         logger.info(hps)
 40 |         utils.check_git_hash(hps.model_dir)
 41 |         writer = SummaryWriter(log_dir=hps.model_dir)
 42 |         writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
 43 | 
 44 |     dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
 45 |     torch.manual_seed(hps.train.seed)
 46 |     torch.cuda.set_device(rank)
 47 | 
 48 |     train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
 49 |     train_sampler = torch.utils.data.distributed.DistributedSampler(
 50 |         train_dataset,
 51 |         num_replicas=n_gpus,
 52 |         rank=rank,
 53 |         shuffle=True)
 54 |     collate_fn = TextAudioSpeakerCollate()
 55 |     train_loader = DataLoader(train_dataset, num_workers=3, shuffle=False,
 56 |                               batch_size=hps.train.batch_size, pin_memory=True,
 57 |                               drop_last=True, collate_fn=collate_fn, sampler=train_sampler, persistent_workers=True)
 58 |     if rank == 0:
 59 |         val_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True)
 60 |         val_loader = DataLoader(val_dataset, num_workers=0, shuffle=False,
 61 |                                 batch_size=1, pin_memory=True,
 62 |                                 drop_last=True, collate_fn=collate_fn)
 63 | 
 64 |     generator = models.FlowGenerator(
 65 |         n_vocab=0,
 66 |         out_channels=hps.data.n_mel_channels,
 67 |         **hps.model).cuda(rank)
 68 |     # vocoder = Vocos.from_pretrained('vocos/config.yaml', 'vocos/pytorch_model.bin').cuda()
 69 |     vocoder = NsfHifiGAN('cuda')
 70 |     optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler,
 71 |                                dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps,
 72 |                                lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
 73 | 
 74 | 
 75 |     # optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler,
 76 |     #                            dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps,
 77 |     #                            lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
 78 |     generator = DDP(generator)
 79 |     epoch_str = 1
 80 |     global_step = 0
 81 |     #
 82 |     # try:
 83 |     #     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator,
 84 |     #                                                optimizer_g)
 85 |     #     optimizer_g.step_num = (epoch_str - 1) * len(train_loader)
 86 |     #     optimizer_g._update_learning_rate()
 87 |     #     global_step = (epoch_str - 1) * len(train_loader)
 88 |     # except:
 89 |     #     if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")):
 90 |     #         _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g)
 91 |     #
 92 |     skip_optimizer = False
 93 |     try:
 94 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator,
 95 |                                                    optimizer_g, False)
 96 |         epoch_str += 1
 97 |         optimizer_g.step_num = (epoch_str - 1) * len(train_loader)
 98 |         optimizer_g._update_learning_rate()
 99 |         global_step = (epoch_str - 1) * len(train_loader)
100 |     except:
101 |         epoch_str = 1
102 |         global_step = 0
103 |     if skip_optimizer:
104 |         epoch_str = 1
105 |         global_step = 0
106 | 
107 |     for epoch in range(epoch_str, hps.train.epochs + 1):
108 |         if rank == 0:
109 |             save_interval = 5
110 |             if epoch % save_interval == 0:
111 |                 evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval, vocoder)
112 |             train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer)
113 |             if epoch % save_interval == 0:
114 |                 utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch,
115 |                                       os.path.join(hps.model_dir, "G_{}.pth".format(epoch)))
116 |                 try:
117 |                     to_remove_path = os.path.join(hps.model_dir, "G_{}.pth".format(epoch - save_interval* 3))
118 |                     os.remove(to_remove_path)
119 |                     print(f'removing {to_remove_path}')
120 |                 except:
121 |                     print(f'removing {to_remove_path} failed')
122 |         else:
123 |             train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
124 | 
125 | 
126 | def train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer):
127 |     train_loader.sampler.set_epoch(epoch)
128 |     global global_step
129 | 
130 |     generator.train()
131 |     for batch_idx, (x, mel,mel_lengths,wav, wav_lengths, speakers, f0) in enumerate(tqdm(train_loader)):
132 |         mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True)
133 |         speakers = speakers.cuda(rank, non_blocking=True)
134 |         x = x.cuda(rank, non_blocking=True)
135 |         f0 = f0.cuda(rank, non_blocking=True)
136 | 
137 |         # Train Generator
138 |         optimizer_g.zero_grad()
139 | 
140 |         (z, z_m, z_logs, logdet, z_mask), l_noise = generator(x, mel, mel_lengths,f0, g=speakers, gen=False)
141 |         l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask)
142 | 
143 |         loss_gs = [l_mle, l_noise]
144 |         loss_g = sum(loss_gs)
145 | 
146 |         loss_g.backward()
147 |         grad_norm = commons.clip_grad_value_(generator.parameters(), 5)
148 |         optimizer_g.step()
149 |         if rank == 0:
150 |             if batch_idx % hps.train.log_interval == 0:
151 |                 y_gen, _ = generator.module(x[:1],f0=f0[:1], g=speakers[:1], gen=True, glow=True)
152 |                 logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
153 |                     epoch, batch_idx * len(x), len(train_loader.dataset),
154 |                            100. * batch_idx / len(train_loader),
155 |                     loss_g.item()))
156 |                 lr = optimizer_g._optim.param_groups[0]['lr']
157 |                 logger.info([x.item() for x in loss_gs] + [global_step, lr])
158 | 
159 |                 scalar_dict = {"loss/g/total": loss_g, "learning_rate": lr, "grad_norm": grad_norm}
160 |                 scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(loss_gs)})
161 |                 utils.summarize(
162 |                     writer=writer,
163 |                     global_step=global_step,
164 |                     images={"train/gt/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
165 |                             "train/gen/mel": utils.plot_spectrogram_to_numpy(y_gen[0].data.cpu().numpy())
166 |                             },
167 |                     scalars=scalar_dict)
168 |         global_step += 1
169 | 
170 |     if rank == 0:
171 |         logger.info('====> Epoch: {}'.format(epoch))
172 | 
173 | 
174 | def evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval, vocoder):
175 |     if rank == 0:
176 |         global global_step
177 |         generator.eval()
178 |         audio_dict = {}
179 |         img_dict = {}
180 |         with torch.no_grad():
181 |             for batch_idx, (x, mel,mel_lengths,wav, wav_lengths, speakers, f0) in enumerate(
182 |                     val_loader):
183 |                 mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True)
184 |                 speakers = speakers.cuda(rank, non_blocking=True)
185 |                 x = x.cuda(rank, non_blocking=True)
186 |                 f0 = f0.cuda(rank, non_blocking=True)
187 | 
188 |                 mel_flow, pred_f0 = generator.module(x, f0=f0, g=speakers, gen=True, glow=True)
189 |                 y_flow = vocoder.spec2wav(mel_flow.squeeze(0).transpose(0, 1).cpu().numpy(),
190 |                                          f0=pred_f0[0, 0, :].cpu().numpy())
191 | 
192 |                 # mel_diff, pred_f0 = generator.module(x, f0=f0,g=speakers, gen=True, glow=False)
193 |                 # y_diff = vocoder.spec2wav(mel_diff.squeeze(0).transpose(0, 1).cpu().numpy(),
194 |                 #                          f0=pred_f0[0, 0, :].cpu().numpy())
195 | 
196 | 
197 |                 y_rec = vocoder.spec2wav(mel.squeeze(0).transpose(0, 1).cpu().numpy(),
198 |                                          f0=f0[0, :].cpu().numpy())
199 | 
200 |                 img_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
201 |                                  f"gen/mel_flow_{batch_idx}": utils.plot_spectrogram_to_numpy(mel_flow[0].data.cpu().numpy()),
202 |                                  # f"gen/mel_diff_{batch_idx}": utils.plot_spectrogram_to_numpy(mel_diff[0].data.cpu().numpy()),
203 |                                  })
204 |                 audio_dict.update({
205 |                     # "gen/wav_gen_{}_diff".format(batch_idx): y_diff,
206 |                     "gen/wav_gen_{}_flow".format(batch_idx): y_flow,
207 |                     "gt/wav_gen_{}_rec".format(batch_idx): y_rec,
208 |                 })
209 | 
210 |         utils.summarize(
211 |             writer=writer_eval,
212 |             global_step=global_step,
213 |             images=img_dict,
214 |             audios=audio_dict,
215 |             audio_sampling_rate=hps.data.sampling_rate
216 |         )
217 |         logger.info('====> Epoch: {}'.format(epoch))
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | import numpy as np
  9 | from scipy.io.wavfile import read
 10 | import torch
 11 | 
 12 | MATPLOTLIB_FLAG = False
 13 | 
 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 15 | logger = logging
 16 | 
 17 | 
 18 | def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
 19 |     assert os.path.isfile(checkpoint_path)
 20 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 21 |     iteration = 1
 22 |     if 'iteration' in checkpoint_dict.keys():
 23 |         iteration = checkpoint_dict['iteration']
 24 |     if 'learning_rate' in checkpoint_dict.keys():
 25 |         learning_rate = checkpoint_dict['learning_rate']
 26 |     if optimizer is not None and 'optimizer' in checkpoint_dict.keys() and not skip_optimizer:
 27 |         optimizer.load_state_dict(checkpoint_dict['optimizer'])
 28 |     saved_state_dict = checkpoint_dict['model']
 29 |     if hasattr(model, 'module'):
 30 |         state_dict = model.module.state_dict()
 31 |     else:
 32 |         state_dict = model.state_dict()
 33 |     new_state_dict = {}
 34 |     for k, v in state_dict.items():
 35 |         try:
 36 |             new_state_dict[k] = saved_state_dict[k]
 37 |             assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
 38 |         except:
 39 |             print("%s is not in the checkpoint" % k)
 40 |             new_state_dict[k] = v
 41 |     if hasattr(model, 'module'):
 42 |         model.module.load_state_dict(new_state_dict)
 43 |     else:
 44 |         model.load_state_dict(new_state_dict)
 45 |     logger.info("Loaded checkpoint '{}' (iteration {})".format(
 46 |         checkpoint_path, iteration))
 47 |     return model, optimizer, learning_rate, iteration
 48 | 
 49 | 
 50 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
 51 |     logger.info("Saving model and optimizer state at iteration {} to {}".format(
 52 |         iteration, checkpoint_path))
 53 |     if hasattr(model, 'module'):
 54 |         state_dict = model.module.state_dict()
 55 |     else:
 56 |         state_dict = model.state_dict()
 57 |     torch.save({'model': state_dict,
 58 |                 'iteration': iteration,
 59 |                 'optimizer': optimizer.state_dict(),
 60 |                 'learning_rate': learning_rate}, checkpoint_path)
 61 | 
 62 | 
 63 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=None):
 64 |     for k, v in scalars.items():
 65 |         writer.add_scalar(k, v, global_step)
 66 |     for k, v in histograms.items():
 67 |         writer.add_histogram(k, v, global_step)
 68 |     for k, v in images.items():
 69 |         writer.add_image(k, v, global_step, dataformats='HWC')
 70 |         for k, v in audios.items():
 71 |             writer.add_audio(k, v, global_step, audio_sampling_rate)
 72 | 
 73 | 
 74 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
 75 |     f_list = glob.glob(os.path.join(dir_path, regex))
 76 |     f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
 77 |     x = f_list[-1]
 78 |     print(x)
 79 |     return x
 80 | 
 81 | 
 82 | def plot_spectrogram_to_numpy(spectrogram):
 83 |     global MATPLOTLIB_FLAG
 84 |     if not MATPLOTLIB_FLAG:
 85 |         import matplotlib
 86 |         matplotlib.use("Agg")
 87 |         MATPLOTLIB_FLAG = True
 88 |         mpl_logger = logging.getLogger('matplotlib')
 89 |         mpl_logger.setLevel(logging.WARNING)
 90 |     import matplotlib.pylab as plt
 91 |     import numpy as np
 92 | 
 93 |     fig, ax = plt.subplots(figsize=(10, 2))
 94 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
 95 |                    interpolation='none')
 96 |     plt.colorbar(im, ax=ax)
 97 |     plt.xlabel("Frames")
 98 |     plt.ylabel("Channels")
 99 |     plt.tight_layout()
100 | 
101 |     fig.canvas.draw()
102 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
103 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
104 |     plt.close()
105 |     return data
106 | 
107 | 
108 | def plot_alignment_to_numpy(alignment, info=None):
109 |     global MATPLOTLIB_FLAG
110 |     if not MATPLOTLIB_FLAG:
111 |         import matplotlib
112 |         matplotlib.use("Agg")
113 |         MATPLOTLIB_FLAG = True
114 |         mpl_logger = logging.getLogger('matplotlib')
115 |         mpl_logger.setLevel(logging.WARNING)
116 |     import matplotlib.pylab as plt
117 |     import numpy as np
118 | 
119 |     fig, ax = plt.subplots(figsize=(6, 4))
120 |     im = ax.imshow(alignment, aspect='auto', origin='lower',
121 |                    interpolation='none')
122 |     fig.colorbar(im, ax=ax)
123 |     xlabel = 'Decoder timestep'
124 |     if info is not None:
125 |         xlabel += '\n\n' + info
126 |     plt.xlabel(xlabel)
127 |     plt.ylabel('Encoder timestep')
128 |     plt.tight_layout()
129 | 
130 |     fig.canvas.draw()
131 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
132 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
133 |     plt.close()
134 |     return data
135 | 
136 | 
137 | def load_wav_to_torch(full_path):
138 |     sampling_rate, data = read(full_path)
139 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
140 | 
141 | 
142 | def load_filepaths_and_text(filename, split="|"):
143 |     with open(filename, encoding='utf-8') as f:
144 |         filepaths_and_text = [line.strip().split(split) for line in f]
145 |     return filepaths_and_text
146 | 
147 | 
148 | def get_hparams(init=True):
149 |     parser = argparse.ArgumentParser()
150 |     parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
151 |                         help='JSON file for configuration')
152 |     parser.add_argument('-m', '--model', type=str, required=True,
153 |                         help='Model name')
154 | 
155 |     args = parser.parse_args()
156 |     model_dir = os.path.join("./logs", args.model)
157 | 
158 |     if not os.path.exists(model_dir):
159 |         os.makedirs(model_dir)
160 | 
161 |     config_path = args.config
162 |     config_save_path = os.path.join(model_dir, "config.json")
163 |     if init:
164 |         with open(config_path, "r") as f:
165 |             data = f.read()
166 |         with open(config_save_path, "w") as f:
167 |             f.write(data)
168 |     else:
169 |         with open(config_save_path, "r") as f:
170 |             data = f.read()
171 |     config = json.loads(data)
172 | 
173 |     hparams = HParams(**config)
174 |     hparams.model_dir = model_dir
175 |     return hparams
176 | 
177 | 
178 | def get_hparams_from_dir(model_dir):
179 |     config_save_path = os.path.join(model_dir, "config.json")
180 |     with open(config_save_path, "r") as f:
181 |         data = f.read()
182 |     config = json.loads(data)
183 | 
184 |     hparams = HParams(**config)
185 |     hparams.model_dir = model_dir
186 |     return hparams
187 | 
188 | 
189 | def get_hparams_from_file(config_path):
190 |     with open(config_path, "r") as f:
191 |         data = f.read()
192 |     config = json.loads(data)
193 | 
194 |     hparams = HParams(**config)
195 |     return hparams
196 | 
197 | 
198 | def check_git_hash(model_dir):
199 |     source_dir = os.path.dirname(os.path.realpath(__file__))
200 |     if not os.path.exists(os.path.join(source_dir, ".git")):
201 |         logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
202 |             source_dir
203 |         ))
204 |         return
205 | 
206 |     cur_hash = subprocess.getoutput("git rev-parse HEAD")
207 | 
208 |     path = os.path.join(model_dir, "githash")
209 |     if os.path.exists(path):
210 |         saved_hash = open(path).read()
211 |         if saved_hash != cur_hash:
212 |             logger.warn("git hash values are different. {}(saved) != {}(current)".format(
213 |                 saved_hash[:8], cur_hash[:8]))
214 |     else:
215 |         open(path, "w").write(cur_hash)
216 | 
217 | 
218 | def get_logger(model_dir, filename="train.log"):
219 |     global logger
220 |     logger = logging.getLogger(os.path.basename(model_dir))
221 |     logger.setLevel(logging.DEBUG)
222 | 
223 |     formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
224 |     if not os.path.exists(model_dir):
225 |         os.makedirs(model_dir)
226 |     h = logging.FileHandler(os.path.join(model_dir, filename))
227 |     h.setLevel(logging.DEBUG)
228 |     h.setFormatter(formatter)
229 |     logger.addHandler(h)
230 |     return logger
231 | 
232 | 
233 | class HParams():
234 |     def __init__(self, **kwargs):
235 |         for k, v in kwargs.items():
236 |             if type(v) == dict:
237 |                 v = HParams(**v)
238 |             self[k] = v
239 | 
240 |     def keys(self):
241 |         return self.__dict__.keys()
242 | 
243 |     def items(self):
244 |         return self.__dict__.items()
245 | 
246 |     def values(self):
247 |         return self.__dict__.values()
248 | 
249 |     def __len__(self):
250 |         return len(self.__dict__)
251 | 
252 |     def __getitem__(self, key):
253 |         return getattr(self, key)
254 | 
255 |     def __setitem__(self, key, value):
256 |         return setattr(self, key, value)
257 | 
258 |     def __contains__(self, key):
259 |         return key in self.__dict__
260 | 
261 |     def __repr__(self):
262 |         return self.__dict__.__repr__()
263 | 
264 | 
265 | 
266 | def plot_data_to_numpy(x, y):
267 |     global MATPLOTLIB_FLAG
268 |     if not MATPLOTLIB_FLAG:
269 |         import matplotlib
270 |         matplotlib.use("Agg")
271 |         MATPLOTLIB_FLAG = True
272 |         mpl_logger = logging.getLogger('matplotlib')
273 |         mpl_logger.setLevel(logging.WARNING)
274 |     import matplotlib.pylab as plt
275 |     import numpy as np
276 | 
277 |     fig, ax = plt.subplots(figsize=(10, 2))
278 |     plt.plot(x)
279 |     plt.plot(y)
280 |     plt.tight_layout()
281 | 
282 |     fig.canvas.draw()
283 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
284 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
285 |     plt.close()
286 |     return data
287 | 


--------------------------------------------------------------------------------