├── .gitignore
├── LICENSE
├── README.md
├── attentions.py
├── commons.py
├── configs
    └── config.json
├── convert.py
├── convert.txt
├── data_utils.py
├── downsample.py
├── filelists
    ├── test.txt
    ├── train.txt
    └── val.txt
├── hubert_model.py
├── losses.py
├── mel_processing.py
├── models.py
├── modules.py
├── preprocess_flist.py
├── preprocess_hubert_f0.py
├── raw
    └── wav_structure.txt
├── requirements.txt
├── resources
    ├── infer.png
    └── train.png
├── speaker_encoder
    ├── __init__.py
    ├── audio.py
    ├── ckpt
    │   ├── pretrained_bak_5805000.pt
    │   └── pretrained_bak_5805000.pt.txt
    ├── compute_embed.py
    ├── config.py
    ├── data_objects
    │   ├── __init__.py
    │   ├── random_cycler.py
    │   ├── speaker.py
    │   ├── speaker_batch.py
    │   ├── speaker_verification_dataset.py
    │   └── utterance.py
    ├── hparams.py
    ├── inference.py
    ├── model.py
    ├── params_data.py
    ├── params_model.py
    ├── preprocess.py
    ├── train.py
    ├── visualizations.py
    └── voice_encoder.py
├── train.py
├── utils.py
└── vdecoder
    ├── __init__.py
    └── hifigan
        ├── env.py
        ├── models.py
        ├── nvSTFT.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | DUMMY
3 | dataset
4 | logs
5 | outputs
6 | hifigan/generator_v1
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jingyi Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Free SVC
2 | 
3 | 基于 [FreeVC](https://github.com/olawod/freevc) 的歌声音色转换模型
4 | 
5 | 已弃坑，还是使用sovits方案
6 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import commons
  9 | import modules
 10 | from modules import LayerNorm
 11 |    
 12 | 
 13 | class Encoder(nn.Module):
 14 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 15 |     super().__init__()
 16 |     self.hidden_channels = hidden_channels
 17 |     self.filter_channels = filter_channels
 18 |     self.n_heads = n_heads
 19 |     self.n_layers = n_layers
 20 |     self.kernel_size = kernel_size
 21 |     self.p_dropout = p_dropout
 22 |     self.window_size = window_size
 23 | 
 24 |     self.drop = nn.Dropout(p_dropout)
 25 |     self.attn_layers = nn.ModuleList()
 26 |     self.norm_layers_1 = nn.ModuleList()
 27 |     self.ffn_layers = nn.ModuleList()
 28 |     self.norm_layers_2 = nn.ModuleList()
 29 |     for i in range(self.n_layers):
 30 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 31 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 32 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 33 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 34 | 
 35 |   def forward(self, x, x_mask):
 36 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 37 |     x = x * x_mask
 38 |     for i in range(self.n_layers):
 39 |       y = self.attn_layers[i](x, x, attn_mask)
 40 |       y = self.drop(y)
 41 |       x = self.norm_layers_1[i](x + y)
 42 | 
 43 |       y = self.ffn_layers[i](x, x_mask)
 44 |       y = self.drop(y)
 45 |       x = self.norm_layers_2[i](x + y)
 46 |     x = x * x_mask
 47 |     return x
 48 | 
 49 | 
 50 | class Decoder(nn.Module):
 51 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 52 |     super().__init__()
 53 |     self.hidden_channels = hidden_channels
 54 |     self.filter_channels = filter_channels
 55 |     self.n_heads = n_heads
 56 |     self.n_layers = n_layers
 57 |     self.kernel_size = kernel_size
 58 |     self.p_dropout = p_dropout
 59 |     self.proximal_bias = proximal_bias
 60 |     self.proximal_init = proximal_init
 61 | 
 62 |     self.drop = nn.Dropout(p_dropout)
 63 |     self.self_attn_layers = nn.ModuleList()
 64 |     self.norm_layers_0 = nn.ModuleList()
 65 |     self.encdec_attn_layers = nn.ModuleList()
 66 |     self.norm_layers_1 = nn.ModuleList()
 67 |     self.ffn_layers = nn.ModuleList()
 68 |     self.norm_layers_2 = nn.ModuleList()
 69 |     for i in range(self.n_layers):
 70 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 71 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 72 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 73 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 74 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 75 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 76 | 
 77 |   def forward(self, x, x_mask, h, h_mask):
 78 |     """
 79 |     x: decoder input
 80 |     h: encoder output
 81 |     """
 82 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 83 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 84 |     x = x * x_mask
 85 |     for i in range(self.n_layers):
 86 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 87 |       y = self.drop(y)
 88 |       x = self.norm_layers_0[i](x + y)
 89 | 
 90 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 91 |       y = self.drop(y)
 92 |       x = self.norm_layers_1[i](x + y)
 93 |       
 94 |       y = self.ffn_layers[i](x, x_mask)
 95 |       y = self.drop(y)
 96 |       x = self.norm_layers_2[i](x + y)
 97 |     x = x * x_mask
 98 |     return x
 99 | 
100 | 
101 | class MultiHeadAttention(nn.Module):
102 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103 |     super().__init__()
104 |     assert channels % n_heads == 0
105 | 
106 |     self.channels = channels
107 |     self.out_channels = out_channels
108 |     self.n_heads = n_heads
109 |     self.p_dropout = p_dropout
110 |     self.window_size = window_size
111 |     self.heads_share = heads_share
112 |     self.block_length = block_length
113 |     self.proximal_bias = proximal_bias
114 |     self.proximal_init = proximal_init
115 |     self.attn = None
116 | 
117 |     self.k_channels = channels // n_heads
118 |     self.conv_q = nn.Conv1d(channels, channels, 1)
119 |     self.conv_k = nn.Conv1d(channels, channels, 1)
120 |     self.conv_v = nn.Conv1d(channels, channels, 1)
121 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
122 |     self.drop = nn.Dropout(p_dropout)
123 | 
124 |     if window_size is not None:
125 |       n_heads_rel = 1 if heads_share else n_heads
126 |       rel_stddev = self.k_channels**-0.5
127 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129 | 
130 |     nn.init.xavier_uniform_(self.conv_q.weight)
131 |     nn.init.xavier_uniform_(self.conv_k.weight)
132 |     nn.init.xavier_uniform_(self.conv_v.weight)
133 |     if proximal_init:
134 |       with torch.no_grad():
135 |         self.conv_k.weight.copy_(self.conv_q.weight)
136 |         self.conv_k.bias.copy_(self.conv_q.bias)
137 |       
138 |   def forward(self, x, c, attn_mask=None):
139 |     q = self.conv_q(x)
140 |     k = self.conv_k(c)
141 |     v = self.conv_v(c)
142 |     
143 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
144 | 
145 |     x = self.conv_o(x)
146 |     return x
147 | 
148 |   def attention(self, query, key, value, mask=None):
149 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
150 |     b, d, t_s, t_t = (*key.size(), query.size(2))
151 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154 | 
155 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156 |     if self.window_size is not None:
157 |       assert t_s == t_t, "Relative attention is only available for self-attention."
158 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
161 |       scores = scores + scores_local
162 |     if self.proximal_bias:
163 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
164 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165 |     if mask is not None:
166 |       scores = scores.masked_fill(mask == 0, -1e4)
167 |       if self.block_length is not None:
168 |         assert t_s == t_t, "Local attention is only available for self-attention."
169 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170 |         scores = scores.masked_fill(block_mask == 0, -1e4)
171 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172 |     p_attn = self.drop(p_attn)
173 |     output = torch.matmul(p_attn, value)
174 |     if self.window_size is not None:
175 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
176 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179 |     return output, p_attn
180 | 
181 |   def _matmul_with_relative_values(self, x, y):
182 |     """
183 |     x: [b, h, l, m]
184 |     y: [h or 1, m, d]
185 |     ret: [b, h, l, d]
186 |     """
187 |     ret = torch.matmul(x, y.unsqueeze(0))
188 |     return ret
189 | 
190 |   def _matmul_with_relative_keys(self, x, y):
191 |     """
192 |     x: [b, h, l, d]
193 |     y: [h or 1, m, d]
194 |     ret: [b, h, l, m]
195 |     """
196 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197 |     return ret
198 | 
199 |   def _get_relative_embeddings(self, relative_embeddings, length):
200 |     max_relative_position = 2 * self.window_size + 1
201 |     # Pad first before slice to avoid using cond ops.
202 |     pad_length = max(length - (self.window_size + 1), 0)
203 |     slice_start_position = max((self.window_size + 1) - length, 0)
204 |     slice_end_position = slice_start_position + 2 * length - 1
205 |     if pad_length > 0:
206 |       padded_relative_embeddings = F.pad(
207 |           relative_embeddings,
208 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209 |     else:
210 |       padded_relative_embeddings = relative_embeddings
211 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212 |     return used_relative_embeddings
213 | 
214 |   def _relative_position_to_absolute_position(self, x):
215 |     """
216 |     x: [b, h, l, 2*l-1]
217 |     ret: [b, h, l, l]
218 |     """
219 |     batch, heads, length, _ = x.size()
220 |     # Concat columns of pad to shift from relative to absolute indexing.
221 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222 | 
223 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
224 |     x_flat = x.view([batch, heads, length * 2 * length])
225 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226 | 
227 |     # Reshape and slice out the padded elements.
228 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229 |     return x_final
230 | 
231 |   def _absolute_position_to_relative_position(self, x):
232 |     """
233 |     x: [b, h, l, l]
234 |     ret: [b, h, l, 2*l-1]
235 |     """
236 |     batch, heads, length, _ = x.size()
237 |     # padd along column
238 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240 |     # add 0's in the beginning that will skew the elements after reshape
241 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243 |     return x_final
244 | 
245 |   def _attention_bias_proximal(self, length):
246 |     """Bias for self-attention to encourage attention to close positions.
247 |     Args:
248 |       length: an integer scalar.
249 |     Returns:
250 |       a Tensor with shape [1, 1, length, length]
251 |     """
252 |     r = torch.arange(length, dtype=torch.float32)
253 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255 | 
256 | 
257 | class FFN(nn.Module):
258 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259 |     super().__init__()
260 |     self.in_channels = in_channels
261 |     self.out_channels = out_channels
262 |     self.filter_channels = filter_channels
263 |     self.kernel_size = kernel_size
264 |     self.p_dropout = p_dropout
265 |     self.activation = activation
266 |     self.causal = causal
267 | 
268 |     if causal:
269 |       self.padding = self._causal_padding
270 |     else:
271 |       self.padding = self._same_padding
272 | 
273 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275 |     self.drop = nn.Dropout(p_dropout)
276 | 
277 |   def forward(self, x, x_mask):
278 |     x = self.conv_1(self.padding(x * x_mask))
279 |     if self.activation == "gelu":
280 |       x = x * torch.sigmoid(1.702 * x)
281 |     else:
282 |       x = torch.relu(x)
283 |     x = self.drop(x)
284 |     x = self.conv_2(self.padding(x * x_mask))
285 |     return x * x_mask
286 |   
287 |   def _causal_padding(self, x):
288 |     if self.kernel_size == 1:
289 |       return x
290 |     pad_l = self.kernel_size - 1
291 |     pad_r = 0
292 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293 |     x = F.pad(x, commons.convert_pad_shape(padding))
294 |     return x
295 | 
296 |   def _same_padding(self, x):
297 |     if self.kernel_size == 1:
298 |       return x
299 |     pad_l = (self.kernel_size - 1) // 2
300 |     pad_r = self.kernel_size // 2
301 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302 |     x = F.pad(x, commons.convert_pad_shape(padding))
303 |     return x
304 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | def slice_pitch_segments(x, ids_str, segment_size=4):
  8 |   ret = torch.zeros_like(x[:, :segment_size])
  9 |   for i in range(x.size(0)):
 10 |     idx_str = ids_str[i]
 11 |     idx_end = idx_str + segment_size
 12 |     ret[i] = x[i, idx_str:idx_end]
 13 |   return ret
 14 | 
 15 | def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
 16 |   b, d, t = x.size()
 17 |   if x_lengths is None:
 18 |     x_lengths = t
 19 |   ids_str_max = x_lengths - segment_size + 1
 20 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 21 |   ret = slice_segments(x, ids_str, segment_size)
 22 |   ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
 23 |   return ret, ret_pitch, ids_str
 24 | 
 25 | def init_weights(m, mean=0.0, std=0.01):
 26 |   classname = m.__class__.__name__
 27 |   if classname.find("Conv") != -1:
 28 |     m.weight.data.normal_(mean, std)
 29 | 
 30 | 
 31 | def get_padding(kernel_size, dilation=1):
 32 |   return int((kernel_size*dilation - dilation)/2)
 33 | 
 34 | 
 35 | def convert_pad_shape(pad_shape):
 36 |   l = pad_shape[::-1]
 37 |   pad_shape = [item for sublist in l for item in sublist]
 38 |   return pad_shape
 39 | 
 40 | 
 41 | def intersperse(lst, item):
 42 |   result = [item] * (len(lst) * 2 + 1)
 43 |   result[1::2] = lst
 44 |   return result
 45 | 
 46 | 
 47 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 48 |   """KL(P||Q)"""
 49 |   kl = (logs_q - logs_p) - 0.5
 50 |   kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
 51 |   return kl
 52 | 
 53 | 
 54 | def rand_gumbel(shape):
 55 |   """Sample from the Gumbel distribution, protect from overflows."""
 56 |   uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 57 |   return -torch.log(-torch.log(uniform_samples))
 58 | 
 59 | 
 60 | def rand_gumbel_like(x):
 61 |   g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 62 |   return g
 63 | 
 64 | 
 65 | def slice_segments(x, ids_str, segment_size=4):
 66 |   ret = torch.zeros_like(x[:, :, :segment_size])
 67 |   for i in range(x.size(0)):
 68 |     idx_str = ids_str[i]
 69 |     idx_end = idx_str + segment_size
 70 |     ret[i] = x[i, :, idx_str:idx_end]
 71 |   return ret
 72 | 
 73 | 
 74 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 75 |   b, d, t = x.size()
 76 |   if x_lengths is None:
 77 |     x_lengths = t
 78 |   ids_str_max = x_lengths - segment_size + 1
 79 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 80 |   ret = slice_segments(x, ids_str, segment_size)
 81 |   return ret, ids_str
 82 | 
 83 | 
 84 | def rand_spec_segments(x, x_lengths=None, segment_size=4):
 85 |   b, d, t = x.size()
 86 |   if x_lengths is None:
 87 |     x_lengths = t
 88 |   ids_str_max = x_lengths - segment_size
 89 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 90 |   ret = slice_segments(x, ids_str, segment_size)
 91 |   return ret, ids_str
 92 | 
 93 | 
 94 | def get_timing_signal_1d(
 95 |     length, channels, min_timescale=1.0, max_timescale=1.0e4):
 96 |   position = torch.arange(length, dtype=torch.float)
 97 |   num_timescales = channels // 2
 98 |   log_timescale_increment = (
 99 |       math.log(float(max_timescale) / float(min_timescale)) /
100 |       (num_timescales - 1))
101 |   inv_timescales = min_timescale * torch.exp(
102 |       torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
103 |   scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
104 |   signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
105 |   signal = F.pad(signal, [0, 0, 0, channels % 2])
106 |   signal = signal.view(1, channels, length)
107 |   return signal
108 | 
109 | 
110 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
111 |   b, channels, length = x.size()
112 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
113 |   return x + signal.to(dtype=x.dtype, device=x.device)
114 | 
115 | 
116 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
117 |   b, channels, length = x.size()
118 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
119 |   return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
120 | 
121 | 
122 | def subsequent_mask(length):
123 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
124 |   return mask
125 | 
126 | 
127 | @torch.jit.script
128 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
129 |   n_channels_int = n_channels[0]
130 |   in_act = input_a + input_b
131 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
132 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
133 |   acts = t_act * s_act
134 |   return acts
135 | 
136 | 
137 | def convert_pad_shape(pad_shape):
138 |   l = pad_shape[::-1]
139 |   pad_shape = [item for sublist in l for item in sublist]
140 |   return pad_shape
141 | 
142 | 
143 | def shift_1d(x):
144 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
145 |   return x
146 | 
147 | 
148 | def sequence_mask(length, max_length=None):
149 |   if max_length is None:
150 |     max_length = length.max()
151 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
152 |   return x.unsqueeze(0) < length.unsqueeze(1)
153 | 
154 | 
155 | def generate_path(duration, mask):
156 |   """
157 |   duration: [b, 1, t_x]
158 |   mask: [b, 1, t_y, t_x]
159 |   """
160 |   device = duration.device
161 |   
162 |   b, _, t_y, t_x = mask.shape
163 |   cum_duration = torch.cumsum(duration, -1)
164 |   
165 |   cum_duration_flat = cum_duration.view(b * t_x)
166 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
167 |   path = path.view(b, t_x, t_y)
168 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
169 |   path = path.unsqueeze(1).transpose(2,3) * mask
170 |   return path
171 | 
172 | 
173 | def clip_grad_value_(parameters, clip_value, norm_type=2):
174 |   if isinstance(parameters, torch.Tensor):
175 |     parameters = [parameters]
176 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
177 |   norm_type = float(norm_type)
178 |   if clip_value is not None:
179 |     clip_value = float(clip_value)
180 | 
181 |   total_norm = 0
182 |   for p in parameters:
183 |     param_norm = p.grad.data.norm(norm_type)
184 |     total_norm += param_norm.item() ** norm_type
185 |     if clip_value is not None:
186 |       p.grad.data.clamp_(min=-clip_value, max=clip_value)
187 |   total_norm = total_norm ** (1. / norm_type)
188 |   return total_norm
189 | 


--------------------------------------------------------------------------------
/configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 200,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 16,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 17920,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "use_sr": true,
19 |     "max_speclen": 384,
20 |     "port": "8001"
21 |   },
22 |   "data": {
23 |     "training_files":"filelists/train.txt",
24 |     "validation_files":"filelists/val.txt",
25 |     "max_wav_value": 32768.0,
26 |     "sampling_rate": 48000,
27 |     "filter_length": 1280,
28 |     "hop_length": 320,
29 |     "win_length": 1280,
30 |     "n_mel_channels": 80,
31 |     "mel_fmin": 0.0,
32 |     "mel_fmax": null
33 |   },
34 |   "model": {
35 |     "inter_channels": 192,
36 |     "hidden_channels": 192,
37 |     "filter_channels": 768,
38 |     "n_heads": 2,
39 |     "n_layers": 6,
40 |     "kernel_size": 3,
41 |     "p_dropout": 0.1,
42 |     "resblock": "1",
43 |     "resblock_kernel_sizes": [3,7,11],
44 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45 |     "upsample_rates": [10,8,2,2],
46 |     "upsample_initial_channel": 512,
47 |     "upsample_kernel_sizes": [16,16,4,4],
48 |     "n_layers_q": 3,
49 |     "use_spectral_norm": false,
50 |     "gin_channels": 256,
51 |     "ssl_dim": 256
52 |   },
53 |   "spk":{
54 |     "nen": 0,
55 |     "paimon": 1,
56 |     "yunhao": 2
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | import numpy
  5 | import torch
  6 | import librosa
  7 | import time
  8 | from scipy.io.wavfile import write
  9 | from tqdm import tqdm
 10 | 
 11 | import utils
 12 | from models import SynthesizerTrn
 13 | from mel_processing import mel_spectrogram_torch
 14 | from speaker_encoder.voice_encoder import SpeakerEncoder
 15 | import logging
 16 | 
 17 | import parselmouth
 18 | import numpy as np
 19 | 
 20 | 
 21 | def stft(y):
 22 |     return librosa.stft(
 23 |         y=y,
 24 |         n_fft=1280,
 25 |         hop_length=160,
 26 |         win_length=1280,
 27 |     )
 28 | 
 29 | 
 30 | def energy(y):
 31 |     # Extract energy
 32 |     S = librosa.magphase(stft(y))[0]
 33 |     e = np.sqrt(np.sum(S ** 2, axis=0))  # np.linalg.norm(S, axis=0)
 34 |     return e.squeeze()  # (Number of frames) => (654,)
 35 | 
 36 | 
 37 | def get_energy(path, p_len=None):
 38 |     wav, sr = librosa.load(path, 16000)
 39 |     e = energy(wav)
 40 |     if p_len is None:
 41 |         p_len = wav.shape[0] // 160
 42 |     assert e.shape[0] - p_len < 2, (e.shape[0], p_len)
 43 |     e = e[: p_len]
 44 |     return e
 45 | 
 46 | 
 47 | def get_f0(path, p_len=None, f0_up_key=0):
 48 |     x, _ = librosa.load(path, 16000)
 49 |     if p_len is None:
 50 |         p_len = x.shape[0] // 160
 51 |     else:
 52 |         assert abs(p_len - x.shape[0] // 160) < 2, (path, p_len, x.shape)
 53 |     time_step = 160 / 16000 * 1000
 54 |     f0_min = 50
 55 |     f0_max = 1100
 56 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 57 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 58 | 
 59 |     f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
 60 |         time_step=time_step / 1000, voicing_threshold=0.6,
 61 |         pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
 62 | 
 63 |     pad_size = (p_len - len(f0) + 1) // 2
 64 |     if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
 65 |         f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
 66 | 
 67 |     f0bak = f0.copy()
 68 |     f0 *= pow(2, f0_up_key / 12)
 69 |     f0_mel = 1127 * np.log(1 + f0 / 700)
 70 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
 71 |     f0_mel[f0_mel <= 1] = 1
 72 |     f0_mel[f0_mel > 255] = 255
 73 |     f0_coarse = np.rint(f0_mel).astype(np.int)
 74 |     return f0_coarse, f0bak
 75 | 
 76 | 
 77 | logging.getLogger('numba').setLevel(logging.WARNING)
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument("--hpfile", type=str, default="configs/freevc.json", help="path to json config file")
 82 |     parser.add_argument("--ptfile", type=str, default="logs/freevc/G_14000.pth", help="path to pth file")
 83 |     parser.add_argument("--outdir", type=str, default="output", help="path to output dir")
 84 |     parser.add_argument("--use_timestamp", default=False, action="store_true")
 85 |     args = parser.parse_args()
 86 | 
 87 |     os.makedirs(args.outdir, exist_ok=True)
 88 |     hps = utils.get_hparams_from_file(args.hpfile)
 89 | 
 90 |     print("Loading model...")
 91 |     net_g = SynthesizerTrn(
 92 |         hps.data.filter_length // 2 + 1,
 93 |         hps.train.segment_size // hps.data.hop_length,
 94 |         **hps.model).cuda()
 95 |     _ = net_g.eval()
 96 |     print("Loading checkpoint...")
 97 |     _ = utils.load_checkpoint(args.ptfile, net_g, None)
 98 | 
 99 |     print("Loading WavLM for content...")
100 |     cmodel = utils.get_hubert_model(0)
101 | 
102 |     print("Processing text...")
103 |     titles, srcs, tgts, pshifts, eshifts = [], [], [], [], []
104 | 
105 |     for line in open("convert.txt").readlines():
106 |         sample, i, pshift, eshift = line.strip().split("|")
107 |         title = f"{sample[:-4]}-{i}-{pshift}-{eshift}"
108 |         src = f"sample/{sample}"
109 |         tgt = int(i)
110 |         titles.append(title)
111 |         srcs.append(src)
112 |         tgts.append(tgt)
113 | 
114 |         pshifts.append(int(pshift))
115 |         eshifts.append(float(eshift))
116 | 
117 |     print("Synthesizing...")
118 |     with torch.no_grad():
119 |         for line in tqdm(zip(titles, srcs, tgts, pshifts, eshifts)):
120 |             title, src, tgt, pshift, eshift = line
121 |             # src
122 |             wav_src, _ = librosa.load(src, sr=16000)
123 |             wav_src = torch.from_numpy(wav_src).unsqueeze(0).cuda()
124 |             c = utils.get_hubert_content(cmodel, wav_src)
125 |             c = torch.repeat_interleave(c, repeats=2, dim=2)
126 |             # print(c.shape)
127 |             g = torch.LongTensor([[tgt]]).cuda()
128 |             cf0, f0bk = get_f0(src, c.shape[-1], f0_up_key=pshift)
129 |             f0 = torch.LongTensor(cf0).unsqueeze(0).cuda()
130 | 
131 |             e = get_energy(src, c.shape[-1]) * eshift
132 |             e = torch.LongTensor(e).unsqueeze(0).cuda()
133 | 
134 |             audio = net_g.infer(c, f0=f0, energy=e, g=g)
135 |             audio = audio[0][0].data.cpu().float().numpy()
136 |             if args.use_timestamp:
137 |                 timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
138 |                 write(os.path.join(args.outdir, "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate,
139 |                       audio)
140 |             else:
141 |                 write(os.path.join(args.outdir, f"{title}.wav"), hps.data.sampling_rate, audio)
142 | 
143 | 


--------------------------------------------------------------------------------
/convert.txt:
--------------------------------------------------------------------------------
1 | cxk.wav|0|12|0.5
2 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import random
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | 
  8 | import commons
  9 | from mel_processing import spectrogram_torch, spec_to_mel_torch
 10 | from utils import load_wav_to_torch, load_filepaths_and_text, transform
 11 | 
 12 | # import h5py
 13 | 
 14 | 
 15 | """Multi speaker version"""
 16 | 
 17 | 
 18 | class TextAudioSpeakerLoader(torch.utils.data.Dataset):
 19 |     """
 20 |         1) loads audio, speaker_id, text pairs
 21 |         2) normalizes text and converts them to sequences of integers
 22 |         3) computes spectrograms from audio files.
 23 |     """
 24 | 
 25 |     def __init__(self, audiopaths, hparams):
 26 |         self.audiopaths = load_filepaths_and_text(audiopaths)
 27 |         self.max_wav_value = hparams.data.max_wav_value
 28 |         self.sampling_rate = hparams.data.sampling_rate
 29 |         self.filter_length = hparams.data.filter_length
 30 |         self.hop_length = hparams.data.hop_length
 31 |         self.win_length = hparams.data.win_length
 32 |         self.sampling_rate = hparams.data.sampling_rate
 33 |         self.use_sr = hparams.train.use_sr
 34 |         self.spec_len = hparams.train.max_speclen
 35 |         self.spk_map = hparams.spk
 36 | 
 37 |         random.seed(1234)
 38 |         random.shuffle(self.audiopaths)
 39 | 
 40 |     def get_audio(self, filename):
 41 |         audio, sampling_rate = load_wav_to_torch(filename)
 42 |         if sampling_rate != self.sampling_rate:
 43 |             raise ValueError("{} SR doesn't match target {} SR".format(
 44 |                 sampling_rate, self.sampling_rate))
 45 |         audio_norm = audio / self.max_wav_value
 46 |         audio_norm = audio_norm.unsqueeze(0)
 47 |         spec_filename = filename.replace(".wav", ".spec.pt")
 48 |         if os.path.exists(spec_filename):
 49 |             spec = torch.load(spec_filename)
 50 |         else:
 51 |             spec = spectrogram_torch(audio_norm, self.filter_length,
 52 |                                      self.sampling_rate, self.hop_length, self.win_length,
 53 |                                      center=False)
 54 |             spec = torch.squeeze(spec, 0)
 55 |             torch.save(spec, spec_filename)
 56 | 
 57 |         spk = filename.split("/")[-2]
 58 |         spk = torch.LongTensor([self.spk_map[spk]])
 59 | 
 60 |         c = torch.load(filename + ".soft.pt").squeeze(0)
 61 |         c = torch.repeat_interleave(c, repeats=3, dim=1)
 62 | 
 63 |         f0 = np.load(filename + ".f0.npy")
 64 |         f0 = torch.FloatTensor(f0)
 65 |         lmin = min(c.size(-1), spec.size(-1), f0.shape[0])
 66 |         assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename)
 67 |         assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
 68 |         assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
 69 |         spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin]
 70 |         audio_norm = audio_norm[:, :lmin * self.hop_length]
 71 |         _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0
 72 |         while spec.size(-1) < self.spec_len:
 73 |             spec = torch.cat((spec, _spec), -1)
 74 |             c = torch.cat((c, _c), -1)
 75 |             f0 = torch.cat((f0, _f0), -1)
 76 |             audio_norm = torch.cat((audio_norm, _audio_norm), -1)
 77 |         start = random.randint(0, spec.size(-1) - self.spec_len)
 78 |         end = start + self.spec_len
 79 |         spec = spec[:, start:end]
 80 |         c = c[:, start:end]
 81 |         f0 = f0[start:end]
 82 |         audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length]
 83 | 
 84 |         return c, f0, spec, audio_norm, spk
 85 | 
 86 |     def __getitem__(self, index):
 87 |         return self.get_audio(self.audiopaths[index][0])
 88 | 
 89 |     def __len__(self):
 90 |         return len(self.audiopaths)
 91 | 
 92 | 
 93 | class EvalDataLoader(torch.utils.data.Dataset):
 94 |     """
 95 |         1) loads audio, speaker_id, text pairs
 96 |         2) normalizes text and converts them to sequences of integers
 97 |         3) computes spectrograms from audio files.
 98 |     """
 99 | 
100 |     def __init__(self, audiopaths, hparams):
101 |         self.audiopaths = load_filepaths_and_text(audiopaths)
102 |         self.max_wav_value = hparams.data.max_wav_value
103 |         self.sampling_rate = hparams.data.sampling_rate
104 |         self.filter_length = hparams.data.filter_length
105 |         self.hop_length = hparams.data.hop_length
106 |         self.win_length = hparams.data.win_length
107 |         self.sampling_rate = hparams.data.sampling_rate
108 |         self.use_sr = hparams.train.use_sr
109 |         self.audiopaths = self.audiopaths[:10]
110 |         self.spk_map = hparams.spk
111 | 
112 | 
113 |     def get_audio(self, filename):
114 |         audio, sampling_rate = load_wav_to_torch(filename)
115 |         if sampling_rate != self.sampling_rate:
116 |             raise ValueError("{} SR doesn't match target {} SR".format(
117 |                 sampling_rate, self.sampling_rate))
118 |         audio_norm = audio / self.max_wav_value
119 |         audio_norm = audio_norm.unsqueeze(0)
120 |         spec_filename = filename.replace(".wav", ".spec.pt")
121 |         if os.path.exists(spec_filename):
122 |             spec = torch.load(spec_filename)
123 |         else:
124 |             spec = spectrogram_torch(audio_norm, self.filter_length,
125 |                                      self.sampling_rate, self.hop_length, self.win_length,
126 |                                      center=False)
127 |             spec = torch.squeeze(spec, 0)
128 |             torch.save(spec, spec_filename)
129 | 
130 |         spk = filename.split("/")[-2]
131 |         spk = torch.LongTensor([self.spk_map[spk]])
132 | 
133 |         c = torch.load(filename + ".soft.pt").squeeze(0)
134 | 
135 |         c = torch.repeat_interleave(c, repeats=3, dim=1)
136 | 
137 |         f0 = np.load(filename + ".f0.npy")
138 |         f0 = torch.FloatTensor(f0)
139 |         lmin = min(c.size(-1), spec.size(-1), f0.shape[0])
140 |         assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
141 |         assert abs(f0.shape[0] - spec.shape[-1]) < 4, (c.size(-1), spec.size(-1), f0.shape)
142 |         spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin]
143 |         audio_norm = audio_norm[:, :lmin * self.hop_length]
144 | 
145 |         return c, f0, spec, audio_norm, spk
146 | 
147 |     def __getitem__(self, index):
148 |         return self.get_audio(self.audiopaths[index][0])
149 | 
150 |     def __len__(self):
151 |         return len(self.audiopaths)
152 | 
153 | 


--------------------------------------------------------------------------------
/downsample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | import numpy as np
 5 | from multiprocessing import Pool, cpu_count
 6 | from scipy.io import wavfile
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def process(item):
11 |     spkdir, wav_name, args = item
12 |     # speaker 's5', 'p280', 'p315' are excluded,
13 |     speaker = spkdir.split("/")[-1]
14 |     wav_path = os.path.join(args.in_dir, speaker, wav_name)
15 |     if os.path.exists(wav_path) and '.wav' in wav_path:
16 |         os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True)
17 |         wav, sr = librosa.load(wav_path, None)
18 |         wav, _ = librosa.effects.trim(wav, top_db=20)
19 |         peak = np.abs(wav).max()
20 |         if peak > 1.0:
21 |             wav = 0.98 * wav / peak
22 |         wav2 = librosa.resample(wav, orig_sr=sr, target_sr=args.sr2)
23 |         save_name = wav_name
24 |         save_path2 = os.path.join(args.out_dir2, speaker, save_name)
25 |         wavfile.write(
26 |             save_path2,
27 |             args.sr2,
28 |             (wav2 * np.iinfo(np.int16).max).astype(np.int16)
29 |         )
30 | 
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("--sr2", type=int, default=48000, help="sampling rate")
36 |     parser.add_argument("--in_dir", type=str, default="./raw", help="path to source dir")
37 |     parser.add_argument("--out_dir2", type=str, default="./dataset/48k", help="path to target dir")
38 |     args = parser.parse_args()
39 |     processs = cpu_count()-2 if cpu_count() >4 else 1
40 |     pool = Pool(processes=processs)
41 | 
42 |     for speaker in os.listdir(args.in_dir):
43 |         spk_dir = os.path.join(args.in_dir, speaker)
44 |         if os.path.isdir(spk_dir):
45 |             print([(spk_dir, i) for i in os.listdir(spk_dir) if i.endswith("wav")])
46 |             for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
47 |                 pass
48 | 


--------------------------------------------------------------------------------
/filelists/test.txt:
--------------------------------------------------------------------------------
 1 | ./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
 2 | ./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav
 3 | ./dataset/48k/nen/kne110_005.wav
 4 | ./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav
 5 | ./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav
 6 | ./dataset/48k/nen/kne110_003.wav
 7 | ./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
 8 | ./dataset/48k/nen/kne110_004.wav
 9 | ./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
10 | ./dataset/48k/nen/kne110_001.wav
11 | ./dataset/48k/nen/kne110_006.wav
12 | ./dataset/48k/nen/kne110_002.wav
13 | 


--------------------------------------------------------------------------------
/filelists/train.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/filelists/train.txt


--------------------------------------------------------------------------------
/filelists/val.txt:
--------------------------------------------------------------------------------
1 | ./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
2 | ./dataset/48k/nen/kne110_006.wav
3 | ./dataset/48k/nen/kne110_002.wav
4 | ./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
5 | 


--------------------------------------------------------------------------------
/hubert_model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import random
  3 | from typing import Optional, Tuple
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as t_func
  8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
  9 | 
 10 | 
 11 | class Hubert(nn.Module):
 12 |     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
 13 |         super().__init__()
 14 |         self._mask = mask
 15 |         self.feature_extractor = FeatureExtractor()
 16 |         self.feature_projection = FeatureProjection()
 17 |         self.positional_embedding = PositionalConvEmbedding()
 18 |         self.norm = nn.LayerNorm(768)
 19 |         self.dropout = nn.Dropout(0.1)
 20 |         self.encoder = TransformerEncoder(
 21 |             nn.TransformerEncoderLayer(
 22 |                 768, 12, 3072, activation="gelu", batch_first=True
 23 |             ),
 24 |             12,
 25 |         )
 26 |         self.proj = nn.Linear(768, 256)
 27 | 
 28 |         self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
 29 |         self.label_embedding = nn.Embedding(num_label_embeddings, 256)
 30 | 
 31 |     def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 32 |         mask = None
 33 |         if self.training and self._mask:
 34 |             mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
 35 |             x[mask] = self.masked_spec_embed.to(x.dtype)
 36 |         return x, mask
 37 | 
 38 |     def encode(
 39 |             self, x: torch.Tensor, layer: Optional[int] = None
 40 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 41 |         x = self.feature_extractor(x)
 42 |         x = self.feature_projection(x.transpose(1, 2))
 43 |         x, mask = self.mask(x)
 44 |         x = x + self.positional_embedding(x)
 45 |         x = self.dropout(self.norm(x))
 46 |         x = self.encoder(x, output_layer=layer)
 47 |         return x, mask
 48 | 
 49 |     def logits(self, x: torch.Tensor) -> torch.Tensor:
 50 |         logits = torch.cosine_similarity(
 51 |             x.unsqueeze(2),
 52 |             self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
 53 |             dim=-1,
 54 |         )
 55 |         return logits / 0.1
 56 | 
 57 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 58 |         x, mask = self.encode(x)
 59 |         x = self.proj(x)
 60 |         logits = self.logits(x)
 61 |         return logits, mask
 62 | 
 63 | 
 64 | class HubertSoft(Hubert):
 65 |     def __init__(self):
 66 |         super().__init__()
 67 | 
 68 |     @torch.inference_mode()
 69 |     def units(self, wav: torch.Tensor) -> torch.Tensor:
 70 |         wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 71 |         x, _ = self.encode(wav)
 72 |         return self.proj(x)
 73 | 
 74 | 
 75 | class FeatureExtractor(nn.Module):
 76 |     def __init__(self):
 77 |         super().__init__()
 78 |         self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
 79 |         self.norm0 = nn.GroupNorm(512, 512)
 80 |         self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
 81 |         self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
 82 |         self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
 83 |         self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
 84 |         self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
 85 |         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
 86 | 
 87 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 88 |         x = t_func.gelu(self.norm0(self.conv0(x)))
 89 |         x = t_func.gelu(self.conv1(x))
 90 |         x = t_func.gelu(self.conv2(x))
 91 |         x = t_func.gelu(self.conv3(x))
 92 |         x = t_func.gelu(self.conv4(x))
 93 |         x = t_func.gelu(self.conv5(x))
 94 |         x = t_func.gelu(self.conv6(x))
 95 |         return x
 96 | 
 97 | 
 98 | class FeatureProjection(nn.Module):
 99 |     def __init__(self):
100 |         super().__init__()
101 |         self.norm = nn.LayerNorm(512)
102 |         self.projection = nn.Linear(512, 768)
103 |         self.dropout = nn.Dropout(0.1)
104 | 
105 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
106 |         x = self.norm(x)
107 |         x = self.projection(x)
108 |         x = self.dropout(x)
109 |         return x
110 | 
111 | 
112 | class PositionalConvEmbedding(nn.Module):
113 |     def __init__(self):
114 |         super().__init__()
115 |         self.conv = nn.Conv1d(
116 |             768,
117 |             768,
118 |             kernel_size=128,
119 |             padding=128 // 2,
120 |             groups=16,
121 |         )
122 |         self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123 | 
124 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
125 |         x = self.conv(x.transpose(1, 2))
126 |         x = t_func.gelu(x[:, :, :-1])
127 |         return x.transpose(1, 2)
128 | 
129 | 
130 | class TransformerEncoder(nn.Module):
131 |     def __init__(
132 |             self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133 |     ) -> None:
134 |         super(TransformerEncoder, self).__init__()
135 |         self.layers = nn.ModuleList(
136 |             [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137 |         )
138 |         self.num_layers = num_layers
139 | 
140 |     def forward(
141 |             self,
142 |             src: torch.Tensor,
143 |             mask: torch.Tensor = None,
144 |             src_key_padding_mask: torch.Tensor = None,
145 |             output_layer: Optional[int] = None,
146 |     ) -> torch.Tensor:
147 |         output = src
148 |         for layer in self.layers[:output_layer]:
149 |             output = layer(
150 |                 output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151 |             )
152 |         return output
153 | 
154 | 
155 | def _compute_mask(
156 |         shape: Tuple[int, int],
157 |         mask_prob: float,
158 |         mask_length: int,
159 |         device: torch.device,
160 |         min_masks: int = 0,
161 | ) -> torch.Tensor:
162 |     batch_size, sequence_length = shape
163 | 
164 |     if mask_length < 1:
165 |         raise ValueError("`mask_length` has to be bigger than 0.")
166 | 
167 |     if mask_length > sequence_length:
168 |         raise ValueError(
169 |             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170 |         )
171 | 
172 |     # compute number of masked spans in batch
173 |     num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174 |     num_masked_spans = max(num_masked_spans, min_masks)
175 | 
176 |     # make sure num masked indices <= sequence_length
177 |     if num_masked_spans * mask_length > sequence_length:
178 |         num_masked_spans = sequence_length // mask_length
179 | 
180 |     # SpecAugment mask to fill
181 |     mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182 | 
183 |     # uniform distribution to sample from, make sure that offset samples are < sequence_length
184 |     uniform_dist = torch.ones(
185 |         (batch_size, sequence_length - (mask_length - 1)), device=device
186 |     )
187 | 
188 |     # get random indices to mask
189 |     mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190 | 
191 |     # expand masked indices to masked spans
192 |     mask_indices = (
193 |         mask_indices.unsqueeze(dim=-1)
194 |         .expand((batch_size, num_masked_spans, mask_length))
195 |         .reshape(batch_size, num_masked_spans * mask_length)
196 |     )
197 |     offsets = (
198 |         torch.arange(mask_length, device=device)[None, None, :]
199 |         .expand((batch_size, num_masked_spans, mask_length))
200 |         .reshape(batch_size, num_masked_spans * mask_length)
201 |     )
202 |     mask_idxs = mask_indices + offsets
203 | 
204 |     # scatter indices to mask
205 |     mask = mask.scatter(1, mask_idxs, True)
206 | 
207 |     return mask
208 | 
209 | 
210 | def hubert_soft(
211 |         path: str,
212 | ) -> HubertSoft:
213 |     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214 |     Args:
215 |         path (str): path of a pretrained model
216 |     """
217 |     hubert = HubertSoft()
218 |     checkpoint = torch.load(path)
219 |     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
220 |     hubert.load_state_dict(checkpoint)
221 |     hubert.eval()
222 |     return hubert
223 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | from torch.nn import functional as F
 3 | 
 4 | import commons
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |   loss = 0
 9 |   for dr, dg in zip(fmap_r, fmap_g):
10 |     for rl, gl in zip(dr, dg):
11 |       rl = rl.float().detach()
12 |       gl = gl.float()
13 |       loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |   return loss * 2 
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |   loss = 0
20 |   r_losses = []
21 |   g_losses = []
22 |   for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |     dr = dr.float()
24 |     dg = dg.float()
25 |     r_loss = torch.mean((1-dr)**2)
26 |     g_loss = torch.mean(dg**2)
27 |     loss += (r_loss + g_loss)
28 |     r_losses.append(r_loss.item())
29 |     g_losses.append(g_loss.item())
30 | 
31 |   return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |   loss = 0
36 |   gen_losses = []
37 |   for dg in disc_outputs:
38 |     dg = dg.float()
39 |     l = torch.mean((1-dg)**2)
40 |     gen_losses.append(l)
41 |     loss += l
42 | 
43 |   return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |   """
48 |   z_p, logs_q: [b, h, t_t]
49 |   m_p, logs_p: [b, h, t_t]
50 |   """
51 |   z_p = z_p.float()
52 |   logs_q = logs_q.float()
53 |   m_p = m_p.float()
54 |   logs_p = logs_p.float()
55 |   z_mask = z_mask.float()
56 |   #print(logs_p)
57 |   kl = logs_p - logs_q - 0.5
58 |   kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
59 |   kl = torch.sum(kl * z_mask)
60 |   l = kl / torch.sum(z_mask)
61 |   return l
62 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     if torch.min(y) < -1.:
 53 |         print('min value is ', torch.min(y))
 54 |     if torch.max(y) > 1.:
 55 |         print('max value is ', torch.max(y))
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 62 | 
 63 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 68 | 
 69 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 70 |     return spec
 71 | 
 72 | 
 73 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 74 |     global mel_basis
 75 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 76 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 77 |     if fmax_dtype_device not in mel_basis:
 78 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
 79 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
 80 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 81 |     spec = spectral_normalize_torch(spec)
 82 |     return spec
 83 | 
 84 | 
 85 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 86 |     if torch.min(y) < -1.:
 87 |         print('min value is ', torch.min(y))
 88 |     if torch.max(y) > 1.:
 89 |         print('max value is ', torch.max(y))
 90 | 
 91 |     global mel_basis, hann_window
 92 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 93 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 94 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 95 |     if fmax_dtype_device not in mel_basis:
 96 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
 97 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
 98 |     if wnsize_dtype_device not in hann_window:
 99 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100 | 
101 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102 |     y = y.squeeze(1)
103 | 
104 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106 | 
107 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108 | 
109 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110 |     spec = spectral_normalize_torch(spec)
111 | 
112 |     return spec
113 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | import attentions
  8 | import commons
  9 | import modules
 10 | 
 11 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 13 | from commons import init_weights, get_padding
 14 | from vdecoder.hifigan.models import Generator
 15 | from utils import f0_to_coarse
 16 | 
 17 | class ResidualCouplingBlock(nn.Module):
 18 |   def __init__(self,
 19 |       channels,
 20 |       hidden_channels,
 21 |       kernel_size,
 22 |       dilation_rate,
 23 |       n_layers,
 24 |       n_flows=4,
 25 |       gin_channels=0):
 26 |     super().__init__()
 27 |     self.channels = channels
 28 |     self.hidden_channels = hidden_channels
 29 |     self.kernel_size = kernel_size
 30 |     self.dilation_rate = dilation_rate
 31 |     self.n_layers = n_layers
 32 |     self.n_flows = n_flows
 33 |     self.gin_channels = gin_channels
 34 | 
 35 |     self.flows = nn.ModuleList()
 36 |     for i in range(n_flows):
 37 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
 38 |       self.flows.append(modules.Flip())
 39 | 
 40 |   def forward(self, x, x_mask, g=None, reverse=False):
 41 |     if not reverse:
 42 |       for flow in self.flows:
 43 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
 44 |     else:
 45 |       for flow in reversed(self.flows):
 46 |         x = flow(x, x_mask, g=g, reverse=reverse)
 47 |     return x
 48 | 
 49 | 
 50 | class Encoder(nn.Module):
 51 |   def __init__(self,
 52 |       in_channels,
 53 |       out_channels,
 54 |       hidden_channels,
 55 |       kernel_size,
 56 |       dilation_rate,
 57 |       n_layers,
 58 |       gin_channels=0):
 59 |     super().__init__()
 60 |     self.in_channels = in_channels
 61 |     self.out_channels = out_channels
 62 |     self.hidden_channels = hidden_channels
 63 |     self.kernel_size = kernel_size
 64 |     self.dilation_rate = dilation_rate
 65 |     self.n_layers = n_layers
 66 |     self.gin_channels = gin_channels
 67 | 
 68 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
 69 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
 70 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 71 | 
 72 |   def forward(self, x, x_lengths, g=None):
 73 |     # print(x.shape,x_lengths.shape)
 74 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 75 |     x = self.pre(x) * x_mask
 76 |     x = self.enc(x, x_mask, g=g)
 77 |     stats = self.proj(x) * x_mask
 78 |     m, logs = torch.split(stats, self.out_channels, dim=1)
 79 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
 80 |     return z, m, logs, x_mask
 81 | 
 82 | 
 83 | class TextEncoder(nn.Module):
 84 |   def __init__(self,
 85 |       in_channels,
 86 |       out_channels,
 87 |       hidden_channels,
 88 |       kernel_size,
 89 |       dilation_rate,
 90 |       n_layers,
 91 |       gin_channels=0,
 92 |       filter_channels=None,
 93 |       n_heads=None,
 94 |       p_dropout=None):
 95 |     super().__init__()
 96 |     self.in_channels = in_channels
 97 |     self.out_channels = out_channels
 98 |     self.hidden_channels = hidden_channels
 99 |     self.kernel_size = kernel_size
100 |     self.dilation_rate = dilation_rate
101 |     self.n_layers = n_layers
102 |     self.gin_channels = gin_channels
103 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
104 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
105 |     self.f0_emb = nn.Embedding(256, hidden_channels)
106 | 
107 |     self.enc_ =  attentions.Encoder(
108 |         hidden_channels,
109 |         filter_channels,
110 |         n_heads,
111 |         n_layers,
112 |         kernel_size,
113 |         p_dropout)
114 | 
115 |   def forward(self, x, x_lengths, f0=None):
116 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
117 |     x = self.pre(x) * x_mask
118 |     x = x + self.f0_emb(f0).transpose(1,2)
119 |     x = self.enc_(x * x_mask, x_mask)
120 |     stats = self.proj(x) * x_mask
121 |     m, logs = torch.split(stats, self.out_channels, dim=1)
122 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
123 | 
124 |     return z, m, logs, x_mask
125 | 
126 | 
127 | 
128 | class DiscriminatorP(torch.nn.Module):
129 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130 |         super(DiscriminatorP, self).__init__()
131 |         self.period = period
132 |         self.use_spectral_norm = use_spectral_norm
133 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
134 |         self.convs = nn.ModuleList([
135 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
136 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
137 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
138 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
139 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
140 |         ])
141 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
142 | 
143 |     def forward(self, x):
144 |         fmap = []
145 | 
146 |         # 1d to 2d
147 |         b, c, t = x.shape
148 |         if t % self.period != 0: # pad first
149 |             n_pad = self.period - (t % self.period)
150 |             x = F.pad(x, (0, n_pad), "reflect")
151 |             t = t + n_pad
152 |         x = x.view(b, c, t // self.period, self.period)
153 | 
154 |         for l in self.convs:
155 |             x = l(x)
156 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
157 |             fmap.append(x)
158 |         x = self.conv_post(x)
159 |         fmap.append(x)
160 |         x = torch.flatten(x, 1, -1)
161 | 
162 |         return x, fmap
163 | 
164 | 
165 | class DiscriminatorS(torch.nn.Module):
166 |     def __init__(self, use_spectral_norm=False):
167 |         super(DiscriminatorS, self).__init__()
168 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
169 |         self.convs = nn.ModuleList([
170 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
171 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
172 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
173 |             norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
174 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
175 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
176 |         ])
177 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
178 | 
179 |     def forward(self, x):
180 |         fmap = []
181 | 
182 |         for l in self.convs:
183 |             x = l(x)
184 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
185 |             fmap.append(x)
186 |         x = self.conv_post(x)
187 |         fmap.append(x)
188 |         x = torch.flatten(x, 1, -1)
189 | 
190 |         return x, fmap
191 | 
192 | 
193 | class MultiPeriodDiscriminator(torch.nn.Module):
194 |     def __init__(self, use_spectral_norm=False):
195 |         super(MultiPeriodDiscriminator, self).__init__()
196 |         periods = [2,3,5,7,11]
197 | 
198 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
199 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
200 |         self.discriminators = nn.ModuleList(discs)
201 | 
202 |     def forward(self, y, y_hat):
203 |         y_d_rs = []
204 |         y_d_gs = []
205 |         fmap_rs = []
206 |         fmap_gs = []
207 |         for i, d in enumerate(self.discriminators):
208 |             y_d_r, fmap_r = d(y)
209 |             y_d_g, fmap_g = d(y_hat)
210 |             y_d_rs.append(y_d_r)
211 |             y_d_gs.append(y_d_g)
212 |             fmap_rs.append(fmap_r)
213 |             fmap_gs.append(fmap_g)
214 | 
215 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
216 |         
217 |         
218 | class SpeakerEncoder(torch.nn.Module):
219 |     def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
220 |         super(SpeakerEncoder, self).__init__()
221 |         self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
222 |         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
223 |         self.relu = nn.ReLU()
224 | 
225 |     def forward(self, mels):
226 |         self.lstm.flatten_parameters()
227 |         _, (hidden, _) = self.lstm(mels)
228 |         embeds_raw = self.relu(self.linear(hidden[-1]))
229 |         return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
230 |         
231 |     def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
232 |         mel_slices = []
233 |         for i in range(0, total_frames-partial_frames, partial_hop):
234 |             mel_range = torch.arange(i, i+partial_frames)
235 |             mel_slices.append(mel_range)
236 |             
237 |         return mel_slices
238 |     
239 |     def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
240 |         mel_len = mel.size(1)
241 |         last_mel = mel[:,-partial_frames:]
242 |         
243 |         if mel_len > partial_frames:
244 |             mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
245 |             mels = list(mel[:,s] for s in mel_slices)
246 |             mels.append(last_mel)
247 |             mels = torch.stack(tuple(mels), 0).squeeze(1)
248 |         
249 |             with torch.no_grad():
250 |                 partial_embeds = self(mels)
251 |             embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
252 |             #embed = embed / torch.linalg.norm(embed, 2)
253 |         else:
254 |             with torch.no_grad():
255 |                 embed = self(last_mel)
256 |         
257 |         return embed
258 | 
259 | 
260 | class SynthesizerTrn(nn.Module):
261 |   """
262 |   Synthesizer for Training
263 |   """
264 | 
265 |   def __init__(self, 
266 |     spec_channels,
267 |     segment_size,
268 |     inter_channels,
269 |     hidden_channels,
270 |     filter_channels,
271 |     n_heads,
272 |     n_layers,
273 |     kernel_size,
274 |     p_dropout,
275 |     resblock, 
276 |     resblock_kernel_sizes, 
277 |     resblock_dilation_sizes, 
278 |     upsample_rates, 
279 |     upsample_initial_channel, 
280 |     upsample_kernel_sizes,
281 |     gin_channels,
282 |     ssl_dim,
283 |     **kwargs):
284 | 
285 |     super().__init__()
286 |     self.spec_channels = spec_channels
287 |     self.inter_channels = inter_channels
288 |     self.hidden_channels = hidden_channels
289 |     self.filter_channels = filter_channels
290 |     self.n_heads = n_heads
291 |     self.n_layers = n_layers
292 |     self.kernel_size = kernel_size
293 |     self.p_dropout = p_dropout
294 |     self.resblock = resblock
295 |     self.resblock_kernel_sizes = resblock_kernel_sizes
296 |     self.resblock_dilation_sizes = resblock_dilation_sizes
297 |     self.upsample_rates = upsample_rates
298 |     self.upsample_initial_channel = upsample_initial_channel
299 |     self.upsample_kernel_sizes = upsample_kernel_sizes
300 |     self.segment_size = segment_size
301 |     self.gin_channels = gin_channels
302 |     self.ssl_dim = ssl_dim
303 |     self.emb_g = nn.Embedding(10, gin_channels)
304 | 
305 |     self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
306 |     hps = {
307 |         "sampling_rate": 48000,
308 |         "inter_channels": 192,
309 |         "resblock": "1",
310 |         "resblock_kernel_sizes": [3, 7, 11],
311 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
312 |         "upsample_rates": [10, 8, 2, 2],
313 |         "upsample_initial_channel": 512,
314 |         "upsample_kernel_sizes": [16, 16, 4, 4],
315 |         "gin_channels": 256,
316 |     }
317 |     self.dec = Generator(h=hps)
318 |     self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
319 |     self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
320 | 
321 |   def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
322 |     if c_lengths == None:
323 |       c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
324 |     if spec_lengths == None:
325 |       spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
326 | 
327 |     g = self.emb_g(g).transpose(1,2)
328 | 
329 |     z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
330 |     z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) 
331 | 
332 |     z_p = self.flow(z, spec_mask, g=g)
333 |     z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
334 | 
335 |     # o = self.dec(z_slice, g=g)
336 |     o = self.dec(z_slice, g=g, f0=pitch_slice)
337 | 
338 |     return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
339 | 
340 |   def infer(self, c, f0, g=None, mel=None, c_lengths=None):
341 |     if c_lengths == None:
342 |       c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
343 |     g = self.emb_g(g).transpose(1,2)
344 | 
345 |     z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
346 |     z = self.flow(z_p, c_mask, g=g, reverse=True)
347 |     # o = self.dec(z * c_mask, g=g)
348 |     o = self.dec(z * c_mask, g=g, f0=f0)
349 | 
350 |     return o
351 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 10 | from torch.nn.utils import weight_norm, remove_weight_norm
 11 | 
 12 | import commons
 13 | from commons import init_weights, get_padding
 14 | 
 15 | 
 16 | LRELU_SLOPE = 0.1
 17 | 
 18 | 
 19 | class LayerNorm(nn.Module):
 20 |   def __init__(self, channels, eps=1e-5):
 21 |     super().__init__()
 22 |     self.channels = channels
 23 |     self.eps = eps
 24 | 
 25 |     self.gamma = nn.Parameter(torch.ones(channels))
 26 |     self.beta = nn.Parameter(torch.zeros(channels))
 27 | 
 28 |   def forward(self, x):
 29 |     x = x.transpose(1, -1)
 30 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 31 |     return x.transpose(1, -1)
 32 | 
 33 |  
 34 | class ConvReluNorm(nn.Module):
 35 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 36 |     super().__init__()
 37 |     self.in_channels = in_channels
 38 |     self.hidden_channels = hidden_channels
 39 |     self.out_channels = out_channels
 40 |     self.kernel_size = kernel_size
 41 |     self.n_layers = n_layers
 42 |     self.p_dropout = p_dropout
 43 |     assert n_layers > 1, "Number of layers should be larger than 0."
 44 | 
 45 |     self.conv_layers = nn.ModuleList()
 46 |     self.norm_layers = nn.ModuleList()
 47 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 48 |     self.norm_layers.append(LayerNorm(hidden_channels))
 49 |     self.relu_drop = nn.Sequential(
 50 |         nn.ReLU(),
 51 |         nn.Dropout(p_dropout))
 52 |     for _ in range(n_layers-1):
 53 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 54 |       self.norm_layers.append(LayerNorm(hidden_channels))
 55 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 56 |     self.proj.weight.data.zero_()
 57 |     self.proj.bias.data.zero_()
 58 | 
 59 |   def forward(self, x, x_mask):
 60 |     x_org = x
 61 |     for i in range(self.n_layers):
 62 |       x = self.conv_layers[i](x * x_mask)
 63 |       x = self.norm_layers[i](x)
 64 |       x = self.relu_drop(x)
 65 |     x = x_org + self.proj(x)
 66 |     return x * x_mask
 67 | 
 68 | 
 69 | class DDSConv(nn.Module):
 70 |   """
 71 |   Dialted and Depth-Separable Convolution
 72 |   """
 73 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 74 |     super().__init__()
 75 |     self.channels = channels
 76 |     self.kernel_size = kernel_size
 77 |     self.n_layers = n_layers
 78 |     self.p_dropout = p_dropout
 79 | 
 80 |     self.drop = nn.Dropout(p_dropout)
 81 |     self.convs_sep = nn.ModuleList()
 82 |     self.convs_1x1 = nn.ModuleList()
 83 |     self.norms_1 = nn.ModuleList()
 84 |     self.norms_2 = nn.ModuleList()
 85 |     for i in range(n_layers):
 86 |       dilation = kernel_size ** i
 87 |       padding = (kernel_size * dilation - dilation) // 2
 88 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 89 |           groups=channels, dilation=dilation, padding=padding
 90 |       ))
 91 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 92 |       self.norms_1.append(LayerNorm(channels))
 93 |       self.norms_2.append(LayerNorm(channels))
 94 | 
 95 |   def forward(self, x, x_mask, g=None):
 96 |     if g is not None:
 97 |       x = x + g
 98 |     for i in range(self.n_layers):
 99 |       y = self.convs_sep[i](x * x_mask)
100 |       y = self.norms_1[i](y)
101 |       y = F.gelu(y)
102 |       y = self.convs_1x1[i](y)
103 |       y = self.norms_2[i](y)
104 |       y = F.gelu(y)
105 |       y = self.drop(y)
106 |       x = x + y
107 |     return x * x_mask
108 | 
109 | 
110 | class WN(torch.nn.Module):
111 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112 |     super(WN, self).__init__()
113 |     assert(kernel_size % 2 == 1)
114 |     self.hidden_channels =hidden_channels
115 |     self.kernel_size = kernel_size,
116 |     self.dilation_rate = dilation_rate
117 |     self.n_layers = n_layers
118 |     self.gin_channels = gin_channels
119 |     self.p_dropout = p_dropout
120 | 
121 |     self.in_layers = torch.nn.ModuleList()
122 |     self.res_skip_layers = torch.nn.ModuleList()
123 |     self.drop = nn.Dropout(p_dropout)
124 | 
125 |     if gin_channels != 0:
126 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128 | 
129 |     for i in range(n_layers):
130 |       dilation = dilation_rate ** i
131 |       padding = int((kernel_size * dilation - dilation) / 2)
132 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133 |                                  dilation=dilation, padding=padding)
134 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135 |       self.in_layers.append(in_layer)
136 | 
137 |       # last one is not necessary
138 |       if i < n_layers - 1:
139 |         res_skip_channels = 2 * hidden_channels
140 |       else:
141 |         res_skip_channels = hidden_channels
142 | 
143 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145 |       self.res_skip_layers.append(res_skip_layer)
146 | 
147 |   def forward(self, x, x_mask, g=None, **kwargs):
148 |     output = torch.zeros_like(x)
149 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
150 | 
151 |     if g is not None:
152 |       g = self.cond_layer(g)
153 | 
154 |     for i in range(self.n_layers):
155 |       x_in = self.in_layers[i](x)
156 |       if g is not None:
157 |         cond_offset = i * 2 * self.hidden_channels
158 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159 |       else:
160 |         g_l = torch.zeros_like(x_in)
161 | 
162 |       acts = commons.fused_add_tanh_sigmoid_multiply(
163 |           x_in,
164 |           g_l,
165 |           n_channels_tensor)
166 |       acts = self.drop(acts)
167 | 
168 |       res_skip_acts = self.res_skip_layers[i](acts)
169 |       if i < self.n_layers - 1:
170 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
171 |         x = (x + res_acts) * x_mask
172 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
173 |       else:
174 |         output = output + res_skip_acts
175 |     return output * x_mask
176 | 
177 |   def remove_weight_norm(self):
178 |     if self.gin_channels != 0:
179 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
180 |     for l in self.in_layers:
181 |       torch.nn.utils.remove_weight_norm(l)
182 |     for l in self.res_skip_layers:
183 |      torch.nn.utils.remove_weight_norm(l)
184 | 
185 | 
186 | class ResBlock1(torch.nn.Module):
187 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188 |         super(ResBlock1, self).__init__()
189 |         self.convs1 = nn.ModuleList([
190 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191 |                                padding=get_padding(kernel_size, dilation[0]))),
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193 |                                padding=get_padding(kernel_size, dilation[1]))),
194 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195 |                                padding=get_padding(kernel_size, dilation[2])))
196 |         ])
197 |         self.convs1.apply(init_weights)
198 | 
199 |         self.convs2 = nn.ModuleList([
200 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 |                                padding=get_padding(kernel_size, 1))),
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=get_padding(kernel_size, 1))),
204 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205 |                                padding=get_padding(kernel_size, 1)))
206 |         ])
207 |         self.convs2.apply(init_weights)
208 | 
209 |     def forward(self, x, x_mask=None):
210 |         for c1, c2 in zip(self.convs1, self.convs2):
211 |             xt = F.leaky_relu(x, LRELU_SLOPE)
212 |             if x_mask is not None:
213 |                 xt = xt * x_mask
214 |             xt = c1(xt)
215 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
216 |             if x_mask is not None:
217 |                 xt = xt * x_mask
218 |             xt = c2(xt)
219 |             x = xt + x
220 |         if x_mask is not None:
221 |             x = x * x_mask
222 |         return x
223 | 
224 |     def remove_weight_norm(self):
225 |         for l in self.convs1:
226 |             remove_weight_norm(l)
227 |         for l in self.convs2:
228 |             remove_weight_norm(l)
229 | 
230 | 
231 | class ResBlock2(torch.nn.Module):
232 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233 |         super(ResBlock2, self).__init__()
234 |         self.convs = nn.ModuleList([
235 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236 |                                padding=get_padding(kernel_size, dilation[0]))),
237 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238 |                                padding=get_padding(kernel_size, dilation[1])))
239 |         ])
240 |         self.convs.apply(init_weights)
241 | 
242 |     def forward(self, x, x_mask=None):
243 |         for c in self.convs:
244 |             xt = F.leaky_relu(x, LRELU_SLOPE)
245 |             if x_mask is not None:
246 |                 xt = xt * x_mask
247 |             xt = c(xt)
248 |             x = xt + x
249 |         if x_mask is not None:
250 |             x = x * x_mask
251 |         return x
252 | 
253 |     def remove_weight_norm(self):
254 |         for l in self.convs:
255 |             remove_weight_norm(l)
256 | 
257 | 
258 | class Log(nn.Module):
259 |   def forward(self, x, x_mask, reverse=False, **kwargs):
260 |     if not reverse:
261 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262 |       logdet = torch.sum(-y, [1, 2])
263 |       return y, logdet
264 |     else:
265 |       x = torch.exp(x) * x_mask
266 |       return x
267 |     
268 | 
269 | class Flip(nn.Module):
270 |   def forward(self, x, *args, reverse=False, **kwargs):
271 |     x = torch.flip(x, [1])
272 |     if not reverse:
273 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274 |       return x, logdet
275 |     else:
276 |       return x
277 | 
278 | 
279 | class ElementwiseAffine(nn.Module):
280 |   def __init__(self, channels):
281 |     super().__init__()
282 |     self.channels = channels
283 |     self.m = nn.Parameter(torch.zeros(channels,1))
284 |     self.logs = nn.Parameter(torch.zeros(channels,1))
285 | 
286 |   def forward(self, x, x_mask, reverse=False, **kwargs):
287 |     if not reverse:
288 |       y = self.m + torch.exp(self.logs) * x
289 |       y = y * x_mask
290 |       logdet = torch.sum(self.logs * x_mask, [1,2])
291 |       return y, logdet
292 |     else:
293 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
294 |       return x
295 | 
296 | 
297 | class ResidualCouplingLayer(nn.Module):
298 |   def __init__(self,
299 |       channels,
300 |       hidden_channels,
301 |       kernel_size,
302 |       dilation_rate,
303 |       n_layers,
304 |       p_dropout=0,
305 |       gin_channels=0,
306 |       mean_only=False):
307 |     assert channels % 2 == 0, "channels should be divisible by 2"
308 |     super().__init__()
309 |     self.channels = channels
310 |     self.hidden_channels = hidden_channels
311 |     self.kernel_size = kernel_size
312 |     self.dilation_rate = dilation_rate
313 |     self.n_layers = n_layers
314 |     self.half_channels = channels // 2
315 |     self.mean_only = mean_only
316 | 
317 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320 |     self.post.weight.data.zero_()
321 |     self.post.bias.data.zero_()
322 | 
323 |   def forward(self, x, x_mask, g=None, reverse=False):
324 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325 |     h = self.pre(x0) * x_mask
326 |     h = self.enc(h, x_mask, g=g)
327 |     stats = self.post(h) * x_mask
328 |     if not self.mean_only:
329 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
330 |     else:
331 |       m = stats
332 |       logs = torch.zeros_like(m)
333 | 
334 |     if not reverse:
335 |       x1 = m + x1 * torch.exp(logs) * x_mask
336 |       x = torch.cat([x0, x1], 1)
337 |       logdet = torch.sum(logs, [1,2])
338 |       return x, logdet
339 |     else:
340 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
341 |       x = torch.cat([x0, x1], 1)
342 |       return x
343 | 


--------------------------------------------------------------------------------
/preprocess_flist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from tqdm import tqdm
 4 | from random import shuffle
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
10 |     parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
11 |     parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list")
12 |     parser.add_argument("--source_dir", type=str, default="./dataset/48k", help="path to source dir")
13 |     args = parser.parse_args()
14 |     
15 |     train = []
16 |     val = []
17 |     test = []
18 |     idx = 0
19 |     
20 |     for speaker in tqdm(os.listdir(args.source_dir)):
21 |         wavs = [os.path.join(args.source_dir, speaker, i)for i in os.listdir(os.path.join(args.source_dir, speaker))]
22 |         wavs = [i for i in wavs if i.endswith("wav")]
23 |         shuffle(wavs)
24 |         train += wavs[2:-10]
25 |         val += wavs[:2]
26 |         test += wavs[-10:]
27 |         
28 |     shuffle(train)
29 |     shuffle(val)
30 |     shuffle(test)
31 |             
32 |     print("Writing", args.train_list)
33 |     with open(args.train_list, "w") as f:
34 |         for fname in tqdm(train):
35 |             wavpath = fname
36 |             f.write(wavpath + "\n")
37 |         
38 |     print("Writing", args.val_list)
39 |     with open(args.val_list, "w") as f:
40 |         for fname in tqdm(val):
41 |             wavpath = fname
42 |             f.write(wavpath + "\n")
43 |             
44 |     print("Writing", args.test_list)
45 |     with open(args.test_list, "w") as f:
46 |         for fname in tqdm(test):
47 |             wavpath = fname
48 |             f.write(wavpath + "\n")
49 |             


--------------------------------------------------------------------------------
/preprocess_hubert_f0.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | import torch
  5 | import json
  6 | from glob import glob
  7 | 
  8 | from pyworld import pyworld
  9 | from tqdm import tqdm
 10 | from scipy.io import wavfile
 11 | 
 12 | import utils
 13 | from mel_processing import mel_spectrogram_torch
 14 | #import h5py
 15 | import logging
 16 | logging.getLogger('numba').setLevel(logging.WARNING)
 17 | 
 18 | import parselmouth
 19 | import librosa
 20 | import numpy as np
 21 | def stft(y):
 22 |     return librosa.stft(
 23 |         y=y,
 24 |         n_fft=1280,
 25 |         hop_length=320,
 26 |         win_length=1280,
 27 |     )
 28 | 
 29 | def energy(y):
 30 |     # Extract energy
 31 |     S = librosa.magphase(stft(y))[0]
 32 |     e = np.sqrt(np.sum(S ** 2, axis=0))  # np.linalg.norm(S, axis=0)
 33 |     return e.squeeze()  # (Number of frames) => (654,)
 34 | 
 35 | def get_energy(path, p_len=None):
 36 |     wav, sr = librosa.load(path, 48000)
 37 |     e = energy(wav)
 38 |     if p_len is None:
 39 |         p_len = wav.shape[0] // 320
 40 |     assert e.shape[0] -p_len <2 ,(e.shape[0] ,p_len)
 41 |     e = e[: p_len]
 42 |     return e
 43 | 
 44 | 
 45 | 
 46 | def get_f0(path,p_len=None, f0_up_key=0):
 47 |     x, _ = librosa.load(path, 48000)
 48 |     if p_len is None:
 49 |         p_len = x.shape[0]//320
 50 |     else:
 51 |         assert abs(p_len-x.shape[0]//320) < 3, (path, p_len, x.shape)
 52 |     time_step = 320 / 48000 * 1000
 53 |     f0_min = 50
 54 |     f0_max = 1100
 55 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 56 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 57 | 
 58 |     f0 = parselmouth.Sound(x, 48000).to_pitch_ac(
 59 |         time_step=time_step / 1000, voicing_threshold=0.6,
 60 |         pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
 61 | 
 62 |     pad_size=(p_len - len(f0) + 1) // 2
 63 |     if(pad_size>0 or p_len - len(f0) - pad_size>0):
 64 |         f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
 65 | 
 66 |     f0bak = f0.copy()
 67 |     f0 *= pow(2, f0_up_key / 12)
 68 |     f0_mel = 1127 * np.log(1 + f0 / 700)
 69 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
 70 |     f0_mel[f0_mel <= 1] = 1
 71 |     f0_mel[f0_mel > 255] = 255
 72 |     f0_coarse = np.rint(f0_mel).astype(np.int)
 73 |     return f0_coarse, f0bak
 74 | 
 75 | def resize2d(x, target_len):
 76 |     source = np.array(x)
 77 |     source[source<0.001] = np.nan
 78 |     target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
 79 |     res = np.nan_to_num(target)
 80 |     return res
 81 | 
 82 | def compute_f0(path, c_len):
 83 |     x, sr = librosa.load(path, sr=48000)
 84 |     f0, t = pyworld.dio(
 85 |         x.astype(np.double),
 86 |         fs=sr,
 87 |         f0_ceil=800,
 88 |         frame_period=1000 * 320 / sr,
 89 |     )
 90 |     f0 = pyworld.stonemask(x.astype(np.double), f0, t, 48000)
 91 |     for index, pitch in enumerate(f0):
 92 |         f0[index] = round(pitch, 1)
 93 |     assert abs(c_len - x.shape[0]//320) < 3, (c_len, f0.shape)
 94 | 
 95 |     return None, resize2d(f0, c_len)
 96 | 
 97 | 
 98 | def process(filename):
 99 |     print(filename)
100 |     save_name = filename+".soft.pt"
101 |     if not os.path.exists(save_name):
102 |         devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
103 |         wav, _ = librosa.load(filename, sr=16000)
104 |         wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
105 |         c = utils.get_hubert_content(hmodel, wav)
106 |         torch.save(c.cpu(), save_name)
107 |     else:
108 |         c = torch.load(save_name)
109 |     f0path = filename+".f0.npy"
110 |     if not os.path.exists(f0path):
111 |         cf0, f0 = compute_f0(filename, c.shape[-1] * 3)
112 |         np.save(f0path, f0)
113 | 
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     parser = argparse.ArgumentParser()
118 |     parser.add_argument("--in_dir", type=str, default="dataset/48k", help="path to input dir")
119 |     args = parser.parse_args()
120 | 
121 |     print("Loading hubert for content...")
122 |     hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
123 |     print("Loaded hubert.")
124 | 
125 |     filenames = glob(f'{args.in_dir}/*/*.wav', recursive=True)#[:10]
126 |     
127 |     for filename in tqdm(filenames):
128 |         process(filename)
129 |     


--------------------------------------------------------------------------------
/raw/wav_structure.txt:
--------------------------------------------------------------------------------
 1 | 数据集准备
 2 | 
 3 | raw
 4 | ├───speaker0
 5 | │   ├───xxx1-xxx1.wav
 6 | │   ├───...
 7 | │   └───Lxx-0xx8.wav
 8 | └───speaker1
 9 |     ├───xx2-0xxx2.wav
10 |     ├───...
11 |     └───xxx7-xxx007.wav
12 | 
13 | 此外还需要编辑config.json
14 | 
15 | "n_speakers": 10
16 | 
17 | "spk":{
18 |     "speaker0": 0,
19 |     "speaker1": 1,
20 | }
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | glob2==0.7
2 | tqdm==4.62.3
3 | librosa==0.8.1
4 | numpy==1.21.6
5 | scipy==1.7.2
6 | tensorboard==2.7.0
7 | torch==1.10.0
8 | torchvision==0.9.0


--------------------------------------------------------------------------------
/resources/infer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/resources/infer.png


--------------------------------------------------------------------------------
/resources/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/resources/train.png


--------------------------------------------------------------------------------
/speaker_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/speaker_encoder/__init__.py


--------------------------------------------------------------------------------
/speaker_encoder/audio.py:
--------------------------------------------------------------------------------
  1 | from scipy.ndimage.morphology import binary_dilation
  2 | from speaker_encoder.params_data import *
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | import numpy as np
  6 | import webrtcvad
  7 | import librosa
  8 | import struct
  9 | 
 10 | int16_max = (2 ** 15) - 1
 11 | 
 12 | 
 13 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
 14 |                    source_sr: Optional[int] = None):
 15 |     """
 16 |     Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
 17 |     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
 18 | 
 19 |     :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
 20 |     just .wav), either the waveform as a numpy array of floats.
 21 |     :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
 22 |     preprocessing. After preprocessing, the waveform's sampling rate will match the data 
 23 |     hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
 24 |     this argument will be ignored.
 25 |     """
 26 |     # Load the wav from disk if needed
 27 |     if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
 28 |         wav, source_sr = librosa.load(fpath_or_wav, sr=None)
 29 |     else:
 30 |         wav = fpath_or_wav
 31 |     
 32 |     # Resample the wav if needed
 33 |     if source_sr is not None and source_sr != sampling_rate:
 34 |         wav = librosa.resample(wav, source_sr, sampling_rate)
 35 |         
 36 |     # Apply the preprocessing: normalize volume and shorten long silences 
 37 |     wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
 38 |     wav = trim_long_silences(wav)
 39 |     
 40 |     return wav
 41 | 
 42 | 
 43 | def wav_to_mel_spectrogram(wav):
 44 |     """
 45 |     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
 46 |     Note: this not a log-mel spectrogram.
 47 |     """
 48 |     frames = librosa.feature.melspectrogram(
 49 |         y=wav,
 50 |         sr=sampling_rate,
 51 |         n_fft=int(sampling_rate * mel_window_length / 1000),
 52 |         hop_length=int(sampling_rate * mel_window_step / 1000),
 53 |         n_mels=mel_n_channels
 54 |     )
 55 |     return frames.astype(np.float32).T
 56 | 
 57 | 
 58 | def trim_long_silences(wav):
 59 |     """
 60 |     Ensures that segments without voice in the waveform remain no longer than a 
 61 |     threshold determined by the VAD parameters in params.py.
 62 | 
 63 |     :param wav: the raw waveform as a numpy array of floats 
 64 |     :return: the same waveform with silences trimmed away (length <= original wav length)
 65 |     """
 66 |     # Compute the voice detection window size
 67 |     samples_per_window = (vad_window_length * sampling_rate) // 1000
 68 |     
 69 |     # Trim the end of the audio to have a multiple of the window size
 70 |     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
 71 |     
 72 |     # Convert the float waveform to 16-bit mono PCM
 73 |     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
 74 |     
 75 |     # Perform voice activation detection
 76 |     voice_flags = []
 77 |     vad = webrtcvad.Vad(mode=3)
 78 |     for window_start in range(0, len(wav), samples_per_window):
 79 |         window_end = window_start + samples_per_window
 80 |         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
 81 |                                          sample_rate=sampling_rate))
 82 |     voice_flags = np.array(voice_flags)
 83 |     
 84 |     # Smooth the voice detection with a moving average
 85 |     def moving_average(array, width):
 86 |         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
 87 |         ret = np.cumsum(array_padded, dtype=float)
 88 |         ret[width:] = ret[width:] - ret[:-width]
 89 |         return ret[width - 1:] / width
 90 |     
 91 |     audio_mask = moving_average(voice_flags, vad_moving_average_width)
 92 |     audio_mask = np.round(audio_mask).astype(np.bool)
 93 |     
 94 |     # Dilate the voiced regions
 95 |     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
 96 |     audio_mask = np.repeat(audio_mask, samples_per_window)
 97 |     
 98 |     return wav[audio_mask == True]
 99 | 
100 | 
101 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
102 |     if increase_only and decrease_only:
103 |         raise ValueError("Both increase only and decrease only are set")
104 |     dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
105 |     if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
106 |         return wav
107 |     return wav * (10 ** (dBFS_change / 20))
108 | 


--------------------------------------------------------------------------------
/speaker_encoder/ckpt/pretrained_bak_5805000.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/speaker_encoder/ckpt/pretrained_bak_5805000.pt


--------------------------------------------------------------------------------
/speaker_encoder/ckpt/pretrained_bak_5805000.pt.txt:
--------------------------------------------------------------------------------
1 | https://github.com/liusongxiang/ppg-vc/tree/main/speaker_encoder/ckpt


--------------------------------------------------------------------------------
/speaker_encoder/compute_embed.py:
--------------------------------------------------------------------------------
 1 | from speaker_encoder import inference as encoder
 2 | from multiprocessing.pool import Pool
 3 | from functools import partial
 4 | from pathlib import Path
 5 | # from utils import logmmse
 6 | # from tqdm import tqdm
 7 | # import numpy as np
 8 | # import librosa
 9 | 
10 | 
11 | def embed_utterance(fpaths, encoder_model_fpath):
12 |     if not encoder.is_loaded():
13 |         encoder.load_model(encoder_model_fpath)
14 | 
15 |     # Compute the speaker embedding of the utterance
16 |     wav_fpath, embed_fpath = fpaths
17 |     wav = np.load(wav_fpath)
18 |     wav = encoder.preprocess_wav(wav)
19 |     embed = encoder.embed_utterance(wav)
20 |     np.save(embed_fpath, embed, allow_pickle=False)
21 |     
22 |  
23 | def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
24 | 
25 |     wav_dir = outdir_root.joinpath("audio")
26 |     metadata_fpath = synthesizer_root.joinpath("train.txt")
27 |     assert wav_dir.exists() and metadata_fpath.exists()
28 |     embed_dir = synthesizer_root.joinpath("embeds")
29 |     embed_dir.mkdir(exist_ok=True)
30 |     
31 |     # Gather the input wave filepath and the target output embed filepath
32 |     with metadata_fpath.open("r") as metadata_file:
33 |         metadata = [line.split("|") for line in metadata_file]
34 |         fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
35 |         
36 |     # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
37 |     # Embed the utterances in separate threads
38 |     func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
39 |     job = Pool(n_processes).imap(func, fpaths)
40 |     list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))


--------------------------------------------------------------------------------
/speaker_encoder/config.py:
--------------------------------------------------------------------------------
 1 | librispeech_datasets = {
 2 |     "train": {
 3 |         "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
 4 |         "other": ["LibriSpeech/train-other-500"]
 5 |     },
 6 |     "test": {
 7 |         "clean": ["LibriSpeech/test-clean"],
 8 |         "other": ["LibriSpeech/test-other"]
 9 |     },
10 |     "dev": {
11 |         "clean": ["LibriSpeech/dev-clean"],
12 |         "other": ["LibriSpeech/dev-other"]
13 |     },
14 | }
15 | libritts_datasets = {
16 |     "train": {
17 |         "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
18 |         "other": ["LibriTTS/train-other-500"]
19 |     },
20 |     "test": {
21 |         "clean": ["LibriTTS/test-clean"],
22 |         "other": ["LibriTTS/test-other"]
23 |     },
24 |     "dev": {
25 |         "clean": ["LibriTTS/dev-clean"],
26 |         "other": ["LibriTTS/dev-other"]
27 |     },
28 | }
29 | voxceleb_datasets = {
30 |     "voxceleb1" : {
31 |         "train": ["VoxCeleb1/wav"],
32 |         "test": ["VoxCeleb1/test_wav"]
33 |     },
34 |     "voxceleb2" : {
35 |         "train": ["VoxCeleb2/dev/aac"],
36 |         "test": ["VoxCeleb2/test_wav"]
37 |     }
38 | }
39 | 
40 | other_datasets = [
41 |     "LJSpeech-1.1",
42 |     "VCTK-Corpus/wav48",
43 | ]
44 | 
45 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
46 | 


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2 | from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
3 | 


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/random_cycler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | class RandomCycler:
 4 |     """
 5 |     Creates an internal copy of a sequence and allows access to its items in a constrained random 
 6 |     order. For a source sequence of n items and one or several consecutive queries of a total 
 7 |     of m items, the following guarantees hold (one implies the other):
 8 |         - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
 9 |         - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
10 |     """
11 |     
12 |     def __init__(self, source):
13 |         if len(source) == 0:
14 |             raise Exception("Can't create RandomCycler from an empty collection")
15 |         self.all_items = list(source)
16 |         self.next_items = []
17 |     
18 |     def sample(self, count: int):
19 |         shuffle = lambda l: random.sample(l, len(l))
20 |         
21 |         out = []
22 |         while count > 0:
23 |             if count >= len(self.all_items):
24 |                 out.extend(shuffle(list(self.all_items)))
25 |                 count -= len(self.all_items)
26 |                 continue
27 |             n = min(count, len(self.next_items))
28 |             out.extend(self.next_items[:n])
29 |             count -= n
30 |             self.next_items = self.next_items[n:]
31 |             if len(self.next_items) == 0:
32 |                 self.next_items = shuffle(list(self.all_items))
33 |         return out
34 |     
35 |     def __next__(self):
36 |         return self.sample(1)[0]
37 | 
38 | 


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/speaker.py:
--------------------------------------------------------------------------------
 1 | from speaker_encoder.data_objects.random_cycler import RandomCycler
 2 | from speaker_encoder.data_objects.utterance import Utterance
 3 | from pathlib import Path
 4 | 
 5 | # Contains the set of utterances of a single speaker
 6 | class Speaker:
 7 |     def __init__(self, root: Path):
 8 |         self.root = root
 9 |         self.name = root.name
10 |         self.utterances = None
11 |         self.utterance_cycler = None
12 |         
13 |     def _load_utterances(self):
14 |         with self.root.joinpath("_sources.txt").open("r") as sources_file:
15 |             sources = [l.split(",") for l in sources_file]
16 |         sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
17 |         self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
18 |         self.utterance_cycler = RandomCycler(self.utterances)
19 |                
20 |     def random_partial(self, count, n_frames):
21 |         """
22 |         Samples a batch of <count> unique partial utterances from the disk in a way that all 
23 |         utterances come up at least once every two cycles and in a random order every time.
24 |         
25 |         :param count: The number of partial utterances to sample from the set of utterances from 
26 |         that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
27 |         the number of utterances available.
28 |         :param n_frames: The number of frames in the partial utterance.
29 |         :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
30 |         frames are the frames of the partial utterances and range is the range of the partial 
31 |         utterance with regard to the complete utterance.
32 |         """
33 |         if self.utterances is None:
34 |             self._load_utterances()
35 | 
36 |         utterances = self.utterance_cycler.sample(count)
37 | 
38 |         a = [(u,) + u.random_partial(n_frames) for u in utterances]
39 | 
40 |         return a
41 | 


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/speaker_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List
 3 | from speaker_encoder.data_objects.speaker import Speaker
 4 | 
 5 | class SpeakerBatch:
 6 |     def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
 7 |         self.speakers = speakers
 8 |         self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
 9 |         
10 |         # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
11 |         # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
12 |         self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
13 | 


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/speaker_verification_dataset.py:
--------------------------------------------------------------------------------
 1 | from speaker_encoder.data_objects.random_cycler import RandomCycler
 2 | from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
 3 | from speaker_encoder.data_objects.speaker import Speaker
 4 | from speaker_encoder.params_data import partials_n_frames
 5 | from torch.utils.data import Dataset, DataLoader
 6 | from pathlib import Path
 7 | 
 8 | # TODO: improve with a pool of speakers for data efficiency
 9 | 
10 | class SpeakerVerificationDataset(Dataset):
11 |     def __init__(self, datasets_root: Path):
12 |         self.root = datasets_root
13 |         speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
14 |         if len(speaker_dirs) == 0:
15 |             raise Exception("No speakers found. Make sure you are pointing to the directory "
16 |                             "containing all preprocessed speaker directories.")
17 |         self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
18 |         self.speaker_cycler = RandomCycler(self.speakers)
19 | 
20 |     def __len__(self):
21 |         return int(1e10)
22 |         
23 |     def __getitem__(self, index):
24 |         return next(self.speaker_cycler)
25 |     
26 |     def get_logs(self):
27 |         log_string = ""
28 |         for log_fpath in self.root.glob("*.txt"):
29 |             with log_fpath.open("r") as log_file:
30 |                 log_string += "".join(log_file.readlines())
31 |         return log_string
32 |     
33 |     
34 | class SpeakerVerificationDataLoader(DataLoader):
35 |     def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
36 |                  batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
37 |                  worker_init_fn=None):
38 |         self.utterances_per_speaker = utterances_per_speaker
39 | 
40 |         super().__init__(
41 |             dataset=dataset, 
42 |             batch_size=speakers_per_batch, 
43 |             shuffle=False, 
44 |             sampler=sampler, 
45 |             batch_sampler=batch_sampler, 
46 |             num_workers=num_workers,
47 |             collate_fn=self.collate, 
48 |             pin_memory=pin_memory, 
49 |             drop_last=False, 
50 |             timeout=timeout, 
51 |             worker_init_fn=worker_init_fn
52 |         )
53 | 
54 |     def collate(self, speakers):
55 |         return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
56 |     


--------------------------------------------------------------------------------
/speaker_encoder/data_objects/utterance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Utterance:
 5 |     def __init__(self, frames_fpath, wave_fpath):
 6 |         self.frames_fpath = frames_fpath
 7 |         self.wave_fpath = wave_fpath
 8 |         
 9 |     def get_frames(self):
10 |         return np.load(self.frames_fpath)
11 | 
12 |     def random_partial(self, n_frames):
13 |         """
14 |         Crops the frames into a partial utterance of n_frames
15 |         
16 |         :param n_frames: The number of frames of the partial utterance
17 |         :return: the partial utterance frames and a tuple indicating the start and end of the 
18 |         partial utterance in the complete utterance.
19 |         """
20 |         frames = self.get_frames()
21 |         if frames.shape[0] == n_frames:
22 |             start = 0
23 |         else:
24 |             start = np.random.randint(0, frames.shape[0] - n_frames)
25 |         end = start + n_frames
26 |         return frames[start:end], (start, end)


--------------------------------------------------------------------------------
/speaker_encoder/hparams.py:
--------------------------------------------------------------------------------
 1 | ## Mel-filterbank
 2 | mel_window_length = 25  # In milliseconds
 3 | mel_window_step = 10    # In milliseconds
 4 | mel_n_channels = 40
 5 | 
 6 | 
 7 | ## Audio
 8 | sampling_rate = 16000
 9 | # Number of spectrogram frames in a partial utterance
10 | partials_n_frames = 160     # 1600 ms
11 | 
12 | 
13 | ## Voice Activation Detection
14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
15 | # This sets the granularity of the VAD. Should not need to be changed.
16 | vad_window_length = 30  # In milliseconds
17 | # Number of frames to average together when performing the moving average smoothing.
18 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
19 | vad_moving_average_width = 8
20 | # Maximum number of consecutive silent frames a segment can have.
21 | vad_max_silence_length = 6
22 | 
23 | 
24 | ## Audio volume normalization
25 | audio_norm_target_dBFS = -30
26 | 
27 | 
28 | ## Model parameters
29 | model_hidden_size = 256
30 | model_embedding_size = 256
31 | model_num_layers = 3


--------------------------------------------------------------------------------
/speaker_encoder/inference.py:
--------------------------------------------------------------------------------
  1 | from speaker_encoder.params_data import *
  2 | from speaker_encoder.model import SpeakerEncoder
  3 | from speaker_encoder.audio import preprocess_wav   # We want to expose this function from here
  4 | from matplotlib import cm
  5 | from speaker_encoder import audio
  6 | from pathlib import Path
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | _model = None # type: SpeakerEncoder
 12 | _device = None # type: torch.device
 13 | 
 14 | 
 15 | def load_model(weights_fpath: Path, device=None):
 16 |     """
 17 |     Loads the model in memory. If this function is not explicitely called, it will be run on the 
 18 |     first call to embed_frames() with the default weights file.
 19 |     
 20 |     :param weights_fpath: the path to saved model weights.
 21 |     :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
 22 |     model will be loaded and will run on this device. Outputs will however always be on the cpu. 
 23 |     If None, will default to your GPU if it"s available, otherwise your CPU.
 24 |     """
 25 |     # TODO: I think the slow loading of the encoder might have something to do with the device it
 26 |     #   was saved on. Worth investigating.
 27 |     global _model, _device
 28 |     if device is None:
 29 |         _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 30 |     elif isinstance(device, str):
 31 |         _device = torch.device(device)
 32 |     _model = SpeakerEncoder(_device, torch.device("cpu"))
 33 |     checkpoint = torch.load(weights_fpath)
 34 |     _model.load_state_dict(checkpoint["model_state"])
 35 |     _model.eval()
 36 |     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
 37 |     
 38 |     
 39 | def is_loaded():
 40 |     return _model is not None
 41 | 
 42 | 
 43 | def embed_frames_batch(frames_batch):
 44 |     """
 45 |     Computes embeddings for a batch of mel spectrogram.
 46 |     
 47 |     :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
 48 |     (batch_size, n_frames, n_channels)
 49 |     :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
 50 |     """
 51 |     if _model is None:
 52 |         raise Exception("Model was not loaded. Call load_model() before inference.")
 53 |     
 54 |     frames = torch.from_numpy(frames_batch).to(_device)
 55 |     embed = _model.forward(frames).detach().cpu().numpy()
 56 |     return embed
 57 | 
 58 | 
 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
 60 |                            min_pad_coverage=0.75, overlap=0.5):
 61 |     """
 62 |     Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
 63 |     partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
 64 |     spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
 65 |     its spectrogram. This function assumes that the mel spectrogram parameters used are those 
 66 |     defined in params_data.py.
 67 |     
 68 |     The returned ranges may be indexing further than the length of the waveform. It is 
 69 |     recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
 70 |     
 71 |     :param n_samples: the number of samples in the waveform
 72 |     :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
 73 |     utterance
 74 |     :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
 75 |     enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
 76 |     then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
 77 |     it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
 78 |     utterance, this parameter is ignored so that the function always returns at least 1 slice.
 79 |     :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
 80 |     utterances are entirely disjoint. 
 81 |     :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
 82 |     respectively the waveform and the mel spectrogram with these slices to obtain the partial 
 83 |     utterances.
 84 |     """
 85 |     assert 0 <= overlap < 1
 86 |     assert 0 < min_pad_coverage <= 1
 87 |     
 88 |     samples_per_frame = int((sampling_rate * mel_window_step / 1000))
 89 |     n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
 90 |     frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
 91 | 
 92 |     # Compute the slices
 93 |     wav_slices, mel_slices = [], []
 94 |     steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
 95 |     for i in range(0, steps, frame_step):
 96 |         mel_range = np.array([i, i + partial_utterance_n_frames])
 97 |         wav_range = mel_range * samples_per_frame
 98 |         mel_slices.append(slice(*mel_range))
 99 |         wav_slices.append(slice(*wav_range))
100 |         
101 |     # Evaluate whether extra padding is warranted or not
102 |     last_wav_range = wav_slices[-1]
103 |     coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
104 |     if coverage < min_pad_coverage and len(mel_slices) > 1:
105 |         mel_slices = mel_slices[:-1]
106 |         wav_slices = wav_slices[:-1]
107 |     
108 |     return wav_slices, mel_slices
109 | 
110 | 
111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
112 |     """
113 |     Computes an embedding for a single utterance.
114 |     
115 |     # TODO: handle multiple wavs to benefit from batching on GPU
116 |     :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
117 |     :param using_partials: if True, then the utterance is split in partial utterances of 
118 |     <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
119 |     normalized average. If False, the utterance is instead computed from feeding the entire 
120 |     spectogram to the network.
121 |     :param return_partials: if True, the partial embeddings will also be returned along with the 
122 |     wav slices that correspond to the partial embeddings.
123 |     :param kwargs: additional arguments to compute_partial_splits()
124 |     :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
125 |     <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
126 |     (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
127 |     returned. If <using_partials> is simultaneously set to False, both these values will be None 
128 |     instead.
129 |     """
130 |     # Process the entire utterance if not using partials
131 |     if not using_partials:
132 |         frames = audio.wav_to_mel_spectrogram(wav)
133 |         embed = embed_frames_batch(frames[None, ...])[0]
134 |         if return_partials:
135 |             return embed, None, None
136 |         return embed
137 |     
138 |     # Compute where to split the utterance into partials and pad if necessary
139 |     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
140 |     max_wave_length = wave_slices[-1].stop
141 |     if max_wave_length >= len(wav):
142 |         wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
143 |     
144 |     # Split the utterance into partials
145 |     frames = audio.wav_to_mel_spectrogram(wav)
146 |     frames_batch = np.array([frames[s] for s in mel_slices])
147 |     partial_embeds = embed_frames_batch(frames_batch)
148 |     
149 |     # Compute the utterance embedding from the partial embeddings
150 |     raw_embed = np.mean(partial_embeds, axis=0)
151 |     embed = raw_embed / np.linalg.norm(raw_embed, 2)
152 |     
153 |     if return_partials:
154 |         return embed, partial_embeds, wave_slices
155 |     return embed
156 | 
157 | 
158 | def embed_speaker(wavs, **kwargs):
159 |     raise NotImplemented()
160 | 
161 | 
162 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
163 |     if ax is None:
164 |         ax = plt.gca()
165 |     
166 |     if shape is None:
167 |         height = int(np.sqrt(len(embed)))
168 |         shape = (height, -1)
169 |     embed = embed.reshape(shape)
170 |     
171 |     cmap = cm.get_cmap()
172 |     mappable = ax.imshow(embed, cmap=cmap)
173 |     cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174 |     cbar.set_clim(*color_range)
175 |     
176 |     ax.set_xticks([]), ax.set_yticks([])
177 |     ax.set_title(title)
178 | 


--------------------------------------------------------------------------------
/speaker_encoder/model.py:
--------------------------------------------------------------------------------
  1 | from speaker_encoder.params_model import *
  2 | from speaker_encoder.params_data import *
  3 | from scipy.interpolate import interp1d
  4 | from sklearn.metrics import roc_curve
  5 | from torch.nn.utils import clip_grad_norm_
  6 | from scipy.optimize import brentq
  7 | from torch import nn
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | 
 12 | class SpeakerEncoder(nn.Module):
 13 |     def __init__(self, device, loss_device):
 14 |         super().__init__()
 15 |         self.loss_device = loss_device
 16 |         
 17 |         # Network defition
 18 |         self.lstm = nn.LSTM(input_size=mel_n_channels,     # 40
 19 |                             hidden_size=model_hidden_size, # 256 
 20 |                             num_layers=model_num_layers,   # 3 
 21 |                             batch_first=True).to(device)
 22 |         self.linear = nn.Linear(in_features=model_hidden_size, 
 23 |                                 out_features=model_embedding_size).to(device)
 24 |         self.relu = torch.nn.ReLU().to(device)
 25 |         
 26 |         # Cosine similarity scaling (with fixed initial parameter values)
 27 |         self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
 28 |         self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
 29 | 
 30 |         # Loss
 31 |         self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
 32 |         
 33 |     def do_gradient_ops(self):
 34 |         # Gradient scale
 35 |         self.similarity_weight.grad *= 0.01
 36 |         self.similarity_bias.grad *= 0.01
 37 |             
 38 |         # Gradient clipping
 39 |         clip_grad_norm_(self.parameters(), 3, norm_type=2)
 40 |     
 41 |     def forward(self, utterances, hidden_init=None):
 42 |         """
 43 |         Computes the embeddings of a batch of utterance spectrograms.
 44 |         
 45 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
 46 |         (batch_size, n_frames, n_channels) 
 47 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
 48 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
 49 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
 50 |         """
 51 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
 52 |         # and the final cell state.
 53 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
 54 |         
 55 |         # We take only the hidden state of the last layer
 56 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 57 |         
 58 |         # L2-normalize it
 59 |         embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 60 |         
 61 |         return embeds
 62 |     
 63 |     def similarity_matrix(self, embeds):
 64 |         """
 65 |         Computes the similarity matrix according the section 2.1 of GE2E.
 66 | 
 67 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
 68 |         utterances_per_speaker, embedding_size)
 69 |         :return: the similarity matrix as a tensor of shape (speakers_per_batch,
 70 |         utterances_per_speaker, speakers_per_batch)
 71 |         """
 72 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
 73 |         
 74 |         # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
 75 |         centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
 76 |         centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
 77 | 
 78 |         # Exclusive centroids (1 per utterance)
 79 |         centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
 80 |         centroids_excl /= (utterances_per_speaker - 1)
 81 |         centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
 82 | 
 83 |         # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
 84 |         # product of these vectors (which is just an element-wise multiplication reduced by a sum).
 85 |         # We vectorize the computation for efficiency.
 86 |         sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
 87 |                                  speakers_per_batch).to(self.loss_device)
 88 |         mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
 89 |         for j in range(speakers_per_batch):
 90 |             mask = np.where(mask_matrix[j])[0]
 91 |             sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
 92 |             sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
 93 |         
 94 |         ## Even more vectorized version (slower maybe because of transpose)
 95 |         # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
 96 |         #                           ).to(self.loss_device)
 97 |         # eye = np.eye(speakers_per_batch, dtype=np.int)
 98 |         # mask = np.where(1 - eye)
 99 |         # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
100 |         # mask = np.where(eye)
101 |         # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
102 |         # sim_matrix2 = sim_matrix2.transpose(1, 2)
103 |         
104 |         sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
105 |         return sim_matrix
106 |     
107 |     def loss(self, embeds):
108 |         """
109 |         Computes the softmax loss according the section 2.1 of GE2E.
110 |         
111 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
112 |         utterances_per_speaker, embedding_size)
113 |         :return: the loss and the EER for this batch of embeddings.
114 |         """
115 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
116 |         
117 |         # Loss
118 |         sim_matrix = self.similarity_matrix(embeds)
119 |         sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
120 |                                          speakers_per_batch))
121 |         ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
122 |         target = torch.from_numpy(ground_truth).long().to(self.loss_device)
123 |         loss = self.loss_fn(sim_matrix, target)
124 |         
125 |         # EER (not backpropagated)
126 |         with torch.no_grad():
127 |             inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
128 |             labels = np.array([inv_argmax(i) for i in ground_truth])
129 |             preds = sim_matrix.detach().cpu().numpy()
130 | 
131 |             # Snippet from https://yangcha.github.io/EER-ROC/
132 |             fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
133 |             eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
134 |             
135 |         return loss, eer


--------------------------------------------------------------------------------
/speaker_encoder/params_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Mel-filterbank
 3 | mel_window_length = 25  # In milliseconds
 4 | mel_window_step = 10    # In milliseconds
 5 | mel_n_channels = 40
 6 | 
 7 | 
 8 | ## Audio
 9 | sampling_rate = 16000
10 | # Number of spectrogram frames in a partial utterance
11 | partials_n_frames = 160     # 1600 ms
12 | # Number of spectrogram frames at inference
13 | inference_n_frames = 80     #  800 ms
14 | 
15 | 
16 | ## Voice Activation Detection
17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18 | # This sets the granularity of the VAD. Should not need to be changed.
19 | vad_window_length = 30  # In milliseconds
20 | # Number of frames to average together when performing the moving average smoothing.
21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
22 | vad_moving_average_width = 8
23 | # Maximum number of consecutive silent frames a segment can have.
24 | vad_max_silence_length = 6
25 | 
26 | 
27 | ## Audio volume normalization
28 | audio_norm_target_dBFS = -30
29 | 
30 | 


--------------------------------------------------------------------------------
/speaker_encoder/params_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Model parameters
 3 | model_hidden_size = 256
 4 | model_embedding_size = 256
 5 | model_num_layers = 3
 6 | 
 7 | 
 8 | ## Training parameters
 9 | learning_rate_init = 1e-4
10 | speakers_per_batch = 64
11 | utterances_per_speaker = 10
12 | 


--------------------------------------------------------------------------------
/speaker_encoder/preprocess.py:
--------------------------------------------------------------------------------
  1 | from multiprocess.pool import ThreadPool
  2 | from speaker_encoder.params_data import *
  3 | from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
  4 | from datetime import datetime
  5 | from speaker_encoder import audio
  6 | from pathlib import Path
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | 
 10 | 
 11 | class DatasetLog:
 12 |     """
 13 |     Registers metadata about the dataset in a text file.
 14 |     """
 15 |     def __init__(self, root, name):
 16 |         self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
 17 |         self.sample_data = dict()
 18 |         
 19 |         start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 20 |         self.write_line("Creating dataset %s on %s" % (name, start_time))
 21 |         self.write_line("-----")
 22 |         self._log_params()
 23 |         
 24 |     def _log_params(self):
 25 |         from speaker_encoder import params_data
 26 |         self.write_line("Parameter values:")
 27 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 28 |             value = getattr(params_data, param_name)
 29 |             self.write_line("\t%s: %s" % (param_name, value))
 30 |         self.write_line("-----")
 31 |     
 32 |     def write_line(self, line):
 33 |         self.text_file.write("%s\n" % line)
 34 |         
 35 |     def add_sample(self, **kwargs):
 36 |         for param_name, value in kwargs.items():
 37 |             if not param_name in self.sample_data:
 38 |                 self.sample_data[param_name] = []
 39 |             self.sample_data[param_name].append(value)
 40 |             
 41 |     def finalize(self):
 42 |         self.write_line("Statistics:")
 43 |         for param_name, values in self.sample_data.items():
 44 |             self.write_line("\t%s:" % param_name)
 45 |             self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
 46 |             self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
 47 |         self.write_line("-----")
 48 |         end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 49 |         self.write_line("Finished on %s" % end_time)
 50 |         self.text_file.close()
 51 |        
 52 |         
 53 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
 54 |     dataset_root = datasets_root.joinpath(dataset_name)
 55 |     if not dataset_root.exists():
 56 |         print("Couldn\'t find %s, skipping this dataset." % dataset_root)
 57 |         return None, None
 58 |     return dataset_root, DatasetLog(out_dir, dataset_name)
 59 | 
 60 | 
 61 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
 62 |                              skip_existing, logger):
 63 |     print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
 64 |     
 65 |     # Function to preprocess utterances for one speaker
 66 |     def preprocess_speaker(speaker_dir: Path):
 67 |         # Give a name to the speaker that includes its dataset
 68 |         speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
 69 |         
 70 |         # Create an output directory with that name, as well as a txt file containing a 
 71 |         # reference to each source file.
 72 |         speaker_out_dir = out_dir.joinpath(speaker_name)
 73 |         speaker_out_dir.mkdir(exist_ok=True)
 74 |         sources_fpath = speaker_out_dir.joinpath("_sources.txt")
 75 |         
 76 |         # There's a possibility that the preprocessing was interrupted earlier, check if 
 77 |         # there already is a sources file.
 78 |         if sources_fpath.exists():
 79 |             try:
 80 |                 with sources_fpath.open("r") as sources_file:
 81 |                     existing_fnames = {line.split(",")[0] for line in sources_file}
 82 |             except:
 83 |                 existing_fnames = {}
 84 |         else:
 85 |             existing_fnames = {}
 86 |         
 87 |         # Gather all audio files for that speaker recursively
 88 |         sources_file = sources_fpath.open("a" if skip_existing else "w")
 89 |         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
 90 |             # Check if the target output file already exists
 91 |             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
 92 |             out_fname = out_fname.replace(".%s" % extension, ".npy")
 93 |             if skip_existing and out_fname in existing_fnames:
 94 |                 continue
 95 |                 
 96 |             # Load and preprocess the waveform
 97 |             wav = audio.preprocess_wav(in_fpath)
 98 |             if len(wav) == 0:
 99 |                 continue
100 |             
101 |             # Create the mel spectrogram, discard those that are too short
102 |             frames = audio.wav_to_mel_spectrogram(wav)
103 |             if len(frames) < partials_n_frames:
104 |                 continue
105 |             
106 |             out_fpath = speaker_out_dir.joinpath(out_fname)
107 |             np.save(out_fpath, frames)
108 |             logger.add_sample(duration=len(wav) / sampling_rate)
109 |             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
110 |         
111 |         sources_file.close()
112 |     
113 |     # Process the utterances for each speaker
114 |     with ThreadPool(8) as pool:
115 |         list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
116 |                   unit="speakers"))
117 |     logger.finalize()
118 |     print("Done preprocessing %s.\n" % dataset_name)
119 | 
120 | 
121 | # Function to preprocess utterances for one speaker
122 | def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
123 |         # Give a name to the speaker that includes its dataset
124 |         speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
125 |         
126 |         # Create an output directory with that name, as well as a txt file containing a 
127 |         # reference to each source file.
128 |         speaker_out_dir = out_dir.joinpath(speaker_name)
129 |         speaker_out_dir.mkdir(exist_ok=True)
130 |         sources_fpath = speaker_out_dir.joinpath("_sources.txt")
131 |         
132 |         # There's a possibility that the preprocessing was interrupted earlier, check if 
133 |         # there already is a sources file.
134 |         # if sources_fpath.exists():
135 |         #     try:
136 |         #         with sources_fpath.open("r") as sources_file:
137 |         #             existing_fnames = {line.split(",")[0] for line in sources_file}
138 |         #     except:
139 |         #         existing_fnames = {}
140 |         # else:
141 |         #     existing_fnames = {}
142 |         existing_fnames = {}
143 |         # Gather all audio files for that speaker recursively
144 |         sources_file = sources_fpath.open("a" if skip_existing else "w")
145 | 
146 |         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
147 |             # Check if the target output file already exists
148 |             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
149 |             out_fname = out_fname.replace(".%s" % extension, ".npy")
150 |             if skip_existing and out_fname in existing_fnames:
151 |                 continue
152 |                 
153 |             # Load and preprocess the waveform
154 |             wav = audio.preprocess_wav(in_fpath)
155 |             if len(wav) == 0:
156 |                 continue
157 |             
158 |             # Create the mel spectrogram, discard those that are too short
159 |             frames = audio.wav_to_mel_spectrogram(wav)
160 |             if len(frames) < partials_n_frames:
161 |                 continue
162 |             
163 |             out_fpath = speaker_out_dir.joinpath(out_fname)
164 |             np.save(out_fpath, frames)
165 |             # logger.add_sample(duration=len(wav) / sampling_rate)
166 |             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
167 |         
168 |         sources_file.close()
169 |         return len(wav)
170 | 
171 | def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
172 |                              skip_existing, logger):
173 |     # from multiprocessing import Pool, cpu_count
174 |     from pathos.multiprocessing import ProcessingPool as Pool
175 |     # Function to preprocess utterances for one speaker
176 |     def __preprocess_speaker(speaker_dir: Path):
177 |         # Give a name to the speaker that includes its dataset
178 |         speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
179 |         
180 |         # Create an output directory with that name, as well as a txt file containing a 
181 |         # reference to each source file.
182 |         speaker_out_dir = out_dir.joinpath(speaker_name)
183 |         speaker_out_dir.mkdir(exist_ok=True)
184 |         sources_fpath = speaker_out_dir.joinpath("_sources.txt")
185 |         
186 |         existing_fnames = {}
187 |         # Gather all audio files for that speaker recursively
188 |         sources_file = sources_fpath.open("a" if skip_existing else "w")
189 |         wav_lens = []
190 |         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
191 |             # Check if the target output file already exists
192 |             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
193 |             out_fname = out_fname.replace(".%s" % extension, ".npy")
194 |             if skip_existing and out_fname in existing_fnames:
195 |                 continue
196 |                 
197 |             # Load and preprocess the waveform
198 |             wav = audio.preprocess_wav(in_fpath)
199 |             if len(wav) == 0:
200 |                 continue
201 |             
202 |             # Create the mel spectrogram, discard those that are too short
203 |             frames = audio.wav_to_mel_spectrogram(wav)
204 |             if len(frames) < partials_n_frames:
205 |                 continue
206 |             
207 |             out_fpath = speaker_out_dir.joinpath(out_fname)
208 |             np.save(out_fpath, frames)
209 |             # logger.add_sample(duration=len(wav) / sampling_rate)
210 |             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
211 |             wav_lens.append(len(wav))
212 |         sources_file.close()
213 |         return wav_lens
214 | 
215 |     print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
216 |     # Process the utterances for each speaker
217 |     # with ThreadPool(8) as pool:
218 |     #     list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
219 |     #               unit="speakers"))
220 |     pool = Pool(processes=20)
221 |     for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
222 |         for wav_len in wav_lens:
223 |             logger.add_sample(duration=wav_len / sampling_rate)
224 |         print(f'{i}/{len(speaker_dirs)} \r')
225 | 
226 |     logger.finalize()
227 |     print("Done preprocessing %s.\n" % dataset_name)
228 | 
229 | 
230 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
231 |     for dataset_name in librispeech_datasets["train"]["other"]:
232 |         # Initialize the preprocessing
233 |         dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
234 |         if not dataset_root:
235 |             return 
236 |         
237 |         # Preprocess all speakers
238 |         speaker_dirs = list(dataset_root.glob("*"))
239 |         _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
240 |                                  skip_existing, logger)
241 | 
242 | 
243 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
244 |     # Initialize the preprocessing
245 |     dataset_name = "VoxCeleb1"
246 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
247 |     if not dataset_root:
248 |         return
249 | 
250 |     # Get the contents of the meta file
251 |     with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
252 |         metadata = [line.split("\t") for line in metafile][1:]
253 |     
254 |     # Select the ID and the nationality, filter out non-anglophone speakers
255 |     nationalities = {line[0]: line[3] for line in metadata}
256 |     # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
257 |     #                     nationality.lower() in anglophone_nationalites]
258 |     keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]                        
259 |     print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
260 |           (len(keep_speaker_ids), len(nationalities)))
261 |     
262 |     # Get the speaker directories for anglophone speakers only
263 |     speaker_dirs = dataset_root.joinpath("wav").glob("*")
264 |     speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
265 |                     speaker_dir.name in keep_speaker_ids]
266 |     print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
267 |           (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
268 | 
269 |     # Preprocess all speakers
270 |     _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
271 |                              skip_existing, logger)
272 | 
273 | 
274 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
275 |     # Initialize the preprocessing
276 |     dataset_name = "VoxCeleb2"
277 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
278 |     if not dataset_root:
279 |         return
280 |     
281 |     # Get the speaker directories
282 |     # Preprocess all speakers
283 |     speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
284 |     _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
285 |                              skip_existing, logger)
286 | 


--------------------------------------------------------------------------------
/speaker_encoder/train.py:
--------------------------------------------------------------------------------
  1 | from speaker_encoder.visualizations import Visualizations
  2 | from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
  3 | from speaker_encoder.params_model import *
  4 | from speaker_encoder.model import SpeakerEncoder
  5 | from utils.profiler import Profiler
  6 | from pathlib import Path
  7 | import torch
  8 | 
  9 | def sync(device: torch.device):
 10 |     # FIXME
 11 |     return 
 12 |     # For correct profiling (cuda operations are async)
 13 |     if device.type == "cuda":
 14 |         torch.cuda.synchronize(device)
 15 | 
 16 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
 17 |           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
 18 |           no_visdom: bool):
 19 |     # Create a dataset and a dataloader
 20 |     dataset = SpeakerVerificationDataset(clean_data_root)
 21 |     loader = SpeakerVerificationDataLoader(
 22 |         dataset,
 23 |         speakers_per_batch,       # 64
 24 |         utterances_per_speaker,   # 10
 25 |         num_workers=8,
 26 |     )
 27 |     
 28 |     # Setup the device on which to run the forward pass and the loss. These can be different, 
 29 |     # because the forward pass is faster on the GPU whereas the loss is often (depending on your
 30 |     # hyperparameters) faster on the CPU.
 31 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 32 |     # FIXME: currently, the gradient is None if loss_device is cuda
 33 |     loss_device = torch.device("cpu")
 34 |     
 35 |     # Create the model and the optimizer
 36 |     model = SpeakerEncoder(device, loss_device)
 37 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
 38 |     init_step = 1
 39 |     
 40 |     # Configure file path for the model
 41 |     state_fpath = models_dir.joinpath(run_id + ".pt")
 42 |     backup_dir = models_dir.joinpath(run_id + "_backups")
 43 | 
 44 |     # Load any existing model
 45 |     if not force_restart:
 46 |         if state_fpath.exists():
 47 |             print("Found existing model \"%s\", loading it and resuming training." % run_id)
 48 |             checkpoint = torch.load(state_fpath)
 49 |             init_step = checkpoint["step"]
 50 |             model.load_state_dict(checkpoint["model_state"])
 51 |             optimizer.load_state_dict(checkpoint["optimizer_state"])
 52 |             optimizer.param_groups[0]["lr"] = learning_rate_init
 53 |         else:
 54 |             print("No model \"%s\" found, starting training from scratch." % run_id)
 55 |     else:
 56 |         print("Starting the training from scratch.")
 57 |     model.train()
 58 |     
 59 |     # Initialize the visualization environment
 60 |     vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
 61 |     vis.log_dataset(dataset)
 62 |     vis.log_params()
 63 |     device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
 64 |     vis.log_implementation({"Device": device_name})
 65 |     
 66 |     # Training loop
 67 |     profiler = Profiler(summarize_every=10, disabled=False)
 68 |     for step, speaker_batch in enumerate(loader, init_step):
 69 |         profiler.tick("Blocking, waiting for batch (threaded)")
 70 |         
 71 |         # Forward pass
 72 |         inputs = torch.from_numpy(speaker_batch.data).to(device)
 73 |         sync(device)
 74 |         profiler.tick("Data to %s" % device)
 75 |         embeds = model(inputs)
 76 |         sync(device)
 77 |         profiler.tick("Forward pass")
 78 |         embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
 79 |         loss, eer = model.loss(embeds_loss)
 80 |         sync(loss_device)
 81 |         profiler.tick("Loss")
 82 | 
 83 |         # Backward pass
 84 |         model.zero_grad()
 85 |         loss.backward()
 86 |         profiler.tick("Backward pass")
 87 |         model.do_gradient_ops()
 88 |         optimizer.step()
 89 |         profiler.tick("Parameter update")
 90 |         
 91 |         # Update visualizations
 92 |         # learning_rate = optimizer.param_groups[0]["lr"]
 93 |         vis.update(loss.item(), eer, step)
 94 |         
 95 |         # Draw projections and save them to the backup folder
 96 |         if umap_every != 0 and step % umap_every == 0:
 97 |             print("Drawing and saving projections (step %d)" % step)
 98 |             backup_dir.mkdir(exist_ok=True)
 99 |             projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
100 |             embeds = embeds.detach().cpu().numpy()
101 |             vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
102 |             vis.save()
103 | 
104 |         # Overwrite the latest version of the model
105 |         if save_every != 0 and step % save_every == 0:
106 |             print("Saving the model (step %d)" % step)
107 |             torch.save({
108 |                 "step": step + 1,
109 |                 "model_state": model.state_dict(),
110 |                 "optimizer_state": optimizer.state_dict(),
111 |             }, state_fpath)
112 |             
113 |         # Make a backup
114 |         if backup_every != 0 and step % backup_every == 0:
115 |             print("Making a backup (step %d)" % step)
116 |             backup_dir.mkdir(exist_ok=True)
117 |             backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
118 |             torch.save({
119 |                 "step": step + 1,
120 |                 "model_state": model.state_dict(),
121 |                 "optimizer_state": optimizer.state_dict(),
122 |             }, backup_fpath)
123 |             
124 |         profiler.tick("Extras (visualizations, saving)")
125 |         


--------------------------------------------------------------------------------
/speaker_encoder/visualizations.py:
--------------------------------------------------------------------------------
  1 | from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
  2 | from datetime import datetime
  3 | from time import perf_counter as timer
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | # import webbrowser
  7 | import visdom
  8 | import umap
  9 | 
 10 | colormap = np.array([
 11 |     [76, 255, 0],
 12 |     [0, 127, 70],
 13 |     [255, 0, 0],
 14 |     [255, 217, 38],
 15 |     [0, 135, 255],
 16 |     [165, 0, 165],
 17 |     [255, 167, 255],
 18 |     [0, 255, 255],
 19 |     [255, 96, 38],
 20 |     [142, 76, 0],
 21 |     [33, 0, 127],
 22 |     [0, 0, 0],
 23 |     [183, 183, 183],
 24 | ], dtype=np.float) / 255 
 25 | 
 26 | 
 27 | class Visualizations:
 28 |     def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
 29 |         # Tracking data
 30 |         self.last_update_timestamp = timer()
 31 |         self.update_every = update_every
 32 |         self.step_times = []
 33 |         self.losses = []
 34 |         self.eers = []
 35 |         print("Updating the visualizations every %d steps." % update_every)
 36 |         
 37 |         # If visdom is disabled TODO: use a better paradigm for that
 38 |         self.disabled = disabled    
 39 |         if self.disabled:
 40 |             return 
 41 |         
 42 |         # Set the environment name
 43 |         now = str(datetime.now().strftime("%d-%m %Hh%M"))
 44 |         if env_name is None:
 45 |             self.env_name = now
 46 |         else:
 47 |             self.env_name = "%s (%s)" % (env_name, now)
 48 |         
 49 |         # Connect to visdom and open the corresponding window in the browser
 50 |         try:
 51 |             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
 52 |         except ConnectionError:
 53 |             raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
 54 |                             "start it.")
 55 |         # webbrowser.open("http://localhost:8097/env/" + self.env_name)
 56 |         
 57 |         # Create the windows
 58 |         self.loss_win = None
 59 |         self.eer_win = None
 60 |         # self.lr_win = None
 61 |         self.implementation_win = None
 62 |         self.projection_win = None
 63 |         self.implementation_string = ""
 64 |         
 65 |     def log_params(self):
 66 |         if self.disabled:
 67 |             return 
 68 |         from speaker_encoder import params_data
 69 |         from speaker_encoder import params_model
 70 |         param_string = "<b>Model parameters</b>:<br>"
 71 |         for param_name in (p for p in dir(params_model) if not p.startswith("__")):
 72 |             value = getattr(params_model, param_name)
 73 |             param_string += "\t%s: %s<br>" % (param_name, value)
 74 |         param_string += "<b>Data parameters</b>:<br>"
 75 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 76 |             value = getattr(params_data, param_name)
 77 |             param_string += "\t%s: %s<br>" % (param_name, value)
 78 |         self.vis.text(param_string, opts={"title": "Parameters"})
 79 |         
 80 |     def log_dataset(self, dataset: SpeakerVerificationDataset):
 81 |         if self.disabled:
 82 |             return 
 83 |         dataset_string = ""
 84 |         dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
 85 |         dataset_string += "\n" + dataset.get_logs()
 86 |         dataset_string = dataset_string.replace("\n", "<br>")
 87 |         self.vis.text(dataset_string, opts={"title": "Dataset"})
 88 |         
 89 |     def log_implementation(self, params):
 90 |         if self.disabled:
 91 |             return 
 92 |         implementation_string = ""
 93 |         for param, value in params.items():
 94 |             implementation_string += "<b>%s</b>: %s\n" % (param, value)
 95 |             implementation_string = implementation_string.replace("\n", "<br>")
 96 |         self.implementation_string = implementation_string
 97 |         self.implementation_win = self.vis.text(
 98 |             implementation_string, 
 99 |             opts={"title": "Training implementation"}
100 |         )
101 | 
102 |     def update(self, loss, eer, step):
103 |         # Update the tracking data
104 |         now = timer()
105 |         self.step_times.append(1000 * (now - self.last_update_timestamp))
106 |         self.last_update_timestamp = now
107 |         self.losses.append(loss)
108 |         self.eers.append(eer)
109 |         print(".", end="")
110 |         
111 |         # Update the plots every <update_every> steps
112 |         if step % self.update_every != 0:
113 |             return
114 |         time_string = "Step time:  mean: %5dms  std: %5dms" % \
115 |                       (int(np.mean(self.step_times)), int(np.std(self.step_times)))
116 |         print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
117 |               (step, np.mean(self.losses), np.mean(self.eers), time_string))
118 |         if not self.disabled:
119 |             self.loss_win = self.vis.line(
120 |                 [np.mean(self.losses)],
121 |                 [step],
122 |                 win=self.loss_win,
123 |                 update="append" if self.loss_win else None,
124 |                 opts=dict(
125 |                     legend=["Avg. loss"],
126 |                     xlabel="Step",
127 |                     ylabel="Loss",
128 |                     title="Loss",
129 |                 )
130 |             )
131 |             self.eer_win = self.vis.line(
132 |                 [np.mean(self.eers)],
133 |                 [step],
134 |                 win=self.eer_win,
135 |                 update="append" if self.eer_win else None,
136 |                 opts=dict(
137 |                     legend=["Avg. EER"],
138 |                     xlabel="Step",
139 |                     ylabel="EER",
140 |                     title="Equal error rate"
141 |                 )
142 |             )
143 |             if self.implementation_win is not None:
144 |                 self.vis.text(
145 |                     self.implementation_string + ("<b>%s</b>" % time_string), 
146 |                     win=self.implementation_win,
147 |                     opts={"title": "Training implementation"},
148 |                 )
149 | 
150 |         # Reset the tracking
151 |         self.losses.clear()
152 |         self.eers.clear()
153 |         self.step_times.clear()
154 |         
155 |     def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
156 |                          max_speakers=10):
157 |         max_speakers = min(max_speakers, len(colormap))
158 |         embeds = embeds[:max_speakers * utterances_per_speaker]
159 |         
160 |         n_speakers = len(embeds) // utterances_per_speaker
161 |         ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
162 |         colors = [colormap[i] for i in ground_truth]
163 |         
164 |         reducer = umap.UMAP()
165 |         projected = reducer.fit_transform(embeds)
166 |         plt.scatter(projected[:, 0], projected[:, 1], c=colors)
167 |         plt.gca().set_aspect("equal", "datalim")
168 |         plt.title("UMAP projection (step %d)" % step)
169 |         if not self.disabled:
170 |             self.projection_win = self.vis.matplot(plt, win=self.projection_win)
171 |         if out_fpath is not None:
172 |             plt.savefig(out_fpath)
173 |         plt.clf()
174 |         
175 |     def save(self):
176 |         if not self.disabled:
177 |             self.vis.save([self.env_name])
178 |         


--------------------------------------------------------------------------------
/speaker_encoder/voice_encoder.py:
--------------------------------------------------------------------------------
  1 | from speaker_encoder.hparams import *
  2 | from speaker_encoder import audio
  3 | from pathlib import Path
  4 | from typing import Union, List
  5 | from torch import nn
  6 | from time import perf_counter as timer
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | 
 11 | class SpeakerEncoder(nn.Module):
 12 |     def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
 13 |         """
 14 |         :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). 
 15 |         If None, defaults to cuda if it is available on your machine, otherwise the model will 
 16 |         run on cpu. Outputs are always returned on the cpu, as numpy arrays.
 17 |         """
 18 |         super().__init__()
 19 |         
 20 |         # Define the network
 21 |         self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
 22 |         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
 23 |         self.relu = nn.ReLU()
 24 |         
 25 |         # Get the target device
 26 |         if device is None:
 27 |             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 28 |         elif isinstance(device, str):
 29 |             device = torch.device(device)
 30 |         self.device = device
 31 |             
 32 |         # Load the pretrained model'speaker weights
 33 |         # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
 34 |         # if not weights_fpath.exists():
 35 |         #     raise Exception("Couldn't find the voice encoder pretrained model at %s." % 
 36 |         #                     weights_fpath)
 37 | 
 38 |         start = timer()
 39 |         checkpoint = torch.load(weights_fpath, map_location="cpu")
 40 | 
 41 |         self.load_state_dict(checkpoint["model_state"], strict=False)
 42 |         self.to(device)
 43 |         
 44 |         if verbose:
 45 |             print("Loaded the voice encoder model on %s in %.2f seconds." % 
 46 |                   (device.type, timer() - start))
 47 | 
 48 |     def forward(self, mels: torch.FloatTensor):
 49 |         """
 50 |         Computes the embeddings of a batch of utterance spectrograms.
 51 |         :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape 
 52 |         (batch_size, n_frames, n_channels) 
 53 |         :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size). 
 54 |         Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
 55 |         """
 56 |         # Pass the input through the LSTM layers and retrieve the final hidden state of the last 
 57 |         # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
 58 |         _, (hidden, _) = self.lstm(mels)
 59 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 60 |         return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 61 |     
 62 |     @staticmethod
 63 |     def compute_partial_slices(n_samples: int, rate, min_coverage):
 64 |         """
 65 |         Computes where to split an utterance waveform and its corresponding mel spectrogram to 
 66 |         obtain partial utterances of <partials_n_frames> each. Both the waveform and the 
 67 |         mel spectrogram slices are returned, so as to make each partial utterance waveform 
 68 |         correspond to its spectrogram.
 69 |     
 70 |         The returned ranges may be indexing further than the length of the waveform. It is 
 71 |         recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
 72 |     
 73 |         :param n_samples: the number of samples in the waveform
 74 |         :param rate: how many partial utterances should occur per second. Partial utterances must 
 75 |         cover the span of the entire utterance, thus the rate should not be lower than the inverse 
 76 |         of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
 77 |         the minimum rate is thus 0.625.
 78 |         :param min_coverage: when reaching the last partial utterance, it may or may not have 
 79 |         enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
 80 |         then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
 81 |         it will be discarded. If there aren't enough frames for one partial utterance, 
 82 |         this parameter is ignored so that the function always returns at least one slice.
 83 |         :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
 84 |         respectively the waveform and the mel spectrogram with these slices to obtain the partial 
 85 |         utterances.
 86 |         """
 87 |         assert 0 < min_coverage <= 1
 88 |         
 89 |         # Compute how many frames separate two partial utterances
 90 |         samples_per_frame = int((sampling_rate * mel_window_step / 1000))
 91 |         n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
 92 |         frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
 93 |         assert 0 < frame_step, "The rate is too high"
 94 |         assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
 95 |             (sampling_rate / (samples_per_frame * partials_n_frames))
 96 |         
 97 |         # Compute the slices
 98 |         wav_slices, mel_slices = [], []
 99 |         steps = max(1, n_frames - partials_n_frames + frame_step + 1)
100 |         for i in range(0, steps, frame_step):
101 |             mel_range = np.array([i, i + partials_n_frames])
102 |             wav_range = mel_range * samples_per_frame
103 |             mel_slices.append(slice(*mel_range))
104 |             wav_slices.append(slice(*wav_range))
105 |         
106 |         # Evaluate whether extra padding is warranted or not
107 |         last_wav_range = wav_slices[-1]
108 |         coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
109 |         if coverage < min_coverage and len(mel_slices) > 1:
110 |             mel_slices = mel_slices[:-1]
111 |             wav_slices = wav_slices[:-1]
112 |         
113 |         return wav_slices, mel_slices
114 |     
115 |     def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
116 |         """
117 |         Computes an embedding for a single utterance. The utterance is divided in partial 
118 |         utterances and an embedding is computed for each. The complete utterance embedding is the 
119 |         L2-normed average embedding of the partial utterances.
120 |         
121 |         TODO: independent batched version of this function
122 |     
123 |         :param wav: a preprocessed utterance waveform as a numpy array of float32
124 |         :param return_partials: if True, the partial embeddings will also be returned along with 
125 |         the wav slices corresponding to each partial utterance.
126 |         :param rate: how many partial utterances should occur per second. Partial utterances must 
127 |         cover the span of the entire utterance, thus the rate should not be lower than the inverse 
128 |         of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
129 |         the minimum rate is thus 0.625.
130 |         :param min_coverage: when reaching the last partial utterance, it may or may not have 
131 |         enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
132 |         then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
133 |         it will be discarded. If there aren't enough frames for one partial utterance, 
134 |         this parameter is ignored so that the function always returns at least one slice.
135 |         :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
136 |         <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
137 |         (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
138 |         returned.
139 |         """
140 |         # Compute where to split the utterance into partials and pad the waveform with zeros if 
141 |         # the partial utterances cover a larger range. 
142 |         wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
143 |         max_wave_length = wav_slices[-1].stop
144 |         if max_wave_length >= len(wav):
145 |             wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
146 |         
147 |         # Split the utterance into partials and forward them through the model
148 |         mel = audio.wav_to_mel_spectrogram(wav)
149 |         mels = np.array([mel[s] for s in mel_slices])
150 |         with torch.no_grad():
151 |             mels = torch.from_numpy(mels).to(self.device)
152 |             partial_embeds = self(mels).cpu().numpy()
153 |         
154 |         # Compute the utterance embedding from the partial embeddings
155 |         raw_embed = np.mean(partial_embeds, axis=0)
156 |         embed = raw_embed / np.linalg.norm(raw_embed, 2)
157 |         
158 |         if return_partials:
159 |             return embed, partial_embeds, wav_slices
160 |         return embed
161 |     
162 |     def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
163 |         """
164 |         Compute the embedding of a collection of wavs (presumably from the same speaker) by 
165 |         averaging their embedding and L2-normalizing it.
166 |         
167 |         :param wavs: list of wavs a numpy arrays of float32.
168 |         :param kwargs: extra arguments to embed_utterance()
169 |         :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
170 |         """
171 |         raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
172 |                              for wav in wavs], axis=0)
173 |         return raw_embed / np.linalg.norm(raw_embed, 2)


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
  3 | import os
  4 | import json
  5 | import argparse
  6 | import itertools
  7 | import math
  8 | import torch
  9 | from torch import nn, optim
 10 | from torch.nn import functional as F
 11 | from torch.utils.data import DataLoader
 12 | from torch.utils.tensorboard import SummaryWriter
 13 | import torch.multiprocessing as mp
 14 | import torch.distributed as dist
 15 | from torch.nn.parallel import DistributedDataParallel as DDP
 16 | from torch.cuda.amp import autocast, GradScaler
 17 | 
 18 | import commons
 19 | import utils
 20 | from data_utils import TextAudioSpeakerLoader, EvalDataLoader
 21 | from models import (
 22 |     SynthesizerTrn,
 23 |     MultiPeriodDiscriminator,
 24 | )
 25 | from losses import (
 26 |     kl_loss,
 27 |     generator_loss, discriminator_loss, feature_loss
 28 | )
 29 | 
 30 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 31 | 
 32 | torch.backends.cudnn.benchmark = True
 33 | global_step = 0
 34 | 
 35 | 
 36 | # os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
 37 | 
 38 | 
 39 | def main():
 40 |     """Assume Single Node Multi GPUs Training Only"""
 41 |     assert torch.cuda.is_available(), "CPU training is not allowed."
 42 |     hps = utils.get_hparams()
 43 | 
 44 |     n_gpus = torch.cuda.device_count()
 45 |     os.environ['MASTER_ADDR'] = 'localhost'
 46 |     os.environ['MASTER_PORT'] = hps.train.port
 47 | 
 48 |     mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
 49 | 
 50 | 
 51 | def run(rank, n_gpus, hps):
 52 |     global global_step
 53 |     if rank == 0:
 54 |         logger = utils.get_logger(hps.model_dir)
 55 |         logger.info(hps)
 56 |         utils.check_git_hash(hps.model_dir)
 57 |         writer = SummaryWriter(log_dir=hps.model_dir)
 58 |         writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
 59 | 
 60 |     dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
 61 |     torch.manual_seed(hps.train.seed)
 62 |     torch.cuda.set_device(rank)
 63 | 
 64 |     train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps)
 65 |     train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
 66 |                               batch_size=hps.train.batch_size)
 67 |     if rank == 0:
 68 |         eval_dataset = EvalDataLoader(hps.data.validation_files, hps)
 69 |         eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
 70 |                                  batch_size=1, pin_memory=False,
 71 |                                  drop_last=False)
 72 | 
 73 |     net_g = SynthesizerTrn(
 74 |         hps.data.filter_length // 2 + 1,
 75 |         hps.train.segment_size // hps.data.hop_length,
 76 |         **hps.model).cuda(rank)
 77 |     net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
 78 |     optim_g = torch.optim.AdamW(
 79 |         net_g.parameters(),
 80 |         hps.train.learning_rate,
 81 |         betas=hps.train.betas,
 82 |         eps=hps.train.eps)
 83 |     optim_d = torch.optim.AdamW(
 84 |         net_d.parameters(),
 85 |         hps.train.learning_rate,
 86 |         betas=hps.train.betas,
 87 |         eps=hps.train.eps)
 88 |     net_g = DDP(net_g, device_ids=[rank])  # , find_unused_parameters=True)
 89 |     net_d = DDP(net_d, device_ids=[rank])
 90 | 
 91 |     try:
 92 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
 93 |                                                    optim_g)
 94 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
 95 |                                                    optim_d)
 96 |         global_step = (epoch_str - 1) * len(train_loader)
 97 |     except:
 98 |         epoch_str = 1
 99 |         global_step = 0
100 | 
101 |     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
102 |     scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
103 | 
104 |     scaler = GradScaler(enabled=hps.train.fp16_run)
105 | 
106 |     for epoch in range(epoch_str, hps.train.epochs + 1):
107 |         if rank == 0:
108 |             train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
109 |                                [train_loader, eval_loader], logger, [writer, writer_eval])
110 |         else:
111 |             train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
112 |                                [train_loader, None], None, None)
113 |         scheduler_g.step()
114 |         scheduler_d.step()
115 | 
116 | 
117 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
118 |     net_g, net_d = nets
119 |     optim_g, optim_d = optims
120 |     scheduler_g, scheduler_d = schedulers
121 |     train_loader, eval_loader = loaders
122 |     if writers is not None:
123 |         writer, writer_eval = writers
124 | 
125 |     # train_loader.batch_sampler.set_epoch(epoch)
126 |     global global_step
127 | 
128 |     net_g.train()
129 |     net_d.train()
130 |     for batch_idx, items in enumerate(train_loader):
131 |         c, f0, spec, y, spk = items
132 |         g = spk.cuda(rank, non_blocking=True)
133 |         spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
134 |         c = c.cuda(rank, non_blocking=True)
135 |         f0 = f0.cuda(rank, non_blocking=True)
136 |         mel = spec_to_mel_torch(
137 |             spec,
138 |             hps.data.filter_length,
139 |             hps.data.n_mel_channels,
140 |             hps.data.sampling_rate,
141 |             hps.data.mel_fmin,
142 |             hps.data.mel_fmax)
143 | 
144 |         with autocast(enabled=hps.train.fp16_run):
145 |             y_hat, ids_slice, z_mask, \
146 |             (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel)
147 | 
148 |             y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
149 |             y_hat_mel = mel_spectrogram_torch(
150 |                 y_hat.squeeze(1),
151 |                 hps.data.filter_length,
152 |                 hps.data.n_mel_channels,
153 |                 hps.data.sampling_rate,
154 |                 hps.data.hop_length,
155 |                 hps.data.win_length,
156 |                 hps.data.mel_fmin,
157 |                 hps.data.mel_fmax
158 |             )
159 |             y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size)  # slice
160 | 
161 |             # Discriminator
162 |             y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
163 | 
164 |             with autocast(enabled=False):
165 |                 loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
166 |                 loss_disc_all = loss_disc
167 | 
168 |         optim_d.zero_grad()
169 |         scaler.scale(loss_disc_all).backward()
170 |         scaler.unscale_(optim_d)
171 |         grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
172 |         scaler.step(optim_d)
173 | 
174 |         with autocast(enabled=hps.train.fp16_run):
175 |             # Generator
176 |             y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
177 |             with autocast(enabled=False):
178 |                 loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
179 |                 loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
180 |                 loss_fm = feature_loss(fmap_r, fmap_g)
181 |                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
182 |                 loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
183 |         optim_g.zero_grad()
184 |         scaler.scale(loss_gen_all).backward()
185 |         scaler.unscale_(optim_g)
186 |         grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
187 |         scaler.step(optim_g)
188 |         scaler.update()
189 | 
190 |         if rank == 0:
191 |             if global_step % hps.train.log_interval == 0:
192 |                 lr = optim_g.param_groups[0]['lr']
193 |                 losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
194 |                 logger.info('Train Epoch: {} [{:.0f}%]'.format(
195 |                     epoch,
196 |                     100. * batch_idx / len(train_loader)))
197 |                 logger.info([x.item() for x in losses] + [global_step, lr])
198 | 
199 |                 scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
200 |                                "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
201 |                 scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl})
202 | 
203 |                 scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
204 |                 scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
205 |                 scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
206 |                 image_dict = {
207 |                     "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
208 |                     "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
209 |                     "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
210 |                 }
211 |                 audio_dict={
212 |                     f"train/gen": y_hat[0],
213 |                     f"train/gt": y[0],
214 |                 }
215 |                 utils.summarize(
216 |                     writer=writer,
217 |                     global_step=global_step,
218 |                     images=image_dict,
219 |                     scalars=scalar_dict,
220 |                     audios=audio_dict,
221 |                     audio_sampling_rate = hps.data.sampling_rate
222 |                 )
223 | 
224 |             if global_step % hps.train.eval_interval == 0:
225 |                 evaluate(hps, net_g, eval_loader, writer_eval)
226 |                 utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
227 |                                       os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
228 |                 utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
229 |                                       os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
230 |         global_step += 1
231 | 
232 |     if rank == 0:
233 |         logger.info('====> Epoch: {}'.format(epoch))
234 | 
235 | 
236 | def evaluate(hps, generator, eval_loader, writer_eval):
237 |     generator.eval()
238 |     image_dict = {}
239 |     audio_dict = {}
240 |     with torch.no_grad():
241 |         for batch_idx, items in enumerate(eval_loader):
242 |             c, f0, spec, y, spk = items
243 |             g = spk[:1].cuda(0)
244 |             spec, y = spec[:1].cuda(0), y[:1].cuda(0)
245 |             c = c[:1].cuda(0)
246 |             f0 = f0[:1].cuda(0)
247 |             mel = spec_to_mel_torch(
248 |                 spec,
249 |                 hps.data.filter_length,
250 |                 hps.data.n_mel_channels,
251 |                 hps.data.sampling_rate,
252 |                 hps.data.mel_fmin,
253 |                 hps.data.mel_fmax)
254 |             y_hat = generator.module.infer(c, f0, g=g, mel=mel)
255 | 
256 |             y_hat_mel = mel_spectrogram_torch(
257 |                 y_hat.squeeze(1).float(),
258 |                 hps.data.filter_length,
259 |                 hps.data.n_mel_channels,
260 |                 hps.data.sampling_rate,
261 |                 hps.data.hop_length,
262 |                 hps.data.win_length,
263 |                 hps.data.mel_fmin,
264 |                 hps.data.mel_fmax
265 |             )
266 | 
267 |             audio_dict.update({
268 |                 f"gen/audio_{batch_idx}": y_hat[0],
269 |                 f"gt/audio_{batch_idx}": y[0]
270 |             })
271 |         image_dict.update({
272 |             f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
273 |             "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
274 |         })
275 |     utils.summarize(
276 |         writer=writer_eval,
277 |         global_step=global_step,
278 |         images=image_dict,
279 |         audios=audio_dict,
280 |         audio_sampling_rate=hps.data.sampling_rate
281 |     )
282 |     generator.train()
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     main()
287 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | 
  9 | import librosa
 10 | import numpy as np
 11 | import torchaudio
 12 | from scipy.io.wavfile import read
 13 | import torch
 14 | import torchvision
 15 | from torch.nn import functional as F
 16 | from commons import sequence_mask
 17 | import hubert_model
 18 | MATPLOTLIB_FLAG = False
 19 | 
 20 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 21 | logger = logging
 22 | 
 23 | f0_bin = 256
 24 | f0_max = 1100.0
 25 | f0_min = 50.0
 26 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 27 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 28 | 
 29 | def f0_to_coarse(f0):
 30 |   is_torch = isinstance(f0, torch.Tensor)
 31 |   f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
 32 |   f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
 33 | 
 34 |   f0_mel[f0_mel <= 1] = 1
 35 |   f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
 36 |   f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
 37 |   assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
 38 |   return f0_coarse
 39 | 
 40 | 
 41 | def get_hubert_model(rank=None):
 42 | 
 43 |   hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt")
 44 |   if rank is not None:
 45 |     hubert_soft = hubert_soft.cuda(rank)
 46 |   return hubert_soft
 47 | 
 48 | def get_hubert_content(hmodel, y=None, path=None):
 49 |   if path is not None:
 50 |     source, sr = torchaudio.load(path)
 51 |     source = torchaudio.functional.resample(source, sr, 16000)
 52 |     if len(source.shape) == 2 and source.shape[1] >= 2:
 53 |       source = torch.mean(source, dim=0).unsqueeze(0)
 54 |   else:
 55 |     source = y
 56 |   source = source.unsqueeze(0)
 57 |   with torch.inference_mode():
 58 |     units = hmodel.units(source)
 59 |     return units.transpose(1,2)
 60 | 
 61 | 
 62 | def get_content(cmodel, y):
 63 |     with torch.no_grad():
 64 |         c = cmodel.extract_features(y.squeeze(1))[0]
 65 |     c = c.transpose(1, 2)
 66 |     return c
 67 | 
 68 | 
 69 | 
 70 | def transform(mel, height): # 68-92
 71 |     #r = np.random.random()
 72 |     #rate = r * 0.3 + 0.85 # 0.85-1.15
 73 |     #height = int(mel.size(-2) * rate)
 74 |     tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
 75 |     if height >= mel.size(-2):
 76 |         return tgt[:, :mel.size(-2), :]
 77 |     else:
 78 |         silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
 79 |         silence += torch.randn_like(silence) / 10
 80 |         return torch.cat((tgt, silence), 1)
 81 | 
 82 | 
 83 | def stretch(mel, width): # 0.5-2
 84 |     return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
 85 | 
 86 | 
 87 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 88 |   assert os.path.isfile(checkpoint_path)
 89 |   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 90 |   iteration = checkpoint_dict['iteration']
 91 |   learning_rate = checkpoint_dict['learning_rate']
 92 |   if optimizer is not None:
 93 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 94 |   saved_state_dict = checkpoint_dict['model']
 95 |   if hasattr(model, 'module'):
 96 |     state_dict = model.module.state_dict()
 97 |   else:
 98 |     state_dict = model.state_dict()
 99 |   new_state_dict= {}
100 |   for k, v in state_dict.items():
101 |     try:
102 |       new_state_dict[k] = saved_state_dict[k]
103 |     except:
104 |       logger.info("%s is not in the checkpoint" % k)
105 |       new_state_dict[k] = v
106 |   if hasattr(model, 'module'):
107 |     model.module.load_state_dict(new_state_dict)
108 |   else:
109 |     model.load_state_dict(new_state_dict)
110 |   logger.info("Loaded checkpoint '{}' (iteration {})" .format(
111 |     checkpoint_path, iteration))
112 |   return model, optimizer, learning_rate, iteration
113 | 
114 | 
115 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
116 |   ckptname = checkpoint_path.split("/")[-1]
117 |   newest_step = int(ckptname.split(".")[0].split("_")[1])
118 |   val_steps = 2000
119 |   last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps*3))
120 |   if newest_step >= val_steps*3:
121 |     os.system(f"rm {last_ckptname}")
122 |   logger.info("Saving model and optimizer state at iteration {} to {}".format(
123 |     iteration, checkpoint_path))
124 |   if hasattr(model, 'module'):
125 |     state_dict = model.module.state_dict()
126 |   else:
127 |     state_dict = model.state_dict()
128 |   torch.save({'model': state_dict,
129 |               'iteration': iteration,
130 |               'optimizer': optimizer.state_dict(),
131 |               'learning_rate': learning_rate}, checkpoint_path)
132 | 
133 | 
134 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
135 |   for k, v in scalars.items():
136 |     writer.add_scalar(k, v, global_step)
137 |   for k, v in histograms.items():
138 |     writer.add_histogram(k, v, global_step)
139 |   for k, v in images.items():
140 |     writer.add_image(k, v, global_step, dataformats='HWC')
141 |   for k, v in audios.items():
142 |     writer.add_audio(k, v, global_step, audio_sampling_rate)
143 | 
144 | 
145 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
146 |   f_list = glob.glob(os.path.join(dir_path, regex))
147 |   f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
148 |   x = f_list[-1]
149 |   print(x)
150 |   return x
151 | 
152 | 
153 | def plot_spectrogram_to_numpy(spectrogram):
154 |   global MATPLOTLIB_FLAG
155 |   if not MATPLOTLIB_FLAG:
156 |     import matplotlib
157 |     matplotlib.use("Agg")
158 |     MATPLOTLIB_FLAG = True
159 |     mpl_logger = logging.getLogger('matplotlib')
160 |     mpl_logger.setLevel(logging.WARNING)
161 |   import matplotlib.pylab as plt
162 |   import numpy as np
163 | 
164 |   fig, ax = plt.subplots(figsize=(10,2))
165 |   im = ax.imshow(spectrogram, aspect="auto", origin="lower",
166 |                   interpolation='none')
167 |   plt.colorbar(im, ax=ax)
168 |   plt.xlabel("Frames")
169 |   plt.ylabel("Channels")
170 |   plt.tight_layout()
171 | 
172 |   fig.canvas.draw()
173 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
174 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
175 |   plt.close()
176 |   return data
177 | 
178 | 
179 | def plot_alignment_to_numpy(alignment, info=None):
180 |   global MATPLOTLIB_FLAG
181 |   if not MATPLOTLIB_FLAG:
182 |     import matplotlib
183 |     matplotlib.use("Agg")
184 |     MATPLOTLIB_FLAG = True
185 |     mpl_logger = logging.getLogger('matplotlib')
186 |     mpl_logger.setLevel(logging.WARNING)
187 |   import matplotlib.pylab as plt
188 |   import numpy as np
189 | 
190 |   fig, ax = plt.subplots(figsize=(6, 4))
191 |   im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
192 |                   interpolation='none')
193 |   fig.colorbar(im, ax=ax)
194 |   xlabel = 'Decoder timestep'
195 |   if info is not None:
196 |       xlabel += '\n\n' + info
197 |   plt.xlabel(xlabel)
198 |   plt.ylabel('Encoder timestep')
199 |   plt.tight_layout()
200 | 
201 |   fig.canvas.draw()
202 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
203 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
204 |   plt.close()
205 |   return data
206 | 
207 | 
208 | def load_wav_to_torch(full_path):
209 |   sampling_rate, data = read(full_path)
210 |   return torch.FloatTensor(data.astype(np.float32)), sampling_rate
211 | 
212 | 
213 | def load_filepaths_and_text(filename, split="|"):
214 |   with open(filename, encoding='utf-8') as f:
215 |     filepaths_and_text = [line.strip().split(split) for line in f]
216 |   return filepaths_and_text
217 | 
218 | 
219 | def get_hparams(init=True):
220 |   parser = argparse.ArgumentParser()
221 |   parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
222 |                       help='JSON file for configuration')
223 |   parser.add_argument('-m', '--model', type=str, required=True,
224 |                       help='Model name')
225 | 
226 |   args = parser.parse_args()
227 |   model_dir = os.path.join("./logs", args.model)
228 | 
229 |   if not os.path.exists(model_dir):
230 |     os.makedirs(model_dir)
231 | 
232 |   config_path = args.config
233 |   config_save_path = os.path.join(model_dir, "config.json")
234 |   if init:
235 |     with open(config_path, "r") as f:
236 |       data = f.read()
237 |     with open(config_save_path, "w") as f:
238 |       f.write(data)
239 |   else:
240 |     with open(config_save_path, "r") as f:
241 |       data = f.read()
242 |   config = json.loads(data)
243 | 
244 |   hparams = HParams(**config)
245 |   hparams.model_dir = model_dir
246 |   return hparams
247 | 
248 | 
249 | def get_hparams_from_dir(model_dir):
250 |   config_save_path = os.path.join(model_dir, "config.json")
251 |   with open(config_save_path, "r") as f:
252 |     data = f.read()
253 |   config = json.loads(data)
254 | 
255 |   hparams =HParams(**config)
256 |   hparams.model_dir = model_dir
257 |   return hparams
258 | 
259 | 
260 | def get_hparams_from_file(config_path):
261 |   with open(config_path, "r") as f:
262 |     data = f.read()
263 |   config = json.loads(data)
264 | 
265 |   hparams =HParams(**config)
266 |   return hparams
267 | 
268 | 
269 | def check_git_hash(model_dir):
270 |   source_dir = os.path.dirname(os.path.realpath(__file__))
271 |   if not os.path.exists(os.path.join(source_dir, ".git")):
272 |     logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
273 |       source_dir
274 |     ))
275 |     return
276 | 
277 |   cur_hash = subprocess.getoutput("git rev-parse HEAD")
278 | 
279 |   path = os.path.join(model_dir, "githash")
280 |   if os.path.exists(path):
281 |     saved_hash = open(path).read()
282 |     if saved_hash != cur_hash:
283 |       logger.warn("git hash values are different. {}(saved) != {}(current)".format(
284 |         saved_hash[:8], cur_hash[:8]))
285 |   else:
286 |     open(path, "w").write(cur_hash)
287 | 
288 | 
289 | def get_logger(model_dir, filename="train.log"):
290 |   global logger
291 |   logger = logging.getLogger(os.path.basename(model_dir))
292 |   logger.setLevel(logging.DEBUG)
293 | 
294 |   formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
295 |   if not os.path.exists(model_dir):
296 |     os.makedirs(model_dir)
297 |   h = logging.FileHandler(os.path.join(model_dir, filename))
298 |   h.setLevel(logging.DEBUG)
299 |   h.setFormatter(formatter)
300 |   logger.addHandler(h)
301 |   return logger
302 | 
303 | 
304 | class HParams():
305 |   def __init__(self, **kwargs):
306 |     for k, v in kwargs.items():
307 |       if type(v) == dict:
308 |         v = HParams(**v)
309 |       self[k] = v
310 | 
311 |   def keys(self):
312 |     return self.__dict__.keys()
313 | 
314 |   def items(self):
315 |     return self.__dict__.items()
316 | 
317 |   def values(self):
318 |     return self.__dict__.values()
319 | 
320 |   def __len__(self):
321 |     return len(self.__dict__)
322 | 
323 |   def __getitem__(self, key):
324 |     return getattr(self, key)
325 | 
326 |   def __setitem__(self, key, value):
327 |     return setattr(self, key, value)
328 | 
329 |   def __contains__(self, key):
330 |     return key in self.__dict__
331 | 
332 |   def __repr__(self):
333 |     return self.__dict__.__repr__()
334 | 
335 | 


--------------------------------------------------------------------------------
/vdecoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/innnky/FreeSVC/47f589e855ac1a4861985e4cb42586b573fedbff/vdecoder/__init__.py


--------------------------------------------------------------------------------
/vdecoder/hifigan/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | class AttrDict(dict):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(AttrDict, self).__init__(*args, **kwargs)
 8 |         self.__dict__ = self
 9 | 
10 | 
11 | def build_env(config, config_name, path):
12 |     t_path = os.path.join(path, config_name)
13 |     if config != t_path:
14 |         os.makedirs(path, exist_ok=True)
15 |         shutil.copyfile(config, os.path.join(path, config_name))
16 | 


--------------------------------------------------------------------------------
/vdecoder/hifigan/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from .env import AttrDict
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.nn as nn
  8 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  9 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 10 | from .utils import init_weights, get_padding
 11 | 
 12 | LRELU_SLOPE = 0.1
 13 | 
 14 | 
 15 | def load_model(model_path, device='cuda'):
 16 |     config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
 17 |     with open(config_file) as f:
 18 |         data = f.read()
 19 | 
 20 |     global h
 21 |     json_config = json.loads(data)
 22 |     h = AttrDict(json_config)
 23 | 
 24 |     generator = Generator(h).to(device)
 25 | 
 26 |     cp_dict = torch.load(model_path)
 27 |     generator.load_state_dict(cp_dict['generator'])
 28 |     generator.eval()
 29 |     generator.remove_weight_norm()
 30 |     del cp_dict
 31 |     return generator, h
 32 | 
 33 | 
 34 | class ResBlock1(torch.nn.Module):
 35 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 36 |         super(ResBlock1, self).__init__()
 37 |         self.h = h
 38 |         self.convs1 = nn.ModuleList([
 39 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 40 |                                padding=get_padding(kernel_size, dilation[0]))),
 41 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 42 |                                padding=get_padding(kernel_size, dilation[1]))),
 43 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
 44 |                                padding=get_padding(kernel_size, dilation[2])))
 45 |         ])
 46 |         self.convs1.apply(init_weights)
 47 | 
 48 |         self.convs2 = nn.ModuleList([
 49 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 50 |                                padding=get_padding(kernel_size, 1))),
 51 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 52 |                                padding=get_padding(kernel_size, 1))),
 53 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 54 |                                padding=get_padding(kernel_size, 1)))
 55 |         ])
 56 |         self.convs2.apply(init_weights)
 57 | 
 58 |     def forward(self, x):
 59 |         for c1, c2 in zip(self.convs1, self.convs2):
 60 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 61 |             xt = c1(xt)
 62 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
 63 |             xt = c2(xt)
 64 |             x = xt + x
 65 |         return x
 66 | 
 67 |     def remove_weight_norm(self):
 68 |         for l in self.convs1:
 69 |             remove_weight_norm(l)
 70 |         for l in self.convs2:
 71 |             remove_weight_norm(l)
 72 | 
 73 | 
 74 | class ResBlock2(torch.nn.Module):
 75 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
 76 |         super(ResBlock2, self).__init__()
 77 |         self.h = h
 78 |         self.convs = nn.ModuleList([
 79 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 80 |                                padding=get_padding(kernel_size, dilation[0]))),
 81 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 82 |                                padding=get_padding(kernel_size, dilation[1])))
 83 |         ])
 84 |         self.convs.apply(init_weights)
 85 | 
 86 |     def forward(self, x):
 87 |         for c in self.convs:
 88 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 89 |             xt = c(xt)
 90 |             x = xt + x
 91 |         return x
 92 | 
 93 |     def remove_weight_norm(self):
 94 |         for l in self.convs:
 95 |             remove_weight_norm(l)
 96 | 
 97 | 
 98 | class SineGen(torch.nn.Module):
 99 |     """ Definition of sine generator
100 |     SineGen(samp_rate, harmonic_num = 0,
101 |             sine_amp = 0.1, noise_std = 0.003,
102 |             voiced_threshold = 0,
103 |             flag_for_pulse=False)
104 |     samp_rate: sampling rate in Hz
105 |     harmonic_num: number of harmonic overtones (default 0)
106 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
107 |     noise_std: std of Gaussian noise (default 0.003)
108 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
109 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
110 |     Note: when flag_for_pulse is True, the first time step of a voiced
111 |         segment is always sin(np.pi) or cos(0)
112 |     """
113 | 
114 |     def __init__(self, samp_rate, harmonic_num=0,
115 |                  sine_amp=0.1, noise_std=0.003,
116 |                  voiced_threshold=0,
117 |                  flag_for_pulse=False):
118 |         super(SineGen, self).__init__()
119 |         self.sine_amp = sine_amp
120 |         self.noise_std = noise_std
121 |         self.harmonic_num = harmonic_num
122 |         self.dim = self.harmonic_num + 1
123 |         self.sampling_rate = samp_rate
124 |         self.voiced_threshold = voiced_threshold
125 |         self.flag_for_pulse = flag_for_pulse
126 | 
127 |     def _f02uv(self, f0):
128 |         # generate uv signal
129 |         uv = (f0 > self.voiced_threshold).type(torch.float32)
130 |         return uv
131 | 
132 |     def _f02sine(self, f0_values):
133 |         """ f0_values: (batchsize, length, dim)
134 |             where dim indicates fundamental tone and overtones
135 |         """
136 |         # convert to F0 in rad. The interger part n can be ignored
137 |         # because 2 * np.pi * n doesn't affect phase
138 |         rad_values = (f0_values / self.sampling_rate) % 1
139 | 
140 |         # initial phase noise (no noise for fundamental component)
141 |         rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
142 |                               device=f0_values.device)
143 |         rand_ini[:, 0] = 0
144 |         rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
145 | 
146 |         # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
147 |         if not self.flag_for_pulse:
148 |             # for normal case
149 | 
150 |             # To prevent torch.cumsum numerical overflow,
151 |             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
152 |             # Buffer tmp_over_one_idx indicates the time step to add -1.
153 |             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
154 |             tmp_over_one = torch.cumsum(rad_values, 1) % 1
155 |             tmp_over_one_idx = (torch.diff(tmp_over_one, dim=1)) < 0
156 |             cumsum_shift = torch.zeros_like(rad_values)
157 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
158 | 
159 |             sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
160 |                               * 2 * np.pi)
161 |         else:
162 |             # If necessary, make sure that the first time step of every
163 |             # voiced segments is sin(pi) or cos(0)
164 |             # This is used for pulse-train generation
165 | 
166 |             # identify the last time step in unvoiced segments
167 |             uv = self._f02uv(f0_values)
168 |             uv_1 = torch.roll(uv, shifts=-1, dims=1)
169 |             uv_1[:, -1, :] = 1
170 |             u_loc = (uv < 1) * (uv_1 > 0)
171 | 
172 |             # get the instantanouse phase
173 |             tmp_cumsum = torch.cumsum(rad_values, dim=1)
174 |             # different batch needs to be processed differently
175 |             for idx in range(f0_values.shape[0]):
176 |                 temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
177 |                 temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
178 |                 # stores the accumulation of i.phase within
179 |                 # each voiced segments
180 |                 tmp_cumsum[idx, :, :] = 0
181 |                 tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
182 | 
183 |             # rad_values - tmp_cumsum: remove the accumulation of i.phase
184 |             # within the previous voiced segment.
185 |             i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
186 | 
187 |             # get the sines
188 |             sines = torch.cos(i_phase * 2 * np.pi)
189 |         return sines
190 | 
191 |     def forward(self, f0):
192 |         """ sine_tensor, uv = forward(f0)
193 |         input F0: tensor(batchsize=1, length, dim=1)
194 |                   f0 for unvoiced steps should be 0
195 |         output sine_tensor: tensor(batchsize=1, length, dim)
196 |         output uv: tensor(batchsize=1, length, 1)
197 |         """
198 |         with torch.no_grad():
199 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
200 |                                  device=f0.device)
201 |             # fundamental component
202 |             fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
203 | 
204 |             # generate sine waveforms
205 |             sine_waves = self._f02sine(fn) * self.sine_amp
206 | 
207 |             # generate uv signal
208 |             # uv = torch.ones(f0.shape)
209 |             # uv = uv * (f0 > self.voiced_threshold)
210 |             uv = self._f02uv(f0)
211 | 
212 |             # noise: for unvoiced should be similar to sine_amp
213 |             #        std = self.sine_amp/3 -> max value ~ self.sine_amp
214 |             # .       for voiced regions is self.noise_std
215 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
216 |             noise = noise_amp * torch.randn_like(sine_waves)
217 | 
218 |             # first: set the unvoiced part to 0 by uv
219 |             # then: additive noise
220 |             sine_waves = sine_waves * uv + noise
221 |         return sine_waves, uv, noise
222 | 
223 | 
224 | class SourceModuleHnNSF(torch.nn.Module):
225 |     """ SourceModule for hn-nsf
226 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
227 |                  add_noise_std=0.003, voiced_threshod=0)
228 |     sampling_rate: sampling_rate in Hz
229 |     harmonic_num: number of harmonic above F0 (default: 0)
230 |     sine_amp: amplitude of sine source signal (default: 0.1)
231 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
232 |         note that amplitude of noise in unvoiced is decided
233 |         by sine_amp
234 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
235 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
236 |     F0_sampled (batchsize, length, 1)
237 |     Sine_source (batchsize, length, 1)
238 |     noise_source (batchsize, length 1)
239 |     uv (batchsize, length, 1)
240 |     """
241 | 
242 |     def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
243 |                  add_noise_std=0.003, voiced_threshod=0):
244 |         super(SourceModuleHnNSF, self).__init__()
245 | 
246 |         self.sine_amp = sine_amp
247 |         self.noise_std = add_noise_std
248 | 
249 |         # to produce sine waveforms
250 |         self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
251 |                                  sine_amp, add_noise_std, voiced_threshod)
252 | 
253 |         # to merge source harmonics into a single excitation
254 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
255 |         self.l_tanh = torch.nn.Tanh()
256 | 
257 |     def forward(self, x):
258 |         """
259 |         Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
260 |         F0_sampled (batchsize, length, 1)
261 |         Sine_source (batchsize, length, 1)
262 |         noise_source (batchsize, length 1)
263 |         """
264 |         # source for harmonic branch
265 |         sine_wavs, uv, _ = self.l_sin_gen(x)
266 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
267 | 
268 |         # source for noise branch, in the same shape as uv
269 |         noise = torch.randn_like(uv) * self.sine_amp / 3
270 |         return sine_merge, noise, uv
271 | 
272 | 
273 | class Generator(torch.nn.Module):
274 |     def __init__(self, h):
275 |         super(Generator, self).__init__()
276 |         self.h = h
277 | 
278 |         self.num_kernels = len(h["resblock_kernel_sizes"])
279 |         self.num_upsamples = len(h["upsample_rates"])
280 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
281 |         self.m_source = SourceModuleHnNSF(
282 |             sampling_rate=h["sampling_rate"],
283 |             harmonic_num=8)
284 |         self.noise_convs = nn.ModuleList()
285 |         self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
286 |         resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2
287 |         self.ups = nn.ModuleList()
288 |         for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
289 |             c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
290 |             self.ups.append(weight_norm(
291 |                 ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)),
292 |                                 k, u, padding=(k - u) // 2)))
293 |             if i + 1 < len(h["upsample_rates"]):  #
294 |                 stride_f0 = np.prod(h["upsample_rates"][i + 1:])
295 |                 self.noise_convs.append(Conv1d(
296 |                     1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
297 |             else:
298 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
299 |         self.resblocks = nn.ModuleList()
300 |         for i in range(len(self.ups)):
301 |             ch = h["upsample_initial_channel"] // (2 ** (i + 1))
302 |             for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
303 |                 self.resblocks.append(resblock(h, ch, k, d))
304 | 
305 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
306 |         self.ups.apply(init_weights)
307 |         self.conv_post.apply(init_weights)
308 |         self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
309 | 
310 |     def forward(self, x, f0, g=None):
311 |         # print(1,x.shape,f0.shape,f0[:, None].shape)
312 |         f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
313 |         # print(2,f0.shape)
314 |         har_source, noi_source, uv = self.m_source(f0)
315 |         har_source = har_source.transpose(1, 2)
316 |         x = self.conv_pre(x)
317 |         x = x + self.cond(g)
318 |         # print(124,x.shape,har_source.shape)
319 |         for i in range(self.num_upsamples):
320 |             x = F.leaky_relu(x, LRELU_SLOPE)
321 |             # print(3,x.shape)
322 |             x = self.ups[i](x)
323 |             x_source = self.noise_convs[i](har_source)
324 |             # print(4,x_source.shape,har_source.shape,x.shape)
325 |             x = x + x_source
326 |             xs = None
327 |             for j in range(self.num_kernels):
328 |                 if xs is None:
329 |                     xs = self.resblocks[i * self.num_kernels + j](x)
330 |                 else:
331 |                     xs += self.resblocks[i * self.num_kernels + j](x)
332 |             x = xs / self.num_kernels
333 |         x = F.leaky_relu(x)
334 |         x = self.conv_post(x)
335 |         x = torch.tanh(x)
336 | 
337 |         return x
338 | 
339 |     def remove_weight_norm(self):
340 |         print('Removing weight norm...')
341 |         for l in self.ups:
342 |             remove_weight_norm(l)
343 |         for l in self.resblocks:
344 |             l.remove_weight_norm()
345 |         remove_weight_norm(self.conv_pre)
346 |         remove_weight_norm(self.conv_post)
347 | 
348 | 
349 | class DiscriminatorP(torch.nn.Module):
350 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
351 |         super(DiscriminatorP, self).__init__()
352 |         self.period = period
353 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
354 |         self.convs = nn.ModuleList([
355 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
356 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
357 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
358 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
359 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
360 |         ])
361 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
362 | 
363 |     def forward(self, x):
364 |         fmap = []
365 | 
366 |         # 1d to 2d
367 |         b, c, t = x.shape
368 |         if t % self.period != 0:  # pad first
369 |             n_pad = self.period - (t % self.period)
370 |             x = F.pad(x, (0, n_pad), "reflect")
371 |             t = t + n_pad
372 |         x = x.view(b, c, t // self.period, self.period)
373 | 
374 |         for l in self.convs:
375 |             x = l(x)
376 |             x = F.leaky_relu(x, LRELU_SLOPE)
377 |             fmap.append(x)
378 |         x = self.conv_post(x)
379 |         fmap.append(x)
380 |         x = torch.flatten(x, 1, -1)
381 | 
382 |         return x, fmap
383 | 
384 | 
385 | class MultiPeriodDiscriminator(torch.nn.Module):
386 |     def __init__(self, periods=None):
387 |         super(MultiPeriodDiscriminator, self).__init__()
388 |         self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
389 |         self.discriminators = nn.ModuleList()
390 |         for period in self.periods:
391 |             self.discriminators.append(DiscriminatorP(period))
392 | 
393 |     def forward(self, y, y_hat):
394 |         y_d_rs = []
395 |         y_d_gs = []
396 |         fmap_rs = []
397 |         fmap_gs = []
398 |         for i, d in enumerate(self.discriminators):
399 |             y_d_r, fmap_r = d(y)
400 |             y_d_g, fmap_g = d(y_hat)
401 |             y_d_rs.append(y_d_r)
402 |             fmap_rs.append(fmap_r)
403 |             y_d_gs.append(y_d_g)
404 |             fmap_gs.append(fmap_g)
405 | 
406 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
407 | 
408 | 
409 | class DiscriminatorS(torch.nn.Module):
410 |     def __init__(self, use_spectral_norm=False):
411 |         super(DiscriminatorS, self).__init__()
412 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
413 |         self.convs = nn.ModuleList([
414 |             norm_f(Conv1d(1, 128, 15, 1, padding=7)),
415 |             norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
416 |             norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
417 |             norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
418 |             norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
419 |             norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
420 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
421 |         ])
422 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
423 | 
424 |     def forward(self, x):
425 |         fmap = []
426 |         for l in self.convs:
427 |             x = l(x)
428 |             x = F.leaky_relu(x, LRELU_SLOPE)
429 |             fmap.append(x)
430 |         x = self.conv_post(x)
431 |         fmap.append(x)
432 |         x = torch.flatten(x, 1, -1)
433 | 
434 |         return x, fmap
435 | 
436 | 
437 | class MultiScaleDiscriminator(torch.nn.Module):
438 |     def __init__(self):
439 |         super(MultiScaleDiscriminator, self).__init__()
440 |         self.discriminators = nn.ModuleList([
441 |             DiscriminatorS(use_spectral_norm=True),
442 |             DiscriminatorS(),
443 |             DiscriminatorS(),
444 |         ])
445 |         self.meanpools = nn.ModuleList([
446 |             AvgPool1d(4, 2, padding=2),
447 |             AvgPool1d(4, 2, padding=2)
448 |         ])
449 | 
450 |     def forward(self, y, y_hat):
451 |         y_d_rs = []
452 |         y_d_gs = []
453 |         fmap_rs = []
454 |         fmap_gs = []
455 |         for i, d in enumerate(self.discriminators):
456 |             if i != 0:
457 |                 y = self.meanpools[i - 1](y)
458 |                 y_hat = self.meanpools[i - 1](y_hat)
459 |             y_d_r, fmap_r = d(y)
460 |             y_d_g, fmap_g = d(y_hat)
461 |             y_d_rs.append(y_d_r)
462 |             fmap_rs.append(fmap_r)
463 |             y_d_gs.append(y_d_g)
464 |             fmap_gs.append(fmap_g)
465 | 
466 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
467 | 
468 | 
469 | def feature_loss(fmap_r, fmap_g):
470 |     loss = 0
471 |     for dr, dg in zip(fmap_r, fmap_g):
472 |         for rl, gl in zip(dr, dg):
473 |             loss += torch.mean(torch.abs(rl - gl))
474 | 
475 |     return loss * 2
476 | 
477 | 
478 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
479 |     loss = 0
480 |     r_losses = []
481 |     g_losses = []
482 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
483 |         r_loss = torch.mean((1 - dr) ** 2)
484 |         g_loss = torch.mean(dg ** 2)
485 |         loss += (r_loss + g_loss)
486 |         r_losses.append(r_loss.item())
487 |         g_losses.append(g_loss.item())
488 | 
489 |     return loss, r_losses, g_losses
490 | 
491 | 
492 | def generator_loss(disc_outputs):
493 |     loss = 0
494 |     gen_losses = []
495 |     for dg in disc_outputs:
496 |         l = torch.mean((1 - dg) ** 2)
497 |         gen_losses.append(l)
498 |         loss += l
499 | 
500 |     return loss, gen_losses
501 | 


--------------------------------------------------------------------------------
/vdecoder/hifigan/nvSTFT.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | os.environ["LRU_CACHE_CAPACITY"] = "3"
  4 | import random
  5 | import torch
  6 | import torch.utils.data
  7 | import numpy as np
  8 | import librosa
  9 | from librosa.util import normalize
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | from scipy.io.wavfile import read
 12 | import soundfile as sf
 13 | 
 14 | def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
 15 |     sampling_rate = None
 16 |     try:
 17 |         data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
 18 |     except Exception as ex:
 19 |         print(f"'{full_path}' failed to load.\nException:")
 20 |         print(ex)
 21 |         if return_empty_on_exception:
 22 |             return [], sampling_rate or target_sr or 48000
 23 |         else:
 24 |             raise Exception(ex)
 25 |     
 26 |     if len(data.shape) > 1:
 27 |         data = data[:, 0]
 28 |         assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
 29 |     
 30 |     if np.issubdtype(data.dtype, np.integer): # if audio data is type int
 31 |         max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
 32 |     else: # if audio data is type fp32
 33 |         max_mag = max(np.amax(data), -np.amin(data))
 34 |         max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
 35 |     
 36 |     data = torch.FloatTensor(data.astype(np.float32))/max_mag
 37 |     
 38 |     if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
 39 |         return [], sampling_rate or target_sr or 48000
 40 |     if target_sr is not None and sampling_rate != target_sr:
 41 |         data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
 42 |         sampling_rate = target_sr
 43 |     
 44 |     return data, sampling_rate
 45 | 
 46 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 47 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 48 | 
 49 | def dynamic_range_decompression(x, C=1):
 50 |     return np.exp(x) / C
 51 | 
 52 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 53 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 54 | 
 55 | def dynamic_range_decompression_torch(x, C=1):
 56 |     return torch.exp(x) / C
 57 | 
 58 | class STFT():
 59 |     def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
 60 |         self.target_sr = sr
 61 |         
 62 |         self.n_mels     = n_mels
 63 |         self.n_fft      = n_fft
 64 |         self.win_size   = win_size
 65 |         self.hop_length = hop_length
 66 |         self.fmin     = fmin
 67 |         self.fmax     = fmax
 68 |         self.clip_val = clip_val
 69 |         self.mel_basis = {}
 70 |         self.hann_window = {}
 71 |     
 72 |     def get_mel(self, y, center=False):
 73 |         sampling_rate = self.target_sr
 74 |         n_mels     = self.n_mels
 75 |         n_fft      = self.n_fft
 76 |         win_size   = self.win_size
 77 |         hop_length = self.hop_length
 78 |         fmin       = self.fmin
 79 |         fmax       = self.fmax
 80 |         clip_val   = self.clip_val
 81 |         
 82 |         if torch.min(y) < -1.:
 83 |             print('min value is ', torch.min(y))
 84 |         if torch.max(y) > 1.:
 85 |             print('max value is ', torch.max(y))
 86 |         
 87 |         if fmax not in self.mel_basis:
 88 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
 89 |             self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 90 |             self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
 91 |         
 92 |         y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
 93 |         y = y.squeeze(1)
 94 |         
 95 |         spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
 96 |                           center=center, pad_mode='reflect', normalized=False, onesided=True)
 97 |         # print(111,spec)
 98 |         spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
 99 |         # print(222,spec)
100 |         spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
101 |         # print(333,spec)
102 |         spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
103 |         # print(444,spec)
104 |         return spec
105 |     
106 |     def __call__(self, audiopath):
107 |         audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
108 |         spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
109 |         return spect
110 | 
111 | stft = STFT()
112 | 


--------------------------------------------------------------------------------
/vdecoder/hifigan/utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import matplotlib
 4 | import torch
 5 | from torch.nn.utils import weight_norm
 6 | matplotlib.use("Agg")
 7 | import matplotlib.pylab as plt
 8 | 
 9 | 
10 | def plot_spectrogram(spectrogram):
11 |     fig, ax = plt.subplots(figsize=(10, 2))
12 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13 |                    interpolation='none')
14 |     plt.colorbar(im, ax=ax)
15 | 
16 |     fig.canvas.draw()
17 |     plt.close()
18 | 
19 |     return fig
20 | 
21 | 
22 | def init_weights(m, mean=0.0, std=0.01):
23 |     classname = m.__class__.__name__
24 |     if classname.find("Conv") != -1:
25 |         m.weight.data.normal_(mean, std)
26 | 
27 | 
28 | def apply_weight_norm(m):
29 |     classname = m.__class__.__name__
30 |     if classname.find("Conv") != -1:
31 |         weight_norm(m)
32 | 
33 | 
34 | def get_padding(kernel_size, dilation=1):
35 |     return int((kernel_size*dilation - dilation)/2)
36 | 
37 | 
38 | def load_checkpoint(filepath, device):
39 |     assert os.path.isfile(filepath)
40 |     print("Loading '{}'".format(filepath))
41 |     checkpoint_dict = torch.load(filepath, map_location=device)
42 |     print("Complete.")
43 |     return checkpoint_dict
44 | 
45 | 
46 | def save_checkpoint(filepath, obj):
47 |     print("Saving checkpoint to {}".format(filepath))
48 |     torch.save(obj, filepath)
49 |     print("Complete.")
50 | 
51 | 
52 | def del_old_checkpoints(cp_dir, prefix, n_models=2):
53 |     pattern = os.path.join(cp_dir, prefix + '????????')
54 |     cp_list = glob.glob(pattern) # get checkpoint paths
55 |     cp_list = sorted(cp_list)# sort by iter
56 |     if len(cp_list) > n_models: # if more than n_models models are found
57 |         for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
58 |             open(cp, 'w').close()# empty file contents
59 |             os.unlink(cp)# delete file (move to trash when using Colab)
60 | 
61 | 
62 | def scan_checkpoint(cp_dir, prefix):
63 |     pattern = os.path.join(cp_dir, prefix + '????????')
64 |     cp_list = glob.glob(pattern)
65 |     if len(cp_list) == 0:
66 |         return None
67 |     return sorted(cp_list)[-1]
68 | 
69 | 


--------------------------------------------------------------------------------