├── .gitignore ├── LICENSE ├── README.md ├── attentions.py ├── baidu_translate.py ├── commons.py ├── config_sample.py ├── inference.py ├── main.py ├── models.py ├── modules.py ├── monotonic_align.py ├── openai_api.py ├── requirements.txt ├── static ├── css │ └── index.css ├── images │ ├── ja.png │ ├── record.png │ ├── recording.png │ ├── reload.png │ ├── text.png │ └── zh.png └── js │ ├── bundle.js │ ├── core-js.js │ └── live2dcubismcore.js ├── templates └── index.html ├── text ├── __init__.py ├── cleaners.py └── symbols.py ├── transforms.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | DUMMY1 2 | DUMMY2 3 | DUMMY3 4 | logs 5 | __pycache__ 6 | .ipynb_checkpoints 7 | .*.swp 8 | 9 | build 10 | *.c 11 | monotonic_align/monotonic_align 12 | 13 | /output 14 | 15 | /config.py 16 | /venv 17 | .env 18 | 19 | *.pth 20 | /model/*/config.json 21 | /static/atri 22 | 23 | /test.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 orange 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChatVITS 2 | 3 | 这是一个基于 ChatGPT、Whisper、VITS 和 Live2D 的语音聊天网页。 4 | 5 | ## 使用方法 6 | - 安装 Python3.6 以上版本 7 | - 安装 CMake 8 | - 安装依赖 `pip install -r requirements.txt` 9 | - 下载 VITS 模型文件,命名为 model.pth 和 config.json 放入 model 文件夹。 10 | - 下载 Live2D 模型文件,放入 static 文件夹。 11 | - 复制 config_sample.py 为 config.py ,并填写设置。 12 | - 运行服务器 `python main.py` 13 | -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__( 12 | self, 13 | hidden_channels, 14 | filter_channels, 15 | n_heads, 16 | n_layers, 17 | kernel_size=1, 18 | p_dropout=0.0, 19 | window_size=4, 20 | **kwargs 21 | ): 22 | super().__init__() 23 | self.hidden_channels = hidden_channels 24 | self.filter_channels = filter_channels 25 | self.n_heads = n_heads 26 | self.n_layers = n_layers 27 | self.kernel_size = kernel_size 28 | self.p_dropout = p_dropout 29 | self.window_size = window_size 30 | 31 | self.drop = nn.Dropout(p_dropout) 32 | self.attn_layers = nn.ModuleList() 33 | self.norm_layers_1 = nn.ModuleList() 34 | self.ffn_layers = nn.ModuleList() 35 | self.norm_layers_2 = nn.ModuleList() 36 | for i in range(self.n_layers): 37 | self.attn_layers.append( 38 | MultiHeadAttention( 39 | hidden_channels, 40 | hidden_channels, 41 | n_heads, 42 | p_dropout=p_dropout, 43 | window_size=window_size, 44 | ) 45 | ) 46 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 47 | self.ffn_layers.append( 48 | FFN( 49 | hidden_channels, 50 | hidden_channels, 51 | filter_channels, 52 | kernel_size, 53 | p_dropout=p_dropout, 54 | ) 55 | ) 56 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 57 | 58 | def forward(self, x, x_mask): 59 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 60 | x = x * x_mask 61 | for i in range(self.n_layers): 62 | y = self.attn_layers[i](x, x, attn_mask) 63 | y = self.drop(y) 64 | x = self.norm_layers_1[i](x + y) 65 | 66 | y = self.ffn_layers[i](x, x_mask) 67 | y = self.drop(y) 68 | x = self.norm_layers_2[i](x + y) 69 | x = x * x_mask 70 | return x 71 | 72 | 73 | class Decoder(nn.Module): 74 | def __init__( 75 | self, 76 | hidden_channels, 77 | filter_channels, 78 | n_heads, 79 | n_layers, 80 | kernel_size=1, 81 | p_dropout=0.0, 82 | proximal_bias=False, 83 | proximal_init=True, 84 | **kwargs 85 | ): 86 | super().__init__() 87 | self.hidden_channels = hidden_channels 88 | self.filter_channels = filter_channels 89 | self.n_heads = n_heads 90 | self.n_layers = n_layers 91 | self.kernel_size = kernel_size 92 | self.p_dropout = p_dropout 93 | self.proximal_bias = proximal_bias 94 | self.proximal_init = proximal_init 95 | 96 | self.drop = nn.Dropout(p_dropout) 97 | self.self_attn_layers = nn.ModuleList() 98 | self.norm_layers_0 = nn.ModuleList() 99 | self.encdec_attn_layers = nn.ModuleList() 100 | self.norm_layers_1 = nn.ModuleList() 101 | self.ffn_layers = nn.ModuleList() 102 | self.norm_layers_2 = nn.ModuleList() 103 | for i in range(self.n_layers): 104 | self.self_attn_layers.append( 105 | MultiHeadAttention( 106 | hidden_channels, 107 | hidden_channels, 108 | n_heads, 109 | p_dropout=p_dropout, 110 | proximal_bias=proximal_bias, 111 | proximal_init=proximal_init, 112 | ) 113 | ) 114 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 115 | self.encdec_attn_layers.append( 116 | MultiHeadAttention( 117 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout 118 | ) 119 | ) 120 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 121 | self.ffn_layers.append( 122 | FFN( 123 | hidden_channels, 124 | hidden_channels, 125 | filter_channels, 126 | kernel_size, 127 | p_dropout=p_dropout, 128 | causal=True, 129 | ) 130 | ) 131 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 132 | 133 | def forward(self, x, x_mask, h, h_mask): 134 | """ 135 | x: decoder input 136 | h: encoder output 137 | """ 138 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( 139 | device=x.device, dtype=x.dtype 140 | ) 141 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 142 | x = x * x_mask 143 | for i in range(self.n_layers): 144 | y = self.self_attn_layers[i](x, x, self_attn_mask) 145 | y = self.drop(y) 146 | x = self.norm_layers_0[i](x + y) 147 | 148 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 149 | y = self.drop(y) 150 | x = self.norm_layers_1[i](x + y) 151 | 152 | y = self.ffn_layers[i](x, x_mask) 153 | y = self.drop(y) 154 | x = self.norm_layers_2[i](x + y) 155 | x = x * x_mask 156 | return x 157 | 158 | 159 | class MultiHeadAttention(nn.Module): 160 | def __init__( 161 | self, 162 | channels, 163 | out_channels, 164 | n_heads, 165 | p_dropout=0.0, 166 | window_size=None, 167 | heads_share=True, 168 | block_length=None, 169 | proximal_bias=False, 170 | proximal_init=False, 171 | ): 172 | super().__init__() 173 | assert channels % n_heads == 0 174 | 175 | self.channels = channels 176 | self.out_channels = out_channels 177 | self.n_heads = n_heads 178 | self.p_dropout = p_dropout 179 | self.window_size = window_size 180 | self.heads_share = heads_share 181 | self.block_length = block_length 182 | self.proximal_bias = proximal_bias 183 | self.proximal_init = proximal_init 184 | self.attn = None 185 | 186 | self.k_channels = channels // n_heads 187 | self.conv_q = nn.Conv1d(channels, channels, 1) 188 | self.conv_k = nn.Conv1d(channels, channels, 1) 189 | self.conv_v = nn.Conv1d(channels, channels, 1) 190 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 191 | self.drop = nn.Dropout(p_dropout) 192 | 193 | if window_size is not None: 194 | n_heads_rel = 1 if heads_share else n_heads 195 | rel_stddev = self.k_channels**-0.5 196 | self.emb_rel_k = nn.Parameter( 197 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 198 | * rel_stddev 199 | ) 200 | self.emb_rel_v = nn.Parameter( 201 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 202 | * rel_stddev 203 | ) 204 | 205 | nn.init.xavier_uniform_(self.conv_q.weight) 206 | nn.init.xavier_uniform_(self.conv_k.weight) 207 | nn.init.xavier_uniform_(self.conv_v.weight) 208 | if proximal_init: 209 | with torch.no_grad(): 210 | self.conv_k.weight.copy_(self.conv_q.weight) 211 | self.conv_k.bias.copy_(self.conv_q.bias) 212 | 213 | def forward(self, x, c, attn_mask=None): 214 | q = self.conv_q(x) 215 | k = self.conv_k(c) 216 | v = self.conv_v(c) 217 | 218 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 219 | 220 | x = self.conv_o(x) 221 | return x 222 | 223 | def attention(self, query, key, value, mask=None): 224 | # reshape [b, d, t] -> [b, n_h, t, d_k] 225 | b, d, t_s, t_t = (*key.size(), query.size(2)) 226 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 227 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 228 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 229 | 230 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 231 | if self.window_size is not None: 232 | assert ( 233 | t_s == t_t 234 | ), "Relative attention is only available for self-attention." 235 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 236 | rel_logits = self._matmul_with_relative_keys( 237 | query / math.sqrt(self.k_channels), key_relative_embeddings 238 | ) 239 | scores_local = self._relative_position_to_absolute_position(rel_logits) 240 | scores = scores + scores_local 241 | if self.proximal_bias: 242 | assert t_s == t_t, "Proximal bias is only available for self-attention." 243 | scores = scores + self._attention_bias_proximal(t_s).to( 244 | device=scores.device, dtype=scores.dtype 245 | ) 246 | if mask is not None: 247 | scores = scores.masked_fill(mask == 0, -1e4) 248 | if self.block_length is not None: 249 | assert ( 250 | t_s == t_t 251 | ), "Local attention is only available for self-attention." 252 | block_mask = ( 253 | torch.ones_like(scores) 254 | .triu(-self.block_length) 255 | .tril(self.block_length) 256 | ) 257 | scores = scores.masked_fill(block_mask == 0, -1e4) 258 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 259 | p_attn = self.drop(p_attn) 260 | output = torch.matmul(p_attn, value) 261 | if self.window_size is not None: 262 | relative_weights = self._absolute_position_to_relative_position(p_attn) 263 | value_relative_embeddings = self._get_relative_embeddings( 264 | self.emb_rel_v, t_s 265 | ) 266 | output = output + self._matmul_with_relative_values( 267 | relative_weights, value_relative_embeddings 268 | ) 269 | output = ( 270 | output.transpose(2, 3).contiguous().view(b, d, t_t) 271 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t] 272 | return output, p_attn 273 | 274 | def _matmul_with_relative_values(self, x, y): 275 | """ 276 | x: [b, h, l, m] 277 | y: [h or 1, m, d] 278 | ret: [b, h, l, d] 279 | """ 280 | ret = torch.matmul(x, y.unsqueeze(0)) 281 | return ret 282 | 283 | def _matmul_with_relative_keys(self, x, y): 284 | """ 285 | x: [b, h, l, d] 286 | y: [h or 1, m, d] 287 | ret: [b, h, l, m] 288 | """ 289 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 290 | return ret 291 | 292 | def _get_relative_embeddings(self, relative_embeddings, length): 293 | max_relative_position = 2 * self.window_size + 1 294 | # Pad first before slice to avoid using cond ops. 295 | pad_length = max(length - (self.window_size + 1), 0) 296 | slice_start_position = max((self.window_size + 1) - length, 0) 297 | slice_end_position = slice_start_position + 2 * length - 1 298 | if pad_length > 0: 299 | padded_relative_embeddings = F.pad( 300 | relative_embeddings, 301 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), 302 | ) 303 | else: 304 | padded_relative_embeddings = relative_embeddings 305 | used_relative_embeddings = padded_relative_embeddings[ 306 | :, slice_start_position:slice_end_position 307 | ] 308 | return used_relative_embeddings 309 | 310 | def _relative_position_to_absolute_position(self, x): 311 | """ 312 | x: [b, h, l, 2*l-1] 313 | ret: [b, h, l, l] 314 | """ 315 | batch, heads, length, _ = x.size() 316 | # Concat columns of pad to shift from relative to absolute indexing. 317 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) 318 | 319 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 320 | x_flat = x.view([batch, heads, length * 2 * length]) 321 | x_flat = F.pad( 322 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) 323 | ) 324 | 325 | # Reshape and slice out the padded elements. 326 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ 327 | :, :, :length, length - 1 : 328 | ] 329 | return x_final 330 | 331 | def _absolute_position_to_relative_position(self, x): 332 | """ 333 | x: [b, h, l, l] 334 | ret: [b, h, l, 2*l-1] 335 | """ 336 | batch, heads, length, _ = x.size() 337 | # padd along column 338 | x = F.pad( 339 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) 340 | ) 341 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) 342 | # add 0's in the beginning that will skew the elements after reshape 343 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 344 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] 345 | return x_final 346 | 347 | def _attention_bias_proximal(self, length): 348 | """Bias for self-attention to encourage attention to close positions. 349 | Args: 350 | length: an integer scalar. 351 | Returns: 352 | a Tensor with shape [1, 1, length, length] 353 | """ 354 | r = torch.arange(length, dtype=torch.float32) 355 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 356 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 357 | 358 | 359 | class FFN(nn.Module): 360 | def __init__( 361 | self, 362 | in_channels, 363 | out_channels, 364 | filter_channels, 365 | kernel_size, 366 | p_dropout=0.0, 367 | activation=None, 368 | causal=False, 369 | ): 370 | super().__init__() 371 | self.in_channels = in_channels 372 | self.out_channels = out_channels 373 | self.filter_channels = filter_channels 374 | self.kernel_size = kernel_size 375 | self.p_dropout = p_dropout 376 | self.activation = activation 377 | self.causal = causal 378 | 379 | if causal: 380 | self.padding = self._causal_padding 381 | else: 382 | self.padding = self._same_padding 383 | 384 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 385 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 386 | self.drop = nn.Dropout(p_dropout) 387 | 388 | def forward(self, x, x_mask): 389 | x = self.conv_1(self.padding(x * x_mask)) 390 | if self.activation == "gelu": 391 | x = x * torch.sigmoid(1.702 * x) 392 | else: 393 | x = torch.relu(x) 394 | x = self.drop(x) 395 | x = self.conv_2(self.padding(x * x_mask)) 396 | return x * x_mask 397 | 398 | def _causal_padding(self, x): 399 | if self.kernel_size == 1: 400 | return x 401 | pad_l = self.kernel_size - 1 402 | pad_r = 0 403 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 404 | x = F.pad(x, commons.convert_pad_shape(padding)) 405 | return x 406 | 407 | def _same_padding(self, x): 408 | if self.kernel_size == 1: 409 | return x 410 | pad_l = (self.kernel_size - 1) // 2 411 | pad_r = self.kernel_size // 2 412 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 413 | x = F.pad(x, commons.convert_pad_shape(padding)) 414 | return x 415 | -------------------------------------------------------------------------------- /baidu_translate.py: -------------------------------------------------------------------------------- 1 | import config 2 | import requests 3 | import hashlib 4 | from uuid import uuid4 5 | 6 | 7 | def translate(text: str): 8 | text = text.replace("\n", "") 9 | salt = str(uuid4()) 10 | sign = hashlib.md5( 11 | ( 12 | config.BAIDU_TRANSLATE_APPID + text + salt + config.BAIDU_TRANSLATE_KEY 13 | ).encode("utf-8") 14 | ).hexdigest() 15 | params = { 16 | "q": text, 17 | "from": "auto", 18 | "to": "zh", 19 | "appid": config.BAIDU_TRANSLATE_APPID, 20 | "salt": salt, 21 | "sign": sign, 22 | } 23 | response = requests.get(config.BAIDU_TRANSLATE_URL, params=params) 24 | result = response.json() 25 | if "error_code" in result: 26 | return {"code": result["error_code"]} 27 | else: 28 | return {"code": 0, "result": replace_words(result["trans_result"][0]["dst"])} 29 | 30 | 31 | def replace_words(text: str): 32 | for key, value in config.REPLACEMENTS.items(): 33 | text = text.replace(key, value) 34 | return text 35 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def init_weights(m, mean=0.0, std=0.01): 7 | classname = m.__class__.__name__ 8 | if classname.find("Conv") != -1: 9 | m.weight.data.normal_(mean, std) 10 | 11 | 12 | def get_padding(kernel_size, dilation=1): 13 | return int((kernel_size * dilation - dilation) / 2) 14 | 15 | 16 | def convert_pad_shape(pad_shape): 17 | l = pad_shape[::-1] 18 | pad_shape = [item for sublist in l for item in sublist] 19 | return pad_shape 20 | 21 | 22 | def intersperse(lst, item): 23 | result = [item] * (len(lst) * 2 + 1) 24 | result[1::2] = lst 25 | return result 26 | 27 | 28 | def kl_divergence(m_p, logs_p, m_q, logs_q): 29 | """KL(P||Q)""" 30 | kl = (logs_q - logs_p) - 0.5 31 | kl += ( 32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 33 | ) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 68 | position = torch.arange(length, dtype=torch.float) 69 | num_timescales = channels // 2 70 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 71 | num_timescales - 1 72 | ) 73 | inv_timescales = min_timescale * torch.exp( 74 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 75 | ) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2, 3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1.0 / norm_type) 161 | return total_norm 162 | -------------------------------------------------------------------------------- /config_sample.py: -------------------------------------------------------------------------------- 1 | SECRET_KEY = "" 2 | OPENAI_API_KEY = "" 3 | BAIDU_TRANSLATE_URL = "https://fanyi-api.baidu.com/api/trans/vip/translate" 4 | BAIDU_TRANSLATE_APPID = "" 5 | BAIDU_TRANSLATE_KEY = "" 6 | REPLACEMENTS = {} 7 | TRANSCRIPT_PROMPT = {"zh": "", "ja": ""} 8 | ENHANCE_PROMPT = "" 9 | INITIAL_CONTEXT = [] 10 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pyexpat import model 3 | import torch 4 | 5 | import commons 6 | import utils 7 | from models import SynthesizerTrn 8 | 9 | from scipy.io.wavfile import write 10 | 11 | import text as text_utils 12 | 13 | 14 | class vits_inference: 15 | def __init__(self, model_name): 16 | self.load_model( 17 | os.path.join("model", model_name, "config.json"), 18 | os.path.join("model", model_name, "model.pth"), 19 | ) 20 | 21 | def get_text(self, text, hps): 22 | text_norm = text_utils.text_to_sequence(text, hps.data.text_cleaners) 23 | if hps.data.add_blank: 24 | text_norm = commons.intersperse(text_norm, 0) 25 | text_norm = torch.LongTensor(text_norm) 26 | return text_norm 27 | 28 | def load_model(self, config_file, model_file): 29 | self.hps = utils.get_hparams_from_file(config_file) 30 | self.net_g = SynthesizerTrn( 31 | len(text_utils.symbols), 32 | self.hps.data.filter_length // 2 + 1, 33 | self.hps.train.segment_size // self.hps.data.hop_length, 34 | **self.hps.model 35 | ).cuda() 36 | _ = self.net_g.eval() 37 | _ = utils.load_checkpoint(model_file, self.net_g, None) 38 | 39 | def synthesis(self, output_file, target_text, speaker_id=-1): 40 | stn_tst = self.get_text(target_text, self.hps) 41 | with torch.no_grad(): 42 | x_tst = stn_tst.cuda().unsqueeze(0) 43 | x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda() 44 | if speaker_id != -1: 45 | sid = torch.LongTensor([int(speaker_id)]).cuda() 46 | audio = ( 47 | self.net_g.infer( 48 | x_tst, 49 | x_tst_lengths, 50 | sid=sid, 51 | noise_scale=0.667, 52 | noise_scale_w=0.8, 53 | length_scale=1, 54 | )[0][0, 0] 55 | .data.cpu() 56 | .float() 57 | .numpy() 58 | ) 59 | else: 60 | audio = ( 61 | self.net_g.infer( 62 | x_tst, 63 | x_tst_lengths, 64 | noise_scale=0.667, 65 | noise_scale_w=0.8, 66 | length_scale=1, 67 | )[0][0, 0] 68 | .data.cpu() 69 | .float() 70 | .numpy() 71 | ) 72 | audio = audio * 32768.0 73 | audio = audio.squeeze() 74 | audio = audio.astype("int16") 75 | write(output_file, 22050, audio) 76 | 77 | 78 | if __name__ == "__main__": 79 | model_name = input("模型名称:") 80 | output_dir = input("输出目录:") 81 | model = vits_inference(model_name) 82 | while True: 83 | target_text = input("生成文本:") 84 | output_file = os.path.join( 85 | output_dir, "{}_{}.wav".format(model_name, target_text.replace(" ", "_")) 86 | ) 87 | model.synthesis(output_file, target_text) 88 | print("生成完成,输出文件:{}".format(output_file)) 89 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template, send_file, abort, jsonify, session 2 | from inference import vits_inference 3 | import baidu_translate 4 | import openai_api 5 | from io import BytesIO 6 | import waitress 7 | 8 | vits_model = vits_inference("atri") 9 | 10 | app = Flask(__name__) 11 | app.config.from_object("config") 12 | 13 | 14 | @app.route("/") 15 | def index(): 16 | return render_template("index.html") 17 | 18 | 19 | @app.route("/synthesis", methods=["GET"]) 20 | def synthesis(): 21 | text = request.args.get("text", type=str) 22 | try: 23 | output_audio = BytesIO() 24 | vits_model.synthesis(output_audio, text, -1) 25 | return send_file(output_audio, mimetype="audio/wav") 26 | except: 27 | abort(500) 28 | 29 | 30 | @app.route("/translate", methods=["GET"]) 31 | def translate(): 32 | text = request.args.get("text", type=str) 33 | return jsonify(baidu_translate.translate(text)) 34 | 35 | 36 | @app.route("/transcript", methods=["POST"]) 37 | def transcript(): 38 | language = request.args.get("language", default="", type=str) 39 | audio = BytesIO(request.files["audio"].stream.read()) 40 | audio.name = "audio.wav" 41 | return jsonify(openai_api.transcript(audio, language)) 42 | 43 | 44 | @app.route("/chat_complete", methods=["GET"]) 45 | def chat_complete(): 46 | context = session.get("context", []) 47 | text = request.args.get("text", type=str) 48 | if text.strip(): 49 | new_context, message = openai_api.chat_complete(context, text) 50 | if message: 51 | session["context"] = new_context 52 | return jsonify(code=0, message=message) 53 | else: 54 | return jsonify(code=500) 55 | else: 56 | return jsonify(code=400) 57 | 58 | 59 | @app.route("/reset_context", methods=["GET"]) 60 | def reset_context(): 61 | session["context"] = [] 62 | return jsonify(code=0) 63 | 64 | 65 | if __name__ == "__main__": 66 | # app.run(debug=False, host="0.0.0.0", port=8080) 67 | waitress.serve(app, host="0.0.0.0", port=8080) 68 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | import monotonic_align 10 | 11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 13 | from commons import init_weights, get_padding 14 | 15 | 16 | class StochasticDurationPredictor(nn.Module): 17 | def __init__( 18 | self, 19 | in_channels, 20 | filter_channels, 21 | kernel_size, 22 | p_dropout, 23 | n_flows=4, 24 | gin_channels=0, 25 | ): 26 | super().__init__() 27 | filter_channels = in_channels # it needs to be removed from future version. 28 | self.in_channels = in_channels 29 | self.filter_channels = filter_channels 30 | self.kernel_size = kernel_size 31 | self.p_dropout = p_dropout 32 | self.n_flows = n_flows 33 | self.gin_channels = gin_channels 34 | 35 | self.log_flow = modules.Log() 36 | self.flows = nn.ModuleList() 37 | self.flows.append(modules.ElementwiseAffine(2)) 38 | for i in range(n_flows): 39 | self.flows.append( 40 | modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) 41 | ) 42 | self.flows.append(modules.Flip()) 43 | 44 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 45 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 46 | self.post_convs = modules.DDSConv( 47 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout 48 | ) 49 | self.post_flows = nn.ModuleList() 50 | self.post_flows.append(modules.ElementwiseAffine(2)) 51 | for i in range(4): 52 | self.post_flows.append( 53 | modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) 54 | ) 55 | self.post_flows.append(modules.Flip()) 56 | 57 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 58 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 59 | self.convs = modules.DDSConv( 60 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout 61 | ) 62 | if gin_channels != 0: 63 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 64 | 65 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 66 | x = torch.detach(x) 67 | x = self.pre(x) 68 | if g is not None: 69 | g = torch.detach(g) 70 | x = x + self.cond(g) 71 | x = self.convs(x, x_mask) 72 | x = self.proj(x) * x_mask 73 | 74 | if not reverse: 75 | flows = self.flows 76 | assert w is not None 77 | 78 | logdet_tot_q = 0 79 | h_w = self.post_pre(w) 80 | h_w = self.post_convs(h_w, x_mask) 81 | h_w = self.post_proj(h_w) * x_mask 82 | e_q = ( 83 | torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) 84 | * x_mask 85 | ) 86 | z_q = e_q 87 | for flow in self.post_flows: 88 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 89 | logdet_tot_q += logdet_q 90 | z_u, z1 = torch.split(z_q, [1, 1], 1) 91 | u = torch.sigmoid(z_u) * x_mask 92 | z0 = (w - u) * x_mask 93 | logdet_tot_q += torch.sum( 94 | (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] 95 | ) 96 | logq = ( 97 | torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) 98 | - logdet_tot_q 99 | ) 100 | 101 | logdet_tot = 0 102 | z0, logdet = self.log_flow(z0, x_mask) 103 | logdet_tot += logdet 104 | z = torch.cat([z0, z1], 1) 105 | for flow in flows: 106 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 107 | logdet_tot = logdet_tot + logdet 108 | nll = ( 109 | torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) 110 | - logdet_tot 111 | ) 112 | return nll + logq # [b] 113 | else: 114 | flows = list(reversed(self.flows)) 115 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 116 | z = ( 117 | torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) 118 | * noise_scale 119 | ) 120 | for flow in flows: 121 | z = flow(z, x_mask, g=x, reverse=reverse) 122 | z0, z1 = torch.split(z, [1, 1], 1) 123 | logw = z0 124 | return logw 125 | 126 | 127 | class DurationPredictor(nn.Module): 128 | def __init__( 129 | self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 130 | ): 131 | super().__init__() 132 | 133 | self.in_channels = in_channels 134 | self.filter_channels = filter_channels 135 | self.kernel_size = kernel_size 136 | self.p_dropout = p_dropout 137 | self.gin_channels = gin_channels 138 | 139 | self.drop = nn.Dropout(p_dropout) 140 | self.conv_1 = nn.Conv1d( 141 | in_channels, filter_channels, kernel_size, padding=kernel_size // 2 142 | ) 143 | self.norm_1 = modules.LayerNorm(filter_channels) 144 | self.conv_2 = nn.Conv1d( 145 | filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 146 | ) 147 | self.norm_2 = modules.LayerNorm(filter_channels) 148 | self.proj = nn.Conv1d(filter_channels, 1, 1) 149 | 150 | if gin_channels != 0: 151 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 152 | 153 | def forward(self, x, x_mask, g=None): 154 | x = torch.detach(x) 155 | if g is not None: 156 | g = torch.detach(g) 157 | x = x + self.cond(g) 158 | x = self.conv_1(x * x_mask) 159 | x = torch.relu(x) 160 | x = self.norm_1(x) 161 | x = self.drop(x) 162 | x = self.conv_2(x * x_mask) 163 | x = torch.relu(x) 164 | x = self.norm_2(x) 165 | x = self.drop(x) 166 | x = self.proj(x * x_mask) 167 | return x * x_mask 168 | 169 | 170 | class TextEncoder(nn.Module): 171 | def __init__( 172 | self, 173 | n_vocab, 174 | out_channels, 175 | hidden_channels, 176 | filter_channels, 177 | n_heads, 178 | n_layers, 179 | kernel_size, 180 | p_dropout, 181 | ): 182 | super().__init__() 183 | self.n_vocab = n_vocab 184 | self.out_channels = out_channels 185 | self.hidden_channels = hidden_channels 186 | self.filter_channels = filter_channels 187 | self.n_heads = n_heads 188 | self.n_layers = n_layers 189 | self.kernel_size = kernel_size 190 | self.p_dropout = p_dropout 191 | 192 | self.emb = nn.Embedding(n_vocab, hidden_channels) 193 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 194 | 195 | self.encoder = attentions.Encoder( 196 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 197 | ) 198 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 199 | 200 | def forward(self, x, x_lengths): 201 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 202 | x = torch.transpose(x, 1, -1) # [b, h, t] 203 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 204 | x.dtype 205 | ) 206 | 207 | x = self.encoder(x * x_mask, x_mask) 208 | stats = self.proj(x) * x_mask 209 | 210 | m, logs = torch.split(stats, self.out_channels, dim=1) 211 | return x, m, logs, x_mask 212 | 213 | 214 | class ResidualCouplingBlock(nn.Module): 215 | def __init__( 216 | self, 217 | channels, 218 | hidden_channels, 219 | kernel_size, 220 | dilation_rate, 221 | n_layers, 222 | n_flows=4, 223 | gin_channels=0, 224 | ): 225 | super().__init__() 226 | self.channels = channels 227 | self.hidden_channels = hidden_channels 228 | self.kernel_size = kernel_size 229 | self.dilation_rate = dilation_rate 230 | self.n_layers = n_layers 231 | self.n_flows = n_flows 232 | self.gin_channels = gin_channels 233 | 234 | self.flows = nn.ModuleList() 235 | for i in range(n_flows): 236 | self.flows.append( 237 | modules.ResidualCouplingLayer( 238 | channels, 239 | hidden_channels, 240 | kernel_size, 241 | dilation_rate, 242 | n_layers, 243 | gin_channels=gin_channels, 244 | mean_only=True, 245 | ) 246 | ) 247 | self.flows.append(modules.Flip()) 248 | 249 | def forward(self, x, x_mask, g=None, reverse=False): 250 | if not reverse: 251 | for flow in self.flows: 252 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 253 | else: 254 | for flow in reversed(self.flows): 255 | x = flow(x, x_mask, g=g, reverse=reverse) 256 | return x 257 | 258 | 259 | class PosteriorEncoder(nn.Module): 260 | def __init__( 261 | self, 262 | in_channels, 263 | out_channels, 264 | hidden_channels, 265 | kernel_size, 266 | dilation_rate, 267 | n_layers, 268 | gin_channels=0, 269 | ): 270 | super().__init__() 271 | self.in_channels = in_channels 272 | self.out_channels = out_channels 273 | self.hidden_channels = hidden_channels 274 | self.kernel_size = kernel_size 275 | self.dilation_rate = dilation_rate 276 | self.n_layers = n_layers 277 | self.gin_channels = gin_channels 278 | 279 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 280 | self.enc = modules.WN( 281 | hidden_channels, 282 | kernel_size, 283 | dilation_rate, 284 | n_layers, 285 | gin_channels=gin_channels, 286 | ) 287 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 288 | 289 | def forward(self, x, x_lengths, g=None): 290 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 291 | x.dtype 292 | ) 293 | x = self.pre(x) * x_mask 294 | x = self.enc(x, x_mask, g=g) 295 | stats = self.proj(x) * x_mask 296 | m, logs = torch.split(stats, self.out_channels, dim=1) 297 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 298 | return z, m, logs, x_mask 299 | 300 | 301 | class Generator(torch.nn.Module): 302 | def __init__( 303 | self, 304 | initial_channel, 305 | resblock, 306 | resblock_kernel_sizes, 307 | resblock_dilation_sizes, 308 | upsample_rates, 309 | upsample_initial_channel, 310 | upsample_kernel_sizes, 311 | gin_channels=0, 312 | ): 313 | super(Generator, self).__init__() 314 | self.num_kernels = len(resblock_kernel_sizes) 315 | self.num_upsamples = len(upsample_rates) 316 | self.conv_pre = Conv1d( 317 | initial_channel, upsample_initial_channel, 7, 1, padding=3 318 | ) 319 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 320 | 321 | self.ups = nn.ModuleList() 322 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 323 | self.ups.append( 324 | weight_norm( 325 | ConvTranspose1d( 326 | upsample_initial_channel // (2**i), 327 | upsample_initial_channel // (2 ** (i + 1)), 328 | k, 329 | u, 330 | padding=(k - u) // 2, 331 | ) 332 | ) 333 | ) 334 | 335 | self.resblocks = nn.ModuleList() 336 | for i in range(len(self.ups)): 337 | ch = upsample_initial_channel // (2 ** (i + 1)) 338 | for j, (k, d) in enumerate( 339 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 340 | ): 341 | self.resblocks.append(resblock(ch, k, d)) 342 | 343 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 344 | self.ups.apply(init_weights) 345 | 346 | if gin_channels != 0: 347 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 348 | 349 | def forward(self, x, g=None): 350 | x = self.conv_pre(x) 351 | if g is not None: 352 | x = x + self.cond(g) 353 | 354 | for i in range(self.num_upsamples): 355 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 356 | x = self.ups[i](x) 357 | xs = None 358 | for j in range(self.num_kernels): 359 | if xs is None: 360 | xs = self.resblocks[i * self.num_kernels + j](x) 361 | else: 362 | xs += self.resblocks[i * self.num_kernels + j](x) 363 | x = xs / self.num_kernels 364 | x = F.leaky_relu(x) 365 | x = self.conv_post(x) 366 | x = torch.tanh(x) 367 | 368 | return x 369 | 370 | def remove_weight_norm(self): 371 | print("Removing weight norm...") 372 | for l in self.ups: 373 | remove_weight_norm(l) 374 | for l in self.resblocks: 375 | l.remove_weight_norm() 376 | 377 | 378 | class DiscriminatorP(torch.nn.Module): 379 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 380 | super(DiscriminatorP, self).__init__() 381 | self.period = period 382 | self.use_spectral_norm = use_spectral_norm 383 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 384 | self.convs = nn.ModuleList( 385 | [ 386 | norm_f( 387 | Conv2d( 388 | 1, 389 | 32, 390 | (kernel_size, 1), 391 | (stride, 1), 392 | padding=(get_padding(kernel_size, 1), 0), 393 | ) 394 | ), 395 | norm_f( 396 | Conv2d( 397 | 32, 398 | 128, 399 | (kernel_size, 1), 400 | (stride, 1), 401 | padding=(get_padding(kernel_size, 1), 0), 402 | ) 403 | ), 404 | norm_f( 405 | Conv2d( 406 | 128, 407 | 512, 408 | (kernel_size, 1), 409 | (stride, 1), 410 | padding=(get_padding(kernel_size, 1), 0), 411 | ) 412 | ), 413 | norm_f( 414 | Conv2d( 415 | 512, 416 | 1024, 417 | (kernel_size, 1), 418 | (stride, 1), 419 | padding=(get_padding(kernel_size, 1), 0), 420 | ) 421 | ), 422 | norm_f( 423 | Conv2d( 424 | 1024, 425 | 1024, 426 | (kernel_size, 1), 427 | 1, 428 | padding=(get_padding(kernel_size, 1), 0), 429 | ) 430 | ), 431 | ] 432 | ) 433 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 434 | 435 | def forward(self, x): 436 | fmap = [] 437 | 438 | # 1d to 2d 439 | b, c, t = x.shape 440 | if t % self.period != 0: # pad first 441 | n_pad = self.period - (t % self.period) 442 | x = F.pad(x, (0, n_pad), "reflect") 443 | t = t + n_pad 444 | x = x.view(b, c, t // self.period, self.period) 445 | 446 | for l in self.convs: 447 | x = l(x) 448 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 449 | fmap.append(x) 450 | x = self.conv_post(x) 451 | fmap.append(x) 452 | x = torch.flatten(x, 1, -1) 453 | 454 | return x, fmap 455 | 456 | 457 | class DiscriminatorS(torch.nn.Module): 458 | def __init__(self, use_spectral_norm=False): 459 | super(DiscriminatorS, self).__init__() 460 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 461 | self.convs = nn.ModuleList( 462 | [ 463 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 464 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 465 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 466 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 467 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 468 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 469 | ] 470 | ) 471 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 472 | 473 | def forward(self, x): 474 | fmap = [] 475 | 476 | for l in self.convs: 477 | x = l(x) 478 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 479 | fmap.append(x) 480 | x = self.conv_post(x) 481 | fmap.append(x) 482 | x = torch.flatten(x, 1, -1) 483 | 484 | return x, fmap 485 | 486 | 487 | class MultiPeriodDiscriminator(torch.nn.Module): 488 | def __init__(self, use_spectral_norm=False): 489 | super(MultiPeriodDiscriminator, self).__init__() 490 | periods = [2, 3, 5, 7, 11] 491 | 492 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 493 | discs = discs + [ 494 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 495 | ] 496 | self.discriminators = nn.ModuleList(discs) 497 | 498 | def forward(self, y, y_hat): 499 | y_d_rs = [] 500 | y_d_gs = [] 501 | fmap_rs = [] 502 | fmap_gs = [] 503 | for i, d in enumerate(self.discriminators): 504 | y_d_r, fmap_r = d(y) 505 | y_d_g, fmap_g = d(y_hat) 506 | y_d_rs.append(y_d_r) 507 | y_d_gs.append(y_d_g) 508 | fmap_rs.append(fmap_r) 509 | fmap_gs.append(fmap_g) 510 | 511 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 512 | 513 | 514 | class SynthesizerTrn(nn.Module): 515 | """ 516 | Synthesizer for Training 517 | """ 518 | 519 | def __init__( 520 | self, 521 | n_vocab, 522 | spec_channels, 523 | segment_size, 524 | inter_channels, 525 | hidden_channels, 526 | filter_channels, 527 | n_heads, 528 | n_layers, 529 | kernel_size, 530 | p_dropout, 531 | resblock, 532 | resblock_kernel_sizes, 533 | resblock_dilation_sizes, 534 | upsample_rates, 535 | upsample_initial_channel, 536 | upsample_kernel_sizes, 537 | n_speakers=0, 538 | gin_channels=0, 539 | use_sdp=True, 540 | **kwargs 541 | ): 542 | super().__init__() 543 | self.n_vocab = n_vocab 544 | self.spec_channels = spec_channels 545 | self.inter_channels = inter_channels 546 | self.hidden_channels = hidden_channels 547 | self.filter_channels = filter_channels 548 | self.n_heads = n_heads 549 | self.n_layers = n_layers 550 | self.kernel_size = kernel_size 551 | self.p_dropout = p_dropout 552 | self.resblock = resblock 553 | self.resblock_kernel_sizes = resblock_kernel_sizes 554 | self.resblock_dilation_sizes = resblock_dilation_sizes 555 | self.upsample_rates = upsample_rates 556 | self.upsample_initial_channel = upsample_initial_channel 557 | self.upsample_kernel_sizes = upsample_kernel_sizes 558 | self.segment_size = segment_size 559 | self.n_speakers = n_speakers 560 | self.gin_channels = gin_channels 561 | 562 | self.use_sdp = use_sdp 563 | 564 | self.enc_p = TextEncoder( 565 | n_vocab, 566 | inter_channels, 567 | hidden_channels, 568 | filter_channels, 569 | n_heads, 570 | n_layers, 571 | kernel_size, 572 | p_dropout, 573 | ) 574 | self.dec = Generator( 575 | inter_channels, 576 | resblock, 577 | resblock_kernel_sizes, 578 | resblock_dilation_sizes, 579 | upsample_rates, 580 | upsample_initial_channel, 581 | upsample_kernel_sizes, 582 | gin_channels=gin_channels, 583 | ) 584 | self.enc_q = PosteriorEncoder( 585 | spec_channels, 586 | inter_channels, 587 | hidden_channels, 588 | 5, 589 | 1, 590 | 16, 591 | gin_channels=gin_channels, 592 | ) 593 | self.flow = ResidualCouplingBlock( 594 | inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels 595 | ) 596 | 597 | if use_sdp: 598 | self.dp = StochasticDurationPredictor( 599 | hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels 600 | ) 601 | else: 602 | self.dp = DurationPredictor( 603 | hidden_channels, 256, 3, 0.5, gin_channels=gin_channels 604 | ) 605 | 606 | if n_speakers > 1: 607 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 608 | 609 | def forward(self, x, x_lengths, y, y_lengths, sid=None): 610 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) 611 | if self.n_speakers > 0: 612 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 613 | else: 614 | g = None 615 | 616 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 617 | z_p = self.flow(z, y_mask, g=g) 618 | 619 | with torch.no_grad(): 620 | # negative cross-entropy 621 | s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] 622 | neg_cent1 = torch.sum( 623 | -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True 624 | ) # [b, 1, t_s] 625 | neg_cent2 = torch.matmul( 626 | -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r 627 | ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 628 | neg_cent3 = torch.matmul( 629 | z_p.transpose(1, 2), (m_p * s_p_sq_r) 630 | ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 631 | neg_cent4 = torch.sum( 632 | -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True 633 | ) # [b, 1, t_s] 634 | neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 635 | 636 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 637 | attn = ( 638 | monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)) 639 | .unsqueeze(1) 640 | .detach() 641 | ) 642 | 643 | w = attn.sum(2) 644 | if self.use_sdp: 645 | l_length = self.dp(x, x_mask, w, g=g) 646 | l_length = l_length / torch.sum(x_mask) 647 | else: 648 | logw_ = torch.log(w + 1e-6) * x_mask 649 | logw = self.dp(x, x_mask, g=g) 650 | l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum( 651 | x_mask 652 | ) # for averaging 653 | 654 | # expand prior 655 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) 656 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) 657 | 658 | z_slice, ids_slice = commons.rand_slice_segments( 659 | z, y_lengths, self.segment_size 660 | ) 661 | o = self.dec(z_slice, g=g) 662 | return ( 663 | o, 664 | l_length, 665 | attn, 666 | ids_slice, 667 | x_mask, 668 | y_mask, 669 | (z, z_p, m_p, logs_p, m_q, logs_q), 670 | ) 671 | 672 | def infer( 673 | self, 674 | x, 675 | x_lengths, 676 | sid=None, 677 | noise_scale=1, 678 | length_scale=1, 679 | noise_scale_w=1.0, 680 | max_len=None, 681 | ): 682 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) 683 | if self.n_speakers > 0: 684 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 685 | else: 686 | g = None 687 | 688 | if self.use_sdp: 689 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 690 | else: 691 | logw = self.dp(x, x_mask, g=g) 692 | w = torch.exp(logw) * x_mask * length_scale 693 | w_ceil = torch.ceil(w) 694 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 695 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to( 696 | x_mask.dtype 697 | ) 698 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 699 | attn = commons.generate_path(w_ceil, attn_mask) 700 | 701 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose( 702 | 1, 2 703 | ) # [b, t', t], [b, t, d] -> [b, d, t'] 704 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose( 705 | 1, 2 706 | ) # [b, t', t], [b, t, d] -> [b, d, t'] 707 | 708 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 709 | z = self.flow(z_p, y_mask, g=g, reverse=True) 710 | o = self.dec((z * y_mask)[:, :, :max_len], g=g) 711 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 712 | 713 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 714 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 715 | g_src = self.emb_g(sid_src).unsqueeze(-1) 716 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 717 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 718 | z_p = self.flow(z, y_mask, g=g_src) 719 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 720 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 721 | return o_hat, y_mask, (z, z_p, z_hat) 722 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__( 34 | self, 35 | in_channels, 36 | hidden_channels, 37 | out_channels, 38 | kernel_size, 39 | n_layers, 40 | p_dropout, 41 | ): 42 | super().__init__() 43 | self.in_channels = in_channels 44 | self.hidden_channels = hidden_channels 45 | self.out_channels = out_channels 46 | self.kernel_size = kernel_size 47 | self.n_layers = n_layers 48 | self.p_dropout = p_dropout 49 | assert n_layers > 1, "Number of layers should be larger than 0." 50 | 51 | self.conv_layers = nn.ModuleList() 52 | self.norm_layers = nn.ModuleList() 53 | self.conv_layers.append( 54 | nn.Conv1d( 55 | in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 56 | ) 57 | ) 58 | self.norm_layers.append(LayerNorm(hidden_channels)) 59 | self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) 60 | for _ in range(n_layers - 1): 61 | self.conv_layers.append( 62 | nn.Conv1d( 63 | hidden_channels, 64 | hidden_channels, 65 | kernel_size, 66 | padding=kernel_size // 2, 67 | ) 68 | ) 69 | self.norm_layers.append(LayerNorm(hidden_channels)) 70 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 71 | self.proj.weight.data.zero_() 72 | self.proj.bias.data.zero_() 73 | 74 | def forward(self, x, x_mask): 75 | x_org = x 76 | for i in range(self.n_layers): 77 | x = self.conv_layers[i](x * x_mask) 78 | x = self.norm_layers[i](x) 79 | x = self.relu_drop(x) 80 | x = x_org + self.proj(x) 81 | return x * x_mask 82 | 83 | 84 | class DDSConv(nn.Module): 85 | """ 86 | Dialted and Depth-Separable Convolution 87 | """ 88 | 89 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): 90 | super().__init__() 91 | self.channels = channels 92 | self.kernel_size = kernel_size 93 | self.n_layers = n_layers 94 | self.p_dropout = p_dropout 95 | 96 | self.drop = nn.Dropout(p_dropout) 97 | self.convs_sep = nn.ModuleList() 98 | self.convs_1x1 = nn.ModuleList() 99 | self.norms_1 = nn.ModuleList() 100 | self.norms_2 = nn.ModuleList() 101 | for i in range(n_layers): 102 | dilation = kernel_size**i 103 | padding = (kernel_size * dilation - dilation) // 2 104 | self.convs_sep.append( 105 | nn.Conv1d( 106 | channels, 107 | channels, 108 | kernel_size, 109 | groups=channels, 110 | dilation=dilation, 111 | padding=padding, 112 | ) 113 | ) 114 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 115 | self.norms_1.append(LayerNorm(channels)) 116 | self.norms_2.append(LayerNorm(channels)) 117 | 118 | def forward(self, x, x_mask, g=None): 119 | if g is not None: 120 | x = x + g 121 | for i in range(self.n_layers): 122 | y = self.convs_sep[i](x * x_mask) 123 | y = self.norms_1[i](y) 124 | y = F.gelu(y) 125 | y = self.convs_1x1[i](y) 126 | y = self.norms_2[i](y) 127 | y = F.gelu(y) 128 | y = self.drop(y) 129 | x = x + y 130 | return x * x_mask 131 | 132 | 133 | class WN(torch.nn.Module): 134 | def __init__( 135 | self, 136 | hidden_channels, 137 | kernel_size, 138 | dilation_rate, 139 | n_layers, 140 | gin_channels=0, 141 | p_dropout=0, 142 | ): 143 | super(WN, self).__init__() 144 | assert kernel_size % 2 == 1 145 | self.hidden_channels = hidden_channels 146 | self.kernel_size = (kernel_size,) 147 | self.dilation_rate = dilation_rate 148 | self.n_layers = n_layers 149 | self.gin_channels = gin_channels 150 | self.p_dropout = p_dropout 151 | 152 | self.in_layers = torch.nn.ModuleList() 153 | self.res_skip_layers = torch.nn.ModuleList() 154 | self.drop = nn.Dropout(p_dropout) 155 | 156 | if gin_channels != 0: 157 | cond_layer = torch.nn.Conv1d( 158 | gin_channels, 2 * hidden_channels * n_layers, 1 159 | ) 160 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") 161 | 162 | for i in range(n_layers): 163 | dilation = dilation_rate**i 164 | padding = int((kernel_size * dilation - dilation) / 2) 165 | in_layer = torch.nn.Conv1d( 166 | hidden_channels, 167 | 2 * hidden_channels, 168 | kernel_size, 169 | dilation=dilation, 170 | padding=padding, 171 | ) 172 | in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") 173 | self.in_layers.append(in_layer) 174 | 175 | # last one is not necessary 176 | if i < n_layers - 1: 177 | res_skip_channels = 2 * hidden_channels 178 | else: 179 | res_skip_channels = hidden_channels 180 | 181 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 182 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") 183 | self.res_skip_layers.append(res_skip_layer) 184 | 185 | def forward(self, x, x_mask, g=None, **kwargs): 186 | output = torch.zeros_like(x) 187 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 188 | 189 | if g is not None: 190 | g = self.cond_layer(g) 191 | 192 | for i in range(self.n_layers): 193 | x_in = self.in_layers[i](x) 194 | if g is not None: 195 | cond_offset = i * 2 * self.hidden_channels 196 | g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] 197 | else: 198 | g_l = torch.zeros_like(x_in) 199 | 200 | acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) 201 | acts = self.drop(acts) 202 | 203 | res_skip_acts = self.res_skip_layers[i](acts) 204 | if i < self.n_layers - 1: 205 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 206 | x = (x + res_acts) * x_mask 207 | output = output + res_skip_acts[:, self.hidden_channels :, :] 208 | else: 209 | output = output + res_skip_acts 210 | return output * x_mask 211 | 212 | def remove_weight_norm(self): 213 | if self.gin_channels != 0: 214 | torch.nn.utils.remove_weight_norm(self.cond_layer) 215 | for l in self.in_layers: 216 | torch.nn.utils.remove_weight_norm(l) 217 | for l in self.res_skip_layers: 218 | torch.nn.utils.remove_weight_norm(l) 219 | 220 | 221 | class ResBlock1(torch.nn.Module): 222 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 223 | super(ResBlock1, self).__init__() 224 | self.convs1 = nn.ModuleList( 225 | [ 226 | weight_norm( 227 | Conv1d( 228 | channels, 229 | channels, 230 | kernel_size, 231 | 1, 232 | dilation=dilation[0], 233 | padding=get_padding(kernel_size, dilation[0]), 234 | ) 235 | ), 236 | weight_norm( 237 | Conv1d( 238 | channels, 239 | channels, 240 | kernel_size, 241 | 1, 242 | dilation=dilation[1], 243 | padding=get_padding(kernel_size, dilation[1]), 244 | ) 245 | ), 246 | weight_norm( 247 | Conv1d( 248 | channels, 249 | channels, 250 | kernel_size, 251 | 1, 252 | dilation=dilation[2], 253 | padding=get_padding(kernel_size, dilation[2]), 254 | ) 255 | ), 256 | ] 257 | ) 258 | self.convs1.apply(init_weights) 259 | 260 | self.convs2 = nn.ModuleList( 261 | [ 262 | weight_norm( 263 | Conv1d( 264 | channels, 265 | channels, 266 | kernel_size, 267 | 1, 268 | dilation=1, 269 | padding=get_padding(kernel_size, 1), 270 | ) 271 | ), 272 | weight_norm( 273 | Conv1d( 274 | channels, 275 | channels, 276 | kernel_size, 277 | 1, 278 | dilation=1, 279 | padding=get_padding(kernel_size, 1), 280 | ) 281 | ), 282 | weight_norm( 283 | Conv1d( 284 | channels, 285 | channels, 286 | kernel_size, 287 | 1, 288 | dilation=1, 289 | padding=get_padding(kernel_size, 1), 290 | ) 291 | ), 292 | ] 293 | ) 294 | self.convs2.apply(init_weights) 295 | 296 | def forward(self, x, x_mask=None): 297 | for c1, c2 in zip(self.convs1, self.convs2): 298 | xt = F.leaky_relu(x, LRELU_SLOPE) 299 | if x_mask is not None: 300 | xt = xt * x_mask 301 | xt = c1(xt) 302 | xt = F.leaky_relu(xt, LRELU_SLOPE) 303 | if x_mask is not None: 304 | xt = xt * x_mask 305 | xt = c2(xt) 306 | x = xt + x 307 | if x_mask is not None: 308 | x = x * x_mask 309 | return x 310 | 311 | def remove_weight_norm(self): 312 | for l in self.convs1: 313 | remove_weight_norm(l) 314 | for l in self.convs2: 315 | remove_weight_norm(l) 316 | 317 | 318 | class ResBlock2(torch.nn.Module): 319 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 320 | super(ResBlock2, self).__init__() 321 | self.convs = nn.ModuleList( 322 | [ 323 | weight_norm( 324 | Conv1d( 325 | channels, 326 | channels, 327 | kernel_size, 328 | 1, 329 | dilation=dilation[0], 330 | padding=get_padding(kernel_size, dilation[0]), 331 | ) 332 | ), 333 | weight_norm( 334 | Conv1d( 335 | channels, 336 | channels, 337 | kernel_size, 338 | 1, 339 | dilation=dilation[1], 340 | padding=get_padding(kernel_size, dilation[1]), 341 | ) 342 | ), 343 | ] 344 | ) 345 | self.convs.apply(init_weights) 346 | 347 | def forward(self, x, x_mask=None): 348 | for c in self.convs: 349 | xt = F.leaky_relu(x, LRELU_SLOPE) 350 | if x_mask is not None: 351 | xt = xt * x_mask 352 | xt = c(xt) 353 | x = xt + x 354 | if x_mask is not None: 355 | x = x * x_mask 356 | return x 357 | 358 | def remove_weight_norm(self): 359 | for l in self.convs: 360 | remove_weight_norm(l) 361 | 362 | 363 | class Log(nn.Module): 364 | def forward(self, x, x_mask, reverse=False, **kwargs): 365 | if not reverse: 366 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 367 | logdet = torch.sum(-y, [1, 2]) 368 | return y, logdet 369 | else: 370 | x = torch.exp(x) * x_mask 371 | return x 372 | 373 | 374 | class Flip(nn.Module): 375 | def forward(self, x, *args, reverse=False, **kwargs): 376 | x = torch.flip(x, [1]) 377 | if not reverse: 378 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 379 | return x, logdet 380 | else: 381 | return x 382 | 383 | 384 | class ElementwiseAffine(nn.Module): 385 | def __init__(self, channels): 386 | super().__init__() 387 | self.channels = channels 388 | self.m = nn.Parameter(torch.zeros(channels, 1)) 389 | self.logs = nn.Parameter(torch.zeros(channels, 1)) 390 | 391 | def forward(self, x, x_mask, reverse=False, **kwargs): 392 | if not reverse: 393 | y = self.m + torch.exp(self.logs) * x 394 | y = y * x_mask 395 | logdet = torch.sum(self.logs * x_mask, [1, 2]) 396 | return y, logdet 397 | else: 398 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 399 | return x 400 | 401 | 402 | class ResidualCouplingLayer(nn.Module): 403 | def __init__( 404 | self, 405 | channels, 406 | hidden_channels, 407 | kernel_size, 408 | dilation_rate, 409 | n_layers, 410 | p_dropout=0, 411 | gin_channels=0, 412 | mean_only=False, 413 | ): 414 | assert channels % 2 == 0, "channels should be divisible by 2" 415 | super().__init__() 416 | self.channels = channels 417 | self.hidden_channels = hidden_channels 418 | self.kernel_size = kernel_size 419 | self.dilation_rate = dilation_rate 420 | self.n_layers = n_layers 421 | self.half_channels = channels // 2 422 | self.mean_only = mean_only 423 | 424 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 425 | self.enc = WN( 426 | hidden_channels, 427 | kernel_size, 428 | dilation_rate, 429 | n_layers, 430 | p_dropout=p_dropout, 431 | gin_channels=gin_channels, 432 | ) 433 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 434 | self.post.weight.data.zero_() 435 | self.post.bias.data.zero_() 436 | 437 | def forward(self, x, x_mask, g=None, reverse=False): 438 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 439 | h = self.pre(x0) * x_mask 440 | h = self.enc(h, x_mask, g=g) 441 | stats = self.post(h) * x_mask 442 | if not self.mean_only: 443 | m, logs = torch.split(stats, [self.half_channels] * 2, 1) 444 | else: 445 | m = stats 446 | logs = torch.zeros_like(m) 447 | 448 | if not reverse: 449 | x1 = m + x1 * torch.exp(logs) * x_mask 450 | x = torch.cat([x0, x1], 1) 451 | logdet = torch.sum(logs, [1, 2]) 452 | return x, logdet 453 | else: 454 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 455 | x = torch.cat([x0, x1], 1) 456 | return x 457 | 458 | 459 | class ConvFlow(nn.Module): 460 | def __init__( 461 | self, 462 | in_channels, 463 | filter_channels, 464 | kernel_size, 465 | n_layers, 466 | num_bins=10, 467 | tail_bound=5.0, 468 | ): 469 | super().__init__() 470 | self.in_channels = in_channels 471 | self.filter_channels = filter_channels 472 | self.kernel_size = kernel_size 473 | self.n_layers = n_layers 474 | self.num_bins = num_bins 475 | self.tail_bound = tail_bound 476 | self.half_channels = in_channels // 2 477 | 478 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 479 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) 480 | self.proj = nn.Conv1d( 481 | filter_channels, self.half_channels * (num_bins * 3 - 1), 1 482 | ) 483 | self.proj.weight.data.zero_() 484 | self.proj.bias.data.zero_() 485 | 486 | def forward(self, x, x_mask, g=None, reverse=False): 487 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 488 | h = self.pre(x0) 489 | h = self.convs(h, x_mask, g=g) 490 | h = self.proj(h) * x_mask 491 | 492 | b, c, t = x0.shape 493 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 494 | 495 | unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) 496 | unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( 497 | self.filter_channels 498 | ) 499 | unnormalized_derivatives = h[..., 2 * self.num_bins :] 500 | 501 | x1, logabsdet = piecewise_rational_quadratic_transform( 502 | x1, 503 | unnormalized_widths, 504 | unnormalized_heights, 505 | unnormalized_derivatives, 506 | inverse=reverse, 507 | tails="linear", 508 | tail_bound=self.tail_bound, 509 | ) 510 | 511 | x = torch.cat([x0, x1], 1) * x_mask 512 | logdet = torch.sum(logabsdet * x_mask, [1, 2]) 513 | if not reverse: 514 | return x, logdet 515 | else: 516 | return x 517 | -------------------------------------------------------------------------------- /monotonic_align.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def maximum_path(neg_cent, mask): 6 | """Pure Python implementation. 7 | neg_cent: [b, t_t, t_s] 8 | mask: [b, t_t, t_s] 9 | """ 10 | device = neg_cent.device 11 | dtype = neg_cent.dtype 12 | neg_cent = neg_cent.detach().cpu().numpy().astype(np.float32) 13 | path = np.zeros(neg_cent.shape, dtype=np.int32) 14 | 15 | for i in range(neg_cent.shape[0]): 16 | t_t_max = int(mask[i].sum(1)[0]) 17 | t_s_max = int(mask[i].sum(2)[0]) 18 | maximum_path_each(path[i], neg_cent[i], t_t_max, t_s_max) 19 | 20 | return torch.from_numpy(path).to(device=device, dtype=dtype) 21 | 22 | 23 | def maximum_path_each(path, value, t_y, t_x, max_neg_val=-1e9): 24 | for y in range(t_y): 25 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 26 | if x == y: 27 | v_cur = max_neg_val 28 | else: 29 | v_cur = value[y - 1, x] 30 | if x == 0: 31 | if y == 0: 32 | v_prev = 0.0 33 | else: 34 | v_prev = max_neg_val 35 | else: 36 | v_prev = value[y - 1, x - 1] 37 | value[y, x] += max(v_prev, v_cur) 38 | 39 | index = t_x - 1 40 | for y in range(t_y - 1, -1, -1): 41 | path[y, index] = 1 42 | if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]): 43 | index = index - 1 44 | -------------------------------------------------------------------------------- /openai_api.py: -------------------------------------------------------------------------------- 1 | import config 2 | import openai 3 | from io import BytesIO 4 | 5 | openai.api_key = config.OPENAI_API_KEY 6 | 7 | 8 | def chat_complete(context: list, text: str) -> tuple[list, str]: 9 | new_context = context.copy() 10 | new_context.append({"role": "user", "content": config.ENHANCE_PROMPT + text}) 11 | try: 12 | completion = openai.ChatCompletion.create( 13 | model="gpt-3.5-turbo", messages=config.INITIAL_CONTEXT + new_context 14 | ) 15 | message = completion["choices"][0]["message"]["content"] 16 | new_context.append({"role": "assistant", "content": message}) 17 | return (new_context, message) 18 | except: 19 | return context, "" 20 | 21 | 22 | def transcript(audio: BytesIO, language: str) -> str: 23 | try: 24 | if language == "ja": 25 | transcript = openai.Audio.transcribe( 26 | "whisper-1", 27 | audio, 28 | language=language, 29 | prompt=config.TRANSCRIPT_PROMPT.get(language, ""), 30 | ) 31 | elif language == "zh": 32 | transcript = openai.Audio.transcribe( 33 | "whisper-1", 34 | audio, 35 | language=language, 36 | prompt=config.TRANSCRIPT_PROMPT.get(language, ""), 37 | ) 38 | else: 39 | transcript = openai.Audio.transcribe("whisper-1", audio) 40 | except openai.InvalidRequestError: 41 | return {"code": 1} 42 | except: 43 | return {"code": -1} 44 | return {"code": 0, "result": transcript["text"]} 45 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | matplotlib 3 | numpy 4 | phonemizer 5 | scipy 6 | torch 7 | torchvision 8 | Unidecode 9 | pyopenjtalk 10 | flask 11 | waitress -------------------------------------------------------------------------------- /static/css/index.css: -------------------------------------------------------------------------------- 1 | html, 2 | body { 3 | margin: 0; 4 | overflow: hidden; 5 | touch-action: none; 6 | } 7 | 8 | textarea { 9 | font-size: 24px; 10 | border: none; 11 | outline: 0; 12 | border-radius: 32px; 13 | padding: 32px; 14 | } 15 | 16 | input { 17 | font-size: 23px; 18 | border: none; 19 | outline: 0; 20 | border-radius: 32px; 21 | padding: 32px; 22 | } 23 | 24 | .center { 25 | position: absolute; 26 | left: 50%; 27 | transform: translateX(-50%); 28 | } 29 | 30 | .button-img { 31 | max-width: 100%; 32 | max-height: 100%; 33 | width: auto; 34 | height: auto; 35 | } 36 | 37 | .button-img:active { 38 | opacity: 0.5; 39 | } 40 | 41 | .top-button { 42 | background-color: transparent; 43 | border: none; 44 | border-radius: 5px; 45 | } 46 | 47 | #header { 48 | position: fixed; 49 | top: 0; 50 | right: 0; 51 | } 52 | 53 | #live2d-canvas { 54 | position: absolute; 55 | width: 100%; 56 | height: 100%; 57 | z-index: -1; 58 | } 59 | 60 | #textarea-div { 61 | position: absolute; 62 | bottom: 5%; 63 | width: 100%; 64 | display: flex; 65 | flex-direction: column; 66 | align-items: center; 67 | justify-items: flex-end; 68 | } 69 | 70 | #captions-textarea { 71 | flex-grow: 2; 72 | background-color: rgba(0, 0, 0, 0.1); 73 | max-width: 1000px; 74 | width: 80%; 75 | height: 80%; 76 | } 77 | 78 | #message-input { 79 | flex-grow: 1; 80 | background-color: rgba(0, 0, 0, 0.1); 81 | max-width: 1000px; 82 | height: 20%; 83 | width: 80%; 84 | } 85 | 86 | #reload-button { 87 | animation: load 1s; 88 | } 89 | 90 | #reload-button:active { 91 | animation: none; 92 | } 93 | 94 | @media screen and (orientation: landscape) { 95 | #header { 96 | height: 8%; 97 | } 98 | 99 | #textarea-div { 100 | height: 40%; 101 | } 102 | 103 | .top-button { 104 | height: 100%; 105 | width: auto; 106 | } 107 | } 108 | 109 | @media screen and (orientation: portrait) { 110 | #header { 111 | width: 15%; 112 | } 113 | 114 | #textarea-div { 115 | height: 25%; 116 | } 117 | 118 | .top-button { 119 | width: 100%; 120 | height: auto; 121 | } 122 | } 123 | 124 | @keyframes load { 125 | 0% { 126 | transform: rotate(0deg); 127 | } 128 | 129 | 100% { 130 | transform: rotate(360deg); 131 | } 132 | } -------------------------------------------------------------------------------- /static/images/ja.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/ja.png -------------------------------------------------------------------------------- /static/images/record.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/record.png -------------------------------------------------------------------------------- /static/images/recording.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/recording.png -------------------------------------------------------------------------------- /static/images/reload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/reload.png -------------------------------------------------------------------------------- /static/images/text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/text.png -------------------------------------------------------------------------------- /static/images/zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/zh.png -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 |