├── .gitignore
├── LICENSE
├── README.md
├── attentions.py
├── baidu_translate.py
├── commons.py
├── config_sample.py
├── inference.py
├── main.py
├── models.py
├── modules.py
├── monotonic_align.py
├── openai_api.py
├── requirements.txt
├── static
    ├── css
    │   └── index.css
    ├── images
    │   ├── ja.png
    │   ├── record.png
    │   ├── recording.png
    │   ├── reload.png
    │   ├── text.png
    │   └── zh.png
    └── js
    │   ├── bundle.js
    │   ├── core-js.js
    │   └── live2dcubismcore.js
├── templates
    └── index.html
├── text
    ├── __init__.py
    ├── cleaners.py
    └── symbols.py
├── transforms.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | DUMMY1
 2 | DUMMY2
 3 | DUMMY3
 4 | logs
 5 | __pycache__
 6 | .ipynb_checkpoints
 7 | .*.swp
 8 | 
 9 | build
10 | *.c
11 | monotonic_align/monotonic_align
12 | 
13 | /output
14 | 
15 | /config.py
16 | /venv
17 | .env
18 | 
19 | *.pth
20 | /model/*/config.json
21 | /static/atri
22 | 
23 | /test.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 orange
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ChatVITS
 2 | 
 3 | 这是一个基于 ChatGPT、Whisper、VITS 和 Live2D 的语音聊天网页。
 4 | 
 5 | ## 使用方法
 6 | - 安装 Python3.6 以上版本
 7 | - 安装 CMake
 8 | - 安装依赖 `pip install -r requirements.txt`
 9 | - 下载 VITS 模型文件，命名为 model.pth 和 config.json 放入 model 文件夹。
10 | - 下载 Live2D 模型文件，放入 static 文件夹。
11 | - 复制 config_sample.py 为 config.py ,并填写设置。
12 | - 运行服务器 `python main.py`
13 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | from modules import LayerNorm
  8 | 
  9 | 
 10 | class Encoder(nn.Module):
 11 |     def __init__(
 12 |         self,
 13 |         hidden_channels,
 14 |         filter_channels,
 15 |         n_heads,
 16 |         n_layers,
 17 |         kernel_size=1,
 18 |         p_dropout=0.0,
 19 |         window_size=4,
 20 |         **kwargs
 21 |     ):
 22 |         super().__init__()
 23 |         self.hidden_channels = hidden_channels
 24 |         self.filter_channels = filter_channels
 25 |         self.n_heads = n_heads
 26 |         self.n_layers = n_layers
 27 |         self.kernel_size = kernel_size
 28 |         self.p_dropout = p_dropout
 29 |         self.window_size = window_size
 30 | 
 31 |         self.drop = nn.Dropout(p_dropout)
 32 |         self.attn_layers = nn.ModuleList()
 33 |         self.norm_layers_1 = nn.ModuleList()
 34 |         self.ffn_layers = nn.ModuleList()
 35 |         self.norm_layers_2 = nn.ModuleList()
 36 |         for i in range(self.n_layers):
 37 |             self.attn_layers.append(
 38 |                 MultiHeadAttention(
 39 |                     hidden_channels,
 40 |                     hidden_channels,
 41 |                     n_heads,
 42 |                     p_dropout=p_dropout,
 43 |                     window_size=window_size,
 44 |                 )
 45 |             )
 46 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
 47 |             self.ffn_layers.append(
 48 |                 FFN(
 49 |                     hidden_channels,
 50 |                     hidden_channels,
 51 |                     filter_channels,
 52 |                     kernel_size,
 53 |                     p_dropout=p_dropout,
 54 |                 )
 55 |             )
 56 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
 57 | 
 58 |     def forward(self, x, x_mask):
 59 |         attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 60 |         x = x * x_mask
 61 |         for i in range(self.n_layers):
 62 |             y = self.attn_layers[i](x, x, attn_mask)
 63 |             y = self.drop(y)
 64 |             x = self.norm_layers_1[i](x + y)
 65 | 
 66 |             y = self.ffn_layers[i](x, x_mask)
 67 |             y = self.drop(y)
 68 |             x = self.norm_layers_2[i](x + y)
 69 |         x = x * x_mask
 70 |         return x
 71 | 
 72 | 
 73 | class Decoder(nn.Module):
 74 |     def __init__(
 75 |         self,
 76 |         hidden_channels,
 77 |         filter_channels,
 78 |         n_heads,
 79 |         n_layers,
 80 |         kernel_size=1,
 81 |         p_dropout=0.0,
 82 |         proximal_bias=False,
 83 |         proximal_init=True,
 84 |         **kwargs
 85 |     ):
 86 |         super().__init__()
 87 |         self.hidden_channels = hidden_channels
 88 |         self.filter_channels = filter_channels
 89 |         self.n_heads = n_heads
 90 |         self.n_layers = n_layers
 91 |         self.kernel_size = kernel_size
 92 |         self.p_dropout = p_dropout
 93 |         self.proximal_bias = proximal_bias
 94 |         self.proximal_init = proximal_init
 95 | 
 96 |         self.drop = nn.Dropout(p_dropout)
 97 |         self.self_attn_layers = nn.ModuleList()
 98 |         self.norm_layers_0 = nn.ModuleList()
 99 |         self.encdec_attn_layers = nn.ModuleList()
100 |         self.norm_layers_1 = nn.ModuleList()
101 |         self.ffn_layers = nn.ModuleList()
102 |         self.norm_layers_2 = nn.ModuleList()
103 |         for i in range(self.n_layers):
104 |             self.self_attn_layers.append(
105 |                 MultiHeadAttention(
106 |                     hidden_channels,
107 |                     hidden_channels,
108 |                     n_heads,
109 |                     p_dropout=p_dropout,
110 |                     proximal_bias=proximal_bias,
111 |                     proximal_init=proximal_init,
112 |                 )
113 |             )
114 |             self.norm_layers_0.append(LayerNorm(hidden_channels))
115 |             self.encdec_attn_layers.append(
116 |                 MultiHeadAttention(
117 |                     hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
118 |                 )
119 |             )
120 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
121 |             self.ffn_layers.append(
122 |                 FFN(
123 |                     hidden_channels,
124 |                     hidden_channels,
125 |                     filter_channels,
126 |                     kernel_size,
127 |                     p_dropout=p_dropout,
128 |                     causal=True,
129 |                 )
130 |             )
131 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
132 | 
133 |     def forward(self, x, x_mask, h, h_mask):
134 |         """
135 |         x: decoder input
136 |         h: encoder output
137 |         """
138 |         self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
139 |             device=x.device, dtype=x.dtype
140 |         )
141 |         encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
142 |         x = x * x_mask
143 |         for i in range(self.n_layers):
144 |             y = self.self_attn_layers[i](x, x, self_attn_mask)
145 |             y = self.drop(y)
146 |             x = self.norm_layers_0[i](x + y)
147 | 
148 |             y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
149 |             y = self.drop(y)
150 |             x = self.norm_layers_1[i](x + y)
151 | 
152 |             y = self.ffn_layers[i](x, x_mask)
153 |             y = self.drop(y)
154 |             x = self.norm_layers_2[i](x + y)
155 |         x = x * x_mask
156 |         return x
157 | 
158 | 
159 | class MultiHeadAttention(nn.Module):
160 |     def __init__(
161 |         self,
162 |         channels,
163 |         out_channels,
164 |         n_heads,
165 |         p_dropout=0.0,
166 |         window_size=None,
167 |         heads_share=True,
168 |         block_length=None,
169 |         proximal_bias=False,
170 |         proximal_init=False,
171 |     ):
172 |         super().__init__()
173 |         assert channels % n_heads == 0
174 | 
175 |         self.channels = channels
176 |         self.out_channels = out_channels
177 |         self.n_heads = n_heads
178 |         self.p_dropout = p_dropout
179 |         self.window_size = window_size
180 |         self.heads_share = heads_share
181 |         self.block_length = block_length
182 |         self.proximal_bias = proximal_bias
183 |         self.proximal_init = proximal_init
184 |         self.attn = None
185 | 
186 |         self.k_channels = channels // n_heads
187 |         self.conv_q = nn.Conv1d(channels, channels, 1)
188 |         self.conv_k = nn.Conv1d(channels, channels, 1)
189 |         self.conv_v = nn.Conv1d(channels, channels, 1)
190 |         self.conv_o = nn.Conv1d(channels, out_channels, 1)
191 |         self.drop = nn.Dropout(p_dropout)
192 | 
193 |         if window_size is not None:
194 |             n_heads_rel = 1 if heads_share else n_heads
195 |             rel_stddev = self.k_channels**-0.5
196 |             self.emb_rel_k = nn.Parameter(
197 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
198 |                 * rel_stddev
199 |             )
200 |             self.emb_rel_v = nn.Parameter(
201 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
202 |                 * rel_stddev
203 |             )
204 | 
205 |         nn.init.xavier_uniform_(self.conv_q.weight)
206 |         nn.init.xavier_uniform_(self.conv_k.weight)
207 |         nn.init.xavier_uniform_(self.conv_v.weight)
208 |         if proximal_init:
209 |             with torch.no_grad():
210 |                 self.conv_k.weight.copy_(self.conv_q.weight)
211 |                 self.conv_k.bias.copy_(self.conv_q.bias)
212 | 
213 |     def forward(self, x, c, attn_mask=None):
214 |         q = self.conv_q(x)
215 |         k = self.conv_k(c)
216 |         v = self.conv_v(c)
217 | 
218 |         x, self.attn = self.attention(q, k, v, mask=attn_mask)
219 | 
220 |         x = self.conv_o(x)
221 |         return x
222 | 
223 |     def attention(self, query, key, value, mask=None):
224 |         # reshape [b, d, t] -> [b, n_h, t, d_k]
225 |         b, d, t_s, t_t = (*key.size(), query.size(2))
226 |         query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
227 |         key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
228 |         value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
229 | 
230 |         scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
231 |         if self.window_size is not None:
232 |             assert (
233 |                 t_s == t_t
234 |             ), "Relative attention is only available for self-attention."
235 |             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
236 |             rel_logits = self._matmul_with_relative_keys(
237 |                 query / math.sqrt(self.k_channels), key_relative_embeddings
238 |             )
239 |             scores_local = self._relative_position_to_absolute_position(rel_logits)
240 |             scores = scores + scores_local
241 |         if self.proximal_bias:
242 |             assert t_s == t_t, "Proximal bias is only available for self-attention."
243 |             scores = scores + self._attention_bias_proximal(t_s).to(
244 |                 device=scores.device, dtype=scores.dtype
245 |             )
246 |         if mask is not None:
247 |             scores = scores.masked_fill(mask == 0, -1e4)
248 |             if self.block_length is not None:
249 |                 assert (
250 |                     t_s == t_t
251 |                 ), "Local attention is only available for self-attention."
252 |                 block_mask = (
253 |                     torch.ones_like(scores)
254 |                     .triu(-self.block_length)
255 |                     .tril(self.block_length)
256 |                 )
257 |                 scores = scores.masked_fill(block_mask == 0, -1e4)
258 |         p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
259 |         p_attn = self.drop(p_attn)
260 |         output = torch.matmul(p_attn, value)
261 |         if self.window_size is not None:
262 |             relative_weights = self._absolute_position_to_relative_position(p_attn)
263 |             value_relative_embeddings = self._get_relative_embeddings(
264 |                 self.emb_rel_v, t_s
265 |             )
266 |             output = output + self._matmul_with_relative_values(
267 |                 relative_weights, value_relative_embeddings
268 |             )
269 |         output = (
270 |             output.transpose(2, 3).contiguous().view(b, d, t_t)
271 |         )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
272 |         return output, p_attn
273 | 
274 |     def _matmul_with_relative_values(self, x, y):
275 |         """
276 |         x: [b, h, l, m]
277 |         y: [h or 1, m, d]
278 |         ret: [b, h, l, d]
279 |         """
280 |         ret = torch.matmul(x, y.unsqueeze(0))
281 |         return ret
282 | 
283 |     def _matmul_with_relative_keys(self, x, y):
284 |         """
285 |         x: [b, h, l, d]
286 |         y: [h or 1, m, d]
287 |         ret: [b, h, l, m]
288 |         """
289 |         ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
290 |         return ret
291 | 
292 |     def _get_relative_embeddings(self, relative_embeddings, length):
293 |         max_relative_position = 2 * self.window_size + 1
294 |         # Pad first before slice to avoid using cond ops.
295 |         pad_length = max(length - (self.window_size + 1), 0)
296 |         slice_start_position = max((self.window_size + 1) - length, 0)
297 |         slice_end_position = slice_start_position + 2 * length - 1
298 |         if pad_length > 0:
299 |             padded_relative_embeddings = F.pad(
300 |                 relative_embeddings,
301 |                 commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
302 |             )
303 |         else:
304 |             padded_relative_embeddings = relative_embeddings
305 |         used_relative_embeddings = padded_relative_embeddings[
306 |             :, slice_start_position:slice_end_position
307 |         ]
308 |         return used_relative_embeddings
309 | 
310 |     def _relative_position_to_absolute_position(self, x):
311 |         """
312 |         x: [b, h, l, 2*l-1]
313 |         ret: [b, h, l, l]
314 |         """
315 |         batch, heads, length, _ = x.size()
316 |         # Concat columns of pad to shift from relative to absolute indexing.
317 |         x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
318 | 
319 |         # Concat extra elements so to add up to shape (len+1, 2*len-1).
320 |         x_flat = x.view([batch, heads, length * 2 * length])
321 |         x_flat = F.pad(
322 |             x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
323 |         )
324 | 
325 |         # Reshape and slice out the padded elements.
326 |         x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
327 |             :, :, :length, length - 1 :
328 |         ]
329 |         return x_final
330 | 
331 |     def _absolute_position_to_relative_position(self, x):
332 |         """
333 |         x: [b, h, l, l]
334 |         ret: [b, h, l, 2*l-1]
335 |         """
336 |         batch, heads, length, _ = x.size()
337 |         # padd along column
338 |         x = F.pad(
339 |             x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
340 |         )
341 |         x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
342 |         # add 0's in the beginning that will skew the elements after reshape
343 |         x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
344 |         x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
345 |         return x_final
346 | 
347 |     def _attention_bias_proximal(self, length):
348 |         """Bias for self-attention to encourage attention to close positions.
349 |         Args:
350 |           length: an integer scalar.
351 |         Returns:
352 |           a Tensor with shape [1, 1, length, length]
353 |         """
354 |         r = torch.arange(length, dtype=torch.float32)
355 |         diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
356 |         return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
357 | 
358 | 
359 | class FFN(nn.Module):
360 |     def __init__(
361 |         self,
362 |         in_channels,
363 |         out_channels,
364 |         filter_channels,
365 |         kernel_size,
366 |         p_dropout=0.0,
367 |         activation=None,
368 |         causal=False,
369 |     ):
370 |         super().__init__()
371 |         self.in_channels = in_channels
372 |         self.out_channels = out_channels
373 |         self.filter_channels = filter_channels
374 |         self.kernel_size = kernel_size
375 |         self.p_dropout = p_dropout
376 |         self.activation = activation
377 |         self.causal = causal
378 | 
379 |         if causal:
380 |             self.padding = self._causal_padding
381 |         else:
382 |             self.padding = self._same_padding
383 | 
384 |         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
385 |         self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
386 |         self.drop = nn.Dropout(p_dropout)
387 | 
388 |     def forward(self, x, x_mask):
389 |         x = self.conv_1(self.padding(x * x_mask))
390 |         if self.activation == "gelu":
391 |             x = x * torch.sigmoid(1.702 * x)
392 |         else:
393 |             x = torch.relu(x)
394 |         x = self.drop(x)
395 |         x = self.conv_2(self.padding(x * x_mask))
396 |         return x * x_mask
397 | 
398 |     def _causal_padding(self, x):
399 |         if self.kernel_size == 1:
400 |             return x
401 |         pad_l = self.kernel_size - 1
402 |         pad_r = 0
403 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
404 |         x = F.pad(x, commons.convert_pad_shape(padding))
405 |         return x
406 | 
407 |     def _same_padding(self, x):
408 |         if self.kernel_size == 1:
409 |             return x
410 |         pad_l = (self.kernel_size - 1) // 2
411 |         pad_r = self.kernel_size // 2
412 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
413 |         x = F.pad(x, commons.convert_pad_shape(padding))
414 |         return x
415 | 


--------------------------------------------------------------------------------
/baidu_translate.py:
--------------------------------------------------------------------------------
 1 | import config
 2 | import requests
 3 | import hashlib
 4 | from uuid import uuid4
 5 | 
 6 | 
 7 | def translate(text: str):
 8 |     text = text.replace("\n", "")
 9 |     salt = str(uuid4())
10 |     sign = hashlib.md5(
11 |         (
12 |             config.BAIDU_TRANSLATE_APPID + text + salt + config.BAIDU_TRANSLATE_KEY
13 |         ).encode("utf-8")
14 |     ).hexdigest()
15 |     params = {
16 |         "q": text,
17 |         "from": "auto",
18 |         "to": "zh",
19 |         "appid": config.BAIDU_TRANSLATE_APPID,
20 |         "salt": salt,
21 |         "sign": sign,
22 |     }
23 |     response = requests.get(config.BAIDU_TRANSLATE_URL, params=params)
24 |     result = response.json()
25 |     if "error_code" in result:
26 |         return {"code": result["error_code"]}
27 |     else:
28 |         return {"code": 0, "result": replace_words(result["trans_result"][0]["dst"])}
29 | 
30 | 
31 | def replace_words(text: str):
32 |     for key, value in config.REPLACEMENTS.items():
33 |         text = text.replace(key, value)
34 |     return text
35 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def init_weights(m, mean=0.0, std=0.01):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find("Conv") != -1:
  9 |         m.weight.data.normal_(mean, std)
 10 | 
 11 | 
 12 | def get_padding(kernel_size, dilation=1):
 13 |     return int((kernel_size * dilation - dilation) / 2)
 14 | 
 15 | 
 16 | def convert_pad_shape(pad_shape):
 17 |     l = pad_shape[::-1]
 18 |     pad_shape = [item for sublist in l for item in sublist]
 19 |     return pad_shape
 20 | 
 21 | 
 22 | def intersperse(lst, item):
 23 |     result = [item] * (len(lst) * 2 + 1)
 24 |     result[1::2] = lst
 25 |     return result
 26 | 
 27 | 
 28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 29 |     """KL(P||Q)"""
 30 |     kl = (logs_q - logs_p) - 0.5
 31 |     kl += (
 32 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 33 |     )
 34 |     return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |     """Sample from the Gumbel distribution, protect from overflows."""
 39 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |     return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |     return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |     ret = torch.zeros_like(x[:, :, :segment_size])
 50 |     for i in range(x.size(0)):
 51 |         idx_str = ids_str[i]
 52 |         idx_end = idx_str + segment_size
 53 |         ret[i] = x[i, :, idx_str:idx_end]
 54 |     return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |     b, d, t = x.size()
 59 |     if x_lengths is None:
 60 |         x_lengths = t
 61 |     ids_str_max = x_lengths - segment_size + 1
 62 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |     ret = slice_segments(x, ids_str, segment_size)
 64 |     return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 68 |     position = torch.arange(length, dtype=torch.float)
 69 |     num_timescales = channels // 2
 70 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 71 |         num_timescales - 1
 72 |     )
 73 |     inv_timescales = min_timescale * torch.exp(
 74 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 75 |     )
 76 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |     signal = signal.view(1, channels, length)
 80 |     return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |     b, channels, length = x.size()
 85 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |     return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |     b, channels, length = x.size()
 91 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |     return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |     n_channels_int = n_channels[0]
103 |     in_act = input_a + input_b
104 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |     acts = t_act * s_act
107 |     return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |     l = pad_shape[::-1]
112 |     pad_shape = [item for sublist in l for item in sublist]
113 |     return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |     return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |     if max_length is None:
123 |         max_length = length.max()
124 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |     return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |     """
130 |     duration: [b, 1, t_x]
131 |     mask: [b, 1, t_y, t_x]
132 |     """
133 |     device = duration.device
134 | 
135 |     b, _, t_y, t_x = mask.shape
136 |     cum_duration = torch.cumsum(duration, -1)
137 | 
138 |     cum_duration_flat = cum_duration.view(b * t_x)
139 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |     path = path.view(b, t_x, t_y)
141 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |     path = path.unsqueeze(1).transpose(2, 3) * mask
143 |     return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |     if isinstance(parameters, torch.Tensor):
148 |         parameters = [parameters]
149 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |     norm_type = float(norm_type)
151 |     if clip_value is not None:
152 |         clip_value = float(clip_value)
153 | 
154 |     total_norm = 0
155 |     for p in parameters:
156 |         param_norm = p.grad.data.norm(norm_type)
157 |         total_norm += param_norm.item() ** norm_type
158 |         if clip_value is not None:
159 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |     total_norm = total_norm ** (1.0 / norm_type)
161 |     return total_norm
162 | 


--------------------------------------------------------------------------------
/config_sample.py:
--------------------------------------------------------------------------------
 1 | SECRET_KEY = ""
 2 | OPENAI_API_KEY = ""
 3 | BAIDU_TRANSLATE_URL = "https://fanyi-api.baidu.com/api/trans/vip/translate"
 4 | BAIDU_TRANSLATE_APPID = ""
 5 | BAIDU_TRANSLATE_KEY = ""
 6 | REPLACEMENTS = {}
 7 | TRANSCRIPT_PROMPT = {"zh": "", "ja": ""}
 8 | ENHANCE_PROMPT = ""
 9 | INITIAL_CONTEXT = []
10 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pyexpat import model
 3 | import torch
 4 | 
 5 | import commons
 6 | import utils
 7 | from models import SynthesizerTrn
 8 | 
 9 | from scipy.io.wavfile import write
10 | 
11 | import text as text_utils
12 | 
13 | 
14 | class vits_inference:
15 |     def __init__(self, model_name):
16 |         self.load_model(
17 |             os.path.join("model", model_name, "config.json"),
18 |             os.path.join("model", model_name, "model.pth"),
19 |         )
20 | 
21 |     def get_text(self, text, hps):
22 |         text_norm = text_utils.text_to_sequence(text, hps.data.text_cleaners)
23 |         if hps.data.add_blank:
24 |             text_norm = commons.intersperse(text_norm, 0)
25 |         text_norm = torch.LongTensor(text_norm)
26 |         return text_norm
27 | 
28 |     def load_model(self, config_file, model_file):
29 |         self.hps = utils.get_hparams_from_file(config_file)
30 |         self.net_g = SynthesizerTrn(
31 |             len(text_utils.symbols),
32 |             self.hps.data.filter_length // 2 + 1,
33 |             self.hps.train.segment_size // self.hps.data.hop_length,
34 |             **self.hps.model
35 |         ).cuda()
36 |         _ = self.net_g.eval()
37 |         _ = utils.load_checkpoint(model_file, self.net_g, None)
38 | 
39 |     def synthesis(self, output_file, target_text, speaker_id=-1):
40 |         stn_tst = self.get_text(target_text, self.hps)
41 |         with torch.no_grad():
42 |             x_tst = stn_tst.cuda().unsqueeze(0)
43 |             x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
44 |             if speaker_id != -1:
45 |                 sid = torch.LongTensor([int(speaker_id)]).cuda()
46 |                 audio = (
47 |                     self.net_g.infer(
48 |                         x_tst,
49 |                         x_tst_lengths,
50 |                         sid=sid,
51 |                         noise_scale=0.667,
52 |                         noise_scale_w=0.8,
53 |                         length_scale=1,
54 |                     )[0][0, 0]
55 |                     .data.cpu()
56 |                     .float()
57 |                     .numpy()
58 |                 )
59 |             else:
60 |                 audio = (
61 |                     self.net_g.infer(
62 |                         x_tst,
63 |                         x_tst_lengths,
64 |                         noise_scale=0.667,
65 |                         noise_scale_w=0.8,
66 |                         length_scale=1,
67 |                     )[0][0, 0]
68 |                     .data.cpu()
69 |                     .float()
70 |                     .numpy()
71 |                 )
72 |             audio = audio * 32768.0
73 |             audio = audio.squeeze()
74 |             audio = audio.astype("int16")
75 |             write(output_file, 22050, audio)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     model_name = input("模型名称：")
80 |     output_dir = input("输出目录：")
81 |     model = vits_inference(model_name)
82 |     while True:
83 |         target_text = input("生成文本：")
84 |         output_file = os.path.join(
85 |             output_dir, "{}_{}.wav".format(model_name, target_text.replace(" ", "_"))
86 |         )
87 |         model.synthesis(output_file, target_text)
88 |         print("生成完成，输出文件：{}".format(output_file))
89 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, render_template, send_file, abort, jsonify, session
 2 | from inference import vits_inference
 3 | import baidu_translate
 4 | import openai_api
 5 | from io import BytesIO
 6 | import waitress
 7 | 
 8 | vits_model = vits_inference("atri")
 9 | 
10 | app = Flask(__name__)
11 | app.config.from_object("config")
12 | 
13 | 
14 | @app.route("/")
15 | def index():
16 |     return render_template("index.html")
17 | 
18 | 
19 | @app.route("/synthesis", methods=["GET"])
20 | def synthesis():
21 |     text = request.args.get("text", type=str)
22 |     try:
23 |         output_audio = BytesIO()
24 |         vits_model.synthesis(output_audio, text, -1)
25 |         return send_file(output_audio, mimetype="audio/wav")
26 |     except:
27 |         abort(500)
28 | 
29 | 
30 | @app.route("/translate", methods=["GET"])
31 | def translate():
32 |     text = request.args.get("text", type=str)
33 |     return jsonify(baidu_translate.translate(text))
34 | 
35 | 
36 | @app.route("/transcript", methods=["POST"])
37 | def transcript():
38 |     language = request.args.get("language", default="", type=str)
39 |     audio = BytesIO(request.files["audio"].stream.read())
40 |     audio.name = "audio.wav"
41 |     return jsonify(openai_api.transcript(audio, language))
42 | 
43 | 
44 | @app.route("/chat_complete", methods=["GET"])
45 | def chat_complete():
46 |     context = session.get("context", [])
47 |     text = request.args.get("text", type=str)
48 |     if text.strip():
49 |         new_context, message = openai_api.chat_complete(context, text)
50 |         if message:
51 |             session["context"] = new_context
52 |             return jsonify(code=0, message=message)
53 |         else:
54 |             return jsonify(code=500)
55 |     else:
56 |         return jsonify(code=400)
57 | 
58 | 
59 | @app.route("/reset_context", methods=["GET"])
60 | def reset_context():
61 |     session["context"] = []
62 |     return jsonify(code=0)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     # app.run(debug=False, host="0.0.0.0", port=8080)
67 |     waitress.serve(app, host="0.0.0.0", port=8080)
68 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | import modules
  8 | import attentions
  9 | import monotonic_align
 10 | 
 11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 13 | from commons import init_weights, get_padding
 14 | 
 15 | 
 16 | class StochasticDurationPredictor(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         in_channels,
 20 |         filter_channels,
 21 |         kernel_size,
 22 |         p_dropout,
 23 |         n_flows=4,
 24 |         gin_channels=0,
 25 |     ):
 26 |         super().__init__()
 27 |         filter_channels = in_channels  # it needs to be removed from future version.
 28 |         self.in_channels = in_channels
 29 |         self.filter_channels = filter_channels
 30 |         self.kernel_size = kernel_size
 31 |         self.p_dropout = p_dropout
 32 |         self.n_flows = n_flows
 33 |         self.gin_channels = gin_channels
 34 | 
 35 |         self.log_flow = modules.Log()
 36 |         self.flows = nn.ModuleList()
 37 |         self.flows.append(modules.ElementwiseAffine(2))
 38 |         for i in range(n_flows):
 39 |             self.flows.append(
 40 |                 modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
 41 |             )
 42 |             self.flows.append(modules.Flip())
 43 | 
 44 |         self.post_pre = nn.Conv1d(1, filter_channels, 1)
 45 |         self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 46 |         self.post_convs = modules.DDSConv(
 47 |             filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
 48 |         )
 49 |         self.post_flows = nn.ModuleList()
 50 |         self.post_flows.append(modules.ElementwiseAffine(2))
 51 |         for i in range(4):
 52 |             self.post_flows.append(
 53 |                 modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
 54 |             )
 55 |             self.post_flows.append(modules.Flip())
 56 | 
 57 |         self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 58 |         self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 59 |         self.convs = modules.DDSConv(
 60 |             filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
 61 |         )
 62 |         if gin_channels != 0:
 63 |             self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 64 | 
 65 |     def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 66 |         x = torch.detach(x)
 67 |         x = self.pre(x)
 68 |         if g is not None:
 69 |             g = torch.detach(g)
 70 |             x = x + self.cond(g)
 71 |         x = self.convs(x, x_mask)
 72 |         x = self.proj(x) * x_mask
 73 | 
 74 |         if not reverse:
 75 |             flows = self.flows
 76 |             assert w is not None
 77 | 
 78 |             logdet_tot_q = 0
 79 |             h_w = self.post_pre(w)
 80 |             h_w = self.post_convs(h_w, x_mask)
 81 |             h_w = self.post_proj(h_w) * x_mask
 82 |             e_q = (
 83 |                 torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
 84 |                 * x_mask
 85 |             )
 86 |             z_q = e_q
 87 |             for flow in self.post_flows:
 88 |                 z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 89 |                 logdet_tot_q += logdet_q
 90 |             z_u, z1 = torch.split(z_q, [1, 1], 1)
 91 |             u = torch.sigmoid(z_u) * x_mask
 92 |             z0 = (w - u) * x_mask
 93 |             logdet_tot_q += torch.sum(
 94 |                 (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
 95 |             )
 96 |             logq = (
 97 |                 torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
 98 |                 - logdet_tot_q
 99 |             )
100 | 
101 |             logdet_tot = 0
102 |             z0, logdet = self.log_flow(z0, x_mask)
103 |             logdet_tot += logdet
104 |             z = torch.cat([z0, z1], 1)
105 |             for flow in flows:
106 |                 z, logdet = flow(z, x_mask, g=x, reverse=reverse)
107 |                 logdet_tot = logdet_tot + logdet
108 |             nll = (
109 |                 torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
110 |                 - logdet_tot
111 |             )
112 |             return nll + logq  # [b]
113 |         else:
114 |             flows = list(reversed(self.flows))
115 |             flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
116 |             z = (
117 |                 torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
118 |                 * noise_scale
119 |             )
120 |             for flow in flows:
121 |                 z = flow(z, x_mask, g=x, reverse=reverse)
122 |             z0, z1 = torch.split(z, [1, 1], 1)
123 |             logw = z0
124 |             return logw
125 | 
126 | 
127 | class DurationPredictor(nn.Module):
128 |     def __init__(
129 |         self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
130 |     ):
131 |         super().__init__()
132 | 
133 |         self.in_channels = in_channels
134 |         self.filter_channels = filter_channels
135 |         self.kernel_size = kernel_size
136 |         self.p_dropout = p_dropout
137 |         self.gin_channels = gin_channels
138 | 
139 |         self.drop = nn.Dropout(p_dropout)
140 |         self.conv_1 = nn.Conv1d(
141 |             in_channels, filter_channels, kernel_size, padding=kernel_size // 2
142 |         )
143 |         self.norm_1 = modules.LayerNorm(filter_channels)
144 |         self.conv_2 = nn.Conv1d(
145 |             filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
146 |         )
147 |         self.norm_2 = modules.LayerNorm(filter_channels)
148 |         self.proj = nn.Conv1d(filter_channels, 1, 1)
149 | 
150 |         if gin_channels != 0:
151 |             self.cond = nn.Conv1d(gin_channels, in_channels, 1)
152 | 
153 |     def forward(self, x, x_mask, g=None):
154 |         x = torch.detach(x)
155 |         if g is not None:
156 |             g = torch.detach(g)
157 |             x = x + self.cond(g)
158 |         x = self.conv_1(x * x_mask)
159 |         x = torch.relu(x)
160 |         x = self.norm_1(x)
161 |         x = self.drop(x)
162 |         x = self.conv_2(x * x_mask)
163 |         x = torch.relu(x)
164 |         x = self.norm_2(x)
165 |         x = self.drop(x)
166 |         x = self.proj(x * x_mask)
167 |         return x * x_mask
168 | 
169 | 
170 | class TextEncoder(nn.Module):
171 |     def __init__(
172 |         self,
173 |         n_vocab,
174 |         out_channels,
175 |         hidden_channels,
176 |         filter_channels,
177 |         n_heads,
178 |         n_layers,
179 |         kernel_size,
180 |         p_dropout,
181 |     ):
182 |         super().__init__()
183 |         self.n_vocab = n_vocab
184 |         self.out_channels = out_channels
185 |         self.hidden_channels = hidden_channels
186 |         self.filter_channels = filter_channels
187 |         self.n_heads = n_heads
188 |         self.n_layers = n_layers
189 |         self.kernel_size = kernel_size
190 |         self.p_dropout = p_dropout
191 | 
192 |         self.emb = nn.Embedding(n_vocab, hidden_channels)
193 |         nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
194 | 
195 |         self.encoder = attentions.Encoder(
196 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
197 |         )
198 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
199 | 
200 |     def forward(self, x, x_lengths):
201 |         x = self.emb(x) * math.sqrt(self.hidden_channels)  # [b, t, h]
202 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
203 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
204 |             x.dtype
205 |         )
206 | 
207 |         x = self.encoder(x * x_mask, x_mask)
208 |         stats = self.proj(x) * x_mask
209 | 
210 |         m, logs = torch.split(stats, self.out_channels, dim=1)
211 |         return x, m, logs, x_mask
212 | 
213 | 
214 | class ResidualCouplingBlock(nn.Module):
215 |     def __init__(
216 |         self,
217 |         channels,
218 |         hidden_channels,
219 |         kernel_size,
220 |         dilation_rate,
221 |         n_layers,
222 |         n_flows=4,
223 |         gin_channels=0,
224 |     ):
225 |         super().__init__()
226 |         self.channels = channels
227 |         self.hidden_channels = hidden_channels
228 |         self.kernel_size = kernel_size
229 |         self.dilation_rate = dilation_rate
230 |         self.n_layers = n_layers
231 |         self.n_flows = n_flows
232 |         self.gin_channels = gin_channels
233 | 
234 |         self.flows = nn.ModuleList()
235 |         for i in range(n_flows):
236 |             self.flows.append(
237 |                 modules.ResidualCouplingLayer(
238 |                     channels,
239 |                     hidden_channels,
240 |                     kernel_size,
241 |                     dilation_rate,
242 |                     n_layers,
243 |                     gin_channels=gin_channels,
244 |                     mean_only=True,
245 |                 )
246 |             )
247 |             self.flows.append(modules.Flip())
248 | 
249 |     def forward(self, x, x_mask, g=None, reverse=False):
250 |         if not reverse:
251 |             for flow in self.flows:
252 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
253 |         else:
254 |             for flow in reversed(self.flows):
255 |                 x = flow(x, x_mask, g=g, reverse=reverse)
256 |         return x
257 | 
258 | 
259 | class PosteriorEncoder(nn.Module):
260 |     def __init__(
261 |         self,
262 |         in_channels,
263 |         out_channels,
264 |         hidden_channels,
265 |         kernel_size,
266 |         dilation_rate,
267 |         n_layers,
268 |         gin_channels=0,
269 |     ):
270 |         super().__init__()
271 |         self.in_channels = in_channels
272 |         self.out_channels = out_channels
273 |         self.hidden_channels = hidden_channels
274 |         self.kernel_size = kernel_size
275 |         self.dilation_rate = dilation_rate
276 |         self.n_layers = n_layers
277 |         self.gin_channels = gin_channels
278 | 
279 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
280 |         self.enc = modules.WN(
281 |             hidden_channels,
282 |             kernel_size,
283 |             dilation_rate,
284 |             n_layers,
285 |             gin_channels=gin_channels,
286 |         )
287 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
288 | 
289 |     def forward(self, x, x_lengths, g=None):
290 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
291 |             x.dtype
292 |         )
293 |         x = self.pre(x) * x_mask
294 |         x = self.enc(x, x_mask, g=g)
295 |         stats = self.proj(x) * x_mask
296 |         m, logs = torch.split(stats, self.out_channels, dim=1)
297 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
298 |         return z, m, logs, x_mask
299 | 
300 | 
301 | class Generator(torch.nn.Module):
302 |     def __init__(
303 |         self,
304 |         initial_channel,
305 |         resblock,
306 |         resblock_kernel_sizes,
307 |         resblock_dilation_sizes,
308 |         upsample_rates,
309 |         upsample_initial_channel,
310 |         upsample_kernel_sizes,
311 |         gin_channels=0,
312 |     ):
313 |         super(Generator, self).__init__()
314 |         self.num_kernels = len(resblock_kernel_sizes)
315 |         self.num_upsamples = len(upsample_rates)
316 |         self.conv_pre = Conv1d(
317 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
318 |         )
319 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
320 | 
321 |         self.ups = nn.ModuleList()
322 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
323 |             self.ups.append(
324 |                 weight_norm(
325 |                     ConvTranspose1d(
326 |                         upsample_initial_channel // (2**i),
327 |                         upsample_initial_channel // (2 ** (i + 1)),
328 |                         k,
329 |                         u,
330 |                         padding=(k - u) // 2,
331 |                     )
332 |                 )
333 |             )
334 | 
335 |         self.resblocks = nn.ModuleList()
336 |         for i in range(len(self.ups)):
337 |             ch = upsample_initial_channel // (2 ** (i + 1))
338 |             for j, (k, d) in enumerate(
339 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
340 |             ):
341 |                 self.resblocks.append(resblock(ch, k, d))
342 | 
343 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
344 |         self.ups.apply(init_weights)
345 | 
346 |         if gin_channels != 0:
347 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
348 | 
349 |     def forward(self, x, g=None):
350 |         x = self.conv_pre(x)
351 |         if g is not None:
352 |             x = x + self.cond(g)
353 | 
354 |         for i in range(self.num_upsamples):
355 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
356 |             x = self.ups[i](x)
357 |             xs = None
358 |             for j in range(self.num_kernels):
359 |                 if xs is None:
360 |                     xs = self.resblocks[i * self.num_kernels + j](x)
361 |                 else:
362 |                     xs += self.resblocks[i * self.num_kernels + j](x)
363 |             x = xs / self.num_kernels
364 |         x = F.leaky_relu(x)
365 |         x = self.conv_post(x)
366 |         x = torch.tanh(x)
367 | 
368 |         return x
369 | 
370 |     def remove_weight_norm(self):
371 |         print("Removing weight norm...")
372 |         for l in self.ups:
373 |             remove_weight_norm(l)
374 |         for l in self.resblocks:
375 |             l.remove_weight_norm()
376 | 
377 | 
378 | class DiscriminatorP(torch.nn.Module):
379 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
380 |         super(DiscriminatorP, self).__init__()
381 |         self.period = period
382 |         self.use_spectral_norm = use_spectral_norm
383 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
384 |         self.convs = nn.ModuleList(
385 |             [
386 |                 norm_f(
387 |                     Conv2d(
388 |                         1,
389 |                         32,
390 |                         (kernel_size, 1),
391 |                         (stride, 1),
392 |                         padding=(get_padding(kernel_size, 1), 0),
393 |                     )
394 |                 ),
395 |                 norm_f(
396 |                     Conv2d(
397 |                         32,
398 |                         128,
399 |                         (kernel_size, 1),
400 |                         (stride, 1),
401 |                         padding=(get_padding(kernel_size, 1), 0),
402 |                     )
403 |                 ),
404 |                 norm_f(
405 |                     Conv2d(
406 |                         128,
407 |                         512,
408 |                         (kernel_size, 1),
409 |                         (stride, 1),
410 |                         padding=(get_padding(kernel_size, 1), 0),
411 |                     )
412 |                 ),
413 |                 norm_f(
414 |                     Conv2d(
415 |                         512,
416 |                         1024,
417 |                         (kernel_size, 1),
418 |                         (stride, 1),
419 |                         padding=(get_padding(kernel_size, 1), 0),
420 |                     )
421 |                 ),
422 |                 norm_f(
423 |                     Conv2d(
424 |                         1024,
425 |                         1024,
426 |                         (kernel_size, 1),
427 |                         1,
428 |                         padding=(get_padding(kernel_size, 1), 0),
429 |                     )
430 |                 ),
431 |             ]
432 |         )
433 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
434 | 
435 |     def forward(self, x):
436 |         fmap = []
437 | 
438 |         # 1d to 2d
439 |         b, c, t = x.shape
440 |         if t % self.period != 0:  # pad first
441 |             n_pad = self.period - (t % self.period)
442 |             x = F.pad(x, (0, n_pad), "reflect")
443 |             t = t + n_pad
444 |         x = x.view(b, c, t // self.period, self.period)
445 | 
446 |         for l in self.convs:
447 |             x = l(x)
448 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
449 |             fmap.append(x)
450 |         x = self.conv_post(x)
451 |         fmap.append(x)
452 |         x = torch.flatten(x, 1, -1)
453 | 
454 |         return x, fmap
455 | 
456 | 
457 | class DiscriminatorS(torch.nn.Module):
458 |     def __init__(self, use_spectral_norm=False):
459 |         super(DiscriminatorS, self).__init__()
460 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
461 |         self.convs = nn.ModuleList(
462 |             [
463 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
464 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
465 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
466 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
467 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
468 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
469 |             ]
470 |         )
471 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
472 | 
473 |     def forward(self, x):
474 |         fmap = []
475 | 
476 |         for l in self.convs:
477 |             x = l(x)
478 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
479 |             fmap.append(x)
480 |         x = self.conv_post(x)
481 |         fmap.append(x)
482 |         x = torch.flatten(x, 1, -1)
483 | 
484 |         return x, fmap
485 | 
486 | 
487 | class MultiPeriodDiscriminator(torch.nn.Module):
488 |     def __init__(self, use_spectral_norm=False):
489 |         super(MultiPeriodDiscriminator, self).__init__()
490 |         periods = [2, 3, 5, 7, 11]
491 | 
492 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
493 |         discs = discs + [
494 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
495 |         ]
496 |         self.discriminators = nn.ModuleList(discs)
497 | 
498 |     def forward(self, y, y_hat):
499 |         y_d_rs = []
500 |         y_d_gs = []
501 |         fmap_rs = []
502 |         fmap_gs = []
503 |         for i, d in enumerate(self.discriminators):
504 |             y_d_r, fmap_r = d(y)
505 |             y_d_g, fmap_g = d(y_hat)
506 |             y_d_rs.append(y_d_r)
507 |             y_d_gs.append(y_d_g)
508 |             fmap_rs.append(fmap_r)
509 |             fmap_gs.append(fmap_g)
510 | 
511 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
512 | 
513 | 
514 | class SynthesizerTrn(nn.Module):
515 |     """
516 |     Synthesizer for Training
517 |     """
518 | 
519 |     def __init__(
520 |         self,
521 |         n_vocab,
522 |         spec_channels,
523 |         segment_size,
524 |         inter_channels,
525 |         hidden_channels,
526 |         filter_channels,
527 |         n_heads,
528 |         n_layers,
529 |         kernel_size,
530 |         p_dropout,
531 |         resblock,
532 |         resblock_kernel_sizes,
533 |         resblock_dilation_sizes,
534 |         upsample_rates,
535 |         upsample_initial_channel,
536 |         upsample_kernel_sizes,
537 |         n_speakers=0,
538 |         gin_channels=0,
539 |         use_sdp=True,
540 |         **kwargs
541 |     ):
542 |         super().__init__()
543 |         self.n_vocab = n_vocab
544 |         self.spec_channels = spec_channels
545 |         self.inter_channels = inter_channels
546 |         self.hidden_channels = hidden_channels
547 |         self.filter_channels = filter_channels
548 |         self.n_heads = n_heads
549 |         self.n_layers = n_layers
550 |         self.kernel_size = kernel_size
551 |         self.p_dropout = p_dropout
552 |         self.resblock = resblock
553 |         self.resblock_kernel_sizes = resblock_kernel_sizes
554 |         self.resblock_dilation_sizes = resblock_dilation_sizes
555 |         self.upsample_rates = upsample_rates
556 |         self.upsample_initial_channel = upsample_initial_channel
557 |         self.upsample_kernel_sizes = upsample_kernel_sizes
558 |         self.segment_size = segment_size
559 |         self.n_speakers = n_speakers
560 |         self.gin_channels = gin_channels
561 | 
562 |         self.use_sdp = use_sdp
563 | 
564 |         self.enc_p = TextEncoder(
565 |             n_vocab,
566 |             inter_channels,
567 |             hidden_channels,
568 |             filter_channels,
569 |             n_heads,
570 |             n_layers,
571 |             kernel_size,
572 |             p_dropout,
573 |         )
574 |         self.dec = Generator(
575 |             inter_channels,
576 |             resblock,
577 |             resblock_kernel_sizes,
578 |             resblock_dilation_sizes,
579 |             upsample_rates,
580 |             upsample_initial_channel,
581 |             upsample_kernel_sizes,
582 |             gin_channels=gin_channels,
583 |         )
584 |         self.enc_q = PosteriorEncoder(
585 |             spec_channels,
586 |             inter_channels,
587 |             hidden_channels,
588 |             5,
589 |             1,
590 |             16,
591 |             gin_channels=gin_channels,
592 |         )
593 |         self.flow = ResidualCouplingBlock(
594 |             inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
595 |         )
596 | 
597 |         if use_sdp:
598 |             self.dp = StochasticDurationPredictor(
599 |                 hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
600 |             )
601 |         else:
602 |             self.dp = DurationPredictor(
603 |                 hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
604 |             )
605 | 
606 |         if n_speakers > 1:
607 |             self.emb_g = nn.Embedding(n_speakers, gin_channels)
608 | 
609 |     def forward(self, x, x_lengths, y, y_lengths, sid=None):
610 |         x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
611 |         if self.n_speakers > 0:
612 |             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
613 |         else:
614 |             g = None
615 | 
616 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
617 |         z_p = self.flow(z, y_mask, g=g)
618 | 
619 |         with torch.no_grad():
620 |             # negative cross-entropy
621 |             s_p_sq_r = torch.exp(-2 * logs_p)  # [b, d, t]
622 |             neg_cent1 = torch.sum(
623 |                 -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True
624 |             )  # [b, 1, t_s]
625 |             neg_cent2 = torch.matmul(
626 |                 -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r
627 |             )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
628 |             neg_cent3 = torch.matmul(
629 |                 z_p.transpose(1, 2), (m_p * s_p_sq_r)
630 |             )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
631 |             neg_cent4 = torch.sum(
632 |                 -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True
633 |             )  # [b, 1, t_s]
634 |             neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
635 | 
636 |             attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
637 |             attn = (
638 |                 monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1))
639 |                 .unsqueeze(1)
640 |                 .detach()
641 |             )
642 | 
643 |         w = attn.sum(2)
644 |         if self.use_sdp:
645 |             l_length = self.dp(x, x_mask, w, g=g)
646 |             l_length = l_length / torch.sum(x_mask)
647 |         else:
648 |             logw_ = torch.log(w + 1e-6) * x_mask
649 |             logw = self.dp(x, x_mask, g=g)
650 |             l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
651 |                 x_mask
652 |             )  # for averaging
653 | 
654 |         # expand prior
655 |         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
656 |         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
657 | 
658 |         z_slice, ids_slice = commons.rand_slice_segments(
659 |             z, y_lengths, self.segment_size
660 |         )
661 |         o = self.dec(z_slice, g=g)
662 |         return (
663 |             o,
664 |             l_length,
665 |             attn,
666 |             ids_slice,
667 |             x_mask,
668 |             y_mask,
669 |             (z, z_p, m_p, logs_p, m_q, logs_q),
670 |         )
671 | 
672 |     def infer(
673 |         self,
674 |         x,
675 |         x_lengths,
676 |         sid=None,
677 |         noise_scale=1,
678 |         length_scale=1,
679 |         noise_scale_w=1.0,
680 |         max_len=None,
681 |     ):
682 |         x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
683 |         if self.n_speakers > 0:
684 |             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
685 |         else:
686 |             g = None
687 | 
688 |         if self.use_sdp:
689 |             logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
690 |         else:
691 |             logw = self.dp(x, x_mask, g=g)
692 |         w = torch.exp(logw) * x_mask * length_scale
693 |         w_ceil = torch.ceil(w)
694 |         y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
695 |         y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
696 |             x_mask.dtype
697 |         )
698 |         attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
699 |         attn = commons.generate_path(w_ceil, attn_mask)
700 | 
701 |         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
702 |             1, 2
703 |         )  # [b, t', t], [b, t, d] -> [b, d, t']
704 |         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
705 |             1, 2
706 |         )  # [b, t', t], [b, t, d] -> [b, d, t']
707 | 
708 |         z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
709 |         z = self.flow(z_p, y_mask, g=g, reverse=True)
710 |         o = self.dec((z * y_mask)[:, :, :max_len], g=g)
711 |         return o, attn, y_mask, (z, z_p, m_p, logs_p)
712 | 
713 |     def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
714 |         assert self.n_speakers > 0, "n_speakers have to be larger than 0."
715 |         g_src = self.emb_g(sid_src).unsqueeze(-1)
716 |         g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
717 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
718 |         z_p = self.flow(z, y_mask, g=g_src)
719 |         z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
720 |         o_hat = self.dec(z_hat * y_mask, g=g_tgt)
721 |         return o_hat, y_mask, (z, z_p, z_hat)
722 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from torch.nn import Conv1d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm
  8 | 
  9 | import commons
 10 | from commons import init_weights, get_padding
 11 | from transforms import piecewise_rational_quadratic_transform
 12 | 
 13 | 
 14 | LRELU_SLOPE = 0.1
 15 | 
 16 | 
 17 | class LayerNorm(nn.Module):
 18 |     def __init__(self, channels, eps=1e-5):
 19 |         super().__init__()
 20 |         self.channels = channels
 21 |         self.eps = eps
 22 | 
 23 |         self.gamma = nn.Parameter(torch.ones(channels))
 24 |         self.beta = nn.Parameter(torch.zeros(channels))
 25 | 
 26 |     def forward(self, x):
 27 |         x = x.transpose(1, -1)
 28 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 29 |         return x.transpose(1, -1)
 30 | 
 31 | 
 32 | class ConvReluNorm(nn.Module):
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         hidden_channels,
 37 |         out_channels,
 38 |         kernel_size,
 39 |         n_layers,
 40 |         p_dropout,
 41 |     ):
 42 |         super().__init__()
 43 |         self.in_channels = in_channels
 44 |         self.hidden_channels = hidden_channels
 45 |         self.out_channels = out_channels
 46 |         self.kernel_size = kernel_size
 47 |         self.n_layers = n_layers
 48 |         self.p_dropout = p_dropout
 49 |         assert n_layers > 1, "Number of layers should be larger than 0."
 50 | 
 51 |         self.conv_layers = nn.ModuleList()
 52 |         self.norm_layers = nn.ModuleList()
 53 |         self.conv_layers.append(
 54 |             nn.Conv1d(
 55 |                 in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
 56 |             )
 57 |         )
 58 |         self.norm_layers.append(LayerNorm(hidden_channels))
 59 |         self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
 60 |         for _ in range(n_layers - 1):
 61 |             self.conv_layers.append(
 62 |                 nn.Conv1d(
 63 |                     hidden_channels,
 64 |                     hidden_channels,
 65 |                     kernel_size,
 66 |                     padding=kernel_size // 2,
 67 |                 )
 68 |             )
 69 |             self.norm_layers.append(LayerNorm(hidden_channels))
 70 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 71 |         self.proj.weight.data.zero_()
 72 |         self.proj.bias.data.zero_()
 73 | 
 74 |     def forward(self, x, x_mask):
 75 |         x_org = x
 76 |         for i in range(self.n_layers):
 77 |             x = self.conv_layers[i](x * x_mask)
 78 |             x = self.norm_layers[i](x)
 79 |             x = self.relu_drop(x)
 80 |         x = x_org + self.proj(x)
 81 |         return x * x_mask
 82 | 
 83 | 
 84 | class DDSConv(nn.Module):
 85 |     """
 86 |     Dialted and Depth-Separable Convolution
 87 |     """
 88 | 
 89 |     def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
 90 |         super().__init__()
 91 |         self.channels = channels
 92 |         self.kernel_size = kernel_size
 93 |         self.n_layers = n_layers
 94 |         self.p_dropout = p_dropout
 95 | 
 96 |         self.drop = nn.Dropout(p_dropout)
 97 |         self.convs_sep = nn.ModuleList()
 98 |         self.convs_1x1 = nn.ModuleList()
 99 |         self.norms_1 = nn.ModuleList()
100 |         self.norms_2 = nn.ModuleList()
101 |         for i in range(n_layers):
102 |             dilation = kernel_size**i
103 |             padding = (kernel_size * dilation - dilation) // 2
104 |             self.convs_sep.append(
105 |                 nn.Conv1d(
106 |                     channels,
107 |                     channels,
108 |                     kernel_size,
109 |                     groups=channels,
110 |                     dilation=dilation,
111 |                     padding=padding,
112 |                 )
113 |             )
114 |             self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
115 |             self.norms_1.append(LayerNorm(channels))
116 |             self.norms_2.append(LayerNorm(channels))
117 | 
118 |     def forward(self, x, x_mask, g=None):
119 |         if g is not None:
120 |             x = x + g
121 |         for i in range(self.n_layers):
122 |             y = self.convs_sep[i](x * x_mask)
123 |             y = self.norms_1[i](y)
124 |             y = F.gelu(y)
125 |             y = self.convs_1x1[i](y)
126 |             y = self.norms_2[i](y)
127 |             y = F.gelu(y)
128 |             y = self.drop(y)
129 |             x = x + y
130 |         return x * x_mask
131 | 
132 | 
133 | class WN(torch.nn.Module):
134 |     def __init__(
135 |         self,
136 |         hidden_channels,
137 |         kernel_size,
138 |         dilation_rate,
139 |         n_layers,
140 |         gin_channels=0,
141 |         p_dropout=0,
142 |     ):
143 |         super(WN, self).__init__()
144 |         assert kernel_size % 2 == 1
145 |         self.hidden_channels = hidden_channels
146 |         self.kernel_size = (kernel_size,)
147 |         self.dilation_rate = dilation_rate
148 |         self.n_layers = n_layers
149 |         self.gin_channels = gin_channels
150 |         self.p_dropout = p_dropout
151 | 
152 |         self.in_layers = torch.nn.ModuleList()
153 |         self.res_skip_layers = torch.nn.ModuleList()
154 |         self.drop = nn.Dropout(p_dropout)
155 | 
156 |         if gin_channels != 0:
157 |             cond_layer = torch.nn.Conv1d(
158 |                 gin_channels, 2 * hidden_channels * n_layers, 1
159 |             )
160 |             self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
161 | 
162 |         for i in range(n_layers):
163 |             dilation = dilation_rate**i
164 |             padding = int((kernel_size * dilation - dilation) / 2)
165 |             in_layer = torch.nn.Conv1d(
166 |                 hidden_channels,
167 |                 2 * hidden_channels,
168 |                 kernel_size,
169 |                 dilation=dilation,
170 |                 padding=padding,
171 |             )
172 |             in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
173 |             self.in_layers.append(in_layer)
174 | 
175 |             # last one is not necessary
176 |             if i < n_layers - 1:
177 |                 res_skip_channels = 2 * hidden_channels
178 |             else:
179 |                 res_skip_channels = hidden_channels
180 | 
181 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
182 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
183 |             self.res_skip_layers.append(res_skip_layer)
184 | 
185 |     def forward(self, x, x_mask, g=None, **kwargs):
186 |         output = torch.zeros_like(x)
187 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
188 | 
189 |         if g is not None:
190 |             g = self.cond_layer(g)
191 | 
192 |         for i in range(self.n_layers):
193 |             x_in = self.in_layers[i](x)
194 |             if g is not None:
195 |                 cond_offset = i * 2 * self.hidden_channels
196 |                 g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
197 |             else:
198 |                 g_l = torch.zeros_like(x_in)
199 | 
200 |             acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
201 |             acts = self.drop(acts)
202 | 
203 |             res_skip_acts = self.res_skip_layers[i](acts)
204 |             if i < self.n_layers - 1:
205 |                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
206 |                 x = (x + res_acts) * x_mask
207 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
208 |             else:
209 |                 output = output + res_skip_acts
210 |         return output * x_mask
211 | 
212 |     def remove_weight_norm(self):
213 |         if self.gin_channels != 0:
214 |             torch.nn.utils.remove_weight_norm(self.cond_layer)
215 |         for l in self.in_layers:
216 |             torch.nn.utils.remove_weight_norm(l)
217 |         for l in self.res_skip_layers:
218 |             torch.nn.utils.remove_weight_norm(l)
219 | 
220 | 
221 | class ResBlock1(torch.nn.Module):
222 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
223 |         super(ResBlock1, self).__init__()
224 |         self.convs1 = nn.ModuleList(
225 |             [
226 |                 weight_norm(
227 |                     Conv1d(
228 |                         channels,
229 |                         channels,
230 |                         kernel_size,
231 |                         1,
232 |                         dilation=dilation[0],
233 |                         padding=get_padding(kernel_size, dilation[0]),
234 |                     )
235 |                 ),
236 |                 weight_norm(
237 |                     Conv1d(
238 |                         channels,
239 |                         channels,
240 |                         kernel_size,
241 |                         1,
242 |                         dilation=dilation[1],
243 |                         padding=get_padding(kernel_size, dilation[1]),
244 |                     )
245 |                 ),
246 |                 weight_norm(
247 |                     Conv1d(
248 |                         channels,
249 |                         channels,
250 |                         kernel_size,
251 |                         1,
252 |                         dilation=dilation[2],
253 |                         padding=get_padding(kernel_size, dilation[2]),
254 |                     )
255 |                 ),
256 |             ]
257 |         )
258 |         self.convs1.apply(init_weights)
259 | 
260 |         self.convs2 = nn.ModuleList(
261 |             [
262 |                 weight_norm(
263 |                     Conv1d(
264 |                         channels,
265 |                         channels,
266 |                         kernel_size,
267 |                         1,
268 |                         dilation=1,
269 |                         padding=get_padding(kernel_size, 1),
270 |                     )
271 |                 ),
272 |                 weight_norm(
273 |                     Conv1d(
274 |                         channels,
275 |                         channels,
276 |                         kernel_size,
277 |                         1,
278 |                         dilation=1,
279 |                         padding=get_padding(kernel_size, 1),
280 |                     )
281 |                 ),
282 |                 weight_norm(
283 |                     Conv1d(
284 |                         channels,
285 |                         channels,
286 |                         kernel_size,
287 |                         1,
288 |                         dilation=1,
289 |                         padding=get_padding(kernel_size, 1),
290 |                     )
291 |                 ),
292 |             ]
293 |         )
294 |         self.convs2.apply(init_weights)
295 | 
296 |     def forward(self, x, x_mask=None):
297 |         for c1, c2 in zip(self.convs1, self.convs2):
298 |             xt = F.leaky_relu(x, LRELU_SLOPE)
299 |             if x_mask is not None:
300 |                 xt = xt * x_mask
301 |             xt = c1(xt)
302 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
303 |             if x_mask is not None:
304 |                 xt = xt * x_mask
305 |             xt = c2(xt)
306 |             x = xt + x
307 |         if x_mask is not None:
308 |             x = x * x_mask
309 |         return x
310 | 
311 |     def remove_weight_norm(self):
312 |         for l in self.convs1:
313 |             remove_weight_norm(l)
314 |         for l in self.convs2:
315 |             remove_weight_norm(l)
316 | 
317 | 
318 | class ResBlock2(torch.nn.Module):
319 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
320 |         super(ResBlock2, self).__init__()
321 |         self.convs = nn.ModuleList(
322 |             [
323 |                 weight_norm(
324 |                     Conv1d(
325 |                         channels,
326 |                         channels,
327 |                         kernel_size,
328 |                         1,
329 |                         dilation=dilation[0],
330 |                         padding=get_padding(kernel_size, dilation[0]),
331 |                     )
332 |                 ),
333 |                 weight_norm(
334 |                     Conv1d(
335 |                         channels,
336 |                         channels,
337 |                         kernel_size,
338 |                         1,
339 |                         dilation=dilation[1],
340 |                         padding=get_padding(kernel_size, dilation[1]),
341 |                     )
342 |                 ),
343 |             ]
344 |         )
345 |         self.convs.apply(init_weights)
346 | 
347 |     def forward(self, x, x_mask=None):
348 |         for c in self.convs:
349 |             xt = F.leaky_relu(x, LRELU_SLOPE)
350 |             if x_mask is not None:
351 |                 xt = xt * x_mask
352 |             xt = c(xt)
353 |             x = xt + x
354 |         if x_mask is not None:
355 |             x = x * x_mask
356 |         return x
357 | 
358 |     def remove_weight_norm(self):
359 |         for l in self.convs:
360 |             remove_weight_norm(l)
361 | 
362 | 
363 | class Log(nn.Module):
364 |     def forward(self, x, x_mask, reverse=False, **kwargs):
365 |         if not reverse:
366 |             y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
367 |             logdet = torch.sum(-y, [1, 2])
368 |             return y, logdet
369 |         else:
370 |             x = torch.exp(x) * x_mask
371 |             return x
372 | 
373 | 
374 | class Flip(nn.Module):
375 |     def forward(self, x, *args, reverse=False, **kwargs):
376 |         x = torch.flip(x, [1])
377 |         if not reverse:
378 |             logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
379 |             return x, logdet
380 |         else:
381 |             return x
382 | 
383 | 
384 | class ElementwiseAffine(nn.Module):
385 |     def __init__(self, channels):
386 |         super().__init__()
387 |         self.channels = channels
388 |         self.m = nn.Parameter(torch.zeros(channels, 1))
389 |         self.logs = nn.Parameter(torch.zeros(channels, 1))
390 | 
391 |     def forward(self, x, x_mask, reverse=False, **kwargs):
392 |         if not reverse:
393 |             y = self.m + torch.exp(self.logs) * x
394 |             y = y * x_mask
395 |             logdet = torch.sum(self.logs * x_mask, [1, 2])
396 |             return y, logdet
397 |         else:
398 |             x = (x - self.m) * torch.exp(-self.logs) * x_mask
399 |             return x
400 | 
401 | 
402 | class ResidualCouplingLayer(nn.Module):
403 |     def __init__(
404 |         self,
405 |         channels,
406 |         hidden_channels,
407 |         kernel_size,
408 |         dilation_rate,
409 |         n_layers,
410 |         p_dropout=0,
411 |         gin_channels=0,
412 |         mean_only=False,
413 |     ):
414 |         assert channels % 2 == 0, "channels should be divisible by 2"
415 |         super().__init__()
416 |         self.channels = channels
417 |         self.hidden_channels = hidden_channels
418 |         self.kernel_size = kernel_size
419 |         self.dilation_rate = dilation_rate
420 |         self.n_layers = n_layers
421 |         self.half_channels = channels // 2
422 |         self.mean_only = mean_only
423 | 
424 |         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
425 |         self.enc = WN(
426 |             hidden_channels,
427 |             kernel_size,
428 |             dilation_rate,
429 |             n_layers,
430 |             p_dropout=p_dropout,
431 |             gin_channels=gin_channels,
432 |         )
433 |         self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
434 |         self.post.weight.data.zero_()
435 |         self.post.bias.data.zero_()
436 | 
437 |     def forward(self, x, x_mask, g=None, reverse=False):
438 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
439 |         h = self.pre(x0) * x_mask
440 |         h = self.enc(h, x_mask, g=g)
441 |         stats = self.post(h) * x_mask
442 |         if not self.mean_only:
443 |             m, logs = torch.split(stats, [self.half_channels] * 2, 1)
444 |         else:
445 |             m = stats
446 |             logs = torch.zeros_like(m)
447 | 
448 |         if not reverse:
449 |             x1 = m + x1 * torch.exp(logs) * x_mask
450 |             x = torch.cat([x0, x1], 1)
451 |             logdet = torch.sum(logs, [1, 2])
452 |             return x, logdet
453 |         else:
454 |             x1 = (x1 - m) * torch.exp(-logs) * x_mask
455 |             x = torch.cat([x0, x1], 1)
456 |             return x
457 | 
458 | 
459 | class ConvFlow(nn.Module):
460 |     def __init__(
461 |         self,
462 |         in_channels,
463 |         filter_channels,
464 |         kernel_size,
465 |         n_layers,
466 |         num_bins=10,
467 |         tail_bound=5.0,
468 |     ):
469 |         super().__init__()
470 |         self.in_channels = in_channels
471 |         self.filter_channels = filter_channels
472 |         self.kernel_size = kernel_size
473 |         self.n_layers = n_layers
474 |         self.num_bins = num_bins
475 |         self.tail_bound = tail_bound
476 |         self.half_channels = in_channels // 2
477 | 
478 |         self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
479 |         self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
480 |         self.proj = nn.Conv1d(
481 |             filter_channels, self.half_channels * (num_bins * 3 - 1), 1
482 |         )
483 |         self.proj.weight.data.zero_()
484 |         self.proj.bias.data.zero_()
485 | 
486 |     def forward(self, x, x_mask, g=None, reverse=False):
487 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
488 |         h = self.pre(x0)
489 |         h = self.convs(h, x_mask, g=g)
490 |         h = self.proj(h) * x_mask
491 | 
492 |         b, c, t = x0.shape
493 |         h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
494 | 
495 |         unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
496 |         unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
497 |             self.filter_channels
498 |         )
499 |         unnormalized_derivatives = h[..., 2 * self.num_bins :]
500 | 
501 |         x1, logabsdet = piecewise_rational_quadratic_transform(
502 |             x1,
503 |             unnormalized_widths,
504 |             unnormalized_heights,
505 |             unnormalized_derivatives,
506 |             inverse=reverse,
507 |             tails="linear",
508 |             tail_bound=self.tail_bound,
509 |         )
510 | 
511 |         x = torch.cat([x0, x1], 1) * x_mask
512 |         logdet = torch.sum(logabsdet * x_mask, [1, 2])
513 |         if not reverse:
514 |             return x, logdet
515 |         else:
516 |             return x
517 | 


--------------------------------------------------------------------------------
/monotonic_align.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def maximum_path(neg_cent, mask):
 6 |     """Pure Python implementation.
 7 |     neg_cent: [b, t_t, t_s]
 8 |     mask: [b, t_t, t_s]
 9 |     """
10 |     device = neg_cent.device
11 |     dtype = neg_cent.dtype
12 |     neg_cent = neg_cent.detach().cpu().numpy().astype(np.float32)
13 |     path = np.zeros(neg_cent.shape, dtype=np.int32)
14 | 
15 |     for i in range(neg_cent.shape[0]):
16 |         t_t_max = int(mask[i].sum(1)[0])
17 |         t_s_max = int(mask[i].sum(2)[0])
18 |         maximum_path_each(path[i], neg_cent[i], t_t_max, t_s_max)
19 | 
20 |     return torch.from_numpy(path).to(device=device, dtype=dtype)
21 | 
22 | 
23 | def maximum_path_each(path, value, t_y, t_x, max_neg_val=-1e9):
24 |     for y in range(t_y):
25 |         for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
26 |             if x == y:
27 |                 v_cur = max_neg_val
28 |             else:
29 |                 v_cur = value[y - 1, x]
30 |             if x == 0:
31 |                 if y == 0:
32 |                     v_prev = 0.0
33 |                 else:
34 |                     v_prev = max_neg_val
35 |             else:
36 |                 v_prev = value[y - 1, x - 1]
37 |             value[y, x] += max(v_prev, v_cur)
38 | 
39 |     index = t_x - 1
40 |     for y in range(t_y - 1, -1, -1):
41 |         path[y, index] = 1
42 |         if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
43 |             index = index - 1
44 | 


--------------------------------------------------------------------------------
/openai_api.py:
--------------------------------------------------------------------------------
 1 | import config
 2 | import openai
 3 | from io import BytesIO
 4 | 
 5 | openai.api_key = config.OPENAI_API_KEY
 6 | 
 7 | 
 8 | def chat_complete(context: list, text: str) -> tuple[list, str]:
 9 |     new_context = context.copy()
10 |     new_context.append({"role": "user", "content": config.ENHANCE_PROMPT + text})
11 |     try:
12 |         completion = openai.ChatCompletion.create(
13 |             model="gpt-3.5-turbo", messages=config.INITIAL_CONTEXT + new_context
14 |         )
15 |         message = completion["choices"][0]["message"]["content"]
16 |         new_context.append({"role": "assistant", "content": message})
17 |         return (new_context, message)
18 |     except:
19 |         return context, ""
20 | 
21 | 
22 | def transcript(audio: BytesIO, language: str) -> str:
23 |     try:
24 |         if language == "ja":
25 |             transcript = openai.Audio.transcribe(
26 |                 "whisper-1",
27 |                 audio,
28 |                 language=language,
29 |                 prompt=config.TRANSCRIPT_PROMPT.get(language, ""),
30 |             )
31 |         elif language == "zh":
32 |             transcript = openai.Audio.transcribe(
33 |                 "whisper-1",
34 |                 audio,
35 |                 language=language,
36 |                 prompt=config.TRANSCRIPT_PROMPT.get(language, ""),
37 |             )
38 |         else:
39 |             transcript = openai.Audio.transcribe("whisper-1", audio)
40 |     except openai.InvalidRequestError:
41 |         return {"code": 1}
42 |     except:
43 |         return {"code": -1}
44 |     return {"code": 0, "result": transcript["text"]}
45 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | librosa
 2 | matplotlib
 3 | numpy
 4 | phonemizer
 5 | scipy
 6 | torch
 7 | torchvision
 8 | Unidecode
 9 | pyopenjtalk
10 | flask
11 | waitress


--------------------------------------------------------------------------------
/static/css/index.css:
--------------------------------------------------------------------------------
  1 | html,
  2 | body {
  3 |     margin: 0;
  4 |     overflow: hidden;
  5 |     touch-action: none;
  6 | }
  7 | 
  8 | textarea {
  9 |     font-size: 24px;
 10 |     border: none;
 11 |     outline: 0;
 12 |     border-radius: 32px;
 13 |     padding: 32px;
 14 | }
 15 | 
 16 | input {
 17 |     font-size: 23px;
 18 |     border: none;
 19 |     outline: 0;
 20 |     border-radius: 32px;
 21 |     padding: 32px;
 22 | }
 23 | 
 24 | .center {
 25 |     position: absolute;
 26 |     left: 50%;
 27 |     transform: translateX(-50%);
 28 | }
 29 | 
 30 | .button-img {
 31 |     max-width: 100%;
 32 |     max-height: 100%;
 33 |     width: auto;
 34 |     height: auto;
 35 | }
 36 | 
 37 | .button-img:active {
 38 |     opacity: 0.5;
 39 | }
 40 | 
 41 | .top-button {
 42 |     background-color: transparent;
 43 |     border: none;
 44 |     border-radius: 5px;
 45 | }
 46 | 
 47 | #header {
 48 |     position: fixed;
 49 |     top: 0;
 50 |     right: 0;
 51 | }
 52 | 
 53 | #live2d-canvas {
 54 |     position: absolute;
 55 |     width: 100%;
 56 |     height: 100%;
 57 |     z-index: -1;
 58 | }
 59 | 
 60 | #textarea-div {
 61 |     position: absolute;
 62 |     bottom: 5%;
 63 |     width: 100%;
 64 |     display: flex;
 65 |     flex-direction: column;
 66 |     align-items: center;
 67 |     justify-items: flex-end;
 68 | }
 69 | 
 70 | #captions-textarea {
 71 |     flex-grow: 2;
 72 |     background-color: rgba(0, 0, 0, 0.1);
 73 |     max-width: 1000px;
 74 |     width: 80%;
 75 |     height: 80%;
 76 | }
 77 | 
 78 | #message-input {
 79 |     flex-grow: 1;
 80 |     background-color: rgba(0, 0, 0, 0.1);
 81 |     max-width: 1000px;
 82 |     height: 20%;
 83 |     width: 80%;
 84 | }
 85 | 
 86 | #reload-button {
 87 |     animation: load 1s;
 88 | }
 89 | 
 90 | #reload-button:active {
 91 |     animation: none;
 92 | }
 93 | 
 94 | @media screen and (orientation: landscape) {
 95 |     #header {
 96 |         height: 8%;
 97 |     }
 98 | 
 99 |     #textarea-div {
100 |         height: 40%;
101 |     }
102 | 
103 |     .top-button {
104 |         height: 100%;
105 |         width: auto;
106 |     }
107 | }
108 | 
109 | @media screen and (orientation: portrait) {
110 |     #header {
111 |         width: 15%;
112 |     }
113 | 
114 |     #textarea-div {
115 |         height: 25%;
116 |     }
117 | 
118 |     .top-button {
119 |         width: 100%;
120 |         height: auto;
121 |     }
122 | }
123 | 
124 | @keyframes load {
125 |     0% {
126 |         transform: rotate(0deg);
127 |     }
128 | 
129 |     100% {
130 |         transform: rotate(360deg);
131 |     }
132 | }


--------------------------------------------------------------------------------
/static/images/ja.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/ja.png


--------------------------------------------------------------------------------
/static/images/record.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/record.png


--------------------------------------------------------------------------------
/static/images/recording.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/recording.png


--------------------------------------------------------------------------------
/static/images/reload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/reload.png


--------------------------------------------------------------------------------
/static/images/text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/text.png


--------------------------------------------------------------------------------
/static/images/zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mirrorange/chatvits/76ef67c6d56662ca2af7313304c60f29c1c0f47f/static/images/zh.png


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 | <head>
 5 |   <meta charset="utf-8">
 6 |   <title>Atri</title>
 7 |   <link rel="stylesheet" href="/static/css/index.css">
 8 |   <!-- Pollyfill script -->
 9 |   <script src="/static/js/core-js.js"></script>
10 |   <!-- Live2DCubismCore script -->
11 |   <script src="/static/js/live2dcubismcore.js"></script>
12 |   <!-- Build script -->
13 |   <script src="/static/js/bundle.js"></script>
14 | </head>
15 | 
16 | <body>
17 |   <audio id="audio-player" crossOrigin="anonymous"></audio>
18 |   <div id="header">
19 |     <button id="reload-button" class="top-button"><img class="button-img" src="/static/images/reload.png" alt="重置"></button>
20 |     <button id="captions-button" class="top-button"><img class="button-img" src="/static/images/text.png" alt="字幕"></button>
21 |     <button id="language-button" class="top-button"><img class="button-img" src="/static/images/zh.png" alt="语言"></button>
22 |     <button id="record-button" class="top-button" style="visibility: hidden;"><img class="button-img" src="/static/images/record.png" alt="录音"></button>
23 |   </div>
24 |   <div id="live2d-canvas"></div>
25 |   <div id="textarea-div">
26 |     <textarea id="captions-textarea" style="visibility: hidden;" readonly></textarea>
27 |     <input id="message-input"></input>
28 |   </div>
29 | </body>
30 | </html>


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | from text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | 
11 | def text_to_sequence(text, cleaner_names):
12 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13 |     Args:
14 |       text: string to convert to a sequence
15 |       cleaner_names: names of the cleaner functions to run the text through
16 |     Returns:
17 |       List of integers corresponding to the symbols in the text
18 |   '''
19 |   sequence = []
20 | 
21 |   clean_text = _clean_text(text, cleaner_names)
22 |   for symbol in clean_text:
23 |     symbol_id = _symbol_to_id[symbol]
24 |     sequence += [symbol_id]
25 |   return sequence
26 | 
27 | 
28 | def cleaned_text_to_sequence(cleaned_text):
29 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30 |     Args:
31 |       text: string to convert to a sequence
32 |     Returns:
33 |       List of integers corresponding to the symbols in the text
34 |   '''
35 |   sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36 |   return sequence
37 | 
38 | 
39 | def sequence_to_text(sequence):
40 |   '''Converts a sequence of IDs back to a string'''
41 |   result = ''
42 |   for symbol_id in sequence:
43 |     s = _id_to_symbol[symbol_id]
44 |     result += s
45 |   return result
46 | 
47 | 
48 | def _clean_text(text, cleaner_names):
49 |   for name in cleaner_names:
50 |     cleaner = getattr(cleaners, name)
51 |     if not cleaner:
52 |       raise Exception('Unknown cleaner: %s' % name)
53 |     text = cleaner(text)
54 |   return text
55 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | from text.symbols import symbols
  2 | import re
  3 | from unidecode import unidecode
  4 | import pyopenjtalk
  5 | 
  6 | 
  7 | # Regular expression matching Japanese without punctuation marks:
  8 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  9 | 
 10 | # Regular expression matching non-Japanese characters or punctuation marks:
 11 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 12 | 
 13 | 
 14 | def custom_cleaners(text:str):
 15 |   '''Remove char not in symbols'''
 16 |   text = text.replace("Atri","アトリ").replace("ATRI","アトリ").replace("亜托莉","アトリ") # 避免她把自己的名字读成 'A'、'T'、'R'、'I'
 17 |   text = get_romaji_with_space_and_accent(text)
 18 |   text = ''.join([char for char in text if char in symbols])
 19 |   return text
 20 | 
 21 | def get_romaji(text):
 22 |     '''Pipeline for romanizing Japanese text.'''
 23 |     sentences = re.split(_japanese_marks, text)
 24 |     marks = re.findall(_japanese_marks, text)
 25 |     text = ''
 26 |     for i, mark in enumerate(marks):
 27 |         if re.match(_japanese_characters, sentences[i]):
 28 |             text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','')
 29 |         text += unidecode(mark).replace(' ','')
 30 |     if re.match(_japanese_characters, sentences[-1]):
 31 |         text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','')
 32 |     if re.match('[A-Za-z]',text[-1]):
 33 |         text += '.'
 34 |     return text
 35 | 
 36 | 
 37 | def get_romaji_with_space(text):
 38 |     '''Pipeline for dividing Japanese text into romanized phrases.'''
 39 |     sentences = re.split(_japanese_marks, text)
 40 |     marks = re.findall(_japanese_marks, text)
 41 |     text = ''
 42 |     for i, sentence in enumerate(sentences):
 43 |         if re.match(_japanese_characters, sentence):
 44 |             if text != '':
 45 |                 text += ' '
 46 |             labels = pyopenjtalk.extract_fullcontext(sentence)
 47 |             for n, label in enumerate(labels):
 48 |                 phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
 49 |                 if phoneme not in ['sil','pau']:
 50 |                     text += phoneme
 51 |                 else:
 52 |                     continue
 53 |                 a3 = int(re.search(r"\+(\d+)/", label).group(1))
 54 |                 if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
 55 |                     a2_next=-1
 56 |                 else:
 57 |                     a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
 58 |                 # Accent phrase boundary
 59 |                 if a3 == 1 and a2_next == 1:
 60 |                     text += ' '
 61 |         if i<len(marks):
 62 |             text += unidecode(marks[i]).replace(' ','')
 63 |     if re.match('[A-Za-z]',text[-1]):
 64 |         text += '.'
 65 |     return text
 66 | 
 67 | 
 68 | def get_romaji_with_space_and_accent(text):
 69 |     '''Pipeline for notating accent in Japanese text.'''
 70 |     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
 71 |     sentences = re.split(_japanese_marks, text)
 72 |     marks = re.findall(_japanese_marks, text)
 73 |     text = ''
 74 |     for i, sentence in enumerate(sentences):
 75 |         if re.match(_japanese_characters, sentence):
 76 |             if text!='':
 77 |                 text+=' '
 78 |             labels = pyopenjtalk.extract_fullcontext(sentence)
 79 |             for n, label in enumerate(labels):
 80 |                 phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
 81 |                 if phoneme not in ['sil','pau']:
 82 |                     text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q').replace('ts','ʦ')
 83 |                 else:
 84 |                     continue
 85 |                 n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
 86 |                 a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
 87 |                 a2 = int(re.search(r"\+(\d+)\+", label).group(1))
 88 |                 a3 = int(re.search(r"\+(\d+)/", label).group(1))
 89 |                 if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
 90 |                     a2_next=-1
 91 |                 else:
 92 |                     a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
 93 |                 # Accent phrase boundary
 94 |                 if a3 == 1 and a2_next == 1:
 95 |                     text += ' '
 96 |                 # Falling
 97 |                 elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
 98 |                     text += '↓'
 99 |                 # Rising
100 |                 elif a2 == 1 and a2_next == 2:
101 |                     text += '↑'
102 |         if i<len(marks):
103 |             text += unidecode(marks[i]).replace(' ','')
104 |     if re.match('[A-Za-z]',text[-1]):
105 |         text += '.'
106 |     return text


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | '''
 6 | _pad        = '_'
 7 | _punctuation = ',.!?-'
 8 | _letters = 'AEINOQUabdefghijkmnoprstuvwyz'
 9 | _letters_ipa = 'ʃʧ↓↑ '
10 | 
11 | 
12 | # Export all symbols:
13 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14 | 
15 | # Special symbol ids
16 | SPACE_ID = symbols.index(" ")
17 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(
 13 |     inputs,
 14 |     unnormalized_widths,
 15 |     unnormalized_heights,
 16 |     unnormalized_derivatives,
 17 |     inverse=False,
 18 |     tails=None,
 19 |     tail_bound=1.0,
 20 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 21 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 22 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 23 | ):
 24 |     if tails is None:
 25 |         spline_fn = rational_quadratic_spline
 26 |         spline_kwargs = {}
 27 |     else:
 28 |         spline_fn = unconstrained_rational_quadratic_spline
 29 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 30 | 
 31 |     outputs, logabsdet = spline_fn(
 32 |         inputs=inputs,
 33 |         unnormalized_widths=unnormalized_widths,
 34 |         unnormalized_heights=unnormalized_heights,
 35 |         unnormalized_derivatives=unnormalized_derivatives,
 36 |         inverse=inverse,
 37 |         min_bin_width=min_bin_width,
 38 |         min_bin_height=min_bin_height,
 39 |         min_derivative=min_derivative,
 40 |         **spline_kwargs
 41 |     )
 42 |     return outputs, logabsdet
 43 | 
 44 | 
 45 | def searchsorted(bin_locations, inputs, eps=1e-6):
 46 |     bin_locations[..., -1] += eps
 47 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 48 | 
 49 | 
 50 | def unconstrained_rational_quadratic_spline(
 51 |     inputs,
 52 |     unnormalized_widths,
 53 |     unnormalized_heights,
 54 |     unnormalized_derivatives,
 55 |     inverse=False,
 56 |     tails="linear",
 57 |     tail_bound=1.0,
 58 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 59 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 60 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 61 | ):
 62 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 63 |     outside_interval_mask = ~inside_interval_mask
 64 | 
 65 |     outputs = torch.zeros_like(inputs)
 66 |     logabsdet = torch.zeros_like(inputs)
 67 | 
 68 |     if tails == "linear":
 69 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 70 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 71 |         unnormalized_derivatives[..., 0] = constant
 72 |         unnormalized_derivatives[..., -1] = constant
 73 | 
 74 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 75 |         logabsdet[outside_interval_mask] = 0
 76 |     else:
 77 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 78 | 
 79 |     (
 80 |         outputs[inside_interval_mask],
 81 |         logabsdet[inside_interval_mask],
 82 |     ) = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound,
 89 |         right=tail_bound,
 90 |         bottom=-tail_bound,
 91 |         top=tail_bound,
 92 |         min_bin_width=min_bin_width,
 93 |         min_bin_height=min_bin_height,
 94 |         min_derivative=min_derivative,
 95 |     )
 96 | 
 97 |     return outputs, logabsdet
 98 | 
 99 | 
100 | def rational_quadratic_spline(
101 |     inputs,
102 |     unnormalized_widths,
103 |     unnormalized_heights,
104 |     unnormalized_derivatives,
105 |     inverse=False,
106 |     left=0.0,
107 |     right=1.0,
108 |     bottom=0.0,
109 |     top=1.0,
110 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 |     if torch.min(inputs) < left or torch.max(inputs) > right:
115 |         raise ValueError("Input to a transform is not within its domain")
116 | 
117 |     num_bins = unnormalized_widths.shape[-1]
118 | 
119 |     if min_bin_width * num_bins > 1.0:
120 |         raise ValueError("Minimal bin width too large for the number of bins")
121 |     if min_bin_height * num_bins > 1.0:
122 |         raise ValueError("Minimal bin height too large for the number of bins")
123 | 
124 |     widths = F.softmax(unnormalized_widths, dim=-1)
125 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 |     cumwidths = torch.cumsum(widths, dim=-1)
127 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 |     cumwidths = (right - left) * cumwidths + left
129 |     cumwidths[..., 0] = left
130 |     cumwidths[..., -1] = right
131 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 | 
133 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 | 
135 |     heights = F.softmax(unnormalized_heights, dim=-1)
136 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 |     cumheights = torch.cumsum(heights, dim=-1)
138 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 |     cumheights = (top - bottom) * cumheights + bottom
140 |     cumheights[..., 0] = bottom
141 |     cumheights[..., -1] = top
142 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
143 | 
144 |     if inverse:
145 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
146 |     else:
147 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 | 
149 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 | 
152 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 |     delta = heights / widths
154 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
155 | 
156 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 | 
159 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
160 | 
161 |     if inverse:
162 |         a = (inputs - input_cumheights) * (
163 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 |         ) + input_heights * (input_delta - input_derivatives)
165 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 |         )
168 |         c = -input_delta * (inputs - input_cumheights)
169 | 
170 |         discriminant = b.pow(2) - 4 * a * c
171 |         assert (discriminant >= 0).all()
172 | 
173 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
174 |         outputs = root * input_bin_widths + input_cumwidths
175 | 
176 |         theta_one_minus_theta = root * (1 - root)
177 |         denominator = input_delta + (
178 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 |             * theta_one_minus_theta
180 |         )
181 |         derivative_numerator = input_delta.pow(2) * (
182 |             input_derivatives_plus_one * root.pow(2)
183 |             + 2 * input_delta * theta_one_minus_theta
184 |             + input_derivatives * (1 - root).pow(2)
185 |         )
186 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 | 
188 |         return outputs, -logabsdet
189 |     else:
190 |         theta = (inputs - input_cumwidths) / input_bin_widths
191 |         theta_one_minus_theta = theta * (1 - theta)
192 | 
193 |         numerator = input_heights * (
194 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 |         )
196 |         denominator = input_delta + (
197 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 |             * theta_one_minus_theta
199 |         )
200 |         outputs = input_cumheights + numerator / denominator
201 | 
202 |         derivative_numerator = input_delta.pow(2) * (
203 |             input_derivatives_plus_one * theta.pow(2)
204 |             + 2 * input_delta * theta_one_minus_theta
205 |             + input_derivatives * (1 - theta).pow(2)
206 |         )
207 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 | 
209 |         return outputs, logabsdet
210 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | import numpy as np
  9 | from scipy.io.wavfile import read
 10 | import torch
 11 | 
 12 | MATPLOTLIB_FLAG = False
 13 | 
 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 15 | logger = logging
 16 | 
 17 | 
 18 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 19 |     assert os.path.isfile(checkpoint_path)
 20 |     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
 21 |     iteration = checkpoint_dict["iteration"]
 22 |     learning_rate = checkpoint_dict["learning_rate"]
 23 |     if optimizer is not None:
 24 |         optimizer.load_state_dict(checkpoint_dict["optimizer"])
 25 |     saved_state_dict = checkpoint_dict["model"]
 26 |     if hasattr(model, "module"):
 27 |         state_dict = model.module.state_dict()
 28 |     else:
 29 |         state_dict = model.state_dict()
 30 |     new_state_dict = {}
 31 |     for k, v in state_dict.items():
 32 |         try:
 33 |             new_state_dict[k] = saved_state_dict[k]
 34 |         except:
 35 |             logger.info("%s is not in the checkpoint" % k)
 36 |             new_state_dict[k] = v
 37 |     if hasattr(model, "module"):
 38 |         model.module.load_state_dict(new_state_dict)
 39 |     else:
 40 |         model.load_state_dict(new_state_dict)
 41 |     logger.info(
 42 |         "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration)
 43 |     )
 44 |     return model, optimizer, learning_rate, iteration
 45 | 
 46 | 
 47 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
 48 |     logger.info(
 49 |         "Saving model and optimizer state at iteration {} to {}".format(
 50 |             iteration, checkpoint_path
 51 |         )
 52 |     )
 53 |     if hasattr(model, "module"):
 54 |         state_dict = model.module.state_dict()
 55 |     else:
 56 |         state_dict = model.state_dict()
 57 |     torch.save(
 58 |         {
 59 |             "model": state_dict,
 60 |             "iteration": iteration,
 61 |             "optimizer": optimizer.state_dict(),
 62 |             "learning_rate": learning_rate,
 63 |         },
 64 |         checkpoint_path,
 65 |     )
 66 | 
 67 | 
 68 | def summarize(
 69 |     writer,
 70 |     global_step,
 71 |     scalars={},
 72 |     histograms={},
 73 |     images={},
 74 |     audios={},
 75 |     audio_sampling_rate=22050,
 76 | ):
 77 |     for k, v in scalars.items():
 78 |         writer.add_scalar(k, v, global_step)
 79 |     for k, v in histograms.items():
 80 |         writer.add_histogram(k, v, global_step)
 81 |     for k, v in images.items():
 82 |         writer.add_image(k, v, global_step, dataformats="HWC")
 83 |     for k, v in audios.items():
 84 |         writer.add_audio(k, v, global_step, audio_sampling_rate)
 85 | 
 86 | 
 87 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
 88 |     f_list = glob.glob(os.path.join(dir_path, regex))
 89 |     f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
 90 |     x = f_list[-1]
 91 |     print(x)
 92 |     return x
 93 | 
 94 | 
 95 | def plot_spectrogram_to_numpy(spectrogram):
 96 |     global MATPLOTLIB_FLAG
 97 |     if not MATPLOTLIB_FLAG:
 98 |         import matplotlib
 99 | 
100 |         matplotlib.use("Agg")
101 |         MATPLOTLIB_FLAG = True
102 |         mpl_logger = logging.getLogger("matplotlib")
103 |         mpl_logger.setLevel(logging.WARNING)
104 |     import matplotlib.pylab as plt
105 |     import numpy as np
106 | 
107 |     fig, ax = plt.subplots(figsize=(10, 2))
108 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
109 |     plt.colorbar(im, ax=ax)
110 |     plt.xlabel("Frames")
111 |     plt.ylabel("Channels")
112 |     plt.tight_layout()
113 | 
114 |     fig.canvas.draw()
115 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
116 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
117 |     plt.close()
118 |     return data
119 | 
120 | 
121 | def plot_alignment_to_numpy(alignment, info=None):
122 |     global MATPLOTLIB_FLAG
123 |     if not MATPLOTLIB_FLAG:
124 |         import matplotlib
125 | 
126 |         matplotlib.use("Agg")
127 |         MATPLOTLIB_FLAG = True
128 |         mpl_logger = logging.getLogger("matplotlib")
129 |         mpl_logger.setLevel(logging.WARNING)
130 |     import matplotlib.pylab as plt
131 |     import numpy as np
132 | 
133 |     fig, ax = plt.subplots(figsize=(6, 4))
134 |     im = ax.imshow(
135 |         alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
136 |     )
137 |     fig.colorbar(im, ax=ax)
138 |     xlabel = "Decoder timestep"
139 |     if info is not None:
140 |         xlabel += "\n\n" + info
141 |     plt.xlabel(xlabel)
142 |     plt.ylabel("Encoder timestep")
143 |     plt.tight_layout()
144 | 
145 |     fig.canvas.draw()
146 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
147 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
148 |     plt.close()
149 |     return data
150 | 
151 | 
152 | def load_wav_to_torch(full_path):
153 |     sampling_rate, data = read(full_path)
154 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
155 | 
156 | 
157 | def load_filepaths_and_text(filename, split="|"):
158 |     with open(filename, encoding="utf-8") as f:
159 |         filepaths_and_text = [line.strip().split(split) for line in f]
160 |     return filepaths_and_text
161 | 
162 | 
163 | def get_hparams(init=True):
164 |     parser = argparse.ArgumentParser()
165 |     parser.add_argument(
166 |         "-c",
167 |         "--config",
168 |         type=str,
169 |         default="./configs/base.json",
170 |         help="JSON file for configuration",
171 |     )
172 |     parser.add_argument("-m", "--model", type=str, required=True, help="Model name")
173 | 
174 |     args = parser.parse_args()
175 |     model_dir = os.path.join("./logs", args.model)
176 | 
177 |     if not os.path.exists(model_dir):
178 |         os.makedirs(model_dir)
179 | 
180 |     config_path = args.config
181 |     config_save_path = os.path.join(model_dir, "config.json")
182 |     if init:
183 |         with open(config_path, "r") as f:
184 |             data = f.read()
185 |         with open(config_save_path, "w") as f:
186 |             f.write(data)
187 |     else:
188 |         with open(config_save_path, "r") as f:
189 |             data = f.read()
190 |     config = json.loads(data)
191 | 
192 |     hparams = HParams(**config)
193 |     hparams.model_dir = model_dir
194 |     return hparams
195 | 
196 | 
197 | def get_hparams_from_dir(model_dir):
198 |     config_save_path = os.path.join(model_dir, "config.json")
199 |     with open(config_save_path, "r") as f:
200 |         data = f.read()
201 |     config = json.loads(data)
202 | 
203 |     hparams = HParams(**config)
204 |     hparams.model_dir = model_dir
205 |     return hparams
206 | 
207 | 
208 | def get_hparams_from_file(config_path):
209 |     with open(config_path, "r") as f:
210 |         data = f.read()
211 |     config = json.loads(data)
212 | 
213 |     hparams = HParams(**config)
214 |     return hparams
215 | 
216 | 
217 | def check_git_hash(model_dir):
218 |     source_dir = os.path.dirname(os.path.realpath(__file__))
219 |     if not os.path.exists(os.path.join(source_dir, ".git")):
220 |         logger.warn(
221 |             "{} is not a git repository, therefore hash value comparison will be ignored.".format(
222 |                 source_dir
223 |             )
224 |         )
225 |         return
226 | 
227 |     cur_hash = subprocess.getoutput("git rev-parse HEAD")
228 | 
229 |     path = os.path.join(model_dir, "githash")
230 |     if os.path.exists(path):
231 |         saved_hash = open(path).read()
232 |         if saved_hash != cur_hash:
233 |             logger.warn(
234 |                 "git hash values are different. {}(saved) != {}(current)".format(
235 |                     saved_hash[:8], cur_hash[:8]
236 |                 )
237 |             )
238 |     else:
239 |         open(path, "w").write(cur_hash)
240 | 
241 | 
242 | def get_logger(model_dir, filename="train.log"):
243 |     global logger
244 |     logger = logging.getLogger(os.path.basename(model_dir))
245 |     logger.setLevel(logging.DEBUG)
246 | 
247 |     formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
248 |     if not os.path.exists(model_dir):
249 |         os.makedirs(model_dir)
250 |     h = logging.FileHandler(os.path.join(model_dir, filename))
251 |     h.setLevel(logging.DEBUG)
252 |     h.setFormatter(formatter)
253 |     logger.addHandler(h)
254 |     return logger
255 | 
256 | 
257 | class HParams:
258 |     def __init__(self, **kwargs):
259 |         for k, v in kwargs.items():
260 |             if type(v) == dict:
261 |                 v = HParams(**v)
262 |             self[k] = v
263 | 
264 |     def keys(self):
265 |         return self.__dict__.keys()
266 | 
267 |     def items(self):
268 |         return self.__dict__.items()
269 | 
270 |     def values(self):
271 |         return self.__dict__.values()
272 | 
273 |     def __len__(self):
274 |         return len(self.__dict__)
275 | 
276 |     def __getitem__(self, key):
277 |         return getattr(self, key)
278 | 
279 |     def __setitem__(self, key, value):
280 |         return setattr(self, key, value)
281 | 
282 |     def __contains__(self, key):
283 |         return key in self.__dict__
284 | 
285 |     def __repr__(self):
286 |         return self.__dict__.__repr__()
287 | 


--------------------------------------------------------------------------------