├── .gitignore
├── 1key.jpg
├── GPT_SoVITS
    ├── AR
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   └── __init__.cpython-310.pyc
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   ├── bucket_sampler.cpython-310.pyc
    │   │   │   ├── data_module.cpython-310.pyc
    │   │   │   └── dataset.cpython-310.pyc
    │   │   ├── bucket_sampler.py
    │   │   ├── data_module.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   ├── t2s_lightning_module.cpython-310.pyc
    │   │   │   ├── t2s_model.cpython-310.pyc
    │   │   │   └── utils.cpython-310.pyc
    │   │   ├── t2s_lightning_module.py
    │   │   ├── t2s_lightning_module_onnx.py
    │   │   ├── t2s_model.py
    │   │   ├── t2s_model_onnx.py
    │   │   └── utils.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   ├── activation.cpython-310.pyc
    │   │   │   ├── embedding.cpython-310.pyc
    │   │   │   ├── lr_schedulers.cpython-310.pyc
    │   │   │   ├── optim.cpython-310.pyc
    │   │   │   ├── patched_mha_with_cache.cpython-310.pyc
    │   │   │   ├── scaling.cpython-310.pyc
    │   │   │   └── transformer.cpython-310.pyc
    │   │   ├── activation.py
    │   │   ├── activation_onnx.py
    │   │   ├── embedding.py
    │   │   ├── embedding_onnx.py
    │   │   ├── lr_schedulers.py
    │   │   ├── optim.py
    │   │   ├── patched_mha_with_cache.py
    │   │   ├── patched_mha_with_cache_onnx.py
    │   │   ├── scaling.py
    │   │   ├── transformer.py
    │   │   └── transformer_onnx.py
    │   ├── text_processing
    │   │   ├── __init__.py
    │   │   ├── phonemizer.py
    │   │   └── symbols.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-310.pyc
    │   │       └── io.cpython-310.pyc
    │   │   ├── initialize.py
    │   │   └── io.py
    ├── __pycache__
    │   ├── my_utils.cpython-310.pyc
    │   ├── process_ckpt.cpython-310.pyc
    │   └── utils.cpython-310.pyc
    ├── configs
    │   ├── s1.yaml
    │   ├── s1big.yaml
    │   ├── s1big2.yaml
    │   ├── s1longer.yaml
    │   ├── s1mq.yaml
    │   ├── s2.json
    │   └── train.yaml
    ├── feature_extractor
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── cnhubert.cpython-310.pyc
    │   │   └── whisper_enc.cpython-310.pyc
    │   ├── cnhubert.py
    │   └── whisper_enc.py
    ├── inference_gui.py
    ├── inference_webui.py
    ├── module
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── attentions.cpython-310.pyc
    │   │   ├── commons.cpython-310.pyc
    │   │   ├── core_vq.cpython-310.pyc
    │   │   ├── data_utils.cpython-310.pyc
    │   │   ├── losses.cpython-310.pyc
    │   │   ├── mel_processing.cpython-310.pyc
    │   │   ├── models.cpython-310.pyc
    │   │   ├── modules.cpython-310.pyc
    │   │   ├── mrte_model.cpython-310.pyc
    │   │   ├── quantize.cpython-310.pyc
    │   │   └── transforms.cpython-310.pyc
    │   ├── attentions.py
    │   ├── attentions_onnx.py
    │   ├── commons.py
    │   ├── core_vq.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── mrte_model.py
    │   ├── quantize.py
    │   └── transforms.py
    ├── my_utils.py
    ├── onnx_export.py
    ├── prepare_datasets
    │   ├── 1-get-text.py
    │   ├── 2-get-hubert-wav32k.py
    │   └── 3-get-semantic.py
    ├── process_ckpt.py
    ├── s1_train.py
    ├── s2_train.py
    ├── text
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── chinese.cpython-310.pyc
    │   │   ├── cleaner.cpython-310.pyc
    │   │   ├── english.cpython-310.pyc
    │   │   ├── japanese.cpython-310.pyc
    │   │   ├── symbols.cpython-310.pyc
    │   │   └── tone_sandhi.cpython-310.pyc
    │   ├── chinese.py
    │   ├── cleaner.py
    │   ├── cmudict-fast.rep
    │   ├── cmudict.rep
    │   ├── engdict-hot.rep
    │   ├── engdict_cache.pickle
    │   ├── english.py
    │   ├── japanese.py
    │   ├── namedict_cache.pickle
    │   ├── opencpop-strict.txt
    │   ├── symbols.py
    │   ├── tone_sandhi.py
    │   └── zh_normalization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-310.pyc
    │   │       ├── char_convert.cpython-310.pyc
    │   │       ├── chronology.cpython-310.pyc
    │   │       ├── constants.cpython-310.pyc
    │   │       ├── num.cpython-310.pyc
    │   │       ├── phonecode.cpython-310.pyc
    │   │       ├── quantifier.cpython-310.pyc
    │   │       └── text_normlization.cpython-310.pyc
    │   │   ├── char_convert.py
    │   │   ├── chronology.py
    │   │   ├── constants.py
    │   │   ├── num.py
    │   │   ├── phonecode.py
    │   │   ├── quantifier.py
    │   │   └── text_normlization.py
    └── utils.py
├── LICENSE
├── README.md
├── __init__.py
├── config.py
├── donate.jpg
├── finetune.py
├── inference.py
├── nodes.py
├── note.txt
├── requirements.txt
├── tools
    ├── __pycache__
    │   └── my_utils.cpython-310.pyc
    ├── i18n
    │   ├── __pycache__
    │   │   └── i18n.cpython-310.pyc
    │   ├── i18n.py
    │   ├── locale
    │   │   ├── en_US.json
    │   │   ├── es_ES.json
    │   │   ├── fr_FR.json
    │   │   ├── it_IT.json
    │   │   ├── ja_JP.json
    │   │   ├── ko_KR.json
    │   │   ├── ru_RU.json
    │   │   ├── tr_TR.json
    │   │   ├── zh_CN.json
    │   │   ├── zh_HK.json
    │   │   ├── zh_SG.json
    │   │   └── zh_TW.json
    │   ├── locale_diff.py
    │   └── scan_i18n.py
    └── my_utils.py
├── web.png
├── web
    └── js
    │   ├── alertMSG.js
    │   ├── previewAudio.js
    │   ├── refreshPath.js
    │   ├── uploadAudio.js
    │   └── uploadSRT.js
└── wechat.jpg


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | /pretrained_models
3 | /logs
4 | 


--------------------------------------------------------------------------------
/1key.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/1key.jpg


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/bucket_sampler.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import itertools
  4 | import math
  5 | import random
  6 | from random import shuffle
  7 | from typing import Iterator
  8 | from typing import Optional
  9 | from typing import TypeVar
 10 | 
 11 | import torch
 12 | import torch.distributed as dist
 13 | from torch.utils.data import Dataset
 14 | from torch.utils.data import Sampler
 15 | 
 16 | __all__ = [
 17 |     "DistributedBucketSampler",
 18 | ]
 19 | 
 20 | T_co = TypeVar("T_co", covariant=True)
 21 | 
 22 | 
 23 | class DistributedBucketSampler(Sampler[T_co]):
 24 |     r"""
 25 |     sort the dataset wrt. input length
 26 |     divide samples into buckets
 27 |     sort within buckets
 28 |     divide buckets into batches
 29 |     sort batches
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         dataset: Dataset,
 35 |         num_replicas: Optional[int] = None,
 36 |         rank: Optional[int] = None,
 37 |         shuffle: bool = True,
 38 |         seed: int = 0,
 39 |         drop_last: bool = False,
 40 |         batch_size: int = 32,
 41 |     ) -> None:
 42 |         if num_replicas is None:
 43 |             if not dist.is_available():
 44 |                 raise RuntimeError("Requires distributed package to be available")
 45 |             num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
 46 |         if rank is None:
 47 |             if not dist.is_available():
 48 |                 raise RuntimeError("Requires distributed package to be available")
 49 |             rank = dist.get_rank() if torch.cuda.is_available() else 0
 50 |             if torch.cuda.is_available():
 51 |                 torch.cuda.set_device(rank)
 52 |         if rank >= num_replicas or rank < 0:
 53 |             raise ValueError(
 54 |                 "Invalid rank {}, rank should be in the interval"
 55 |                 " [0, {}]".format(rank, num_replicas - 1)
 56 |             )
 57 |         self.dataset = dataset
 58 |         self.num_replicas = num_replicas
 59 |         self.rank = rank
 60 |         self.epoch = 0
 61 |         self.drop_last = drop_last
 62 |         # If the dataset length is evenly divisible by # of replicas, then there
 63 |         # is no need to drop any data, since the dataset will be split equally.
 64 |         if (
 65 |             self.drop_last and len(self.dataset) % self.num_replicas != 0
 66 |         ):  # type: ignore[arg-type]
 67 |             # Split to nearest available length that is evenly divisible.
 68 |             # This is to ensure each rank receives the same amount of data when
 69 |             # using this Sampler.
 70 |             self.num_samples = math.ceil(
 71 |                 (len(self.dataset) - self.num_replicas)
 72 |                 / self.num_replicas  # type: ignore[arg-type]
 73 |             )
 74 |         else:
 75 |             self.num_samples = math.ceil(
 76 |                 len(self.dataset) / self.num_replicas
 77 |             )  # type: ignore[arg-type]
 78 |         self.total_size = self.num_samples * self.num_replicas
 79 |         self.shuffle = shuffle
 80 |         self.seed = seed
 81 |         self.batch_size = batch_size
 82 |         self.id_with_length = self._get_sample_lengths()
 83 |         self.id_buckets = self.make_buckets(bucket_width=2.0)
 84 | 
 85 |     def _get_sample_lengths(self):
 86 |         id_with_lengths = []
 87 |         for i in range(len(self.dataset)):
 88 |             id_with_lengths.append((i, self.dataset.get_sample_length(i)))
 89 |         id_with_lengths.sort(key=lambda x: x[1])
 90 |         return id_with_lengths
 91 | 
 92 |     def make_buckets(self, bucket_width: float = 2.0):
 93 |         buckets = []
 94 |         cur = []
 95 |         max_sec = bucket_width
 96 |         for id, sec in self.id_with_length:
 97 |             if sec < max_sec:
 98 |                 cur.append(id)
 99 |             else:
100 |                 buckets.append(cur)
101 |                 cur = [id]
102 |                 max_sec += bucket_width
103 |         if len(cur) > 0:
104 |             buckets.append(cur)
105 |         return buckets
106 | 
107 |     def __iter__(self) -> Iterator[T_co]:
108 |         if self.shuffle:
109 |             # deterministically shuffle based on epoch and seed
110 |             g = torch.Generator()
111 |             g.manual_seed(self.seed + self.epoch)
112 |             random.seed(self.epoch + self.seed)
113 |             shuffled_bucket = []
114 |             for buc in self.id_buckets:
115 |                 buc_copy = buc.copy()
116 |                 shuffle(buc_copy)
117 |                 shuffled_bucket.append(buc_copy)
118 |             grouped_batch_size = self.batch_size * self.num_replicas
119 |             shuffled_bucket = list(itertools.chain(*shuffled_bucket))
120 |             n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size))
121 |             batches = [
122 |                 shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size]
123 |                 for b in range(n_batch)
124 |             ]
125 |             shuffle(batches)
126 |             indices = list(itertools.chain(*batches))
127 |         else:
128 |             # type: ignore[arg-type]
129 |             indices = list(range(len(self.dataset)))
130 | 
131 |         if not self.drop_last:
132 |             # add extra samples to make it evenly divisible
133 |             padding_size = self.total_size - len(indices)
134 |             if padding_size <= len(indices):
135 |                 indices += indices[:padding_size]
136 |             else:
137 |                 indices += (indices * math.ceil(padding_size / len(indices)))[
138 |                     :padding_size
139 |                 ]
140 |         else:
141 |             # remove tail of data to make it evenly divisible.
142 |             indices = indices[: self.total_size]
143 |         assert len(indices) == self.total_size
144 | 
145 |         # subsample
146 |         indices = indices[self.rank : self.total_size : self.num_replicas]
147 |         assert len(indices) == self.num_samples
148 | 
149 |         return iter(indices)
150 | 
151 |     def __len__(self) -> int:
152 |         return self.num_samples
153 | 
154 |     def set_epoch(self, epoch: int) -> None:
155 |         r"""
156 |         Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
157 |         use a different random ordering for each epoch. Otherwise, the next iteration of this
158 |         sampler will yield the same ordering.
159 | 
160 |         Args:
161 |             epoch (int): Epoch number.
162 |         """
163 |         self.epoch = epoch
164 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from AR.data.bucket_sampler import DistributedBucketSampler
 5 | from AR.data.dataset import Text2SemanticDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | class Text2SemanticDataModule(LightningDataModule):
10 |     def __init__(
11 |         self,
12 |         config,
13 |         train_semantic_path,
14 |         train_phoneme_path,
15 |         dev_semantic_path=None,
16 |         dev_phoneme_path=None,
17 |     ):
18 |         super().__init__()
19 |         self.config = config
20 |         self.train_semantic_path = train_semantic_path
21 |         self.train_phoneme_path = train_phoneme_path
22 |         self.dev_semantic_path = dev_semantic_path
23 |         self.dev_phoneme_path = dev_phoneme_path
24 |         self.num_workers = self.config["data"]["num_workers"]
25 | 
26 |     def prepare_data(self):
27 |         pass
28 | 
29 |     def setup(self, stage=None, output_logs=False):
30 |         self._train_dataset = Text2SemanticDataset(
31 |             phoneme_path=self.train_phoneme_path,
32 |             semantic_path=self.train_semantic_path,
33 |             max_sec=self.config["data"]["max_sec"],
34 |             pad_val=self.config["data"]["pad_val"],
35 |         )
36 |         self._dev_dataset = self._train_dataset
37 |         # self._dev_dataset = Text2SemanticDataset(
38 |         #     phoneme_path=self.dev_phoneme_path,
39 |         #     semantic_path=self.dev_semantic_path,
40 |         #     max_sample=self.config['data']['max_eval_sample'],
41 |         #     max_sec=self.config['data']['max_sec'],
42 |         #     pad_val=self.config['data']['pad_val'])
43 | 
44 |     def train_dataloader(self):
45 |         batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 |         batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 |         return DataLoader(
49 |             self._train_dataset,
50 |             batch_size=batch_size,
51 |             sampler=sampler,
52 |             collate_fn=self._train_dataset.collate,
53 |             num_workers=self.num_workers,
54 |             persistent_workers=True,
55 |             prefetch_factor=16,
56 |         )
57 | 
58 |     def val_dataloader(self):
59 |         return DataLoader(
60 |             self._dev_dataset,
61 |             batch_size=1,
62 |             shuffle=False,
63 |             collate_fn=self._train_dataset.collate,
64 |             num_workers=max(self.num_workers, 12),
65 |             persistent_workers=True,
66 |             prefetch_factor=16,
67 |         )
68 | 
69 |     # 这个会使用到嘛？
70 |     def test_dataloader(self):
71 |         return DataLoader(
72 |             self._dev_dataset,
73 |             batch_size=1,
74 |             shuffle=False,
75 |             collate_fn=self._train_dataset.collate,
76 |         )
77 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os, sys
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | from typing import Dict
  8 | 
  9 | import torch
 10 | from pytorch_lightning import LightningModule
 11 | from AR.models.t2s_model import Text2SemanticDecoder
 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 13 | from AR.modules.optim import ScaledAdam
 14 | 
 15 | class Text2SemanticLightningModule(LightningModule):
 16 |     def __init__(self, config, output_dir, is_train=True):
 17 |         super().__init__()
 18 |         self.config = config
 19 |         self.top_k = 3
 20 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 21 |         pretrained_s1 = config.get("pretrained_s1")
 22 |         if pretrained_s1 and is_train:
 23 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 24 |             print(
 25 |                 self.load_state_dict(
 26 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 27 |                 )
 28 |             )
 29 |         if is_train:
 30 |             self.automatic_optimization = False
 31 |             self.save_hyperparameters()
 32 |             self.eval_dir = output_dir / "eval"
 33 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 34 | 
 35 |     def training_step(self, batch: Dict, batch_idx: int):
 36 |         opt = self.optimizers()
 37 |         scheduler = self.lr_schedulers()
 38 |         forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old
 39 |         loss, acc = forward(
 40 |             batch["phoneme_ids"],
 41 |             batch["phoneme_ids_len"],
 42 |             batch["semantic_ids"],
 43 |             batch["semantic_ids_len"],
 44 |             batch["bert_feature"],
 45 |         )
 46 |         self.manual_backward(loss)
 47 |         if batch_idx > 0 and batch_idx % 4 == 0:
 48 |             opt.step()
 49 |             opt.zero_grad()
 50 |             scheduler.step()
 51 | 
 52 |         self.log(
 53 |             "total_loss",
 54 |             loss,
 55 |             on_step=True,
 56 |             on_epoch=True,
 57 |             prog_bar=True,
 58 |             sync_dist=True,
 59 |         )
 60 |         self.log(
 61 |             "lr",
 62 |             scheduler.get_last_lr()[0],
 63 |             on_epoch=True,
 64 |             prog_bar=True,
 65 |             sync_dist=True,
 66 |         )
 67 |         self.log(
 68 |             f"top_{self.top_k}_acc",
 69 |             acc,
 70 |             on_step=True,
 71 |             on_epoch=True,
 72 |             prog_bar=True,
 73 |             sync_dist=True,
 74 |         )
 75 | 
 76 |     def validation_step(self, batch: Dict, batch_idx: int):
 77 |         return
 78 | 
 79 |     # # get loss
 80 |     # loss, acc = self.model.forward(
 81 |     #     batch['phoneme_ids'], batch['phoneme_ids_len'],
 82 |     #     batch['semantic_ids'], batch['semantic_ids_len'],
 83 |     #     batch['bert_feature']
 84 |     # )
 85 |     #
 86 |     # self.log(
 87 |     #     "val_total_loss",
 88 |     #     loss,
 89 |     #     on_step=True,
 90 |     #     on_epoch=True,
 91 |     #     prog_bar=True,
 92 |     #     sync_dist=True)
 93 |     # self.log(
 94 |     #     f"val_top_{self.top_k}_acc",
 95 |     #     acc,
 96 |     #     on_step=True,
 97 |     #     on_epoch=True,
 98 |     #     prog_bar=True,
 99 |     #     sync_dist=True)
100 |     #
101 |     # # get infer output
102 |     # semantic_len = batch['semantic_ids'].size(1)
103 |     # prompt_len = min(int(semantic_len * 0.5), 150)
104 |     # prompt = batch['semantic_ids'][:, :prompt_len]
105 |     # pred_semantic = self.model.infer(batch['phoneme_ids'],
106 |     #                                  batch['phoneme_ids_len'], prompt,
107 |     #                                  batch['bert_feature']
108 |     #                                  )
109 |     # save_name = f'semantic_toks_{batch_idx}.pt'
110 |     # save_path = os.path.join(self.eval_dir, save_name)
111 |     # torch.save(pred_semantic.detach().cpu(), save_path)
112 | 
113 |     def configure_optimizers(self):
114 |         model_parameters = self.model.parameters()
115 |         parameters_names = []
116 |         parameters_names.append(
117 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
118 |         )
119 |         lm_opt = ScaledAdam(
120 |             model_parameters,
121 |             lr=0.01,
122 |             betas=(0.9, 0.95),
123 |             clipping_scale=2.0,
124 |             parameters_names=parameters_names,
125 |             show_dominant_parameters=False,
126 |             clipping_update_period=1000,
127 |         )
128 | 
129 |         return {
130 |             "optimizer": lm_opt,
131 |             "lr_scheduler": {
132 |                 "scheduler": WarmupCosineLRSchedule(
133 |                     lm_opt,
134 |                     init_lr=self.config["optimizer"]["lr_init"],
135 |                     peak_lr=self.config["optimizer"]["lr"],
136 |                     end_lr=self.config["optimizer"]["lr_end"],
137 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
138 |                     total_steps=self.config["optimizer"]["decay_steps"],
139 |                 )
140 |             },
141 |         }
142 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os, sys
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | from typing import Dict
  8 | 
  9 | import torch
 10 | from pytorch_lightning import LightningModule
 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 13 | from AR.modules.optim import ScaledAdam
 14 | 
 15 | 
 16 | class Text2SemanticLightningModule(LightningModule):
 17 |     def __init__(self, config, output_dir, is_train=True):
 18 |         super().__init__()
 19 |         self.config = config
 20 |         self.top_k = 3
 21 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 22 |         pretrained_s1 = config.get("pretrained_s1")
 23 |         if pretrained_s1 and is_train:
 24 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 25 |             print(
 26 |                 self.load_state_dict(
 27 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 28 |                 )
 29 |             )
 30 |         if is_train:
 31 |             self.automatic_optimization = False
 32 |             self.save_hyperparameters()
 33 |             self.eval_dir = output_dir / "eval"
 34 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 35 | 
 36 |     def training_step(self, batch: Dict, batch_idx: int):
 37 |         opt = self.optimizers()
 38 |         scheduler = self.lr_schedulers()
 39 |         loss, acc = self.model.forward(
 40 |             batch["phoneme_ids"],
 41 |             batch["phoneme_ids_len"],
 42 |             batch["semantic_ids"],
 43 |             batch["semantic_ids_len"],
 44 |             batch["bert_feature"],
 45 |         )
 46 |         self.manual_backward(loss)
 47 |         if batch_idx > 0 and batch_idx % 4 == 0:
 48 |             opt.step()
 49 |             opt.zero_grad()
 50 |             scheduler.step()
 51 | 
 52 |         self.log(
 53 |             "total_loss",
 54 |             loss,
 55 |             on_step=True,
 56 |             on_epoch=True,
 57 |             prog_bar=True,
 58 |             sync_dist=True,
 59 |         )
 60 |         self.log(
 61 |             "lr",
 62 |             scheduler.get_last_lr()[0],
 63 |             on_epoch=True,
 64 |             prog_bar=True,
 65 |             sync_dist=True,
 66 |         )
 67 |         self.log(
 68 |             f"top_{self.top_k}_acc",
 69 |             acc,
 70 |             on_step=True,
 71 |             on_epoch=True,
 72 |             prog_bar=True,
 73 |             sync_dist=True,
 74 |         )
 75 | 
 76 |     def validation_step(self, batch: Dict, batch_idx: int):
 77 |         return
 78 | 
 79 |     def configure_optimizers(self):
 80 |         model_parameters = self.model.parameters()
 81 |         parameters_names = []
 82 |         parameters_names.append(
 83 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
 84 |         )
 85 |         lm_opt = ScaledAdam(
 86 |             model_parameters,
 87 |             lr=0.01,
 88 |             betas=(0.9, 0.95),
 89 |             clipping_scale=2.0,
 90 |             parameters_names=parameters_names,
 91 |             show_dominant_parameters=False,
 92 |             clipping_update_period=1000,
 93 |         )
 94 | 
 95 |         return {
 96 |             "optimizer": lm_opt,
 97 |             "lr_scheduler": {
 98 |                 "scheduler": WarmupCosineLRSchedule(
 99 |                     lm_opt,
100 |                     init_lr=self.config["optimizer"]["lr_init"],
101 |                     peak_lr=self.config["optimizer"]["lr"],
102 |                     end_lr=self.config["optimizer"]["lr_end"],
103 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
104 |                     total_steps=self.config["optimizer"]["decay_steps"],
105 |                 )
106 |             },
107 |         }
108 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/activation_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
  2 | from typing import Optional
  3 | from typing import Tuple
  4 | import torch
  5 | from torch import Tensor
  6 | from torch.nn import Linear
  7 | from torch.nn import Module
  8 | from torch.nn.init import constant_
  9 | from torch.nn.init import xavier_normal_
 10 | from torch.nn.init import xavier_uniform_
 11 | from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
 12 | from torch.nn.parameter import Parameter
 13 | 
 14 | from torch.nn import functional as F
 15 | from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched
 16 | 
 17 | 
 18 | class MultiheadAttention(Module):
 19 |     __constants__ = ["batch_first"]
 20 |     bias_k: Optional[torch.Tensor]
 21 |     bias_v: Optional[torch.Tensor]
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         embed_dim,
 26 |         num_heads,
 27 |         dropout=0.0,
 28 |         bias=True,
 29 |         add_bias_kv=False,
 30 |         add_zero_attn=False,
 31 |         kdim=None,
 32 |         vdim=None,
 33 |         batch_first=False,
 34 |         linear1_cls=Linear,
 35 |         linear2_cls=Linear,
 36 |         device=None,
 37 |         dtype=None,
 38 |     ) -> None:
 39 |         factory_kwargs = {"device": device, "dtype": dtype}
 40 |         super(MultiheadAttention, self).__init__()
 41 |         self.embed_dim = embed_dim
 42 |         self.kdim = kdim if kdim is not None else embed_dim
 43 |         self.vdim = vdim if vdim is not None else embed_dim
 44 |         self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
 45 | 
 46 |         self.num_heads = num_heads
 47 |         self.dropout = dropout
 48 |         self.batch_first = batch_first
 49 |         self.head_dim = embed_dim // num_heads
 50 |         assert (
 51 |             self.head_dim * num_heads == self.embed_dim
 52 |         ), "embed_dim must be divisible by num_heads"
 53 | 
 54 |         if add_bias_kv:
 55 |             self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
 56 |             self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
 57 |         else:
 58 |             self.bias_k = self.bias_v = None
 59 | 
 60 |         if linear1_cls == Linear:
 61 |             if not self._qkv_same_embed_dim:
 62 |                 self.q_proj_weight = Parameter(
 63 |                     torch.empty((embed_dim, embed_dim), **factory_kwargs)
 64 |                 )
 65 |                 self.k_proj_weight = Parameter(
 66 |                     torch.empty((embed_dim, self.kdim), **factory_kwargs)
 67 |                 )
 68 |                 self.v_proj_weight = Parameter(
 69 |                     torch.empty((embed_dim, self.vdim), **factory_kwargs)
 70 |                 )
 71 |                 self.register_parameter("in_proj_weight", None)
 72 |             else:
 73 |                 self.in_proj_weight = Parameter(
 74 |                     torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
 75 |                 )
 76 |                 self.register_parameter("q_proj_weight", None)
 77 |                 self.register_parameter("k_proj_weight", None)
 78 |                 self.register_parameter("v_proj_weight", None)
 79 | 
 80 |             if bias:
 81 |                 self.in_proj_bias = Parameter(
 82 |                     torch.empty(3 * embed_dim, **factory_kwargs)
 83 |                 )
 84 |             else:
 85 |                 self.register_parameter("in_proj_bias", None)
 86 |             self.out_proj = NonDynamicallyQuantizableLinear(
 87 |                 embed_dim, embed_dim, bias=bias, **factory_kwargs
 88 |             )
 89 | 
 90 |             self._reset_parameters()
 91 |         else:
 92 |             if not self._qkv_same_embed_dim:
 93 |                 raise NotImplementedError
 94 |             else:
 95 |                 self.in_proj_linear = linear1_cls(
 96 |                     embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
 97 |                 )
 98 |                 self.in_proj_weight = self.in_proj_linear.weight
 99 | 
100 |                 self.register_parameter("q_proj_weight", None)
101 |                 self.register_parameter("k_proj_weight", None)
102 |                 self.register_parameter("v_proj_weight", None)
103 | 
104 |                 if bias:
105 |                     self.in_proj_bias = self.in_proj_linear.bias
106 |                 else:
107 |                     self.register_parameter("in_proj_bias", None)
108 | 
109 |             self.out_proj = linear2_cls(
110 |                 embed_dim, embed_dim, bias=bias, **factory_kwargs
111 |             )
112 | 
113 |             if self.bias_k is not None:
114 |                 xavier_normal_(self.bias_k)
115 |             if self.bias_v is not None:
116 |                 xavier_normal_(self.bias_v)
117 | 
118 |         self.add_zero_attn = add_zero_attn
119 | 
120 |     def _reset_parameters(self):
121 |         if self._qkv_same_embed_dim:
122 |             xavier_uniform_(self.in_proj_weight)
123 |         else:
124 |             xavier_uniform_(self.q_proj_weight)
125 |             xavier_uniform_(self.k_proj_weight)
126 |             xavier_uniform_(self.v_proj_weight)
127 | 
128 |         if self.in_proj_bias is not None:
129 |             constant_(self.in_proj_bias, 0.0)
130 |             constant_(self.out_proj.bias, 0.0)
131 | 
132 |         if self.bias_k is not None:
133 |             xavier_normal_(self.bias_k)
134 |         if self.bias_v is not None:
135 |             xavier_normal_(self.bias_v)
136 | 
137 |     def __setstate__(self, state):
138 |         # Support loading old MultiheadAttention checkpoints generated by v1.1.0
139 |         if "_qkv_same_embed_dim" not in state:
140 |             state["_qkv_same_embed_dim"] = True
141 | 
142 |         super(MultiheadAttention, self).__setstate__(state)
143 | 
144 |     def forward(
145 |         self,
146 |         query: Tensor,
147 |         key: Tensor,
148 |         value: Tensor,
149 |         key_padding_mask: Optional[Tensor] = None,
150 |         need_weights: bool = True,
151 |         attn_mask: Optional[Tensor] = None,
152 |         average_attn_weights: bool = True,
153 |         cache=None,
154 |     ) -> Tuple[Tensor, Optional[Tensor]]:
155 |         any_nested = query.is_nested or key.is_nested or value.is_nested
156 |         query = key = value = query.transpose(1, 0)
157 |         attn_output = multi_head_attention_forward_patched(
158 |             query,
159 |             key,
160 |             value,
161 |             self.embed_dim,
162 |             self.num_heads,
163 |             self.in_proj_weight,
164 |             self.in_proj_bias,
165 |             self.bias_k,
166 |             self.bias_v,
167 |             self.add_zero_attn,
168 |             self.dropout,
169 |             self.out_proj.weight,
170 |             self.out_proj.bias,
171 |             training=self.training,
172 |             key_padding_mask=key_padding_mask,
173 |             need_weights=need_weights,
174 |             attn_mask=attn_mask,
175 |             average_attn_weights=average_attn_weights,
176 |             cache=cache,
177 |         )
178 |         return attn_output.transpose(1, 0)
179 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(
64 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 |             ).unsqueeze(1)
66 |         else:
67 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 |         div_term = torch.exp(
69 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 |             * -(math.log(10000.0) / self.embedding_dim)
71 |         )
72 |         pe[:, 0::2] = torch.sin(position * div_term)
73 |         pe[:, 1::2] = torch.cos(position * div_term)
74 |         pe = pe.unsqueeze(0)
75 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 | 
77 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
78 |         self.extend_pe(x)
79 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
80 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 |         return self.dropout(output)
82 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (
53 |                 self.total_steps - self.warmup_steps
54 |             )
55 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
56 |                 raise RuntimeError(
57 |                     "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 |                 )
59 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 | 
62 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
63 |         self.set_lr(lr)
64 |         self.lr = lr
65 |         self._current_step += 1
66 |         return self.lr
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     m = nn.Linear(10, 10)
71 |     opt = Adam(m.parameters(), lr=1e-4)
72 |     s = WarmupCosineLRSchedule(
73 |         opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 |     )
75 |     lrs = []
76 |     for i in range(25000):
77 |         s.step()
78 |         lrs.append(s.lr)
79 |         print(s.lr)
80 | 
81 |     plt.plot(lrs)
82 |     plt.plot(range(0, 25000), lrs)
83 |     plt.show()
84 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _mha_shape_check,
 4 |     _canonical_mask,
 5 |     _none_or_dtype,
 6 |     _in_projection_packed,
 7 | )
 8 | 
 9 | def multi_head_attention_forward_patched(
10 |     query,
11 |     key,
12 |     value,
13 |     embed_dim_to_check: int,
14 |     num_heads: int,
15 |     in_proj_weight,
16 |     in_proj_bias: Optional[Tensor],
17 |     bias_k: Optional[Tensor],
18 |     bias_v: Optional[Tensor],
19 |     add_zero_attn: bool,
20 |     dropout_p: float,
21 |     out_proj_weight: Tensor,
22 |     out_proj_bias: Optional[Tensor],
23 |     training: bool = True,
24 |     key_padding_mask: Optional[Tensor] = None,
25 |     need_weights: bool = True,
26 |     attn_mask: Optional[Tensor] = None,
27 |     use_separate_proj_weight: bool = False,
28 |     q_proj_weight: Optional[Tensor] = None,
29 |     k_proj_weight: Optional[Tensor] = None,
30 |     v_proj_weight: Optional[Tensor] = None,
31 |     static_k: Optional[Tensor] = None,
32 |     static_v: Optional[Tensor] = None,
33 |     average_attn_weights: bool = True,
34 |     is_causal: bool = False,
35 |     cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 | 
38 |     # set up shape vars
39 |     _, _, embed_dim = query.shape
40 |     attn_mask = _canonical_mask(
41 |         mask=attn_mask,
42 |         mask_name="attn_mask",
43 |         other_type=None,
44 |         other_name="",
45 |         target_type=query.dtype,
46 |         check_other=False,
47 |     )
48 |     head_dim = embed_dim // num_heads
49 | 
50 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 | 
54 |     if cache["first_infer"] == 1:
55 |         cache["k"][cache["stage"]] = k
56 |         cache["v"][cache["stage"]] = v
57 |     else:
58 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 |         k = cache["k"][cache["stage"]]
61 |         v = cache["v"][cache["stage"]]
62 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 | 
64 |     attn_mask = _canonical_mask(
65 |         mask=attn_mask,
66 |         mask_name="attn_mask",
67 |         other_type=None,
68 |         other_name="",
69 |         target_type=q.dtype,
70 |         check_other=False,
71 |     )
72 |     attn_mask = attn_mask.unsqueeze(0)
73 | 
74 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 | 
78 |     dropout_p = 0.0
79 |     attn_mask = attn_mask.unsqueeze(0)
80 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 |     attn_output = scaled_dot_product_attention(
84 |         q, k, v, attn_mask, dropout_p, is_causal
85 |     )
86 |     attn_output = (
87 |         attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 |     )
89 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 | 
92 |     return attn_output
93 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = (
34 |             rf"([{''.join(self._special_cases_dict.keys())}])"
35 |         )
36 | 
37 |     def _normalize_punctuation(self, text: str) -> str:
38 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 |         text = regex.sub(r"\pZ+", r" ", text)
41 |         return text.strip()
42 | 
43 |     def _convert_punctuation(self, word: Word) -> str:
44 |         if not word.phonemes:
45 |             return ""
46 |         if word.phonemes[0] in ["‖", "|"]:
47 |             return word.text.strip()
48 | 
49 |         phonemes = "".join(word.phonemes)
50 |         # remove modifier characters ˈˌː with regex
51 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 |         return phonemes.strip()
53 | 
54 |     def phonemize(self, text: str, espeak: bool = False) -> str:
55 |         text_to_phonemize: str = self._normalize_punctuation(text)
56 |         sents: List[Sentence] = [
57 |             sent
58 |             for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 |         ]
60 |         words: List[str] = [
61 |             self._convert_punctuation(word) for word in itertools.chain(*sents)
62 |         ]
63 |         return " ".join(words)
64 | 
65 |     def transform(self, phonemes):
66 |         # convert phonemes to ids
67 |         # dictionary is in symbols.py
68 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     phonemizer = GruutPhonemizer("en-us")
73 |     # text -> IPA
74 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 |     print("phonemes:", phonemes)
76 |     print("len(phonemes):", len(phonemes))
77 |     phoneme_ids = phonemizer.transform(phonemes)
78 |     print("phoneme_ids:", phoneme_ids)
79 |     print("len(phoneme_ids):", len(phoneme_ids))
80 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
 8 | SPACE_ID = SYMBOLS.index(" ")
 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == 'true' else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(
22 |         extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 |     # 获取最新的 ckpt 文件名
24 |     newest_ckpt = sorted_info[0][2]
25 |     return newest_ckpt
26 | 
27 | 
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 |     try:
31 |         with open(file_path, 'r') as file:
32 |             text = file.readline().strip()
33 |         assert text.strip() != ''
34 |         return text
35 |     except Exception:
36 |         return False
37 |     return False
38 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | 
 7 | def initialize(model: torch.nn.Module, init: str):
 8 |     """Initialize weights of a neural network module.
 9 | 
10 |     Parameters are initialized using the given method or distribution.
11 | 
12 |     Custom initialization routines can be implemented into submodules
13 |     as function `espnet_initialization_fn` within the custom module.
14 | 
15 |     Args:
16 |         model: Target.
17 |         init: Method of initialization.
18 |     """
19 |     assert check_argument_types()
20 |     print("init with", init)
21 | 
22 |     # weight init
23 |     for p in model.parameters():
24 |         if p.dim() > 1:
25 |             if init == "xavier_uniform":
26 |                 torch.nn.init.xavier_uniform_(p.data)
27 |             elif init == "xavier_normal":
28 |                 torch.nn.init.xavier_normal_(p.data)
29 |             elif init == "kaiming_uniform":
30 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 |             elif init == "kaiming_normal":
32 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 |             else:
34 |                 raise ValueError("Unknown initialization: " + init)
35 |     # bias init
36 |     for name, p in model.named_parameters():
37 |         if ".bias" in name and p.dim() == 1:
38 |             p.data.zero_()
39 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict(
22 |         (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 |     )
24 |     with open(path, "a") as args_file:
25 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
26 |         args_file.write(
27 |             "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 |         )
29 |         args_file.write("==> Cmd:\n")
30 |         args_file.write(str(sys.argv))
31 |         args_file.write("\n==> args:\n")
32 |         for k, v in sorted(args_dict.items()):
33 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
34 |         args_file.close()
35 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/__pycache__/my_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/my_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/__pycache__/process_ckpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/process_ckpt.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 12
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 16
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 12
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 6
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 512
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 100
 4 |   batch_size: 6
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 32
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 40
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   saving_path: "ckpt/"
22 |   resume_checkpoint: null
23 |   vocoder_config_path: "quantizer/new_ckpt/config.json"
24 |   vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 |   datadir: "/home/liweiche/GigaSpeech/wavs"
26 |   metapath: "/home/liweiche/GigaSpeech/train2.json"
27 |   val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 |   sampledir: "logs/"
29 |   pretrained_path: null
30 |   lr: 0.0001
31 |   batch_size: 200.0
32 |   train_bucket_size: 8192
33 |   training_step: 800000
34 |   optim_flat_percent: 0.0
35 |   warmup_step: 50
36 |   adam_beta1: 0.9
37 |   adam_beta2: 0.98
38 |   ffd_size: 3072
39 |   hidden_size: 768
40 |   enc_nlayers: 6
41 |   dec_nlayers: 6
42 |   nheads: 12
43 |   ar_layer: 4
44 |   ar_ffd_size: 1024
45 |   ar_hidden_size: 256
46 |   ar_nheads: 4
47 |   aligner_softmax_temp: 1.0
48 |   layer_norm_eps: 0.00001
49 |   speaker_embed_dropout: 0.05
50 |   label_smoothing: 0.0
51 |   val_check_interval: 5000
52 |   check_val_every_n_epoch: 1
53 |   precision: "fp16"
54 |   nworkers: 16
55 |   distributed: true
56 |   accelerator: "ddp"
57 |   version: null
58 |   accumulate_grad_batches: 1
59 |   use_repetition_token: true
60 |   use_repetition_gating: false
61 |   repetition_penalty: 1.0
62 |   sampling_temperature: 1.0
63 |   top_k: -1
64 |   min_top_k: 3
65 |   top_p: 0.8
66 |   sample_num: 4
67 |   length_penalty_max_length: 15000
68 |   length_penalty_max_prob: 0.95
69 |   max_input_length: 2048
70 |   max_output_length: 2000
71 |   sample_rate: 16000
72 |   n_codes: 1024
73 |   n_cluster_groups: 1
74 |   phone_context_window: 4
75 |   phoneset_size: 1000
76 | inference:
77 |   top_k: 5
78 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4
22 |   },
23 |   "data": {
24 |     "max_wav_value": 32768.0,
25 |     "sampling_rate": 32000,
26 |     "filter_length": 2048,
27 |     "hop_length": 640,
28 |     "win_length": 2048,
29 |     "n_mel_channels": 128,
30 |     "mel_fmin": 0.0,
31 |     "mel_fmax": null,
32 |     "add_blank": true,
33 |     "n_speakers": 300,
34 |     "cleaned_text": true
35 |   },
36 |   "model": {
37 |     "inter_channels": 192,
38 |     "hidden_channels": 192,
39 |     "filter_channels": 768,
40 |     "n_heads": 2,
41 |     "n_layers": 6,
42 |     "kernel_size": 3,
43 |     "p_dropout": 0.1,
44 |     "resblock": "1",
45 |     "resblock_kernel_sizes": [
46 |       3,
47 |       7,
48 |       11
49 |     ],
50 |     "resblock_dilation_sizes": [
51 |       [
52 |         1,
53 |         3,
54 |         5
55 |       ],
56 |       [
57 |         1,
58 |         3,
59 |         5
60 |       ],
61 |       [
62 |         1,
63 |         3,
64 |         5
65 |       ]
66 |     ],
67 |     "upsample_rates": [
68 |       10,
69 |       8,
70 |       2,
71 |       2,
72 |       2
73 |     ],
74 |     "upsample_initial_channel": 512,
75 |     "upsample_kernel_sizes": [
76 |       16,
77 |       16,
78 |       8,
79 |       2,
80 |       2
81 |     ],
82 |     "n_layers_q": 3,
83 |     "use_spectral_norm": false,
84 |     "gin_channels": 512,
85 |     "semantic_frame_rate": "25hz",
86 |     "freeze_quantizer": true
87 |   },
88 |   "s2_ckpt_dir": "logs/s2/big2k1",
89 |   "content_module": "cnhubert"
90 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | gpu:
 2 |   n_card: 1
 3 |   n_process_per_card: 2
 4 | io:
 5 |   text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 24
28 |   dropout: 0
29 |   EOS: 1024
30 |   random_bert: 0
31 | inference:
32 |   top_k: 5
33 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {
4 |     'cnhubert': cnhubert,
5 |     'whisper': whisper_enc
6 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__pycache__/cnhubert.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/cnhubert.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__pycache__/whisper_enc.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/whisper_enc.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import librosa
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import soundfile as sf
  7 | import logging
  8 | 
  9 | logging.getLogger("numba").setLevel(logging.WARNING)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | import utils
 17 | import torch.nn as nn
 18 | 
 19 | cnhubert_base_path = None
 20 | 
 21 | 
 22 | class CNHubert(nn.Module):
 23 |     def __init__(self):
 24 |         super().__init__()
 25 |         self.model = HubertModel.from_pretrained(cnhubert_base_path)
 26 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
 27 |             cnhubert_base_path
 28 |         )
 29 | 
 30 |     def forward(self, x):
 31 |         input_values = self.feature_extractor(
 32 |             x, return_tensors="pt", sampling_rate=16000
 33 |         ).input_values.to(x.device)
 34 |         feats = self.model(input_values)["last_hidden_state"]
 35 |         return feats
 36 | 
 37 | 
 38 | # class CNHubertLarge(nn.Module):
 39 | #     def __init__(self):
 40 | #         super().__init__()
 41 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 42 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 43 | #     def forward(self, x):
 44 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 45 | #         feats = self.model(input_values)["last_hidden_state"]
 46 | #         return feats
 47 | #
 48 | # class CVec(nn.Module):
 49 | #     def __init__(self):
 50 | #         super().__init__()
 51 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 52 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 53 | #     def forward(self, x):
 54 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 55 | #         feats = self.model(input_values)["last_hidden_state"]
 56 | #         return feats
 57 | #
 58 | # class cnw2v2base(nn.Module):
 59 | #     def __init__(self):
 60 | #         super().__init__()
 61 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 62 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 63 | #     def forward(self, x):
 64 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 65 | #         feats = self.model(input_values)["last_hidden_state"]
 66 | #         return feats
 67 | 
 68 | 
 69 | def get_model():
 70 |     model = CNHubert()
 71 |     model.eval()
 72 |     return model
 73 | 
 74 | 
 75 | # def get_large_model():
 76 | #     model = CNHubertLarge()
 77 | #     model.eval()
 78 | #     return model
 79 | #
 80 | # def get_model_cvec():
 81 | #     model = CVec()
 82 | #     model.eval()
 83 | #     return model
 84 | #
 85 | # def get_model_cnw2v2base():
 86 | #     model = cnw2v2base()
 87 | #     model.eval()
 88 | #     return model
 89 | 
 90 | 
 91 | def get_content(hmodel, wav_16k_tensor):
 92 |     with torch.no_grad():
 93 |         feats = hmodel(wav_16k_tensor)
 94 |     return feats.transpose(1, 2)
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     model = get_model()
 99 |     src_path = "/Users/Shared/原音频2.wav"
100 |     wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
101 |     model = model
102 |     wav_16k_tensor = wav_16k_tensor
103 |     feats = get_content(model, wav_16k_tensor)
104 |     print(feats.shape)
105 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 |             :1, :feature_len, :
24 |         ].transpose(1, 2)
25 |     return feature
26 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/attentions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/attentions.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/commons.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/commons.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/core_vq.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/core_vq.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/data_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/losses.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/losses.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/mel_processing.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/mel_processing.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/mrte_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/mrte_model.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/quantize.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/quantize.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/transforms.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/module/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def init_weights(m, mean=0.0, std=0.01):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find("Conv") != -1:
  9 |         m.weight.data.normal_(mean, std)
 10 | 
 11 | 
 12 | def get_padding(kernel_size, dilation=1):
 13 |     return int((kernel_size * dilation - dilation) / 2)
 14 | 
 15 | 
 16 | def convert_pad_shape(pad_shape):
 17 |     l = pad_shape[::-1]
 18 |     pad_shape = [item for sublist in l for item in sublist]
 19 |     return pad_shape
 20 | 
 21 | 
 22 | def intersperse(lst, item):
 23 |     result = [item] * (len(lst) * 2 + 1)
 24 |     result[1::2] = lst
 25 |     return result
 26 | 
 27 | 
 28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 29 |     """KL(P||Q)"""
 30 |     kl = (logs_q - logs_p) - 0.5
 31 |     kl += (
 32 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 33 |     )
 34 |     return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |     """Sample from the Gumbel distribution, protect from overflows."""
 39 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |     return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |     return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |     ret = torch.zeros_like(x[:, :, :segment_size])
 50 |     for i in range(x.size(0)):
 51 |         idx_str = ids_str[i]
 52 |         idx_end = idx_str + segment_size
 53 |         ret[i] = x[i, :, idx_str:idx_end]
 54 |     return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |     b, d, t = x.size()
 59 |     if x_lengths is None:
 60 |         x_lengths = t
 61 |     ids_str_max = x_lengths - segment_size + 1
 62 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |     ret = slice_segments(x, ids_str, segment_size)
 64 |     return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 68 |     position = torch.arange(length, dtype=torch.float)
 69 |     num_timescales = channels // 2
 70 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 71 |         num_timescales - 1
 72 |     )
 73 |     inv_timescales = min_timescale * torch.exp(
 74 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 75 |     )
 76 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |     signal = signal.view(1, channels, length)
 80 |     return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |     b, channels, length = x.size()
 85 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |     return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |     b, channels, length = x.size()
 91 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |     return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |     n_channels_int = n_channels[0]
103 |     in_act = input_a + input_b
104 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |     acts = t_act * s_act
107 |     return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |     l = pad_shape[::-1]
112 |     pad_shape = [item for sublist in l for item in sublist]
113 |     return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |     return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |     if max_length is None:
123 |         max_length = length.max()
124 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |     return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |     """
130 |     duration: [b, 1, t_x]
131 |     mask: [b, 1, t_y, t_x]
132 |     """
133 |     device = duration.device
134 | 
135 |     b, _, t_y, t_x = mask.shape
136 |     cum_duration = torch.cumsum(duration, -1)
137 | 
138 |     cum_duration_flat = cum_duration.view(b * t_x)
139 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |     path = path.view(b, t_x, t_y)
141 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |     path = path.unsqueeze(1).transpose(2, 3) * mask
143 |     return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |     if isinstance(parameters, torch.Tensor):
148 |         parameters = [parameters]
149 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |     norm_type = float(norm_type)
151 |     if clip_value is not None:
152 |         clip_value = float(clip_value)
153 | 
154 |     total_norm = 0
155 |     for p in parameters:
156 |         param_norm = p.grad.data.norm(norm_type)
157 |         total_norm += param_norm.item() ** norm_type
158 |         if clip_value is not None:
159 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |     total_norm = total_norm ** (1.0 / norm_type)
161 |     return total_norm
162 | 
163 | 
164 | def squeeze(x, x_mask=None, n_sqz=2):
165 |     b, c, t = x.size()
166 | 
167 |     t = (t // n_sqz) * n_sqz
168 |     x = x[:, :, :t]
169 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
170 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
171 | 
172 |     if x_mask is not None:
173 |         x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz]
174 |     else:
175 |         x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
176 |     return x_sqz * x_mask, x_mask
177 | 
178 | 
179 | def unsqueeze(x, x_mask=None, n_sqz=2):
180 |     b, c, t = x.size()
181 | 
182 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
183 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
184 | 
185 |     if x_mask is not None:
186 |         x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
187 |     else:
188 |         x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
189 |     return x_unsqz * x_mask, x_mask
190 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |     loss = 0
 9 |     for dr, dg in zip(fmap_r, fmap_g):
10 |         for rl, gl in zip(dr, dg):
11 |             rl = rl.float().detach()
12 |             gl = gl.float()
13 |             loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |     return loss * 2
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |     loss = 0
20 |     r_losses = []
21 |     g_losses = []
22 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |         dr = dr.float()
24 |         dg = dg.float()
25 |         r_loss = torch.mean((1 - dr) ** 2)
26 |         g_loss = torch.mean(dg**2)
27 |         loss += r_loss + g_loss
28 |         r_losses.append(r_loss.item())
29 |         g_losses.append(g_loss.item())
30 | 
31 |     return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |     loss = 0
36 |     gen_losses = []
37 |     for dg in disc_outputs:
38 |         dg = dg.float()
39 |         l = torch.mean((1 - dg) ** 2)
40 |         gen_losses.append(l)
41 |         loss += l
42 | 
43 |     return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |     """
48 |     z_p, logs_q: [b, h, t_t]
49 |     m_p, logs_p: [b, h, t_t]
50 |     """
51 |     z_p = z_p.float()
52 |     logs_q = logs_q.float()
53 |     m_p = m_p.float()
54 |     logs_p = logs_p.float()
55 |     z_mask = z_mask.float()
56 | 
57 |     kl = logs_p - logs_q - 0.5
58 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 |     kl = torch.sum(kl * z_mask)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def mle_loss(z, m, logs, logdet, mask):
65 |     l = torch.sum(logs) + 0.5 * torch.sum(
66 |         torch.exp(-2 * logs) * ((z - m) ** 2)
67 |     )  # neg normal likelihood w/o the constant term
68 |     l = l - torch.sum(logdet)  # log jacobian determinant
69 |     l = l / torch.sum(
70 |         torch.ones_like(z) * mask
71 |     )  # averaging across batch, channel and time axes
72 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
73 |     return l
74 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     if torch.min(y) < -1.0:
 53 |         print("min value is ", torch.min(y))
 54 |     if torch.max(y) > 1.0:
 55 |         print("max value is ", torch.max(y))
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 62 |             dtype=y.dtype, device=y.device
 63 |         )
 64 | 
 65 |     y = torch.nn.functional.pad(
 66 |         y.unsqueeze(1),
 67 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 68 |         mode="reflect",
 69 |     )
 70 |     y = y.squeeze(1)
 71 |     spec = torch.stft(
 72 |         y,
 73 |         n_fft,
 74 |         hop_length=hop_size,
 75 |         win_length=win_size,
 76 |         window=hann_window[wnsize_dtype_device],
 77 |         center=center,
 78 |         pad_mode="reflect",
 79 |         normalized=False,
 80 |         onesided=True,
 81 |         return_complex=False,
 82 |     )
 83 | 
 84 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 85 |     return spec
 86 | 
 87 | 
 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 89 |     global mel_basis
 90 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 91 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 92 |     if fmax_dtype_device not in mel_basis:
 93 |         mel = librosa_mel_fn(
 94 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
 95 |         )
 96 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 97 |             dtype=spec.dtype, device=spec.device
 98 |         )
 99 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
100 |     spec = spectral_normalize_torch(spec)
101 |     return spec
102 | 
103 | 
104 | def mel_spectrogram_torch(
105 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
106 | ):
107 |     if torch.min(y) < -1.0:
108 |         print("min value is ", torch.min(y))
109 |     if torch.max(y) > 1.0:
110 |         print("max value is ", torch.max(y))
111 | 
112 |     global mel_basis, hann_window
113 |     dtype_device = str(y.dtype) + "_" + str(y.device)
114 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
115 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
116 |     if fmax_dtype_device not in mel_basis:
117 |         mel = librosa_mel_fn(
118 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
119 |         )
120 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
121 |             dtype=y.dtype, device=y.device
122 |         )
123 |     if wnsize_dtype_device not in hann_window:
124 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
125 |             dtype=y.dtype, device=y.device
126 |         )
127 | 
128 |     y = torch.nn.functional.pad(
129 |         y.unsqueeze(1),
130 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
131 |         mode="reflect",
132 |     )
133 |     y = y.squeeze(1)
134 | 
135 |     spec = torch.stft(
136 |         y,
137 |         n_fft,
138 |         hop_length=hop_size,
139 |         win_length=win_size,
140 |         window=hann_window[wnsize_dtype_device],
141 |         center=center,
142 |         pad_mode="reflect",
143 |         normalized=False,
144 |         onesided=True,
145 |         return_complex=False,
146 |     )
147 | 
148 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
149 | 
150 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
151 |     spec = spectral_normalize_torch(spec)
152 | 
153 |     return spec
154 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/mrte_model.py:
--------------------------------------------------------------------------------
  1 | # This is Multi-reference timbre encoder
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn.utils import remove_weight_norm, weight_norm
  6 | from module.attentions import MultiHeadAttention
  7 | 
  8 | 
  9 | class MRTE(nn.Module):
 10 |     def __init__(
 11 |         self,
 12 |         content_enc_channels=192,
 13 |         hidden_size=512,
 14 |         out_channels=192,
 15 |         kernel_size=5,
 16 |         n_heads=4,
 17 |         ge_layer=2,
 18 |     ):
 19 |         super(MRTE, self).__init__()
 20 |         self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads)
 21 |         self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
 22 |         self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
 23 |         self.c_post = nn.Conv1d(hidden_size, out_channels, 1)
 24 | 
 25 |     def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None):
 26 |         if ge == None:
 27 |             ge = 0
 28 |         attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1)
 29 | 
 30 |         ssl_enc = self.c_pre(ssl_enc * ssl_mask)
 31 |         text_enc = self.text_pre(text * text_mask)
 32 |         if test != None:
 33 |             if test == 0:
 34 |                 x = (
 35 |                     self.cross_attention(
 36 |                         ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
 37 |                     )
 38 |                     + ssl_enc
 39 |                     + ge
 40 |                 )
 41 |             elif test == 1:
 42 |                 x = ssl_enc + ge
 43 |             elif test == 2:
 44 |                 x = (
 45 |                     self.cross_attention(
 46 |                         ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask
 47 |                     )
 48 |                     + ge
 49 |                 )
 50 |             else:
 51 |                 raise ValueError("test should be 0,1,2")
 52 |         else:
 53 |             x = (
 54 |                 self.cross_attention(
 55 |                     ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
 56 |                 )
 57 |                 + ssl_enc
 58 |                 + ge
 59 |             )
 60 |         x = self.c_post(x * ssl_mask)
 61 |         return x
 62 | 
 63 | 
 64 | class SpeakerEncoder(torch.nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         mel_n_channels=80,
 68 |         model_num_layers=2,
 69 |         model_hidden_size=256,
 70 |         model_embedding_size=256,
 71 |     ):
 72 |         super(SpeakerEncoder, self).__init__()
 73 |         self.lstm = nn.LSTM(
 74 |             mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
 75 |         )
 76 |         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
 77 |         self.relu = nn.ReLU()
 78 | 
 79 |     def forward(self, mels):
 80 |         self.lstm.flatten_parameters()
 81 |         _, (hidden, _) = self.lstm(mels.transpose(-1, -2))
 82 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 83 |         return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 84 | 
 85 | 
 86 | class MELEncoder(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         in_channels,
 90 |         out_channels,
 91 |         hidden_channels,
 92 |         kernel_size,
 93 |         dilation_rate,
 94 |         n_layers,
 95 |     ):
 96 |         super().__init__()
 97 |         self.in_channels = in_channels
 98 |         self.out_channels = out_channels
 99 |         self.hidden_channels = hidden_channels
100 |         self.kernel_size = kernel_size
101 |         self.dilation_rate = dilation_rate
102 |         self.n_layers = n_layers
103 | 
104 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
105 |         self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers)
106 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
107 | 
108 |     def forward(self, x):
109 |         # print(x.shape,x_lengths.shape)
110 |         x = self.pre(x)
111 |         x = self.enc(x)
112 |         x = self.proj(x)
113 |         return x
114 | 
115 | 
116 | class WN(torch.nn.Module):
117 |     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers):
118 |         super(WN, self).__init__()
119 |         assert kernel_size % 2 == 1
120 |         self.hidden_channels = hidden_channels
121 |         self.kernel_size = kernel_size
122 |         self.dilation_rate = dilation_rate
123 |         self.n_layers = n_layers
124 | 
125 |         self.in_layers = torch.nn.ModuleList()
126 |         self.res_skip_layers = torch.nn.ModuleList()
127 | 
128 |         for i in range(n_layers):
129 |             dilation = dilation_rate**i
130 |             padding = int((kernel_size * dilation - dilation) / 2)
131 |             in_layer = nn.Conv1d(
132 |                 hidden_channels,
133 |                 2 * hidden_channels,
134 |                 kernel_size,
135 |                 dilation=dilation,
136 |                 padding=padding,
137 |             )
138 |             in_layer = weight_norm(in_layer)
139 |             self.in_layers.append(in_layer)
140 | 
141 |             # last one is not necessary
142 |             if i < n_layers - 1:
143 |                 res_skip_channels = 2 * hidden_channels
144 |             else:
145 |                 res_skip_channels = hidden_channels
146 | 
147 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
148 |             res_skip_layer = weight_norm(res_skip_layer, name="weight")
149 |             self.res_skip_layers.append(res_skip_layer)
150 | 
151 |     def forward(self, x):
152 |         output = torch.zeros_like(x)
153 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
154 | 
155 |         for i in range(self.n_layers):
156 |             x_in = self.in_layers[i](x)
157 | 
158 |             acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor)
159 | 
160 |             res_skip_acts = self.res_skip_layers[i](acts)
161 |             if i < self.n_layers - 1:
162 |                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
163 |                 x = x + res_acts
164 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
165 |             else:
166 |                 output = output + res_skip_acts
167 |         return output
168 | 
169 |     def remove_weight_norm(self):
170 |         for l in self.in_layers:
171 |             remove_weight_norm(l)
172 |         for l in self.res_skip_layers:
173 |             remove_weight_norm(l)
174 | 
175 | 
176 | @torch.jit.script
177 | def fused_add_tanh_sigmoid_multiply(input, n_channels):
178 |     n_channels_int = n_channels[0]
179 |     t_act = torch.tanh(input[:, :n_channels_int, :])
180 |     s_act = torch.sigmoid(input[:, n_channels_int:, :])
181 |     acts = t_act * s_act
182 |     return acts
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     content_enc = torch.randn(3, 192, 100)
187 |     content_mask = torch.ones(3, 1, 100)
188 |     ref_mel = torch.randn(3, 128, 30)
189 |     ref_mask = torch.ones(3, 1, 30)
190 |     model = MRTE()
191 |     out = model(content_enc, content_mask, ref_mel, ref_mask)
192 |     print(out.shape)
193 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/quantize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | """Residual vector quantizer implementation."""
  8 | 
  9 | from dataclasses import dataclass, field
 10 | import math
 11 | import typing as tp
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | 
 16 | from module.core_vq import ResidualVectorQuantization
 17 | 
 18 | 
 19 | @dataclass
 20 | class QuantizedResult:
 21 |     quantized: torch.Tensor
 22 |     codes: torch.Tensor
 23 |     bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
 24 |     penalty: tp.Optional[torch.Tensor] = None
 25 |     metrics: dict = field(default_factory=dict)
 26 | 
 27 | 
 28 | class ResidualVectorQuantizer(nn.Module):
 29 |     """Residual Vector Quantizer.
 30 |     Args:
 31 |         dimension (int): Dimension of the codebooks.
 32 |         n_q (int): Number of residual vector quantizers used.
 33 |         bins (int): Codebook size.
 34 |         decay (float): Decay for exponential moving average over the codebooks.
 35 |         kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
 36 |         kmeans_iters (int): Number of iterations used for kmeans initialization.
 37 |         threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
 38 |             that have an exponential moving average cluster size less than the specified threshold with
 39 |             randomly selected vector from the current batch.
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         dimension: int = 256,
 45 |         n_q: int = 8,
 46 |         bins: int = 1024,
 47 |         decay: float = 0.99,
 48 |         kmeans_init: bool = True,
 49 |         kmeans_iters: int = 50,
 50 |         threshold_ema_dead_code: int = 2,
 51 |     ):
 52 |         super().__init__()
 53 |         self.n_q = n_q
 54 |         self.dimension = dimension
 55 |         self.bins = bins
 56 |         self.decay = decay
 57 |         self.kmeans_init = kmeans_init
 58 |         self.kmeans_iters = kmeans_iters
 59 |         self.threshold_ema_dead_code = threshold_ema_dead_code
 60 |         self.vq = ResidualVectorQuantization(
 61 |             dim=self.dimension,
 62 |             codebook_size=self.bins,
 63 |             num_quantizers=self.n_q,
 64 |             decay=self.decay,
 65 |             kmeans_init=self.kmeans_init,
 66 |             kmeans_iters=self.kmeans_iters,
 67 |             threshold_ema_dead_code=self.threshold_ema_dead_code,
 68 |         )
 69 | 
 70 |     def forward(
 71 |         self,
 72 |         x: torch.Tensor,
 73 |         n_q: tp.Optional[int] = None,
 74 |         layers: tp.Optional[list] = None,
 75 |     ) -> QuantizedResult:
 76 |         """Residual vector quantization on the given input tensor.
 77 |         Args:
 78 |             x (torch.Tensor): Input tensor.
 79 |             n_q (int): Number of quantizer used to quantize. Default: All quantizers.
 80 |             layers (list): Layer that need to return quantized. Defalt: None.
 81 |         Returns:
 82 |             QuantizedResult:
 83 |                 The quantized (or approximately quantized) representation with
 84 |                 the associated numbert quantizers and layer quantized required to return.
 85 |         """
 86 |         n_q = n_q if n_q else self.n_q
 87 |         if layers and max(layers) >= n_q:
 88 |             raise ValueError(
 89 |                 f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B."
 90 |             )
 91 |         quantized, codes, commit_loss, quantized_list = self.vq(
 92 |             x, n_q=n_q, layers=layers
 93 |         )
 94 |         return quantized, codes, torch.mean(commit_loss), quantized_list
 95 | 
 96 |     def encode(
 97 |         self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
 98 |     ) -> torch.Tensor:
 99 |         """Encode a given input tensor with the specified sample rate at the given bandwidth.
100 |         The RVQ encode method sets the appropriate number of quantizer to use
101 |         and returns indices for each quantizer.
102 |         Args:
103 |             x (torch.Tensor): Input tensor.
104 |             n_q (int): Number of quantizer used to quantize. Default: All quantizers.
105 |             st (int): Start to encode input from which layers. Default: 0.
106 |         """
107 |         n_q = n_q if n_q else self.n_q
108 |         st = st or 0
109 |         codes = self.vq.encode(x, n_q=n_q, st=st)
110 |         return codes
111 | 
112 |     def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
113 |         """Decode the given codes to the quantized representation.
114 |         Args:
115 |             codes (torch.Tensor): Input indices for each quantizer.
116 |             st (int): Start to decode input codes from which layers. Default: 0.
117 |         """
118 |         quantized = self.vq.decode(codes, st=st)
119 |         return quantized
120 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/my_utils.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load_audio(file, sr):
 6 |     try:
 7 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 8 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 9 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 |         file = (
11 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 |         )  # 防止小白拷路径头尾带了空格和"和回车
13 |         out, _ = (
14 |             ffmpeg.input(file, threads=0)
15 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 |         )
18 |     except Exception as e:
19 |         raise RuntimeError(f"Failed to load audio: {e}")
20 | 
21 |     return np.frombuffer(out, np.float32).flatten()
22 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/1-get-text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | 
  5 | inp_text = os.environ.get("inp_text")
  6 | inp_wav_dir = os.environ.get("inp_wav_dir")
  7 | exp_name = os.environ.get("exp_name")
  8 | i_part = os.environ.get("i_part")
  9 | all_parts = os.environ.get("all_parts")
 10 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
 11 | opt_dir = os.environ.get("opt_dir")
 12 | bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
 13 | is_half = eval(os.environ.get("is_half", "True"))
 14 | import sys, numpy as np, traceback, pdb
 15 | import os.path
 16 | from glob import glob
 17 | from tqdm import tqdm
 18 | from text.cleaner import clean_text
 19 | import torch
 20 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 21 | import numpy as np
 22 | 
 23 | # inp_text=sys.argv[1]
 24 | # inp_wav_dir=sys.argv[2]
 25 | # exp_name=sys.argv[3]
 26 | # i_part=sys.argv[4]
 27 | # all_parts=sys.argv[5]
 28 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
 29 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
 30 | # bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
 31 | 
 32 | from time import time as ttime
 33 | import shutil
 34 | 
 35 | 
 36 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
 37 |     dir=os.path.dirname(path)
 38 |     name=os.path.basename(path)
 39 |     # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
 40 |     tmp_path="%s%s.pth"%(ttime(),i_part)
 41 |     torch.save(fea,tmp_path)
 42 |     shutil.move(tmp_path,"%s/%s"%(dir,name))
 43 | 
 44 | 
 45 | txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
 46 | if os.path.exists(txt_path) == False:
 47 |     bert_dir = "%s/3-bert" % (opt_dir)
 48 |     os.makedirs(opt_dir, exist_ok=True)
 49 |     os.makedirs(bert_dir, exist_ok=True)
 50 |     if torch.cuda.is_available():
 51 |         device = "cuda:0"
 52 |     # elif torch.backends.mps.is_available():
 53 |     #     device = "mps"
 54 |     else:
 55 |         device = "cpu"
 56 |     tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
 57 |     bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
 58 |     if is_half == True:
 59 |         bert_model = bert_model.half().to(device)
 60 |     else:
 61 |         bert_model = bert_model.to(device)
 62 | 
 63 |     def get_bert_feature(text, word2ph):
 64 |         with torch.no_grad():
 65 |             inputs = tokenizer(text, return_tensors="pt")
 66 |             for i in inputs:
 67 |                 inputs[i] = inputs[i].to(device)
 68 |             res = bert_model(**inputs, output_hidden_states=True)
 69 |             res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
 70 | 
 71 |         assert len(word2ph) == len(text)
 72 |         phone_level_feature = []
 73 |         for i in range(len(word2ph)):
 74 |             repeat_feature = res[i].repeat(word2ph[i], 1)
 75 |             phone_level_feature.append(repeat_feature)
 76 | 
 77 |         phone_level_feature = torch.cat(phone_level_feature, dim=0)
 78 | 
 79 |         return phone_level_feature.T
 80 | 
 81 |     def process(data, res):
 82 |         for name, text, lan in data:
 83 |             try:
 84 |                 name = os.path.basename(name)
 85 |                 phones, word2ph, norm_text = clean_text(
 86 |                     text.replace("%", "-").replace("￥", ","), lan
 87 |                 )
 88 |                 path_bert = "%s/%s.pt" % (bert_dir, name)
 89 |                 if os.path.exists(path_bert) == False and lan == "zh":
 90 |                     bert_feature = get_bert_feature(norm_text, word2ph)
 91 |                     assert bert_feature.shape[-1] == len(phones)
 92 |                     # torch.save(bert_feature, path_bert)
 93 |                     my_save(bert_feature, path_bert)
 94 |                 phones = " ".join(phones)
 95 |                 # res.append([name,phones])
 96 |                 res.append([name, phones, word2ph, norm_text])
 97 |             except:
 98 |                 print(name, text, traceback.format_exc())
 99 | 
100 |     todo = []
101 |     res = []
102 |     with open(inp_text, "r", encoding="utf8") as f:
103 |         lines = f.read().strip("\n").split("\n")
104 | 
105 |     language_v1_to_language_v2 = {
106 |         "ZH": "zh",
107 |         "zh": "zh",
108 |         "JP": "ja",
109 |         "jp": "ja",
110 |         "JA": "ja",
111 |         "ja": "ja",
112 |         "EN": "en",
113 |         "en": "en",
114 |         "En": "en",
115 |     }
116 |     for line in lines[int(i_part) :: int(all_parts)]:
117 |         try:
118 |             wav_name, spk_name, language, text = line.split("|")
119 |             # todo.append([name,text,"zh"])
120 |             todo.append(
121 |                 [wav_name, text, language_v1_to_language_v2.get(language, language)]
122 |             )
123 |         except:
124 |             print(line, traceback.format_exc())
125 | 
126 |     process(todo, res)
127 |     opt = []
128 |     for name, phones, word2ph, norm_text in res:
129 |         opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
130 |     with open(txt_path, "w", encoding="utf8") as f:
131 |         f.write("\n".join(opt) + "\n")
132 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys,os
  4 | inp_text=                           os.environ.get("inp_text")
  5 | inp_wav_dir=                        os.environ.get("inp_wav_dir")
  6 | exp_name=                           os.environ.get("exp_name")
  7 | i_part=                             os.environ.get("i_part")
  8 | all_parts=                          os.environ.get("all_parts")
  9 | os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
 10 | from feature_extractor import cnhubert
 11 | opt_dir=                            os.environ.get("opt_dir")
 12 | cnhubert.cnhubert_base_path=                os.environ.get("cnhubert_base_dir")
 13 | is_half=eval(os.environ.get("is_half","True"))
 14 | 
 15 | import pdb,traceback,numpy as np,logging
 16 | from scipy.io import wavfile
 17 | import librosa,torch
 18 | now_dir = os.getcwd()
 19 | sys.path.append(now_dir)
 20 | from my_utils import load_audio
 21 | 
 22 | # from config import cnhubert_base_path
 23 | # cnhubert.cnhubert_base_path=cnhubert_base_path
 24 | # inp_text=sys.argv[1]
 25 | # inp_wav_dir=sys.argv[2]
 26 | # exp_name=sys.argv[3]
 27 | # i_part=sys.argv[4]
 28 | # all_parts=sys.argv[5]
 29 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
 30 | # cnhubert.cnhubert_base_path=sys.argv[7]
 31 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
 32 | 
 33 | from time import time as ttime
 34 | import shutil
 35 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
 36 |     dir=os.path.dirname(path)
 37 |     name=os.path.basename(path)
 38 |     # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
 39 |     tmp_path="%s%s.pth"%(ttime(),i_part)
 40 |     torch.save(fea,tmp_path)
 41 |     shutil.move(tmp_path,"%s/%s"%(dir,name))
 42 | 
 43 | hubert_dir="%s/4-cnhubert"%(opt_dir)
 44 | wav32dir="%s/5-wav32k"%(opt_dir)
 45 | os.makedirs(opt_dir,exist_ok=True)
 46 | os.makedirs(hubert_dir,exist_ok=True)
 47 | os.makedirs(wav32dir,exist_ok=True)
 48 | 
 49 | maxx=0.95
 50 | alpha=0.5
 51 | if torch.cuda.is_available():
 52 |     device = "cuda:0"
 53 | # elif torch.backends.mps.is_available():
 54 | #     device = "mps"
 55 | else:
 56 |     device = "cpu"
 57 | model=cnhubert.get_model()
 58 | # is_half=False
 59 | if(is_half==True):
 60 |     model=model.half().to(device)
 61 | else:
 62 |     model = model.to(device)
 63 | 
 64 | nan_fails=[]
 65 | def name2go(wav_name,wav_path):
 66 |     hubert_path="%s/%s.pt"%(hubert_dir,wav_name)
 67 |     if(os.path.exists(hubert_path)):return
 68 |     tmp_audio = load_audio(wav_path, 32000)
 69 |     tmp_max = np.abs(tmp_audio).max()
 70 |     if tmp_max > 2.2:
 71 |         print("%s-filtered,%s" % (wav_name, tmp_max))
 72 |         return
 73 |     tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio
 74 |     tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio
 75 |     tmp_audio = librosa.resample(
 76 |         tmp_audio32b, orig_sr=32000, target_sr=16000
 77 |     )#不是重采样问题
 78 |     tensor_wav16 = torch.from_numpy(tmp_audio)
 79 |     if (is_half == True):
 80 |         tensor_wav16=tensor_wav16.half().to(device)
 81 |     else:
 82 |         tensor_wav16 = tensor_wav16.to(device)
 83 |     ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
 84 |     if np.isnan(ssl.detach().numpy()).sum()!= 0:
 85 |         nan_fails.append(wav_name)
 86 |         print("nan filtered:%s"%wav_name)
 87 |         return
 88 |     wavfile.write(
 89 |         "%s/%s"%(wav32dir,wav_name),
 90 |         32000,
 91 |         tmp_audio32.astype("int16"),
 92 |     )
 93 |     my_save(ssl,hubert_path )
 94 | 
 95 | with open(inp_text,"r",encoding="utf8")as f:
 96 |     lines=f.read().strip("\n").split("\n")
 97 | 
 98 | for line in lines[int(i_part)::int(all_parts)]:
 99 |     try:
100 |         # wav_name,text=line.split("\t")
101 |         wav_name, spk_name, language, text = line.split("|")
102 |         if (inp_wav_dir != "" and inp_wav_dir != None):
103 |             wav_name = os.path.basename(wav_name)
104 |             wav_path = "%s/%s"%(inp_wav_dir, wav_name)
105 | 
106 |         else:
107 |             wav_path=wav_name
108 |             wav_name = os.path.basename(wav_name)
109 |         name2go(wav_name,wav_path)
110 |     except:
111 |         print(line,traceback.format_exc())
112 | 
113 | if(len(nan_fails)>0 and is_half==True):
114 |     is_half=False
115 |     model=model.float()
116 |     for wav_name in nan_fails:
117 |         try:
118 |             name2go(wav_name)
119 |         except:
120 |             print(wav_name,traceback.format_exc())
121 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/3-get-semantic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | inp_text = os.environ.get("inp_text")
 4 | exp_name = os.environ.get("exp_name")
 5 | i_part = os.environ.get("i_part")
 6 | all_parts = os.environ.get("all_parts")
 7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
 8 | opt_dir = os.environ.get("opt_dir")
 9 | pretrained_s2G = os.environ.get("pretrained_s2G")
10 | s2config_path = os.environ.get("s2config_path")
11 | is_half = eval(os.environ.get("is_half", "True"))
12 | import math, traceback
13 | import multiprocessing
14 | import sys, pdb
15 | 
16 | now_dir = os.getcwd()
17 | sys.path.append(now_dir)
18 | from random import shuffle
19 | import torch.multiprocessing as mp
20 | from glob import glob
21 | from tqdm import tqdm
22 | import logging, librosa, utils, torch
23 | from module.models import SynthesizerTrn
24 | 
25 | logging.getLogger("numba").setLevel(logging.WARNING)
26 | # from config import pretrained_s2G
27 | 
28 | # inp_text=sys.argv[1]
29 | # exp_name=sys.argv[2]
30 | # i_part=sys.argv[3]
31 | # all_parts=sys.argv[4]
32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5]
33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
34 | 
35 | 
36 | hubert_dir = "%s/4-cnhubert" % (opt_dir)
37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
38 | if os.path.exists(semantic_path) == False:
39 |     os.makedirs(opt_dir, exist_ok=True)
40 | 
41 |     if torch.cuda.is_available():
42 |         device = "cuda"
43 |     # elif torch.backends.mps.is_available():
44 |     #     device = "mps"
45 |     else:
46 |         device = "cpu"
47 |     hps = utils.get_hparams_from_file(s2config_path)
48 |     vq_model = SynthesizerTrn(
49 |         hps.data.filter_length // 2 + 1,
50 |         hps.train.segment_size // hps.data.hop_length,
51 |         n_speakers=hps.data.n_speakers,
52 |         **hps.model
53 |     )
54 |     if is_half == True:
55 |         vq_model = vq_model.half().to(device)
56 |     else:
57 |         vq_model = vq_model.to(device)
58 |     vq_model.eval()
59 |     # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True)
60 |     # utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
61 |     print(
62 |         vq_model.load_state_dict(
63 |             torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
64 |         )
65 |     )
66 | 
67 |     def name2go(wav_name, lines):
68 |         hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
69 |         if os.path.exists(hubert_path) == False:
70 |             return
71 |         ssl_content = torch.load(hubert_path, map_location="cpu")
72 |         if is_half == True:
73 |             ssl_content = ssl_content.half().to(device)
74 |         else:
75 |             ssl_content = ssl_content.to(device)
76 |         codes = vq_model.extract_latent(ssl_content)
77 |         semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
78 |         lines.append("%s\t%s" % (wav_name, semantic))
79 | 
80 |     with open(inp_text, "r", encoding="utf8") as f:
81 |         lines = f.read().strip("\n").split("\n")
82 | 
83 |     lines1 = []
84 |     for line in lines[int(i_part) :: int(all_parts)]:
85 |         # print(line)
86 |         try:
87 |             # wav_name,text=line.split("\t")
88 |             wav_name, spk_name, language, text = line.split("|")
89 |             wav_name = os.path.basename(wav_name)
90 |             # name2go(name,lines1)
91 |             name2go(wav_name, lines1)
92 |         except:
93 |             print(line, traceback.format_exc())
94 |     with open(semantic_path, "w", encoding="utf8") as f:
95 |         f.write("\n".join(lines1))
96 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from collections import OrderedDict
 3 | from time import time as ttime
 4 | import shutil,os
 5 | import torch
 6 | from tools.i18n.i18n import I18nAuto
 7 | 
 8 | i18n = I18nAuto()
 9 | 
10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
11 |     dir=os.path.dirname(path)
12 |     name=os.path.basename(path)
13 |     tmp_path="%s.pth"%(ttime())
14 |     torch.save(fea,tmp_path)
15 |     shutil.move(tmp_path,"%s/%s"%(dir,name))
16 | 
17 | def savee(ckpt, name, epoch, steps, hps):
18 |     try:
19 |         opt = OrderedDict()
20 |         opt["weight"] = {}
21 |         for key in ckpt.keys():
22 |             if "enc_q" in key:
23 |                 continue
24 |             opt["weight"][key] = ckpt[key].half()
25 |         opt["config"] = hps
26 |         opt["info"] = "%sepoch_%siteration" % (epoch, steps)
27 |         # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
28 |         my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
29 |         return "Success."
30 |     except:
31 |         return traceback.format_exc()
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from text.symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text):
 7 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |   return phones
15 | 
16 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/chinese.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/chinese.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/cleaner.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/cleaner.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/english.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/english.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/japanese.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/japanese.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/symbols.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/symbols.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__pycache__/tone_sandhi.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/tone_sandhi.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pdb
  3 | import re
  4 | 
  5 | import cn2an
  6 | from pypinyin import lazy_pinyin, Style
  7 | 
  8 | from text.symbols import punctuation
  9 | from text.tone_sandhi import ToneSandhi
 10 | from text.zh_normalization.text_normlization import TextNormalizer
 11 | 
 12 | normalizer = lambda x: cn2an.transform(x, "an2cn")
 13 | 
 14 | current_file_path = os.path.dirname(__file__)
 15 | pinyin_to_symbol_map = {
 16 |     line.split("\t")[0]: line.strip().split("\t")[1]
 17 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 18 | }
 19 | 
 20 | import jieba_fast.posseg as psg
 21 | 
 22 | 
 23 | rep_map = {
 24 |     "：": ",",
 25 |     "；": ",",
 26 |     "，": ",",
 27 |     "。": ".",
 28 |     "！": "!",
 29 |     "？": "?",
 30 |     "\n": ".",
 31 |     "·": ",",
 32 |     "、": ",",
 33 |     "...": "…",
 34 |     "$": ".",
 35 |     "/": ",",
 36 |     "—": "-",
 37 |     "~": "…",
 38 |     "～":"…",
 39 | }
 40 | 
 41 | tone_modifier = ToneSandhi()
 42 | 
 43 | 
 44 | def replace_punctuation(text):
 45 |     text = text.replace("嗯", "恩").replace("呣", "母")
 46 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 47 | 
 48 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 49 | 
 50 |     replaced_text = re.sub(
 51 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 52 |     )
 53 | 
 54 |     return replaced_text
 55 | 
 56 | 
 57 | def g2p(text):
 58 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 59 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 60 |     phones, word2ph = _g2p(sentences)
 61 |     return phones, word2ph
 62 | 
 63 | 
 64 | def _get_initials_finals(word):
 65 |     initials = []
 66 |     finals = []
 67 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 68 |     orig_finals = lazy_pinyin(
 69 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 70 |     )
 71 |     for c, v in zip(orig_initials, orig_finals):
 72 |         initials.append(c)
 73 |         finals.append(v)
 74 |     return initials, finals
 75 | 
 76 | 
 77 | def _g2p(segments):
 78 |     phones_list = []
 79 |     word2ph = []
 80 |     for seg in segments:
 81 |         pinyins = []
 82 |         # Replace all English words in the sentence
 83 |         seg = re.sub("[a-zA-Z]+", "", seg)
 84 |         seg_cut = psg.lcut(seg)
 85 |         initials = []
 86 |         finals = []
 87 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
 88 |         for word, pos in seg_cut:
 89 |             if pos == "eng":
 90 |                 continue
 91 |             sub_initials, sub_finals = _get_initials_finals(word)
 92 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
 93 |             initials.append(sub_initials)
 94 |             finals.append(sub_finals)
 95 | 
 96 |             # assert len(sub_initials) == len(sub_finals) == len(word)
 97 |         initials = sum(initials, [])
 98 |         finals = sum(finals, [])
 99 |         #
100 |         for c, v in zip(initials, finals):
101 |             raw_pinyin = c + v
102 |             # NOTE: post process for pypinyin outputs
103 |             # we discriminate i, ii and iii
104 |             if c == v:
105 |                 assert c in punctuation
106 |                 phone = [c]
107 |                 word2ph.append(1)
108 |             else:
109 |                 v_without_tone = v[:-1]
110 |                 tone = v[-1]
111 | 
112 |                 pinyin = c + v_without_tone
113 |                 assert tone in "12345"
114 | 
115 |                 if c:
116 |                     # 多音节
117 |                     v_rep_map = {
118 |                         "uei": "ui",
119 |                         "iou": "iu",
120 |                         "uen": "un",
121 |                     }
122 |                     if v_without_tone in v_rep_map.keys():
123 |                         pinyin = c + v_rep_map[v_without_tone]
124 |                 else:
125 |                     # 单音节
126 |                     pinyin_rep_map = {
127 |                         "ing": "ying",
128 |                         "i": "yi",
129 |                         "in": "yin",
130 |                         "u": "wu",
131 |                     }
132 |                     if pinyin in pinyin_rep_map.keys():
133 |                         pinyin = pinyin_rep_map[pinyin]
134 |                     else:
135 |                         single_rep_map = {
136 |                             "v": "yu",
137 |                             "e": "e",
138 |                             "i": "y",
139 |                             "u": "w",
140 |                         }
141 |                         if pinyin[0] in single_rep_map.keys():
142 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
143 | 
144 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
145 |                 new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
146 |                 new_v = new_v + tone
147 |                 phone = [new_c, new_v]
148 |                 word2ph.append(len(phone))
149 | 
150 |             phones_list += phone
151 |     return phones_list, word2ph
152 | 
153 | 
154 | def text_normalize(text):
155 |     # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
156 |     tx = TextNormalizer()
157 |     sentences = tx.normalize(text)
158 |     dest_text = ""
159 |     for sentence in sentences:
160 |         dest_text += replace_punctuation(sentence)
161 |     return dest_text
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     text = "啊——但是《原神》是由,米哈\游自主，研发的一款全.新开放世界.冒险游戏"
166 |     text = "呣呣呣～就是…大人的鼹鼠党吧？"
167 |     text = "你好"
168 |     text = text_normalize(text)
169 |     print(g2p(text))
170 | 
171 | 
172 | # # 示例用法
173 | # text = "这是一个示例文本：,你好！这是一个测试..."
174 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
175 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
 2 | 
 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
 4 | special = [
 5 |     # ("%", "zh", "SP"),
 6 |     ("￥", "zh", "SP2"),
 7 |     ("^", "zh", "SP3"),
 8 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
 9 | ]
10 | 
11 | 
12 | def clean_text(text, language):
13 |     if(language not in language_module_map):
14 |         language="en"
15 |         text=" "
16 |     for special_s, special_l, target_symbol in special:
17 |         if special_s in text and language == special_l:
18 |             return clean_special(text, language, special_s, target_symbol)
19 |     language_module = language_module_map[language]
20 |     norm_text = language_module.text_normalize(text)
21 |     if language == "zh":
22 |         phones, word2ph = language_module.g2p(norm_text)
23 |         assert len(phones) == sum(word2ph)
24 |         assert len(norm_text) == len(word2ph)
25 |     else:
26 |         phones = language_module.g2p(norm_text)
27 |         word2ph = None
28 | 
29 |     for ph in phones:
30 |         assert ph in symbols
31 |     return phones, word2ph, norm_text
32 | 
33 | 
34 | def clean_special(text, language, special_s, target_symbol):
35 |     """
36 |     特殊静音段sp符号处理
37 |     """
38 |     text = text.replace(special_s, ",")
39 |     language_module = language_module_map[language]
40 |     norm_text = language_module.text_normalize(text)
41 |     phones = language_module.g2p(norm_text)
42 |     new_ph = []
43 |     for ph in phones[0]:
44 |         assert ph in symbols
45 |         if ph == ",":
46 |             new_ph.append(target_symbol)
47 |         else:
48 |             new_ph.append(ph)
49 |     return new_ph, phones[1], norm_text
50 | 
51 | 
52 | def text_to_sequence(text, language):
53 |     phones = clean_text(text)
54 |     return cleaned_text_to_sequence(phones)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/japanese.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2 | import re
  3 | import sys
  4 | 
  5 | import pyopenjtalk
  6 | 
  7 | 
  8 | from text import symbols
  9 | # Regular expression matching Japanese without punctuation marks:
 10 | _japanese_characters = re.compile(
 11 |     r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 12 | )
 13 | 
 14 | # Regular expression matching non-Japanese characters or punctuation marks:
 15 | _japanese_marks = re.compile(
 16 |     r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 17 | )
 18 | 
 19 | # List of (symbol, Japanese) pairs for marks:
 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
 21 | 
 22 | 
 23 | # List of (consonant, sokuon) pairs:
 24 | _real_sokuon = [
 25 |     (re.compile("%s" % x[0]), x[1])
 26 |     for x in [
 27 |         (r"Q([↑↓]*[kg])", r"k#\1"),
 28 |         (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
 29 |         (r"Q([↑↓]*[sʃ])", r"s\1"),
 30 |         (r"Q([↑↓]*[pb])", r"p#\1"),
 31 |     ]
 32 | ]
 33 | 
 34 | # List of (consonant, hatsuon) pairs:
 35 | _real_hatsuon = [
 36 |     (re.compile("%s" % x[0]), x[1])
 37 |     for x in [
 38 |         (r"N([↑↓]*[pbm])", r"m\1"),
 39 |         (r"N([↑↓]*[ʧʥj])", r"n^\1"),
 40 |         (r"N([↑↓]*[tdn])", r"n\1"),
 41 |         (r"N([↑↓]*[kg])", r"ŋ\1"),
 42 |     ]
 43 | ]
 44 | 
 45 | 
 46 | def post_replace_ph(ph):
 47 |     rep_map = {
 48 |         "：": ",",
 49 |         "；": ",",
 50 |         "，": ",",
 51 |         "。": ".",
 52 |         "！": "!",
 53 |         "？": "?",
 54 |         "\n": ".",
 55 |         "·": ",",
 56 |         "、": ",",
 57 |         "...": "…",
 58 |     }
 59 |     if ph in rep_map.keys():
 60 |         ph = rep_map[ph]
 61 |     if ph in symbols:
 62 |         return ph
 63 |     if ph not in symbols:
 64 |         ph = "UNK"
 65 |     return ph
 66 | 
 67 | 
 68 | def symbols_to_japanese(text):
 69 |     for regex, replacement in _symbols_to_japanese:
 70 |         text = re.sub(regex, replacement, text)
 71 |     return text
 72 | 
 73 | 
 74 | def preprocess_jap(text, with_prosody=False):
 75 |     """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
 76 |     text = symbols_to_japanese(text)
 77 |     sentences = re.split(_japanese_marks, text)
 78 |     marks = re.findall(_japanese_marks, text)
 79 |     text = []
 80 |     for i, sentence in enumerate(sentences):
 81 |         if re.match(_japanese_characters, sentence):
 82 |             if with_prosody:
 83 |                 text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
 84 |             else:
 85 |                 p = pyopenjtalk.g2p(sentence)
 86 |                 text += p.split(" ")
 87 | 
 88 |         if i < len(marks):
 89 |             if marks[i] == " ":# 防止意外的UNK
 90 |                 continue
 91 |             text += [marks[i].replace(" ", "")]
 92 |     return text
 93 | 
 94 | 
 95 | def text_normalize(text):
 96 |     # todo: jap text normalize
 97 |     return text
 98 | 
 99 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
100 | def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
101 |     """Extract phoneme + prosoody symbol sequence from input full-context labels.
102 | 
103 |     The algorithm is based on `Prosodic features control by symbols as input of
104 |     sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
105 | 
106 |     Args:
107 |         text (str): Input text.
108 |         drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
109 | 
110 |     Returns:
111 |         List[str]: List of phoneme + prosody symbols.
112 | 
113 |     Examples:
114 |         >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
115 |         >>> pyopenjtalk_g2p_prosody("こんにちは。")
116 |         ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
117 | 
118 |     .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
119 |         modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
120 | 
121 |     """
122 |     labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
123 |     N = len(labels)
124 | 
125 |     phones = []
126 |     for n in range(N):
127 |         lab_curr = labels[n]
128 | 
129 |         # current phoneme
130 |         p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
131 |         # deal unvoiced vowels as normal vowels
132 |         if drop_unvoiced_vowels and p3 in "AEIOU":
133 |             p3 = p3.lower()
134 | 
135 |         # deal with sil at the beginning and the end of text
136 |         if p3 == "sil":
137 |             assert n == 0 or n == N - 1
138 |             if n == 0:
139 |                 phones.append("^")
140 |             elif n == N - 1:
141 |                 # check question form or not
142 |                 e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
143 |                 if e3 == 0:
144 |                     phones.append("$")
145 |                 elif e3 == 1:
146 |                     phones.append("?")
147 |             continue
148 |         elif p3 == "pau":
149 |             phones.append("_")
150 |             continue
151 |         else:
152 |             phones.append(p3)
153 | 
154 |         # accent type and position info (forward or backward)
155 |         a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
156 |         a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
157 |         a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
158 | 
159 |         # number of mora in accent phrase
160 |         f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
161 | 
162 |         a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
163 |         # accent phrase border
164 |         if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
165 |             phones.append("#")
166 |         # pitch falling
167 |         elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
168 |             phones.append("]")
169 |         # pitch rising
170 |         elif a2 == 1 and a2_next == 2:
171 |             phones.append("[")
172 | 
173 |     return phones
174 | 
175 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
176 | def _numeric_feature_by_regex(regex, s):
177 |     match = re.search(regex, s)
178 |     if match is None:
179 |         return -50
180 |     return int(match.group(1))
181 | 
182 | def g2p(norm_text, with_prosody=False):
183 |     phones = preprocess_jap(norm_text, with_prosody)
184 |     phones = [post_replace_ph(i) for i in phones]
185 |     # todo: implement tones and word2ph
186 |     return phones
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね！")
191 |     print(phones)


--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/symbols.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
  4 | punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
  5 | punctuation.append("-")
  6 | pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
  7 | # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
  8 | pad = "_"
  9 | 
 10 | c = [
 11 |     "AA",
 12 |     "EE",
 13 |     "OO",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "f",
 19 |     "g",
 20 |     "h",
 21 |     "j",
 22 |     "k",
 23 |     "l",
 24 |     "m",
 25 |     "n",
 26 |     "p",
 27 |     "q",
 28 |     "r",
 29 |     "s",
 30 |     "sh",
 31 |     "t",
 32 |     "w",
 33 |     "x",
 34 |     "y",
 35 |     "z",
 36 |     "zh",
 37 | ]
 38 | v = [
 39 |     "E1",
 40 |     "En1",
 41 |     "a1",
 42 |     "ai1",
 43 |     "an1",
 44 |     "ang1",
 45 |     "ao1",
 46 |     "e1",
 47 |     "ei1",
 48 |     "en1",
 49 |     "eng1",
 50 |     "er1",
 51 |     "i1",
 52 |     "i01",
 53 |     "ia1",
 54 |     "ian1",
 55 |     "iang1",
 56 |     "iao1",
 57 |     "ie1",
 58 |     "in1",
 59 |     "ing1",
 60 |     "iong1",
 61 |     "ir1",
 62 |     "iu1",
 63 |     "o1",
 64 |     "ong1",
 65 |     "ou1",
 66 |     "u1",
 67 |     "ua1",
 68 |     "uai1",
 69 |     "uan1",
 70 |     "uang1",
 71 |     "ui1",
 72 |     "un1",
 73 |     "uo1",
 74 |     "v1",
 75 |     "van1",
 76 |     "ve1",
 77 |     "vn1",
 78 |     "E2",
 79 |     "En2",
 80 |     "a2",
 81 |     "ai2",
 82 |     "an2",
 83 |     "ang2",
 84 |     "ao2",
 85 |     "e2",
 86 |     "ei2",
 87 |     "en2",
 88 |     "eng2",
 89 |     "er2",
 90 |     "i2",
 91 |     "i02",
 92 |     "ia2",
 93 |     "ian2",
 94 |     "iang2",
 95 |     "iao2",
 96 |     "ie2",
 97 |     "in2",
 98 |     "ing2",
 99 |     "iong2",
100 |     "ir2",
101 |     "iu2",
102 |     "o2",
103 |     "ong2",
104 |     "ou2",
105 |     "u2",
106 |     "ua2",
107 |     "uai2",
108 |     "uan2",
109 |     "uang2",
110 |     "ui2",
111 |     "un2",
112 |     "uo2",
113 |     "v2",
114 |     "van2",
115 |     "ve2",
116 |     "vn2",
117 |     "E3",
118 |     "En3",
119 |     "a3",
120 |     "ai3",
121 |     "an3",
122 |     "ang3",
123 |     "ao3",
124 |     "e3",
125 |     "ei3",
126 |     "en3",
127 |     "eng3",
128 |     "er3",
129 |     "i3",
130 |     "i03",
131 |     "ia3",
132 |     "ian3",
133 |     "iang3",
134 |     "iao3",
135 |     "ie3",
136 |     "in3",
137 |     "ing3",
138 |     "iong3",
139 |     "ir3",
140 |     "iu3",
141 |     "o3",
142 |     "ong3",
143 |     "ou3",
144 |     "u3",
145 |     "ua3",
146 |     "uai3",
147 |     "uan3",
148 |     "uang3",
149 |     "ui3",
150 |     "un3",
151 |     "uo3",
152 |     "v3",
153 |     "van3",
154 |     "ve3",
155 |     "vn3",
156 |     "E4",
157 |     "En4",
158 |     "a4",
159 |     "ai4",
160 |     "an4",
161 |     "ang4",
162 |     "ao4",
163 |     "e4",
164 |     "ei4",
165 |     "en4",
166 |     "eng4",
167 |     "er4",
168 |     "i4",
169 |     "i04",
170 |     "ia4",
171 |     "ian4",
172 |     "iang4",
173 |     "iao4",
174 |     "ie4",
175 |     "in4",
176 |     "ing4",
177 |     "iong4",
178 |     "ir4",
179 |     "iu4",
180 |     "o4",
181 |     "ong4",
182 |     "ou4",
183 |     "u4",
184 |     "ua4",
185 |     "uai4",
186 |     "uan4",
187 |     "uang4",
188 |     "ui4",
189 |     "un4",
190 |     "uo4",
191 |     "v4",
192 |     "van4",
193 |     "ve4",
194 |     "vn4",
195 |     "E5",
196 |     "En5",
197 |     "a5",
198 |     "ai5",
199 |     "an5",
200 |     "ang5",
201 |     "ao5",
202 |     "e5",
203 |     "ei5",
204 |     "en5",
205 |     "eng5",
206 |     "er5",
207 |     "i5",
208 |     "i05",
209 |     "ia5",
210 |     "ian5",
211 |     "iang5",
212 |     "iao5",
213 |     "ie5",
214 |     "in5",
215 |     "ing5",
216 |     "iong5",
217 |     "ir5",
218 |     "iu5",
219 |     "o5",
220 |     "ong5",
221 |     "ou5",
222 |     "u5",
223 |     "ua5",
224 |     "uai5",
225 |     "uan5",
226 |     "uang5",
227 |     "ui5",
228 |     "un5",
229 |     "uo5",
230 |     "v5",
231 |     "van5",
232 |     "ve5",
233 |     "vn5",
234 | ]
235 | 
236 | v_without_tone = [
237 |     "E",
238 |     "En",
239 |     "a",
240 |     "ai",
241 |     "an",
242 |     "ang",
243 |     "ao",
244 |     "e",
245 |     "ei",
246 |     "en",
247 |     "eng",
248 |     "er",
249 |     "i",
250 |     "i0",
251 |     "ia",
252 |     "ian",
253 |     "iang",
254 |     "iao",
255 |     "ie",
256 |     "in",
257 |     "ing",
258 |     "iong",
259 |     "ir",
260 |     "iu",
261 |     "o",
262 |     "ong",
263 |     "ou",
264 |     "u",
265 |     "ua",
266 |     "uai",
267 |     "uan",
268 |     "uang",
269 |     "ui",
270 |     "un",
271 |     "uo",
272 |     "v",
273 |     "van",
274 |     "ve",
275 |     "vn",
276 | ]
277 | 
278 | # japanese
279 | ja_symbols = [
280 |     "I",
281 |     "N",
282 |     "U",
283 |     "a",
284 |     "b",
285 |     "by",
286 |     "ch",
287 |     "cl",
288 |     "d",
289 |     "dy",
290 |     "e",
291 |     "f",
292 |     "g",
293 |     "gy",
294 |     "h",
295 |     "hy",
296 |     "i",
297 |     "j",
298 |     "k",
299 |     "ky",
300 |     "m",
301 |     "my",
302 |     "n",
303 |     "ny",
304 |     "o",
305 |     "p",
306 |     "py",
307 |     "r",
308 |     "ry",
309 |     "s",
310 |     "sh",
311 |     "t",
312 |     "ts",
313 |     "u",
314 |     "v",
315 |     "w",
316 |     "y",
317 |     "z",
318 |     # "[", #上升调型
319 |     # "]", #下降调型
320 |     # "$", #结束符
321 |     # "^", #开始符
322 | ]
323 | 
324 | arpa = {
325 |     "AH0",
326 |     "S",
327 |     "AH1",
328 |     "EY2",
329 |     "AE2",
330 |     "EH0",
331 |     "OW2",
332 |     "UH0",
333 |     "NG",
334 |     "B",
335 |     "G",
336 |     "AY0",
337 |     "M",
338 |     "AA0",
339 |     "F",
340 |     "AO0",
341 |     "ER2",
342 |     "UH1",
343 |     "IY1",
344 |     "AH2",
345 |     "DH",
346 |     "IY0",
347 |     "EY1",
348 |     "IH0",
349 |     "K",
350 |     "N",
351 |     "W",
352 |     "IY2",
353 |     "T",
354 |     "AA1",
355 |     "ER1",
356 |     "EH2",
357 |     "OY0",
358 |     "UH2",
359 |     "UW1",
360 |     "Z",
361 |     "AW2",
362 |     "AW1",
363 |     "V",
364 |     "UW2",
365 |     "AA2",
366 |     "ER",
367 |     "AW0",
368 |     "UW0",
369 |     "R",
370 |     "OW1",
371 |     "EH1",
372 |     "ZH",
373 |     "AE0",
374 |     "IH2",
375 |     "IH",
376 |     "Y",
377 |     "JH",
378 |     "P",
379 |     "AY1",
380 |     "EY0",
381 |     "OY2",
382 |     "TH",
383 |     "HH",
384 |     "D",
385 |     "ER0",
386 |     "CH",
387 |     "AO1",
388 |     "AE1",
389 |     "AO2",
390 |     "OY1",
391 |     "AY2",
392 |     "IH1",
393 |     "OW0",
394 |     "L",
395 |     "SH",
396 | }
397 | 
398 | symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
399 | symbols = sorted(set(symbols))
400 | if __name__ == "__main__":
401 |     print(len(symbols))
402 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 本软件及其相关代码以MIT协议开源，作者不对软件具备任何控制力，使用软件者、传播软件导出的声音者自负全责。
 2 | 如不认可该条款，则不能使用或引用软件包内任何代码和文件。
 3 | 
 4 | 特此授予任何获得本软件和相关文档文件（以下简称“软件”）副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利，以及授予本软件所提供的人使用本软件的权利，但须符合以下条件：
 5 | 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。
 6 | 软件是“按原样”提供的，没有任何明示或暗示的保证，包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下，作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任，无论是在合同诉讼、侵权诉讼还是其他诉讼中。
 7 | 
 8 | 
 9 | MIT License
10 | 
11 | Copyright (c) 2024 AIFSH
12 | 
13 | Permission is hereby granted, free of charge, to any person obtaining a copy
14 | of this software and associated documentation files (the "Software"), to deal
15 | in the Software without restriction, including without limitation the rights
16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 | copies of the Software, and to permit persons to whom the Software is
18 | furnished to do so, subject to the following conditions:
19 | 
20 | The above copyright notice and this permission notice shall be included in all
21 | copies or substantial portions of the Software.
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 | SOFTWARE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | new repo https://github.com/AIFSH/GSTTS-ComfyUI
 2 | # ComfyUI-GPT_SoVITS
 3 | a comfyui custom node for [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)! you can voice cloning and tts in comfyui now
 4 | <div>
 5 |   <figure>
 6 |   <img alt='webpage' src="web.png?raw=true" width="600px"/>
 7 |   <figure>
 8 | </div>
 9 | 
10 | # Disclaimer  / 免责声明
11 | We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
12 | 我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规.
13 | 
14 | ## Features
15 | - `srt` file for subtitle was supported
16 | - mutiple speaker was supported in finetune and inference by `srt`
17 | - huge comfyui custom nodes can merge in gpt_sovits
18 | 
19 | ## How to use
20 | make sure `ffmpeg` is worked in your commandline
21 | for Linux
22 | ```
23 | apt update
24 | apt install ffmpeg
25 | ```
26 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically
27 | 
28 | then!
29 | ```
30 | git clone https://github.com/AIFSH/ComfyUI-GPT_SoVITS.git
31 | cd ComfyUI-GPT_SoVITS
32 | pip install -r requirements.txt
33 | ```
34 | `weights` will be downloaded from huggingface automatically! if you in china,make sure your internet attach the huggingface
35 | or if you still struggle with huggingface, you may try follow [hf-mirror](https://hf-mirror.com/) to config your env.
36 | 
37 | 或者下载[权重文件](https://pan.quark.cn/s/e5057be01087)解压后把`pretrained_models`整个文件夹放进`ComfyUI-GPT_SoVITS`目录
38 | 
39 | ## Windows
40 | There is a portable standalone build for Windows that should work for running on Nvidia GPUs and cuda>=11.8,
41 | click [the link](https://www.bilibili.com/video/BV1qx4y1h7T2) to download
42 | <div>
43 |   <figure>
44 |   <img alt='Wechat' src="1key.jpg?raw=true" width="300px"/>
45 |   <figure>
46 | </div>
47 |     
48 | ## Tutorial
49 | - [Demo](https://www.bilibili.com/video/BV1yC411G7NJ)
50 | - [Demo for mutiple speaker](https://www.bilibili.com/video/BV1QC41137Wq/)
51 | - [FULL WorkFLOW](https://www.bilibili.com/video/BV1pp421D7qa)
52 | ## My other nodes you may need
53 | - [ComfyUI-UVR5](https://github.com/AIFSH/ComfyUI-UVR5)
54 | - [ComfyUI-IP_LAP](https://github.com/AIFSH/ComfyUI-IP_LAP)
55 | 
56 | ## WeChat Group && Donate
57 | <div>
58 |   <figure>
59 |   <img alt='Wechat' src="wechat.jpg?raw=true" width="300px"/>
60 |   <img alt='donate' src="donate.jpg?raw=true" width="300px"/>
61 |   <figure>
62 | </div>
63 | 
64 | ## Thanks
65 | - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
66 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import site
 2 | import os,sys
 3 | import logging
 4 | from server import PromptServer
 5 | 
 6 | now_dir = os.path.dirname(os.path.abspath(__file__))
 7 | site_packages_roots = []
 8 | for path in site.getsitepackages():
 9 |     if "packages" in path:
10 |         site_packages_roots.append(path)
11 | if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
12 | #os.environ["OPENBLAS_NUM_THREADS"] = "4"
13 | for site_packages_root in site_packages_roots:
14 |     if os.path.exists(site_packages_root):
15 |         try:
16 |             with open("%s/GPT_SoVITS.pth" % (site_packages_root), "w") as f:
17 |                 f.write(
18 |                     "%s\n%s/GPT_SoVITS\n%s/GPT_SoVITS/text\n"
19 |                     % (now_dir,now_dir,now_dir)
20 |                 )
21 |             break
22 |         except PermissionError:
23 |             raise PermissionError
24 | 
25 | if os.path.isfile("%s/GPT_SoVITS.pth" % (site_packages_root)):
26 |     print("!!!GPT_SoVITS path was added to " + "%s/GPT_SoVITS.pth" % (site_packages_root) 
27 |     + "\n if meet `No module` error,try `python main.py` again, don't be foolish to pip install tools")
28 | 
29 | from huggingface_hub import snapshot_download
30 | model_path = os.path.join(now_dir,"pretrained_models")
31 | if not os.path.isfile(os.path.join(model_path,"s2G488k.pth")):
32 |     snapshot_download(repo_id="lj1995/GPT-SoVITS",local_dir=model_path)
33 | else:
34 |     print("GPT_SoVITS use cache models,make sure your 'pretrained_models' complete")
35 | 
36 | WEB_DIRECTORY = "./web"
37 | from .nodes import LoadSRT,LoadAudio, GPT_SOVITS_INFER, PreViewAudio,GPT_SOVITS_FT, GPT_SOVITS_TTS
38 | 
39 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension
40 | # WEB_DIRECTORY = "./somejs"
41 | 
42 | # A dictionary that contains all nodes you want to export with their names
43 | # NOTE: names should be globally unique
44 | NODE_CLASS_MAPPINGS = {
45 |     "GPT_SOVITS_FT": GPT_SOVITS_FT,
46 |     "LoadAudio": LoadAudio,
47 |     "PreViewAudio": PreViewAudio,
48 |     "LoadSRT": LoadSRT,
49 |     "GPT_SOVITS_INFER": GPT_SOVITS_INFER,
50 |     "GPT_SOVITS_TTS": GPT_SOVITS_TTS
51 | }
52 | 
53 | # A dictionary that contains the friendly/humanly readable titles for the nodes
54 | NODE_DISPLAY_NAME_MAPPINGS = {
55 |     "GPT_SOVITS_FT": "GPT_SOVITS Finetune",
56 |     "LoadAudio": "AudioLoader",
57 |     "PreViewAudio": "PreView Audio",
58 |     "LoadSRT": "SRT FILE Loader",
59 |     "GPT_SOVITS_INFER": "GPT_SOVITS Inference",
60 |     "GPT_SOVITS_TTS": "GPT_SOVITS TTS"
61 | }
62 | 
63 | @PromptServer.instance.routes.get("/gpt_sovits/reboot")
64 | def restart(self):
65 |     try:
66 |         sys.stdout.close_log()
67 |     except Exception as e:
68 |         pass
69 | 
70 |     return os.execv(sys.executable, [sys.executable] + sys.argv)
71 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | import torch
 4 | 
 5 | # 推理用的指定模型
 6 | sovits_path = ""
 7 | gpt_path = ""
 8 | is_half_str = os.environ.get("is_half", "True")
 9 | is_half = True if is_half_str.lower() == 'true' else False
10 | is_share_str = os.environ.get("is_share","False")
11 | is_share= True if is_share_str.lower() == 'true' else False
12 | 
13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17 | 
18 | exp_root = "logs"
19 | python_exec = sys.executable or "python"
20 | if torch.cuda.is_available():
21 |     infer_device = "cuda"
22 | else:
23 |     infer_device = "cpu"
24 | 
25 | webui_port_main = 9874
26 | webui_port_uvr5 = 9873
27 | webui_port_infer_tts = 9872
28 | webui_port_subfix = 9871
29 | 
30 | api_port = 9880
31 | 
32 | if infer_device == "cuda":
33 |     gpu_name = torch.cuda.get_device_name(0)
34 |     if (
35 |             ("16" in gpu_name and "V100" not in gpu_name.upper())
36 |             or "P40" in gpu_name.upper()
37 |             or "P10" in gpu_name.upper()
38 |             or "1060" in gpu_name
39 |             or "1070" in gpu_name
40 |             or "1080" in gpu_name
41 |     ):
42 |         is_half=False
43 | 
44 | if(infer_device=="cpu"):is_half=False
45 | 
46 | class Config:
47 |     def __init__(self):
48 |         self.sovits_path = sovits_path
49 |         self.gpt_path = gpt_path
50 |         self.is_half = is_half
51 | 
52 |         self.cnhubert_path = cnhubert_path
53 |         self.bert_path = bert_path
54 |         self.pretrained_sovits_path = pretrained_sovits_path
55 |         self.pretrained_gpt_path = pretrained_gpt_path
56 | 
57 |         self.exp_root = exp_root
58 |         self.python_exec = python_exec
59 |         self.infer_device = infer_device
60 | 
61 |         self.webui_port_main = webui_port_main
62 |         self.webui_port_uvr5 = webui_port_uvr5
63 |         self.webui_port_infer_tts = webui_port_infer_tts
64 |         self.webui_port_subfix = webui_port_subfix
65 | 
66 |         self.api_port = api_port
67 | 


--------------------------------------------------------------------------------
/donate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/donate.jpg


--------------------------------------------------------------------------------
/note.txt:
--------------------------------------------------------------------------------
1 | when update GPT_SoVITS
2 | rember make change s2_train.py
3 | from 
4 | import utils
5 | to
6 | from GPT_SoVITS import utils


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pydub
 2 | srt
 3 | librosa
 4 | LangSegment
 5 | transformers
 6 | cn2an
 7 | pypinyin
 8 | jieba_fast
 9 | pyopenjtalk
10 | wordsegment
11 | g2p_en
12 | pytorch_lightning
13 | audiotsm
14 | ffmpeg-python


--------------------------------------------------------------------------------
/tools/__pycache__/my_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/tools/__pycache__/my_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/i18n/__pycache__/i18n.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/tools/i18n/__pycache__/i18n.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | parent_directory = os.path.dirname(os.path.abspath(__file__))
 6 | def load_language_list(language):
 7 |     with open(os.path.join(parent_directory,f"locale/{language}.json"), "r", encoding="utf-8") as f:
 8 |         language_list = json.load(f)
 9 |     return language_list
10 | 
11 | 
12 | class I18nAuto:
13 |     def __init__(self, language=None):
14 |         if language in ["Auto", None]:
15 |             language = locale.getdefaultlocale()[
16 |                 0
17 |             ]  # getlocale can't identify the system's language ((None, None))
18 |         if not os.path.exists(os.path.join(parent_directory,f"locale/{language}.json")):
19 |             language = "en_US"
20 |         self.language = language
21 |         self.language_map = load_language_list(language)
22 | 
23 |     def __call__(self, key):
24 |         return self.language_map.get(key, key)
25 | 
26 |     def __repr__(self):
27 |         return "Use Language: " + self.language
28 | 


--------------------------------------------------------------------------------
/tools/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音",
  3 |     "A模型权重": "A模型权重",
  4 |     "A模型路径": "A模型路径",
  5 |     "B模型路径": "B模型路径",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt处理",
 13 |     "harvest进程数": "harvest进程数",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一键训练",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "保存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名",
 33 |     "保存的模型名不带后缀": "保存的模型名不带后缀",
 34 |     "保存频率save_every_epoch": "保存频率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)",
 38 |     "停止音频转换": "停止音频转换",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路径",
 41 |     "加载模型": "加载模型",
 42 |     "加载预训练底模D路径": "加载预训练底模D路径",
 43 |     "加载预训练底模G路径": "加载预训练底模G路径",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸载音色省显存",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "后处理重采样至最终采样率，0为不进行重采样",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "响应阈值",
 51 |     "响度因子": "响度因子",
 52 |     "处理数据": "处理数据",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "导出文件格式",
 55 |     "常见问题解答": "常见问题解答",
 56 |     "常规设置": "常规设置",
 57 |     "开始音频转换": "开始音频转换",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "性能设置",
 60 |     "总训练轮数total_epoch": "总训练轮数total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定输出文件夹",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理时间(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速",
 74 |     "显卡信息": "显卡信息",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)",
 78 |     "检索特征占比": "检索特征占比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况",
 82 |     "模型是否带音高指导": "模型是否带音高指导",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)",
 84 |     "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否",
 85 |     "模型版本型号": "模型版本型号",
 86 |     "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合",
 87 |     "模型路径": "模型路径",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出长度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特征提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ",
 94 |     "目标采样率": "目标采样率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型信息",
 99 |     "要置入的模型信息": "要置入的模型信息",
100 |     "训练": "训练",
101 |     "训练模型": "训练模型",
102 |     "训练特征索引": "训练特征索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "请指定说话人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "请选择说话人id",
108 |     "转换": "转换",
109 |     "输入实验名": "输入实验名",
110 |     "输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "输入训练文件夹路径",
116 |     "输入设备": "输入设备",
117 |     "输入降噪": "输入降噪",
118 |     "输出信息": "输出信息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "输出设备",
121 |     "输出降噪": "输出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)",
123 |     "选择.index文件": "选择.index文件",
124 |     "选择.pth文件": "选择.pth文件",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "采样长度",
130 |     "重载设备列表": "重载设备列表",
131 |     "音调设置": "音调设置",
132 |     "音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)",
133 |     "音高算法": "音高算法",
134 |     "额外推理时长": "额外推理时长"
135 | }
136 | 


--------------------------------------------------------------------------------
/tools/i18n/locale/zh_HK.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/tools/i18n/locale/zh_SG.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/tools/i18n/locale/zh_TW.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/tools/i18n/locale_diff.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import OrderedDict
 4 | 
 5 | # Define the standard file name
 6 | standard_file = "locale/zh_CN.json"
 7 | 
 8 | # Find all JSON files in the directory
 9 | dir_path = "locale/"
10 | languages = [
11 |     os.path.join(dir_path, f)
12 |     for f in os.listdir(dir_path)
13 |     if f.endswith(".json") and f != standard_file
14 | ]
15 | 
16 | # Load the standard file
17 | with open(standard_file, "r", encoding="utf-8") as f:
18 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
19 | 
20 | # Loop through each language file
21 | for lang_file in languages:
22 |     # Load the language file
23 |     with open(lang_file, "r", encoding="utf-8") as f:
24 |         lang_data = json.load(f, object_pairs_hook=OrderedDict)
25 | 
26 |     # Find the difference between the language file and the standard file
27 |     diff = set(standard_data.keys()) - set(lang_data.keys())
28 | 
29 |     miss = set(lang_data.keys()) - set(standard_data.keys())
30 | 
31 |     # Add any missing keys to the language file
32 |     for key in diff:
33 |         lang_data[key] = key
34 | 
35 |     # Del any extra keys to the language file
36 |     for key in miss:
37 |         del lang_data[key]
38 | 
39 |     # Sort the keys of the language file to match the order of the standard file
40 |     lang_data = OrderedDict(
41 |         sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
42 |     )
43 | 
44 |     # Save the updated language file
45 |     with open(lang_file, "w", encoding="utf-8") as f:
46 |         json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
47 |         f.write("\n")
48 | 


--------------------------------------------------------------------------------
/tools/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import glob
 3 | import json
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def extract_i18n_strings(node):
 8 |     i18n_strings = []
 9 | 
10 |     if (
11 |         isinstance(node, ast.Call)
12 |         and isinstance(node.func, ast.Name)
13 |         and node.func.id == "i18n"
14 |     ):
15 |         for arg in node.args:
16 |             if isinstance(arg, ast.Str):
17 |                 i18n_strings.append(arg.s)
18 | 
19 |     for child_node in ast.iter_child_nodes(node):
20 |         i18n_strings.extend(extract_i18n_strings(child_node))
21 | 
22 |     return i18n_strings
23 | 
24 | 
25 | # scan the directory for all .py files (recursively)
26 | # for each file, parse the code into an AST
27 | # for each AST, extract the i18n strings
28 | 
29 | strings = []
30 | for filename in glob.iglob("**/*.py", recursive=True):
31 |     with open(filename, "r") as f:
32 |         code = f.read()
33 |         if "I18nAuto" in code:
34 |             tree = ast.parse(code)
35 |             i18n_strings = extract_i18n_strings(tree)
36 |             print(filename, len(i18n_strings))
37 |             strings.extend(i18n_strings)
38 | code_keys = set(strings)
39 | """
40 | n_i18n.py
41 | gui_v1.py 26
42 | app.py 16
43 | infer-web.py 147
44 | scan_i18n.py 0
45 | i18n.py 0
46 | lib/train/process_ckpt.py 1
47 | """
48 | print()
49 | print("Total unique:", len(code_keys))
50 | 
51 | 
52 | standard_file = "i18n/locale/zh_CN.json"
53 | with open(standard_file, "r", encoding="utf-8") as f:
54 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
55 | standard_keys = set(standard_data.keys())
56 | 
57 | # Define the standard file name
58 | unused_keys = standard_keys - code_keys
59 | print("Unused keys:", len(unused_keys))
60 | for unused_key in unused_keys:
61 |     print("\t", unused_key)
62 | 
63 | missing_keys = code_keys - standard_keys
64 | print("Missing keys:", len(missing_keys))
65 | for missing_key in missing_keys:
66 |     print("\t", missing_key)
67 | 
68 | code_keys_dict = OrderedDict()
69 | for s in strings:
70 |     code_keys_dict[s] = s
71 | 
72 | # write back
73 | with open(standard_file, "w", encoding="utf-8") as f:
74 |     json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
75 |     f.write("\n")
76 | 


--------------------------------------------------------------------------------
/tools/my_utils.py:
--------------------------------------------------------------------------------
 1 | import platform,os,traceback
 2 | import ffmpeg
 3 | import numpy as np
 4 | 
 5 | 
 6 | def load_audio(file, sr):
 7 |     try:
 8 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 9 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11 |         file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
12 |         if os.path.exists(file) == False:
13 |             raise RuntimeError(
14 |                 "You input a wrong audio path that does not exists, please fix it!"
15 |             )
16 |         out, _ = (
17 |             ffmpeg.input(file, threads=0)
18 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20 |         )
21 |     except Exception as e:
22 |         traceback.print_exc()
23 |         raise RuntimeError(f"Failed to load audio: {e}")
24 | 
25 |     return np.frombuffer(out, np.float32).flatten()
26 | 
27 | 
28 | def clean_path(path_str):
29 |     if platform.system() == 'Windows':
30 |         path_str = path_str.replace('/', '\\')
31 |     return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
32 | 


--------------------------------------------------------------------------------
/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/web.png


--------------------------------------------------------------------------------
/web/js/alertMSG.js:
--------------------------------------------------------------------------------
 1 | import { app } from "../../../scripts/app.js";
 2 | 
 3 | app.registerExtension({
 4 | 	name: "GPT_SOVITS.alertMSG",
 5 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
 6 | 		if (nodeData?.name == "GPT_SOVITS_FT") {
 7 | 			nodeType.prototype.onExecuted = function (data) {
 8 | 				// alert("Success!you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n Now you can tts or inference");
 9 | 				let msg = "Success! you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n you'd like to reboot the server to use tts and inference?"
10 | 				if (confirm(msg)) {
11 | 					try {
12 | 						api.fetchApi("/gpt_sovits/reboot");
13 | 					}
14 | 					catch(exception) {
15 | 					}
16 | 				}
17 | 			}
18 | 		}
19 | 	},
20 | });


--------------------------------------------------------------------------------
/web/js/previewAudio.js:
--------------------------------------------------------------------------------
  1 | import { app } from "../../../scripts/app.js";
  2 | import { api } from '../../../scripts/api.js'
  3 | 
  4 | function fitHeight(node) {
  5 |     node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]])
  6 |     node?.graph?.setDirtyCanvas(true);
  7 | }
  8 | function chainCallback(object, property, callback) {
  9 |     if (object == undefined) {
 10 |         //This should not happen.
 11 |         console.error("Tried to add callback to non-existant object")
 12 |         return;
 13 |     }
 14 |     if (property in object) {
 15 |         const callback_orig = object[property]
 16 |         object[property] = function () {
 17 |             const r = callback_orig.apply(this, arguments);
 18 |             callback.apply(this, arguments);
 19 |             return r
 20 |         };
 21 |     } else {
 22 |         object[property] = callback;
 23 |     }
 24 | }
 25 | 
 26 | function addPreviewOptions(nodeType) {
 27 |     chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) {
 28 |         // The intended way of appending options is returning a list of extra options,
 29 |         // but this isn't used in widgetInputs.js and would require
 30 |         // less generalization of chainCallback
 31 |         let optNew = []
 32 |         try {
 33 |             const previewWidget = this.widgets.find((w) => w.name === "audiopreview");
 34 | 
 35 |             let url = null
 36 |             if (previewWidget.audioEl?.hidden == false && previewWidget.audioEl.src) {
 37 |                 //Use full quality audio
 38 |                 //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params));
 39 |                 url = previewWidget.audioEl.src
 40 |             }
 41 |             if (url) {
 42 |                 optNew.push(
 43 |                     {
 44 |                         content: "Open preview",
 45 |                         callback: () => {
 46 |                             window.open(url, "_blank")
 47 |                         },
 48 |                     },
 49 |                     {
 50 |                         content: "Save preview",
 51 |                         callback: () => {
 52 |                             const a = document.createElement("a");
 53 |                             a.href = url;
 54 |                             a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename"));
 55 |                             document.body.append(a);
 56 |                             a.click();
 57 |                             requestAnimationFrame(() => a.remove());
 58 |                         },
 59 |                     }
 60 |                 );
 61 |             }
 62 |             if(options.length > 0 && options[0] != null && optNew.length > 0) {
 63 |                 optNew.push(null);
 64 |             }
 65 |             options.unshift(...optNew);
 66 |             
 67 |         } catch (error) {
 68 |             console.log(error);
 69 |         }
 70 |         
 71 |     });
 72 | }
 73 | function previewAudio(node,file,type){
 74 |     var element = document.createElement("div");
 75 |     const previewNode = node;
 76 |     var previewWidget = node.addDOMWidget("audiopreview", "preview", element, {
 77 |         serialize: false,
 78 |         hideOnZoom: false,
 79 |         getValue() {
 80 |             return element.value;
 81 |         },
 82 |         setValue(v) {
 83 |             element.value = v;
 84 |         },
 85 |     });
 86 |     previewWidget.computeSize = function(width) {
 87 |         if (this.aspectRatio && !this.parentEl.hidden) {
 88 |             let height = (previewNode.size[0]-20)/ this.aspectRatio + 10;
 89 |             if (!(height > 0)) {
 90 |                 height = 0;
 91 |             }
 92 |             this.computedHeight = height + 10;
 93 |             return [width, height];
 94 |         }
 95 |         return [width, -4];//no loaded src, widget should not display
 96 |     }
 97 |     // element.style['pointer-events'] = "none"
 98 |     previewWidget.value = {hidden: false, paused: false, params: {}}
 99 |     previewWidget.parentEl = document.createElement("div");
100 |     previewWidget.parentEl.className = "audio_preview";
101 |     previewWidget.parentEl.style['width'] = "100%"
102 |     element.appendChild(previewWidget.parentEl);
103 |     previewWidget.audioEl = document.createElement("audio");
104 |     previewWidget.audioEl.controls = true;
105 |     previewWidget.audioEl.loop = false;
106 |     previewWidget.audioEl.muted = false;
107 |     previewWidget.audioEl.style['width'] = "100%"
108 |     previewWidget.audioEl.addEventListener("loadedmetadata", () => {
109 | 
110 |         previewWidget.aspectRatio = previewWidget.audioEl.audioWidth / previewWidget.audioEl.audioHeight;
111 |         fitHeight(this);
112 |     });
113 |     previewWidget.audioEl.addEventListener("error", () => {
114 |         //TODO: consider a way to properly notify the user why a preview isn't shown.
115 |         previewWidget.parentEl.hidden = true;
116 |         fitHeight(this);
117 |     });
118 | 
119 |     let params =  {
120 |         "filename": file,
121 |         "type": type,
122 |     }
123 |     
124 |     previewWidget.parentEl.hidden = previewWidget.value.hidden;
125 |     previewWidget.audioEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden;
126 |     let target_width = 256
127 |     if (element.style?.width) {
128 |         //overscale to allow scrolling. Endpoint won't return higher than native
129 |         target_width = element.style.width.slice(0,-2)*2;
130 |     }
131 |     if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") {
132 |         params.force_size = target_width+"x?"
133 |     } else {
134 |         let size = params.force_size.split("x")
135 |         let ar = parseInt(size[0])/parseInt(size[1])
136 |         params.force_size = target_width+"x"+(target_width/ar)
137 |     }
138 |     
139 |     previewWidget.audioEl.src = api.apiURL('/view?' + new URLSearchParams(params));
140 | 
141 |     previewWidget.audioEl.hidden = false;
142 |     previewWidget.parentEl.appendChild(previewWidget.audioEl)
143 | }
144 | 
145 | app.registerExtension({
146 | 	name: "GPT_SOVITS.AudioPreviewer",
147 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
148 | 		if (nodeData?.name == "PreViewAudio") {
149 | 			nodeType.prototype.onExecuted = function (data) {
150 | 				previewAudio(this, data.audio[0], data.audio[1]);
151 | 			}
152 |             addPreviewOptions(nodeType)
153 | 		}
154 | 	}
155 | });


--------------------------------------------------------------------------------
/web/js/refreshPath.js:
--------------------------------------------------------------------------------
  1 | import { app } from "../../../scripts/app.js";
  2 | import { api } from '../../../scripts/api.js'
  3 | import { ComfyWidgets } from "../../../scripts/widgets.js"
  4 | function rebootAPI() {
  5 | 	if (confirm("Are you sure you'd like to reboot the server to refresh weights path?")) {
  6 | 		try {
  7 | 			api.fetchApi("/gpt_sovits/reboot");
  8 | 		}
  9 | 		catch(exception) {
 10 | 
 11 | 		}
 12 | 		return true;
 13 | 	}
 14 | 
 15 | 	return false;
 16 | }
 17 | function pathRefresh(node, inputName, inputData, app) {
 18 |     const gptWidget = node.widgets.find((w) => w.name === "gpt_weight")
 19 |     const sovitsWidget = node.widgets.find((w) => w.name === "sovits_weight")
 20 |     /* 
 21 |     A method that returns the required style for the html 
 22 |     */
 23 |     var default_gpt_value = gptWidget.value;
 24 |     Object.defineProperty(gptWidget, "value", {
 25 |         set : function(value) {
 26 |             this._real_value = value;
 27 |         },
 28 | 
 29 |         get : function() {
 30 |             let value = "";
 31 |             if (this._real_value) {
 32 |                 value = this._real_value;
 33 |             } else {
 34 |                 return default_gpt_value;
 35 |             }
 36 | 
 37 |             if (value.filename) {
 38 |                 let real_value = value;
 39 |                 value = "";
 40 |                 if (real_value.subfolder) {
 41 |                     value = real_value.subfolder + "/";
 42 |                 }
 43 | 
 44 |                 value += real_value.filename;
 45 | 
 46 |                 if(real_value.type && real_value.type !== "input")
 47 |                     value += ` [${real_value.type}]`;
 48 |             }
 49 |             return value;
 50 |         }
 51 |     });
 52 | 
 53 |     var default_sovits_value = sovitsWidget.value;
 54 |     Object.defineProperty(sovitsWidget, "value", {
 55 |         set : function(value) {
 56 |             this._real_value = value;
 57 |         },
 58 | 
 59 |         get : function() {
 60 |             let value = "";
 61 |             if (this._real_value) {
 62 |                 value = this._real_value;
 63 |             } else {
 64 |                 return default_sovits_value;
 65 |             }
 66 | 
 67 |             if (value.filename) {
 68 |                 let real_value = value;
 69 |                 value = "";
 70 |                 if (real_value.subfolder) {
 71 |                     value = real_value.subfolder + "/";
 72 |                 }
 73 | 
 74 |                 value += real_value.filename;
 75 | 
 76 |                 if(real_value.type && real_value.type !== "input")
 77 |                     value += ` [${real_value.type}]`;
 78 |             }
 79 |             return value;
 80 |         }
 81 |     });
 82 | 
 83 |     // Create the button widget for selecting the files
 84 |     let refreshWidget = node.addWidget("button", "REBOOT TO REFRESH WEIGHTS LIST", "refresh", () => {
 85 |         rebootAPI()
 86 |     });
 87 | 
 88 |     refreshWidget.serialize = false;
 89 | 
 90 |     const cb = node.callback;
 91 |     gptWidget.callback = function () {
 92 |         if (cb) {
 93 |             return cb.apply(this, arguments);
 94 |         }
 95 |     };
 96 |     sovitsWidget.callback = function () {
 97 |         if (cb) {
 98 |             return cb.apply(this, arguments);
 99 |         }
100 |     };
101 | 
102 |     return { widget: refreshWidget };
103 | }
104 | ComfyWidgets.PATHREFRESH = pathRefresh;
105 | 
106 | app.registerExtension({
107 | 	name: "GPT_SOVITS.RefreshPath",
108 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
109 | 		if (nodeData?.name == "GPT_SOVITS_TTS") {
110 | 			nodeData.input.required.upload = ["PATHREFRESH"];
111 | 		}
112 | 
113 |         if (nodeData?.name == "GPT_SOVITS_INFER") {
114 | 			nodeData.input.required.upload = ["PATHREFRESH"];
115 | 		}
116 | 	},
117 | });


--------------------------------------------------------------------------------
/web/js/uploadSRT.js:
--------------------------------------------------------------------------------
  1 | import { app } from "../../../scripts/app.js";
  2 | import { api } from '../../../scripts/api.js'
  3 | import { ComfyWidgets } from "../../../scripts/widgets.js"
  4 | 
  5 | function srtUpload(node, inputName, inputData, app) {
  6 |     const srtWidget = node.widgets.find((w) => w.name === "srt");
  7 |     let uploadWidget;
  8 |     /* 
  9 |     A method that returns the required style for the html 
 10 |     */
 11 |     var default_value = srtWidget.value;
 12 |     Object.defineProperty(srtWidget, "value", {
 13 |         set : function(value) {
 14 |             this._real_value = value;
 15 |         },
 16 | 
 17 |         get : function() {
 18 |             let value = "";
 19 |             if (this._real_value) {
 20 |                 value = this._real_value;
 21 |             } else {
 22 |                 return default_value;
 23 |             }
 24 | 
 25 |             if (value.filename) {
 26 |                 let real_value = value;
 27 |                 value = "";
 28 |                 if (real_value.subfolder) {
 29 |                     value = real_value.subfolder + "/";
 30 |                 }
 31 | 
 32 |                 value += real_value.filename;
 33 | 
 34 |                 if(real_value.type && real_value.type !== "input")
 35 |                     value += ` [${real_value.type}]`;
 36 |             }
 37 |             return value;
 38 |         }
 39 |     });
 40 |     async function uploadFile(file, updateNode, pasted = false) {
 41 |         try {
 42 |             // Wrap file in formdata so it includes filename
 43 |             const body = new FormData();
 44 |             body.append("image", file);
 45 |             if (pasted) body.append("subfolder", "pasted");
 46 |             const resp = await api.fetchApi("/upload/image", {
 47 |                 method: "POST",
 48 |                 body,
 49 |             });
 50 | 
 51 |             if (resp.status === 200) {
 52 |                 const data = await resp.json();
 53 |                 // Add the file to the dropdown list and update the widget value
 54 |                 let path = data.name;
 55 |                 if (data.subfolder) path = data.subfolder + "/" + path;
 56 | 
 57 |                 if (!srtWidget.options.values.includes(path)) {
 58 |                     srtWidget.options.values.push(path);
 59 |                 }
 60 | 
 61 |                 if (updateNode) {
 62 |                     srtWidget.value = path;
 63 |                 }
 64 |             } else {
 65 |                 alert(resp.status + " - " + resp.statusText);
 66 |             }
 67 |         } catch (error) {
 68 |             alert(error);
 69 |         }
 70 |     }
 71 | 
 72 |     const fileInput = document.createElement("input");
 73 |     Object.assign(fileInput, {
 74 |         type: "file",
 75 |         accept: "file/srt,file/txt",
 76 |         style: "display: none",
 77 |         onchange: async () => {
 78 |             if (fileInput.files.length) {
 79 |                 await uploadFile(fileInput.files[0], true);
 80 |             }
 81 |         },
 82 |     });
 83 |     document.body.append(fileInput);
 84 | 
 85 |     // Create the button widget for selecting the files
 86 |     uploadWidget = node.addWidget("button", "choose srt file to upload", "Audio", () => {
 87 |         fileInput.click();
 88 |     });
 89 | 
 90 |     uploadWidget.serialize = false;
 91 | 
 92 |     const cb = node.callback;
 93 |     srtWidget.callback = function () {
 94 |         if (cb) {
 95 |             return cb.apply(this, arguments);
 96 |         }
 97 |     };
 98 | 
 99 |     return { widget: uploadWidget };
100 | }
101 | 
102 | ComfyWidgets.SRTPLOAD = srtUpload;
103 | 
104 | app.registerExtension({
105 | 	name: "GPT_SOVITS.UploadSRT",
106 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
107 | 		if (nodeData?.name == "LoadSRT") {
108 | 			nodeData.input.required.upload = ["SRTPLOAD"];
109 | 		}
110 | 	},
111 | });
112 | 
113 | 


--------------------------------------------------------------------------------
/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/wechat.jpg


--------------------------------------------------------------------------------