├── .gitignore ├── 1key.jpg ├── GPT_SoVITS ├── AR │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-310.pyc │ ├── data │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── bucket_sampler.cpython-310.pyc │ │ │ ├── data_module.cpython-310.pyc │ │ │ └── dataset.cpython-310.pyc │ │ ├── bucket_sampler.py │ │ ├── data_module.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── t2s_lightning_module.cpython-310.pyc │ │ │ ├── t2s_model.cpython-310.pyc │ │ │ └── utils.cpython-310.pyc │ │ ├── t2s_lightning_module.py │ │ ├── t2s_lightning_module_onnx.py │ │ ├── t2s_model.py │ │ ├── t2s_model_onnx.py │ │ └── utils.py │ ├── modules │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── activation.cpython-310.pyc │ │ │ ├── embedding.cpython-310.pyc │ │ │ ├── lr_schedulers.cpython-310.pyc │ │ │ ├── optim.cpython-310.pyc │ │ │ ├── patched_mha_with_cache.cpython-310.pyc │ │ │ ├── scaling.cpython-310.pyc │ │ │ └── transformer.cpython-310.pyc │ │ ├── activation.py │ │ ├── activation_onnx.py │ │ ├── embedding.py │ │ ├── embedding_onnx.py │ │ ├── lr_schedulers.py │ │ ├── optim.py │ │ ├── patched_mha_with_cache.py │ │ ├── patched_mha_with_cache_onnx.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── transformer_onnx.py │ ├── text_processing │ │ ├── __init__.py │ │ ├── phonemizer.py │ │ └── symbols.py │ └── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ └── io.cpython-310.pyc │ │ ├── initialize.py │ │ └── io.py ├── __pycache__ │ ├── my_utils.cpython-310.pyc │ ├── process_ckpt.cpython-310.pyc │ └── utils.cpython-310.pyc ├── configs │ ├── s1.yaml │ ├── s1big.yaml │ ├── s1big2.yaml │ ├── s1longer.yaml │ ├── s1mq.yaml │ ├── s2.json │ └── train.yaml ├── feature_extractor │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── cnhubert.cpython-310.pyc │ │ └── whisper_enc.cpython-310.pyc │ ├── cnhubert.py │ └── whisper_enc.py ├── inference_gui.py ├── inference_webui.py ├── module │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── attentions.cpython-310.pyc │ │ ├── commons.cpython-310.pyc │ │ ├── core_vq.cpython-310.pyc │ │ ├── data_utils.cpython-310.pyc │ │ ├── losses.cpython-310.pyc │ │ ├── mel_processing.cpython-310.pyc │ │ ├── models.cpython-310.pyc │ │ ├── modules.cpython-310.pyc │ │ ├── mrte_model.cpython-310.pyc │ │ ├── quantize.cpython-310.pyc │ │ └── transforms.cpython-310.pyc │ ├── attentions.py │ ├── attentions_onnx.py │ ├── commons.py │ ├── core_vq.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── models_onnx.py │ ├── modules.py │ ├── mrte_model.py │ ├── quantize.py │ └── transforms.py ├── my_utils.py ├── onnx_export.py ├── prepare_datasets │ ├── 1-get-text.py │ ├── 2-get-hubert-wav32k.py │ └── 3-get-semantic.py ├── process_ckpt.py ├── s1_train.py ├── s2_train.py ├── text │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── chinese.cpython-310.pyc │ │ ├── cleaner.cpython-310.pyc │ │ ├── english.cpython-310.pyc │ │ ├── japanese.cpython-310.pyc │ │ ├── symbols.cpython-310.pyc │ │ └── tone_sandhi.cpython-310.pyc │ ├── chinese.py │ ├── cleaner.py │ ├── cmudict-fast.rep │ ├── cmudict.rep │ ├── engdict-hot.rep │ ├── engdict_cache.pickle │ ├── english.py │ ├── japanese.py │ ├── namedict_cache.pickle │ ├── opencpop-strict.txt │ ├── symbols.py │ ├── tone_sandhi.py │ └── zh_normalization │ │ ├── README.md │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── char_convert.cpython-310.pyc │ │ ├── chronology.cpython-310.pyc │ │ ├── constants.cpython-310.pyc │ │ ├── num.cpython-310.pyc │ │ ├── phonecode.cpython-310.pyc │ │ ├── quantifier.cpython-310.pyc │ │ └── text_normlization.cpython-310.pyc │ │ ├── char_convert.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py └── utils.py ├── LICENSE ├── README.md ├── __init__.py ├── config.py ├── donate.jpg ├── finetune.py ├── inference.py ├── nodes.py ├── note.txt ├── requirements.txt ├── tools ├── __pycache__ │ └── my_utils.cpython-310.pyc ├── i18n │ ├── __pycache__ │ │ └── i18n.cpython-310.pyc │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ko_KR.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ ├── locale_diff.py │ └── scan_i18n.py └── my_utils.py ├── web.png ├── web └── js │ ├── alertMSG.js │ ├── previewAudio.js │ ├── refreshPath.js │ ├── uploadAudio.js │ └── uploadSRT.js └── wechat.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | /pretrained_models 3 | /logs 4 | -------------------------------------------------------------------------------- /1key.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/1key.jpg -------------------------------------------------------------------------------- /GPT_SoVITS/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/bucket_sampler.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import math 5 | import random 6 | from random import shuffle 7 | from typing import Iterator 8 | from typing import Optional 9 | from typing import TypeVar 10 | 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data import Dataset 14 | from torch.utils.data import Sampler 15 | 16 | __all__ = [ 17 | "DistributedBucketSampler", 18 | ] 19 | 20 | T_co = TypeVar("T_co", covariant=True) 21 | 22 | 23 | class DistributedBucketSampler(Sampler[T_co]): 24 | r""" 25 | sort the dataset wrt. input length 26 | divide samples into buckets 27 | sort within buckets 28 | divide buckets into batches 29 | sort batches 30 | """ 31 | 32 | def __init__( 33 | self, 34 | dataset: Dataset, 35 | num_replicas: Optional[int] = None, 36 | rank: Optional[int] = None, 37 | shuffle: bool = True, 38 | seed: int = 0, 39 | drop_last: bool = False, 40 | batch_size: int = 32, 41 | ) -> None: 42 | if num_replicas is None: 43 | if not dist.is_available(): 44 | raise RuntimeError("Requires distributed package to be available") 45 | num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1 46 | if rank is None: 47 | if not dist.is_available(): 48 | raise RuntimeError("Requires distributed package to be available") 49 | rank = dist.get_rank() if torch.cuda.is_available() else 0 50 | if torch.cuda.is_available(): 51 | torch.cuda.set_device(rank) 52 | if rank >= num_replicas or rank < 0: 53 | raise ValueError( 54 | "Invalid rank {}, rank should be in the interval" 55 | " [0, {}]".format(rank, num_replicas - 1) 56 | ) 57 | self.dataset = dataset 58 | self.num_replicas = num_replicas 59 | self.rank = rank 60 | self.epoch = 0 61 | self.drop_last = drop_last 62 | # If the dataset length is evenly divisible by # of replicas, then there 63 | # is no need to drop any data, since the dataset will be split equally. 64 | if ( 65 | self.drop_last and len(self.dataset) % self.num_replicas != 0 66 | ): # type: ignore[arg-type] 67 | # Split to nearest available length that is evenly divisible. 68 | # This is to ensure each rank receives the same amount of data when 69 | # using this Sampler. 70 | self.num_samples = math.ceil( 71 | (len(self.dataset) - self.num_replicas) 72 | / self.num_replicas # type: ignore[arg-type] 73 | ) 74 | else: 75 | self.num_samples = math.ceil( 76 | len(self.dataset) / self.num_replicas 77 | ) # type: ignore[arg-type] 78 | self.total_size = self.num_samples * self.num_replicas 79 | self.shuffle = shuffle 80 | self.seed = seed 81 | self.batch_size = batch_size 82 | self.id_with_length = self._get_sample_lengths() 83 | self.id_buckets = self.make_buckets(bucket_width=2.0) 84 | 85 | def _get_sample_lengths(self): 86 | id_with_lengths = [] 87 | for i in range(len(self.dataset)): 88 | id_with_lengths.append((i, self.dataset.get_sample_length(i))) 89 | id_with_lengths.sort(key=lambda x: x[1]) 90 | return id_with_lengths 91 | 92 | def make_buckets(self, bucket_width: float = 2.0): 93 | buckets = [] 94 | cur = [] 95 | max_sec = bucket_width 96 | for id, sec in self.id_with_length: 97 | if sec < max_sec: 98 | cur.append(id) 99 | else: 100 | buckets.append(cur) 101 | cur = [id] 102 | max_sec += bucket_width 103 | if len(cur) > 0: 104 | buckets.append(cur) 105 | return buckets 106 | 107 | def __iter__(self) -> Iterator[T_co]: 108 | if self.shuffle: 109 | # deterministically shuffle based on epoch and seed 110 | g = torch.Generator() 111 | g.manual_seed(self.seed + self.epoch) 112 | random.seed(self.epoch + self.seed) 113 | shuffled_bucket = [] 114 | for buc in self.id_buckets: 115 | buc_copy = buc.copy() 116 | shuffle(buc_copy) 117 | shuffled_bucket.append(buc_copy) 118 | grouped_batch_size = self.batch_size * self.num_replicas 119 | shuffled_bucket = list(itertools.chain(*shuffled_bucket)) 120 | n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) 121 | batches = [ 122 | shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] 123 | for b in range(n_batch) 124 | ] 125 | shuffle(batches) 126 | indices = list(itertools.chain(*batches)) 127 | else: 128 | # type: ignore[arg-type] 129 | indices = list(range(len(self.dataset))) 130 | 131 | if not self.drop_last: 132 | # add extra samples to make it evenly divisible 133 | padding_size = self.total_size - len(indices) 134 | if padding_size <= len(indices): 135 | indices += indices[:padding_size] 136 | else: 137 | indices += (indices * math.ceil(padding_size / len(indices)))[ 138 | :padding_size 139 | ] 140 | else: 141 | # remove tail of data to make it evenly divisible. 142 | indices = indices[: self.total_size] 143 | assert len(indices) == self.total_size 144 | 145 | # subsample 146 | indices = indices[self.rank : self.total_size : self.num_replicas] 147 | assert len(indices) == self.num_samples 148 | 149 | return iter(indices) 150 | 151 | def __len__(self) -> int: 152 | return self.num_samples 153 | 154 | def set_epoch(self, epoch: int) -> None: 155 | r""" 156 | Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas 157 | use a different random ordering for each epoch. Otherwise, the next iteration of this 158 | sampler will yield the same ordering. 159 | 160 | Args: 161 | epoch (int): Epoch number. 162 | """ 163 | self.epoch = epoch 164 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from AR.data.bucket_sampler import DistributedBucketSampler 5 | from AR.data.dataset import Text2SemanticDataset 6 | from torch.utils.data import DataLoader 7 | 8 | 9 | class Text2SemanticDataModule(LightningDataModule): 10 | def __init__( 11 | self, 12 | config, 13 | train_semantic_path, 14 | train_phoneme_path, 15 | dev_semantic_path=None, 16 | dev_phoneme_path=None, 17 | ): 18 | super().__init__() 19 | self.config = config 20 | self.train_semantic_path = train_semantic_path 21 | self.train_phoneme_path = train_phoneme_path 22 | self.dev_semantic_path = dev_semantic_path 23 | self.dev_phoneme_path = dev_phoneme_path 24 | self.num_workers = self.config["data"]["num_workers"] 25 | 26 | def prepare_data(self): 27 | pass 28 | 29 | def setup(self, stage=None, output_logs=False): 30 | self._train_dataset = Text2SemanticDataset( 31 | phoneme_path=self.train_phoneme_path, 32 | semantic_path=self.train_semantic_path, 33 | max_sec=self.config["data"]["max_sec"], 34 | pad_val=self.config["data"]["pad_val"], 35 | ) 36 | self._dev_dataset = self._train_dataset 37 | # self._dev_dataset = Text2SemanticDataset( 38 | # phoneme_path=self.dev_phoneme_path, 39 | # semantic_path=self.dev_semantic_path, 40 | # max_sample=self.config['data']['max_eval_sample'], 41 | # max_sec=self.config['data']['max_sec'], 42 | # pad_val=self.config['data']['pad_val']) 43 | 44 | def train_dataloader(self): 45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] 46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 48 | return DataLoader( 49 | self._train_dataset, 50 | batch_size=batch_size, 51 | sampler=sampler, 52 | collate_fn=self._train_dataset.collate, 53 | num_workers=self.num_workers, 54 | persistent_workers=True, 55 | prefetch_factor=16, 56 | ) 57 | 58 | def val_dataloader(self): 59 | return DataLoader( 60 | self._dev_dataset, 61 | batch_size=1, 62 | shuffle=False, 63 | collate_fn=self._train_dataset.collate, 64 | num_workers=max(self.num_workers, 12), 65 | persistent_workers=True, 66 | prefetch_factor=16, 67 | ) 68 | 69 | # 这个会使用到嘛? 70 | def test_dataloader(self): 71 | return DataLoader( 72 | self._dev_dataset, 73 | batch_size=1, 74 | shuffle=False, 75 | collate_fn=self._train_dataset.collate, 76 | ) 77 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/t2s_lightning_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os, sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from typing import Dict 8 | 9 | import torch 10 | from pytorch_lightning import LightningModule 11 | from AR.models.t2s_model import Text2SemanticDecoder 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 13 | from AR.modules.optim import ScaledAdam 14 | 15 | class Text2SemanticLightningModule(LightningModule): 16 | def __init__(self, config, output_dir, is_train=True): 17 | super().__init__() 18 | self.config = config 19 | self.top_k = 3 20 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 21 | pretrained_s1 = config.get("pretrained_s1") 22 | if pretrained_s1 and is_train: 23 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 24 | print( 25 | self.load_state_dict( 26 | torch.load(pretrained_s1, map_location="cpu")["weight"] 27 | ) 28 | ) 29 | if is_train: 30 | self.automatic_optimization = False 31 | self.save_hyperparameters() 32 | self.eval_dir = output_dir / "eval" 33 | self.eval_dir.mkdir(parents=True, exist_ok=True) 34 | 35 | def training_step(self, batch: Dict, batch_idx: int): 36 | opt = self.optimizers() 37 | scheduler = self.lr_schedulers() 38 | forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old 39 | loss, acc = forward( 40 | batch["phoneme_ids"], 41 | batch["phoneme_ids_len"], 42 | batch["semantic_ids"], 43 | batch["semantic_ids_len"], 44 | batch["bert_feature"], 45 | ) 46 | self.manual_backward(loss) 47 | if batch_idx > 0 and batch_idx % 4 == 0: 48 | opt.step() 49 | opt.zero_grad() 50 | scheduler.step() 51 | 52 | self.log( 53 | "total_loss", 54 | loss, 55 | on_step=True, 56 | on_epoch=True, 57 | prog_bar=True, 58 | sync_dist=True, 59 | ) 60 | self.log( 61 | "lr", 62 | scheduler.get_last_lr()[0], 63 | on_epoch=True, 64 | prog_bar=True, 65 | sync_dist=True, 66 | ) 67 | self.log( 68 | f"top_{self.top_k}_acc", 69 | acc, 70 | on_step=True, 71 | on_epoch=True, 72 | prog_bar=True, 73 | sync_dist=True, 74 | ) 75 | 76 | def validation_step(self, batch: Dict, batch_idx: int): 77 | return 78 | 79 | # # get loss 80 | # loss, acc = self.model.forward( 81 | # batch['phoneme_ids'], batch['phoneme_ids_len'], 82 | # batch['semantic_ids'], batch['semantic_ids_len'], 83 | # batch['bert_feature'] 84 | # ) 85 | # 86 | # self.log( 87 | # "val_total_loss", 88 | # loss, 89 | # on_step=True, 90 | # on_epoch=True, 91 | # prog_bar=True, 92 | # sync_dist=True) 93 | # self.log( 94 | # f"val_top_{self.top_k}_acc", 95 | # acc, 96 | # on_step=True, 97 | # on_epoch=True, 98 | # prog_bar=True, 99 | # sync_dist=True) 100 | # 101 | # # get infer output 102 | # semantic_len = batch['semantic_ids'].size(1) 103 | # prompt_len = min(int(semantic_len * 0.5), 150) 104 | # prompt = batch['semantic_ids'][:, :prompt_len] 105 | # pred_semantic = self.model.infer(batch['phoneme_ids'], 106 | # batch['phoneme_ids_len'], prompt, 107 | # batch['bert_feature'] 108 | # ) 109 | # save_name = f'semantic_toks_{batch_idx}.pt' 110 | # save_path = os.path.join(self.eval_dir, save_name) 111 | # torch.save(pred_semantic.detach().cpu(), save_path) 112 | 113 | def configure_optimizers(self): 114 | model_parameters = self.model.parameters() 115 | parameters_names = [] 116 | parameters_names.append( 117 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()] 118 | ) 119 | lm_opt = ScaledAdam( 120 | model_parameters, 121 | lr=0.01, 122 | betas=(0.9, 0.95), 123 | clipping_scale=2.0, 124 | parameters_names=parameters_names, 125 | show_dominant_parameters=False, 126 | clipping_update_period=1000, 127 | ) 128 | 129 | return { 130 | "optimizer": lm_opt, 131 | "lr_scheduler": { 132 | "scheduler": WarmupCosineLRSchedule( 133 | lm_opt, 134 | init_lr=self.config["optimizer"]["lr_init"], 135 | peak_lr=self.config["optimizer"]["lr"], 136 | end_lr=self.config["optimizer"]["lr_end"], 137 | warmup_steps=self.config["optimizer"]["warmup_steps"], 138 | total_steps=self.config["optimizer"]["decay_steps"], 139 | ) 140 | }, 141 | } 142 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os, sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from typing import Dict 8 | 9 | import torch 10 | from pytorch_lightning import LightningModule 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 13 | from AR.modules.optim import ScaledAdam 14 | 15 | 16 | class Text2SemanticLightningModule(LightningModule): 17 | def __init__(self, config, output_dir, is_train=True): 18 | super().__init__() 19 | self.config = config 20 | self.top_k = 3 21 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 22 | pretrained_s1 = config.get("pretrained_s1") 23 | if pretrained_s1 and is_train: 24 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 25 | print( 26 | self.load_state_dict( 27 | torch.load(pretrained_s1, map_location="cpu")["weight"] 28 | ) 29 | ) 30 | if is_train: 31 | self.automatic_optimization = False 32 | self.save_hyperparameters() 33 | self.eval_dir = output_dir / "eval" 34 | self.eval_dir.mkdir(parents=True, exist_ok=True) 35 | 36 | def training_step(self, batch: Dict, batch_idx: int): 37 | opt = self.optimizers() 38 | scheduler = self.lr_schedulers() 39 | loss, acc = self.model.forward( 40 | batch["phoneme_ids"], 41 | batch["phoneme_ids_len"], 42 | batch["semantic_ids"], 43 | batch["semantic_ids_len"], 44 | batch["bert_feature"], 45 | ) 46 | self.manual_backward(loss) 47 | if batch_idx > 0 and batch_idx % 4 == 0: 48 | opt.step() 49 | opt.zero_grad() 50 | scheduler.step() 51 | 52 | self.log( 53 | "total_loss", 54 | loss, 55 | on_step=True, 56 | on_epoch=True, 57 | prog_bar=True, 58 | sync_dist=True, 59 | ) 60 | self.log( 61 | "lr", 62 | scheduler.get_last_lr()[0], 63 | on_epoch=True, 64 | prog_bar=True, 65 | sync_dist=True, 66 | ) 67 | self.log( 68 | f"top_{self.top_k}_acc", 69 | acc, 70 | on_step=True, 71 | on_epoch=True, 72 | prog_bar=True, 73 | sync_dist=True, 74 | ) 75 | 76 | def validation_step(self, batch: Dict, batch_idx: int): 77 | return 78 | 79 | def configure_optimizers(self): 80 | model_parameters = self.model.parameters() 81 | parameters_names = [] 82 | parameters_names.append( 83 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()] 84 | ) 85 | lm_opt = ScaledAdam( 86 | model_parameters, 87 | lr=0.01, 88 | betas=(0.9, 0.95), 89 | clipping_scale=2.0, 90 | parameters_names=parameters_names, 91 | show_dominant_parameters=False, 92 | clipping_update_period=1000, 93 | ) 94 | 95 | return { 96 | "optimizer": lm_opt, 97 | "lr_scheduler": { 98 | "scheduler": WarmupCosineLRSchedule( 99 | lm_opt, 100 | init_lr=self.config["optimizer"]["lr_init"], 101 | peak_lr=self.config["optimizer"]["lr"], 102 | end_lr=self.config["optimizer"]["lr_end"], 103 | warmup_steps=self.config["optimizer"]["warmup_steps"], 104 | total_steps=self.config["optimizer"]["decay_steps"], 105 | ) 106 | }, 107 | } 108 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/activation_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py 2 | from typing import Optional 3 | from typing import Tuple 4 | import torch 5 | from torch import Tensor 6 | from torch.nn import Linear 7 | from torch.nn import Module 8 | from torch.nn.init import constant_ 9 | from torch.nn.init import xavier_normal_ 10 | from torch.nn.init import xavier_uniform_ 11 | from torch.nn.modules.linear import NonDynamicallyQuantizableLinear 12 | from torch.nn.parameter import Parameter 13 | 14 | from torch.nn import functional as F 15 | from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched 16 | 17 | 18 | class MultiheadAttention(Module): 19 | __constants__ = ["batch_first"] 20 | bias_k: Optional[torch.Tensor] 21 | bias_v: Optional[torch.Tensor] 22 | 23 | def __init__( 24 | self, 25 | embed_dim, 26 | num_heads, 27 | dropout=0.0, 28 | bias=True, 29 | add_bias_kv=False, 30 | add_zero_attn=False, 31 | kdim=None, 32 | vdim=None, 33 | batch_first=False, 34 | linear1_cls=Linear, 35 | linear2_cls=Linear, 36 | device=None, 37 | dtype=None, 38 | ) -> None: 39 | factory_kwargs = {"device": device, "dtype": dtype} 40 | super(MultiheadAttention, self).__init__() 41 | self.embed_dim = embed_dim 42 | self.kdim = kdim if kdim is not None else embed_dim 43 | self.vdim = vdim if vdim is not None else embed_dim 44 | self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim 45 | 46 | self.num_heads = num_heads 47 | self.dropout = dropout 48 | self.batch_first = batch_first 49 | self.head_dim = embed_dim // num_heads 50 | assert ( 51 | self.head_dim * num_heads == self.embed_dim 52 | ), "embed_dim must be divisible by num_heads" 53 | 54 | if add_bias_kv: 55 | self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) 56 | self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) 57 | else: 58 | self.bias_k = self.bias_v = None 59 | 60 | if linear1_cls == Linear: 61 | if not self._qkv_same_embed_dim: 62 | self.q_proj_weight = Parameter( 63 | torch.empty((embed_dim, embed_dim), **factory_kwargs) 64 | ) 65 | self.k_proj_weight = Parameter( 66 | torch.empty((embed_dim, self.kdim), **factory_kwargs) 67 | ) 68 | self.v_proj_weight = Parameter( 69 | torch.empty((embed_dim, self.vdim), **factory_kwargs) 70 | ) 71 | self.register_parameter("in_proj_weight", None) 72 | else: 73 | self.in_proj_weight = Parameter( 74 | torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) 75 | ) 76 | self.register_parameter("q_proj_weight", None) 77 | self.register_parameter("k_proj_weight", None) 78 | self.register_parameter("v_proj_weight", None) 79 | 80 | if bias: 81 | self.in_proj_bias = Parameter( 82 | torch.empty(3 * embed_dim, **factory_kwargs) 83 | ) 84 | else: 85 | self.register_parameter("in_proj_bias", None) 86 | self.out_proj = NonDynamicallyQuantizableLinear( 87 | embed_dim, embed_dim, bias=bias, **factory_kwargs 88 | ) 89 | 90 | self._reset_parameters() 91 | else: 92 | if not self._qkv_same_embed_dim: 93 | raise NotImplementedError 94 | else: 95 | self.in_proj_linear = linear1_cls( 96 | embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs 97 | ) 98 | self.in_proj_weight = self.in_proj_linear.weight 99 | 100 | self.register_parameter("q_proj_weight", None) 101 | self.register_parameter("k_proj_weight", None) 102 | self.register_parameter("v_proj_weight", None) 103 | 104 | if bias: 105 | self.in_proj_bias = self.in_proj_linear.bias 106 | else: 107 | self.register_parameter("in_proj_bias", None) 108 | 109 | self.out_proj = linear2_cls( 110 | embed_dim, embed_dim, bias=bias, **factory_kwargs 111 | ) 112 | 113 | if self.bias_k is not None: 114 | xavier_normal_(self.bias_k) 115 | if self.bias_v is not None: 116 | xavier_normal_(self.bias_v) 117 | 118 | self.add_zero_attn = add_zero_attn 119 | 120 | def _reset_parameters(self): 121 | if self._qkv_same_embed_dim: 122 | xavier_uniform_(self.in_proj_weight) 123 | else: 124 | xavier_uniform_(self.q_proj_weight) 125 | xavier_uniform_(self.k_proj_weight) 126 | xavier_uniform_(self.v_proj_weight) 127 | 128 | if self.in_proj_bias is not None: 129 | constant_(self.in_proj_bias, 0.0) 130 | constant_(self.out_proj.bias, 0.0) 131 | 132 | if self.bias_k is not None: 133 | xavier_normal_(self.bias_k) 134 | if self.bias_v is not None: 135 | xavier_normal_(self.bias_v) 136 | 137 | def __setstate__(self, state): 138 | # Support loading old MultiheadAttention checkpoints generated by v1.1.0 139 | if "_qkv_same_embed_dim" not in state: 140 | state["_qkv_same_embed_dim"] = True 141 | 142 | super(MultiheadAttention, self).__setstate__(state) 143 | 144 | def forward( 145 | self, 146 | query: Tensor, 147 | key: Tensor, 148 | value: Tensor, 149 | key_padding_mask: Optional[Tensor] = None, 150 | need_weights: bool = True, 151 | attn_mask: Optional[Tensor] = None, 152 | average_attn_weights: bool = True, 153 | cache=None, 154 | ) -> Tuple[Tensor, Optional[Tensor]]: 155 | any_nested = query.is_nested or key.is_nested or value.is_nested 156 | query = key = value = query.transpose(1, 0) 157 | attn_output = multi_head_attention_forward_patched( 158 | query, 159 | key, 160 | value, 161 | self.embed_dim, 162 | self.num_heads, 163 | self.in_proj_weight, 164 | self.in_proj_bias, 165 | self.bias_k, 166 | self.bias_v, 167 | self.add_zero_attn, 168 | self.dropout, 169 | self.out_proj.weight, 170 | self.out_proj.bias, 171 | training=self.training, 172 | key_padding_mask=key_padding_mask, 173 | need_weights=need_weights, 174 | attn_mask=attn_mask, 175 | average_attn_weights=average_attn_weights, 176 | cache=cache, 177 | ) 178 | return attn_output.transpose(1, 0) 179 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange( 64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32 65 | ).unsqueeze(1) 66 | else: 67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 68 | div_term = torch.exp( 69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) 70 | * -(math.log(10000.0) / self.embedding_dim) 71 | ) 72 | pe[:, 0::2] = torch.sin(position * div_term) 73 | pe[:, 1::2] = torch.cos(position * div_term) 74 | pe = pe.unsqueeze(0) 75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 76 | 77 | def forward(self, x: torch.Tensor) -> torch.Tensor: 78 | self.extend_pe(x) 79 | output = x.unsqueeze(-1) if x.ndim == 2 else x 80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 81 | return self.dropout(output) 82 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / ( 53 | self.total_steps - self.warmup_steps 54 | ) 55 | if decay_ratio < 0.0 or decay_ratio > 1.0: 56 | raise RuntimeError( 57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." 58 | ) 59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 61 | 62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 63 | self.set_lr(lr) 64 | self.lr = lr 65 | self._current_step += 1 66 | return self.lr 67 | 68 | 69 | if __name__ == "__main__": 70 | m = nn.Linear(10, 10) 71 | opt = Adam(m.parameters(), lr=1e-4) 72 | s = WarmupCosineLRSchedule( 73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 74 | ) 75 | lrs = [] 76 | for i in range(25000): 77 | s.step() 78 | lrs.append(s.lr) 79 | print(s.lr) 80 | 81 | plt.plot(lrs) 82 | plt.plot(range(0, 25000), lrs) 83 | plt.show() 84 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _mha_shape_check, 4 | _canonical_mask, 5 | _none_or_dtype, 6 | _in_projection_packed, 7 | ) 8 | 9 | def multi_head_attention_forward_patched( 10 | query, 11 | key, 12 | value, 13 | embed_dim_to_check: int, 14 | num_heads: int, 15 | in_proj_weight, 16 | in_proj_bias: Optional[Tensor], 17 | bias_k: Optional[Tensor], 18 | bias_v: Optional[Tensor], 19 | add_zero_attn: bool, 20 | dropout_p: float, 21 | out_proj_weight: Tensor, 22 | out_proj_bias: Optional[Tensor], 23 | training: bool = True, 24 | key_padding_mask: Optional[Tensor] = None, 25 | need_weights: bool = True, 26 | attn_mask: Optional[Tensor] = None, 27 | use_separate_proj_weight: bool = False, 28 | q_proj_weight: Optional[Tensor] = None, 29 | k_proj_weight: Optional[Tensor] = None, 30 | v_proj_weight: Optional[Tensor] = None, 31 | static_k: Optional[Tensor] = None, 32 | static_v: Optional[Tensor] = None, 33 | average_attn_weights: bool = True, 34 | is_causal: bool = False, 35 | cache=None, 36 | ) -> Tuple[Tensor, Optional[Tensor]]: 37 | 38 | # set up shape vars 39 | _, _, embed_dim = query.shape 40 | attn_mask = _canonical_mask( 41 | mask=attn_mask, 42 | mask_name="attn_mask", 43 | other_type=None, 44 | other_name="", 45 | target_type=query.dtype, 46 | check_other=False, 47 | ) 48 | head_dim = embed_dim // num_heads 49 | 50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 53 | 54 | if cache["first_infer"] == 1: 55 | cache["k"][cache["stage"]] = k 56 | cache["v"][cache["stage"]] = v 57 | else: 58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 60 | k = cache["k"][cache["stage"]] 61 | v = cache["v"][cache["stage"]] 62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 63 | 64 | attn_mask = _canonical_mask( 65 | mask=attn_mask, 66 | mask_name="attn_mask", 67 | other_type=None, 68 | other_name="", 69 | target_type=q.dtype, 70 | check_other=False, 71 | ) 72 | attn_mask = attn_mask.unsqueeze(0) 73 | 74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 77 | 78 | dropout_p = 0.0 79 | attn_mask = attn_mask.unsqueeze(0) 80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 83 | attn_output = scaled_dot_product_attention( 84 | q, k, v, attn_mask, dropout_p, is_causal 85 | ) 86 | attn_output = ( 87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 88 | ) 89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 90 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 91 | 92 | return attn_output 93 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = ( 34 | rf"([{''.join(self._special_cases_dict.keys())}])" 35 | ) 36 | 37 | def _normalize_punctuation(self, text: str) -> str: 38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 40 | text = regex.sub(r"\pZ+", r" ", text) 41 | return text.strip() 42 | 43 | def _convert_punctuation(self, word: Word) -> str: 44 | if not word.phonemes: 45 | return "" 46 | if word.phonemes[0] in ["‖", "|"]: 47 | return word.text.strip() 48 | 49 | phonemes = "".join(word.phonemes) 50 | # remove modifier characters ˈˌː with regex 51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 52 | return phonemes.strip() 53 | 54 | def phonemize(self, text: str, espeak: bool = False) -> str: 55 | text_to_phonemize: str = self._normalize_punctuation(text) 56 | sents: List[Sentence] = [ 57 | sent 58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) 59 | ] 60 | words: List[str] = [ 61 | self._convert_punctuation(word) for word in itertools.chain(*sents) 62 | ] 63 | return " ".join(words) 64 | 65 | def transform(self, phonemes): 66 | # convert phonemes to ids 67 | # dictionary is in symbols.py 68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 69 | 70 | 71 | if __name__ == "__main__": 72 | phonemizer = GruutPhonemizer("en-us") 73 | # text -> IPA 74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 75 | print("phonemes:", phonemes) 76 | print("len(phonemes):", len(phonemes)) 77 | phoneme_ids = phonemizer.transform(phonemes) 78 | print("phoneme_ids:", phoneme_ids) 79 | print("len(phoneme_ids):", len(phoneme_ids)) 80 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 8 | SPACE_ID = SYMBOLS.index(" ") 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 11 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == 'true' else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted( 22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 23 | # 获取最新的 ckpt 文件名 24 | newest_ckpt = sorted_info[0][2] 25 | return newest_ckpt 26 | 27 | 28 | # 文本存在且不为空时 return True 29 | def check_txt_file(file_path): 30 | try: 31 | with open(file_path, 'r') as file: 32 | text = file.readline().strip() 33 | assert text.strip() != '' 34 | return text 35 | except Exception: 36 | return False 37 | return False 38 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | 7 | def initialize(model: torch.nn.Module, init: str): 8 | """Initialize weights of a neural network module. 9 | 10 | Parameters are initialized using the given method or distribution. 11 | 12 | Custom initialization routines can be implemented into submodules 13 | as function `espnet_initialization_fn` within the custom module. 14 | 15 | Args: 16 | model: Target. 17 | init: Method of initialization. 18 | """ 19 | assert check_argument_types() 20 | print("init with", init) 21 | 22 | # weight init 23 | for p in model.parameters(): 24 | if p.dim() > 1: 25 | if init == "xavier_uniform": 26 | torch.nn.init.xavier_uniform_(p.data) 27 | elif init == "xavier_normal": 28 | torch.nn.init.xavier_normal_(p.data) 29 | elif init == "kaiming_uniform": 30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 31 | elif init == "kaiming_normal": 32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 33 | else: 34 | raise ValueError("Unknown initialization: " + init) 35 | # bias init 36 | for name, p in model.named_parameters(): 37 | if ".bias" in name and p.dim() == 1: 38 | p.data.zero_() 39 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict( 22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") 23 | ) 24 | with open(path, "a") as args_file: 25 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 26 | args_file.write( 27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) 28 | ) 29 | args_file.write("==> Cmd:\n") 30 | args_file.write(str(sys.argv)) 31 | args_file.write("\n==> args:\n") 32 | for k, v in sorted(args_dict.items()): 33 | args_file.write(" %s: %s\n" % (str(k), str(v))) 34 | args_file.close() 35 | -------------------------------------------------------------------------------- /GPT_SoVITS/__pycache__/my_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/my_utils.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/__pycache__/process_ckpt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/process_ckpt.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 12 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 16 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 12 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 6 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1longer.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 512 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1mq.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 100 4 | batch_size: 6 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 32 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 40 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | saving_path: "ckpt/" 22 | resume_checkpoint: null 23 | vocoder_config_path: "quantizer/new_ckpt/config.json" 24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" 25 | datadir: "/home/liweiche/GigaSpeech/wavs" 26 | metapath: "/home/liweiche/GigaSpeech/train2.json" 27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json" 28 | sampledir: "logs/" 29 | pretrained_path: null 30 | lr: 0.0001 31 | batch_size: 200.0 32 | train_bucket_size: 8192 33 | training_step: 800000 34 | optim_flat_percent: 0.0 35 | warmup_step: 50 36 | adam_beta1: 0.9 37 | adam_beta2: 0.98 38 | ffd_size: 3072 39 | hidden_size: 768 40 | enc_nlayers: 6 41 | dec_nlayers: 6 42 | nheads: 12 43 | ar_layer: 4 44 | ar_ffd_size: 1024 45 | ar_hidden_size: 256 46 | ar_nheads: 4 47 | aligner_softmax_temp: 1.0 48 | layer_norm_eps: 0.00001 49 | speaker_embed_dropout: 0.05 50 | label_smoothing: 0.0 51 | val_check_interval: 5000 52 | check_val_every_n_epoch: 1 53 | precision: "fp16" 54 | nworkers: 16 55 | distributed: true 56 | accelerator: "ddp" 57 | version: null 58 | accumulate_grad_batches: 1 59 | use_repetition_token: true 60 | use_repetition_gating: false 61 | repetition_penalty: 1.0 62 | sampling_temperature: 1.0 63 | top_k: -1 64 | min_top_k: 3 65 | top_p: 0.8 66 | sample_num: 4 67 | length_penalty_max_length: 15000 68 | length_penalty_max_prob: 0.95 69 | max_input_length: 2048 70 | max_output_length: 2000 71 | sample_rate: 16000 72 | n_codes: 1024 73 | n_cluster_groups: 1 74 | phone_context_window: 4 75 | phoneset_size: 1000 76 | inference: 77 | top_k: 5 78 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 500, 5 | "seed": 1234, 6 | "epochs": 100, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 20480, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "text_low_lr_rate": 0.4 22 | }, 23 | "data": { 24 | "max_wav_value": 32768.0, 25 | "sampling_rate": 32000, 26 | "filter_length": 2048, 27 | "hop_length": 640, 28 | "win_length": 2048, 29 | "n_mel_channels": 128, 30 | "mel_fmin": 0.0, 31 | "mel_fmax": null, 32 | "add_blank": true, 33 | "n_speakers": 300, 34 | "cleaned_text": true 35 | }, 36 | "model": { 37 | "inter_channels": 192, 38 | "hidden_channels": 192, 39 | "filter_channels": 768, 40 | "n_heads": 2, 41 | "n_layers": 6, 42 | "kernel_size": 3, 43 | "p_dropout": 0.1, 44 | "resblock": "1", 45 | "resblock_kernel_sizes": [ 46 | 3, 47 | 7, 48 | 11 49 | ], 50 | "resblock_dilation_sizes": [ 51 | [ 52 | 1, 53 | 3, 54 | 5 55 | ], 56 | [ 57 | 1, 58 | 3, 59 | 5 60 | ], 61 | [ 62 | 1, 63 | 3, 64 | 5 65 | ] 66 | ], 67 | "upsample_rates": [ 68 | 10, 69 | 8, 70 | 2, 71 | 2, 72 | 2 73 | ], 74 | "upsample_initial_channel": 512, 75 | "upsample_kernel_sizes": [ 76 | 16, 77 | 16, 78 | 8, 79 | 2, 80 | 2 81 | ], 82 | "n_layers_q": 3, 83 | "use_spectral_norm": false, 84 | "gin_channels": 512, 85 | "semantic_frame_rate": "25hz", 86 | "freeze_quantizer": true 87 | }, 88 | "s2_ckpt_dir": "logs/s2/big2k1", 89 | "content_module": "cnhubert" 90 | } -------------------------------------------------------------------------------- /GPT_SoVITS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | gpu: 2 | n_card: 1 3 | n_process_per_card: 2 4 | io: 5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 24 28 | dropout: 0 29 | EOS: 1024 30 | random_bert: 0 31 | inference: 32 | top_k: 5 33 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = { 4 | 'cnhubert': cnhubert, 5 | 'whisper': whisper_enc 6 | } -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__pycache__/cnhubert.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/cnhubert.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__pycache__/whisper_enc.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/feature_extractor/__pycache__/whisper_enc.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/cnhubert.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import librosa 4 | import torch 5 | import torch.nn.functional as F 6 | import soundfile as sf 7 | import logging 8 | 9 | logging.getLogger("numba").setLevel(logging.WARNING) 10 | 11 | from transformers import ( 12 | Wav2Vec2FeatureExtractor, 13 | HubertModel, 14 | ) 15 | 16 | import utils 17 | import torch.nn as nn 18 | 19 | cnhubert_base_path = None 20 | 21 | 22 | class CNHubert(nn.Module): 23 | def __init__(self): 24 | super().__init__() 25 | self.model = HubertModel.from_pretrained(cnhubert_base_path) 26 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( 27 | cnhubert_base_path 28 | ) 29 | 30 | def forward(self, x): 31 | input_values = self.feature_extractor( 32 | x, return_tensors="pt", sampling_rate=16000 33 | ).input_values.to(x.device) 34 | feats = self.model(input_values)["last_hidden_state"] 35 | return feats 36 | 37 | 38 | # class CNHubertLarge(nn.Module): 39 | # def __init__(self): 40 | # super().__init__() 41 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 42 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 43 | # def forward(self, x): 44 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 45 | # feats = self.model(input_values)["last_hidden_state"] 46 | # return feats 47 | # 48 | # class CVec(nn.Module): 49 | # def __init__(self): 50 | # super().__init__() 51 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 52 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 53 | # def forward(self, x): 54 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 55 | # feats = self.model(input_values)["last_hidden_state"] 56 | # return feats 57 | # 58 | # class cnw2v2base(nn.Module): 59 | # def __init__(self): 60 | # super().__init__() 61 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 62 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 63 | # def forward(self, x): 64 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 65 | # feats = self.model(input_values)["last_hidden_state"] 66 | # return feats 67 | 68 | 69 | def get_model(): 70 | model = CNHubert() 71 | model.eval() 72 | return model 73 | 74 | 75 | # def get_large_model(): 76 | # model = CNHubertLarge() 77 | # model.eval() 78 | # return model 79 | # 80 | # def get_model_cvec(): 81 | # model = CVec() 82 | # model.eval() 83 | # return model 84 | # 85 | # def get_model_cnw2v2base(): 86 | # model = cnw2v2base() 87 | # model.eval() 88 | # return model 89 | 90 | 91 | def get_content(hmodel, wav_16k_tensor): 92 | with torch.no_grad(): 93 | feats = hmodel(wav_16k_tensor) 94 | return feats.transpose(1, 2) 95 | 96 | 97 | if __name__ == "__main__": 98 | model = get_model() 99 | src_path = "/Users/Shared/原音频2.wav" 100 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) 101 | model = model 102 | wav_16k_tensor = wav_16k_tensor 103 | feats = get_content(model, wav_16k_tensor) 104 | print(feats.shape) 105 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ 23 | :1, :feature_len, : 24 | ].transpose(1, 2) 25 | return feature 26 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/attentions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/attentions.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/commons.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/commons.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/core_vq.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/core_vq.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/data_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/data_utils.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/losses.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/losses.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/mel_processing.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/mel_processing.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/mrte_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/mrte_model.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/quantize.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/quantize.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/__pycache__/transforms.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/module/__pycache__/transforms.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/module/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def init_weights(m, mean=0.0, std=0.01): 7 | classname = m.__class__.__name__ 8 | if classname.find("Conv") != -1: 9 | m.weight.data.normal_(mean, std) 10 | 11 | 12 | def get_padding(kernel_size, dilation=1): 13 | return int((kernel_size * dilation - dilation) / 2) 14 | 15 | 16 | def convert_pad_shape(pad_shape): 17 | l = pad_shape[::-1] 18 | pad_shape = [item for sublist in l for item in sublist] 19 | return pad_shape 20 | 21 | 22 | def intersperse(lst, item): 23 | result = [item] * (len(lst) * 2 + 1) 24 | result[1::2] = lst 25 | return result 26 | 27 | 28 | def kl_divergence(m_p, logs_p, m_q, logs_q): 29 | """KL(P||Q)""" 30 | kl = (logs_q - logs_p) - 0.5 31 | kl += ( 32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 33 | ) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 68 | position = torch.arange(length, dtype=torch.float) 69 | num_timescales = channels // 2 70 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 71 | num_timescales - 1 72 | ) 73 | inv_timescales = min_timescale * torch.exp( 74 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 75 | ) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2, 3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1.0 / norm_type) 161 | return total_norm 162 | 163 | 164 | def squeeze(x, x_mask=None, n_sqz=2): 165 | b, c, t = x.size() 166 | 167 | t = (t // n_sqz) * n_sqz 168 | x = x[:, :, :t] 169 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 170 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 171 | 172 | if x_mask is not None: 173 | x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] 174 | else: 175 | x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 176 | return x_sqz * x_mask, x_mask 177 | 178 | 179 | def unsqueeze(x, x_mask=None, n_sqz=2): 180 | b, c, t = x.size() 181 | 182 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 183 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 184 | 185 | if x_mask is not None: 186 | x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 187 | else: 188 | x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 189 | return x_unsqz * x_mask, x_mask 190 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1 - dr) ** 2) 26 | g_loss = torch.mean(dg**2) 27 | loss += r_loss + g_loss 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1 - dg) ** 2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | 64 | def mle_loss(z, m, logs, logdet, mask): 65 | l = torch.sum(logs) + 0.5 * torch.sum( 66 | torch.exp(-2 * logs) * ((z - m) ** 2) 67 | ) # neg normal likelihood w/o the constant term 68 | l = l - torch.sum(logdet) # log jacobian determinant 69 | l = l / torch.sum( 70 | torch.ones_like(z) * mask 71 | ) # averaging across batch, channel and time axes 72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 73 | return l 74 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | if torch.min(y) < -1.0: 53 | print("min value is ", torch.min(y)) 54 | if torch.max(y) > 1.0: 55 | print("max value is ", torch.max(y)) 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + "_" + str(y.device) 59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 62 | dtype=y.dtype, device=y.device 63 | ) 64 | 65 | y = torch.nn.functional.pad( 66 | y.unsqueeze(1), 67 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 68 | mode="reflect", 69 | ) 70 | y = y.squeeze(1) 71 | spec = torch.stft( 72 | y, 73 | n_fft, 74 | hop_length=hop_size, 75 | win_length=win_size, 76 | window=hann_window[wnsize_dtype_device], 77 | center=center, 78 | pad_mode="reflect", 79 | normalized=False, 80 | onesided=True, 81 | return_complex=False, 82 | ) 83 | 84 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 85 | return spec 86 | 87 | 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 89 | global mel_basis 90 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 91 | fmax_dtype_device = str(fmax) + "_" + dtype_device 92 | if fmax_dtype_device not in mel_basis: 93 | mel = librosa_mel_fn( 94 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 95 | ) 96 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 97 | dtype=spec.dtype, device=spec.device 98 | ) 99 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 100 | spec = spectral_normalize_torch(spec) 101 | return spec 102 | 103 | 104 | def mel_spectrogram_torch( 105 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 106 | ): 107 | if torch.min(y) < -1.0: 108 | print("min value is ", torch.min(y)) 109 | if torch.max(y) > 1.0: 110 | print("max value is ", torch.max(y)) 111 | 112 | global mel_basis, hann_window 113 | dtype_device = str(y.dtype) + "_" + str(y.device) 114 | fmax_dtype_device = str(fmax) + "_" + dtype_device 115 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 116 | if fmax_dtype_device not in mel_basis: 117 | mel = librosa_mel_fn( 118 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 119 | ) 120 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 121 | dtype=y.dtype, device=y.device 122 | ) 123 | if wnsize_dtype_device not in hann_window: 124 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 125 | dtype=y.dtype, device=y.device 126 | ) 127 | 128 | y = torch.nn.functional.pad( 129 | y.unsqueeze(1), 130 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 131 | mode="reflect", 132 | ) 133 | y = y.squeeze(1) 134 | 135 | spec = torch.stft( 136 | y, 137 | n_fft, 138 | hop_length=hop_size, 139 | win_length=win_size, 140 | window=hann_window[wnsize_dtype_device], 141 | center=center, 142 | pad_mode="reflect", 143 | normalized=False, 144 | onesided=True, 145 | return_complex=False, 146 | ) 147 | 148 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 149 | 150 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 151 | spec = spectral_normalize_torch(spec) 152 | 153 | return spec 154 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/mrte_model.py: -------------------------------------------------------------------------------- 1 | # This is Multi-reference timbre encoder 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn.utils import remove_weight_norm, weight_norm 6 | from module.attentions import MultiHeadAttention 7 | 8 | 9 | class MRTE(nn.Module): 10 | def __init__( 11 | self, 12 | content_enc_channels=192, 13 | hidden_size=512, 14 | out_channels=192, 15 | kernel_size=5, 16 | n_heads=4, 17 | ge_layer=2, 18 | ): 19 | super(MRTE, self).__init__() 20 | self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads) 21 | self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) 22 | self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) 23 | self.c_post = nn.Conv1d(hidden_size, out_channels, 1) 24 | 25 | def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None): 26 | if ge == None: 27 | ge = 0 28 | attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1) 29 | 30 | ssl_enc = self.c_pre(ssl_enc * ssl_mask) 31 | text_enc = self.text_pre(text * text_mask) 32 | if test != None: 33 | if test == 0: 34 | x = ( 35 | self.cross_attention( 36 | ssl_enc * ssl_mask, text_enc * text_mask, attn_mask 37 | ) 38 | + ssl_enc 39 | + ge 40 | ) 41 | elif test == 1: 42 | x = ssl_enc + ge 43 | elif test == 2: 44 | x = ( 45 | self.cross_attention( 46 | ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask 47 | ) 48 | + ge 49 | ) 50 | else: 51 | raise ValueError("test should be 0,1,2") 52 | else: 53 | x = ( 54 | self.cross_attention( 55 | ssl_enc * ssl_mask, text_enc * text_mask, attn_mask 56 | ) 57 | + ssl_enc 58 | + ge 59 | ) 60 | x = self.c_post(x * ssl_mask) 61 | return x 62 | 63 | 64 | class SpeakerEncoder(torch.nn.Module): 65 | def __init__( 66 | self, 67 | mel_n_channels=80, 68 | model_num_layers=2, 69 | model_hidden_size=256, 70 | model_embedding_size=256, 71 | ): 72 | super(SpeakerEncoder, self).__init__() 73 | self.lstm = nn.LSTM( 74 | mel_n_channels, model_hidden_size, model_num_layers, batch_first=True 75 | ) 76 | self.linear = nn.Linear(model_hidden_size, model_embedding_size) 77 | self.relu = nn.ReLU() 78 | 79 | def forward(self, mels): 80 | self.lstm.flatten_parameters() 81 | _, (hidden, _) = self.lstm(mels.transpose(-1, -2)) 82 | embeds_raw = self.relu(self.linear(hidden[-1])) 83 | return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 84 | 85 | 86 | class MELEncoder(nn.Module): 87 | def __init__( 88 | self, 89 | in_channels, 90 | out_channels, 91 | hidden_channels, 92 | kernel_size, 93 | dilation_rate, 94 | n_layers, 95 | ): 96 | super().__init__() 97 | self.in_channels = in_channels 98 | self.out_channels = out_channels 99 | self.hidden_channels = hidden_channels 100 | self.kernel_size = kernel_size 101 | self.dilation_rate = dilation_rate 102 | self.n_layers = n_layers 103 | 104 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 105 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers) 106 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 107 | 108 | def forward(self, x): 109 | # print(x.shape,x_lengths.shape) 110 | x = self.pre(x) 111 | x = self.enc(x) 112 | x = self.proj(x) 113 | return x 114 | 115 | 116 | class WN(torch.nn.Module): 117 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers): 118 | super(WN, self).__init__() 119 | assert kernel_size % 2 == 1 120 | self.hidden_channels = hidden_channels 121 | self.kernel_size = kernel_size 122 | self.dilation_rate = dilation_rate 123 | self.n_layers = n_layers 124 | 125 | self.in_layers = torch.nn.ModuleList() 126 | self.res_skip_layers = torch.nn.ModuleList() 127 | 128 | for i in range(n_layers): 129 | dilation = dilation_rate**i 130 | padding = int((kernel_size * dilation - dilation) / 2) 131 | in_layer = nn.Conv1d( 132 | hidden_channels, 133 | 2 * hidden_channels, 134 | kernel_size, 135 | dilation=dilation, 136 | padding=padding, 137 | ) 138 | in_layer = weight_norm(in_layer) 139 | self.in_layers.append(in_layer) 140 | 141 | # last one is not necessary 142 | if i < n_layers - 1: 143 | res_skip_channels = 2 * hidden_channels 144 | else: 145 | res_skip_channels = hidden_channels 146 | 147 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 148 | res_skip_layer = weight_norm(res_skip_layer, name="weight") 149 | self.res_skip_layers.append(res_skip_layer) 150 | 151 | def forward(self, x): 152 | output = torch.zeros_like(x) 153 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 154 | 155 | for i in range(self.n_layers): 156 | x_in = self.in_layers[i](x) 157 | 158 | acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor) 159 | 160 | res_skip_acts = self.res_skip_layers[i](acts) 161 | if i < self.n_layers - 1: 162 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 163 | x = x + res_acts 164 | output = output + res_skip_acts[:, self.hidden_channels :, :] 165 | else: 166 | output = output + res_skip_acts 167 | return output 168 | 169 | def remove_weight_norm(self): 170 | for l in self.in_layers: 171 | remove_weight_norm(l) 172 | for l in self.res_skip_layers: 173 | remove_weight_norm(l) 174 | 175 | 176 | @torch.jit.script 177 | def fused_add_tanh_sigmoid_multiply(input, n_channels): 178 | n_channels_int = n_channels[0] 179 | t_act = torch.tanh(input[:, :n_channels_int, :]) 180 | s_act = torch.sigmoid(input[:, n_channels_int:, :]) 181 | acts = t_act * s_act 182 | return acts 183 | 184 | 185 | if __name__ == "__main__": 186 | content_enc = torch.randn(3, 192, 100) 187 | content_mask = torch.ones(3, 1, 100) 188 | ref_mel = torch.randn(3, 128, 30) 189 | ref_mask = torch.ones(3, 1, 30) 190 | model = MRTE() 191 | out = model(content_enc, content_mask, ref_mel, ref_mask) 192 | print(out.shape) 193 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Residual vector quantizer implementation.""" 8 | 9 | from dataclasses import dataclass, field 10 | import math 11 | import typing as tp 12 | 13 | import torch 14 | from torch import nn 15 | 16 | from module.core_vq import ResidualVectorQuantization 17 | 18 | 19 | @dataclass 20 | class QuantizedResult: 21 | quantized: torch.Tensor 22 | codes: torch.Tensor 23 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item. 24 | penalty: tp.Optional[torch.Tensor] = None 25 | metrics: dict = field(default_factory=dict) 26 | 27 | 28 | class ResidualVectorQuantizer(nn.Module): 29 | """Residual Vector Quantizer. 30 | Args: 31 | dimension (int): Dimension of the codebooks. 32 | n_q (int): Number of residual vector quantizers used. 33 | bins (int): Codebook size. 34 | decay (float): Decay for exponential moving average over the codebooks. 35 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks. 36 | kmeans_iters (int): Number of iterations used for kmeans initialization. 37 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes 38 | that have an exponential moving average cluster size less than the specified threshold with 39 | randomly selected vector from the current batch. 40 | """ 41 | 42 | def __init__( 43 | self, 44 | dimension: int = 256, 45 | n_q: int = 8, 46 | bins: int = 1024, 47 | decay: float = 0.99, 48 | kmeans_init: bool = True, 49 | kmeans_iters: int = 50, 50 | threshold_ema_dead_code: int = 2, 51 | ): 52 | super().__init__() 53 | self.n_q = n_q 54 | self.dimension = dimension 55 | self.bins = bins 56 | self.decay = decay 57 | self.kmeans_init = kmeans_init 58 | self.kmeans_iters = kmeans_iters 59 | self.threshold_ema_dead_code = threshold_ema_dead_code 60 | self.vq = ResidualVectorQuantization( 61 | dim=self.dimension, 62 | codebook_size=self.bins, 63 | num_quantizers=self.n_q, 64 | decay=self.decay, 65 | kmeans_init=self.kmeans_init, 66 | kmeans_iters=self.kmeans_iters, 67 | threshold_ema_dead_code=self.threshold_ema_dead_code, 68 | ) 69 | 70 | def forward( 71 | self, 72 | x: torch.Tensor, 73 | n_q: tp.Optional[int] = None, 74 | layers: tp.Optional[list] = None, 75 | ) -> QuantizedResult: 76 | """Residual vector quantization on the given input tensor. 77 | Args: 78 | x (torch.Tensor): Input tensor. 79 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 80 | layers (list): Layer that need to return quantized. Defalt: None. 81 | Returns: 82 | QuantizedResult: 83 | The quantized (or approximately quantized) representation with 84 | the associated numbert quantizers and layer quantized required to return. 85 | """ 86 | n_q = n_q if n_q else self.n_q 87 | if layers and max(layers) >= n_q: 88 | raise ValueError( 89 | f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." 90 | ) 91 | quantized, codes, commit_loss, quantized_list = self.vq( 92 | x, n_q=n_q, layers=layers 93 | ) 94 | return quantized, codes, torch.mean(commit_loss), quantized_list 95 | 96 | def encode( 97 | self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None 98 | ) -> torch.Tensor: 99 | """Encode a given input tensor with the specified sample rate at the given bandwidth. 100 | The RVQ encode method sets the appropriate number of quantizer to use 101 | and returns indices for each quantizer. 102 | Args: 103 | x (torch.Tensor): Input tensor. 104 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 105 | st (int): Start to encode input from which layers. Default: 0. 106 | """ 107 | n_q = n_q if n_q else self.n_q 108 | st = st or 0 109 | codes = self.vq.encode(x, n_q=n_q, st=st) 110 | return codes 111 | 112 | def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor: 113 | """Decode the given codes to the quantized representation. 114 | Args: 115 | codes (torch.Tensor): Input indices for each quantizer. 116 | st (int): Start to decode input codes from which layers. Default: 0. 117 | """ 118 | quantized = self.vq.decode(codes, st=st) 119 | return quantized 120 | -------------------------------------------------------------------------------- /GPT_SoVITS/my_utils.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import numpy as np 3 | 4 | 5 | def load_audio(file, sr): 6 | try: 7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 10 | file = ( 11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 12 | ) # 防止小白拷路径头尾带了空格和"和回车 13 | out, _ = ( 14 | ffmpeg.input(file, threads=0) 15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 17 | ) 18 | except Exception as e: 19 | raise RuntimeError(f"Failed to load audio: {e}") 20 | 21 | return np.frombuffer(out, np.float32).flatten() 22 | -------------------------------------------------------------------------------- /GPT_SoVITS/prepare_datasets/1-get-text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | inp_text = os.environ.get("inp_text") 6 | inp_wav_dir = os.environ.get("inp_wav_dir") 7 | exp_name = os.environ.get("exp_name") 8 | i_part = os.environ.get("i_part") 9 | all_parts = os.environ.get("all_parts") 10 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") 11 | opt_dir = os.environ.get("opt_dir") 12 | bert_pretrained_dir = os.environ.get("bert_pretrained_dir") 13 | is_half = eval(os.environ.get("is_half", "True")) 14 | import sys, numpy as np, traceback, pdb 15 | import os.path 16 | from glob import glob 17 | from tqdm import tqdm 18 | from text.cleaner import clean_text 19 | import torch 20 | from transformers import AutoModelForMaskedLM, AutoTokenizer 21 | import numpy as np 22 | 23 | # inp_text=sys.argv[1] 24 | # inp_wav_dir=sys.argv[2] 25 | # exp_name=sys.argv[3] 26 | # i_part=sys.argv[4] 27 | # all_parts=sys.argv[5] 28 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu 29 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name 30 | # bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" 31 | 32 | from time import time as ttime 33 | import shutil 34 | 35 | 36 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path 37 | dir=os.path.dirname(path) 38 | name=os.path.basename(path) 39 | # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) 40 | tmp_path="%s%s.pth"%(ttime(),i_part) 41 | torch.save(fea,tmp_path) 42 | shutil.move(tmp_path,"%s/%s"%(dir,name)) 43 | 44 | 45 | txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) 46 | if os.path.exists(txt_path) == False: 47 | bert_dir = "%s/3-bert" % (opt_dir) 48 | os.makedirs(opt_dir, exist_ok=True) 49 | os.makedirs(bert_dir, exist_ok=True) 50 | if torch.cuda.is_available(): 51 | device = "cuda:0" 52 | # elif torch.backends.mps.is_available(): 53 | # device = "mps" 54 | else: 55 | device = "cpu" 56 | tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) 57 | bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) 58 | if is_half == True: 59 | bert_model = bert_model.half().to(device) 60 | else: 61 | bert_model = bert_model.to(device) 62 | 63 | def get_bert_feature(text, word2ph): 64 | with torch.no_grad(): 65 | inputs = tokenizer(text, return_tensors="pt") 66 | for i in inputs: 67 | inputs[i] = inputs[i].to(device) 68 | res = bert_model(**inputs, output_hidden_states=True) 69 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] 70 | 71 | assert len(word2ph) == len(text) 72 | phone_level_feature = [] 73 | for i in range(len(word2ph)): 74 | repeat_feature = res[i].repeat(word2ph[i], 1) 75 | phone_level_feature.append(repeat_feature) 76 | 77 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 78 | 79 | return phone_level_feature.T 80 | 81 | def process(data, res): 82 | for name, text, lan in data: 83 | try: 84 | name = os.path.basename(name) 85 | phones, word2ph, norm_text = clean_text( 86 | text.replace("%", "-").replace("¥", ","), lan 87 | ) 88 | path_bert = "%s/%s.pt" % (bert_dir, name) 89 | if os.path.exists(path_bert) == False and lan == "zh": 90 | bert_feature = get_bert_feature(norm_text, word2ph) 91 | assert bert_feature.shape[-1] == len(phones) 92 | # torch.save(bert_feature, path_bert) 93 | my_save(bert_feature, path_bert) 94 | phones = " ".join(phones) 95 | # res.append([name,phones]) 96 | res.append([name, phones, word2ph, norm_text]) 97 | except: 98 | print(name, text, traceback.format_exc()) 99 | 100 | todo = [] 101 | res = [] 102 | with open(inp_text, "r", encoding="utf8") as f: 103 | lines = f.read().strip("\n").split("\n") 104 | 105 | language_v1_to_language_v2 = { 106 | "ZH": "zh", 107 | "zh": "zh", 108 | "JP": "ja", 109 | "jp": "ja", 110 | "JA": "ja", 111 | "ja": "ja", 112 | "EN": "en", 113 | "en": "en", 114 | "En": "en", 115 | } 116 | for line in lines[int(i_part) :: int(all_parts)]: 117 | try: 118 | wav_name, spk_name, language, text = line.split("|") 119 | # todo.append([name,text,"zh"]) 120 | todo.append( 121 | [wav_name, text, language_v1_to_language_v2.get(language, language)] 122 | ) 123 | except: 124 | print(line, traceback.format_exc()) 125 | 126 | process(todo, res) 127 | opt = [] 128 | for name, phones, word2ph, norm_text in res: 129 | opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text)) 130 | with open(txt_path, "w", encoding="utf8") as f: 131 | f.write("\n".join(opt) + "\n") 132 | -------------------------------------------------------------------------------- /GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys,os 4 | inp_text= os.environ.get("inp_text") 5 | inp_wav_dir= os.environ.get("inp_wav_dir") 6 | exp_name= os.environ.get("exp_name") 7 | i_part= os.environ.get("i_part") 8 | all_parts= os.environ.get("all_parts") 9 | os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") 10 | from feature_extractor import cnhubert 11 | opt_dir= os.environ.get("opt_dir") 12 | cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") 13 | is_half=eval(os.environ.get("is_half","True")) 14 | 15 | import pdb,traceback,numpy as np,logging 16 | from scipy.io import wavfile 17 | import librosa,torch 18 | now_dir = os.getcwd() 19 | sys.path.append(now_dir) 20 | from my_utils import load_audio 21 | 22 | # from config import cnhubert_base_path 23 | # cnhubert.cnhubert_base_path=cnhubert_base_path 24 | # inp_text=sys.argv[1] 25 | # inp_wav_dir=sys.argv[2] 26 | # exp_name=sys.argv[3] 27 | # i_part=sys.argv[4] 28 | # all_parts=sys.argv[5] 29 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6] 30 | # cnhubert.cnhubert_base_path=sys.argv[7] 31 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name 32 | 33 | from time import time as ttime 34 | import shutil 35 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path 36 | dir=os.path.dirname(path) 37 | name=os.path.basename(path) 38 | # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) 39 | tmp_path="%s%s.pth"%(ttime(),i_part) 40 | torch.save(fea,tmp_path) 41 | shutil.move(tmp_path,"%s/%s"%(dir,name)) 42 | 43 | hubert_dir="%s/4-cnhubert"%(opt_dir) 44 | wav32dir="%s/5-wav32k"%(opt_dir) 45 | os.makedirs(opt_dir,exist_ok=True) 46 | os.makedirs(hubert_dir,exist_ok=True) 47 | os.makedirs(wav32dir,exist_ok=True) 48 | 49 | maxx=0.95 50 | alpha=0.5 51 | if torch.cuda.is_available(): 52 | device = "cuda:0" 53 | # elif torch.backends.mps.is_available(): 54 | # device = "mps" 55 | else: 56 | device = "cpu" 57 | model=cnhubert.get_model() 58 | # is_half=False 59 | if(is_half==True): 60 | model=model.half().to(device) 61 | else: 62 | model = model.to(device) 63 | 64 | nan_fails=[] 65 | def name2go(wav_name,wav_path): 66 | hubert_path="%s/%s.pt"%(hubert_dir,wav_name) 67 | if(os.path.exists(hubert_path)):return 68 | tmp_audio = load_audio(wav_path, 32000) 69 | tmp_max = np.abs(tmp_audio).max() 70 | if tmp_max > 2.2: 71 | print("%s-filtered,%s" % (wav_name, tmp_max)) 72 | return 73 | tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio 74 | tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio 75 | tmp_audio = librosa.resample( 76 | tmp_audio32b, orig_sr=32000, target_sr=16000 77 | )#不是重采样问题 78 | tensor_wav16 = torch.from_numpy(tmp_audio) 79 | if (is_half == True): 80 | tensor_wav16=tensor_wav16.half().to(device) 81 | else: 82 | tensor_wav16 = tensor_wav16.to(device) 83 | ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) 84 | if np.isnan(ssl.detach().numpy()).sum()!= 0: 85 | nan_fails.append(wav_name) 86 | print("nan filtered:%s"%wav_name) 87 | return 88 | wavfile.write( 89 | "%s/%s"%(wav32dir,wav_name), 90 | 32000, 91 | tmp_audio32.astype("int16"), 92 | ) 93 | my_save(ssl,hubert_path ) 94 | 95 | with open(inp_text,"r",encoding="utf8")as f: 96 | lines=f.read().strip("\n").split("\n") 97 | 98 | for line in lines[int(i_part)::int(all_parts)]: 99 | try: 100 | # wav_name,text=line.split("\t") 101 | wav_name, spk_name, language, text = line.split("|") 102 | if (inp_wav_dir != "" and inp_wav_dir != None): 103 | wav_name = os.path.basename(wav_name) 104 | wav_path = "%s/%s"%(inp_wav_dir, wav_name) 105 | 106 | else: 107 | wav_path=wav_name 108 | wav_name = os.path.basename(wav_name) 109 | name2go(wav_name,wav_path) 110 | except: 111 | print(line,traceback.format_exc()) 112 | 113 | if(len(nan_fails)>0 and is_half==True): 114 | is_half=False 115 | model=model.float() 116 | for wav_name in nan_fails: 117 | try: 118 | name2go(wav_name) 119 | except: 120 | print(wav_name,traceback.format_exc()) 121 | -------------------------------------------------------------------------------- /GPT_SoVITS/prepare_datasets/3-get-semantic.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | inp_text = os.environ.get("inp_text") 4 | exp_name = os.environ.get("exp_name") 5 | i_part = os.environ.get("i_part") 6 | all_parts = os.environ.get("all_parts") 7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") 8 | opt_dir = os.environ.get("opt_dir") 9 | pretrained_s2G = os.environ.get("pretrained_s2G") 10 | s2config_path = os.environ.get("s2config_path") 11 | is_half = eval(os.environ.get("is_half", "True")) 12 | import math, traceback 13 | import multiprocessing 14 | import sys, pdb 15 | 16 | now_dir = os.getcwd() 17 | sys.path.append(now_dir) 18 | from random import shuffle 19 | import torch.multiprocessing as mp 20 | from glob import glob 21 | from tqdm import tqdm 22 | import logging, librosa, utils, torch 23 | from module.models import SynthesizerTrn 24 | 25 | logging.getLogger("numba").setLevel(logging.WARNING) 26 | # from config import pretrained_s2G 27 | 28 | # inp_text=sys.argv[1] 29 | # exp_name=sys.argv[2] 30 | # i_part=sys.argv[3] 31 | # all_parts=sys.argv[4] 32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5] 33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name 34 | 35 | 36 | hubert_dir = "%s/4-cnhubert" % (opt_dir) 37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) 38 | if os.path.exists(semantic_path) == False: 39 | os.makedirs(opt_dir, exist_ok=True) 40 | 41 | if torch.cuda.is_available(): 42 | device = "cuda" 43 | # elif torch.backends.mps.is_available(): 44 | # device = "mps" 45 | else: 46 | device = "cpu" 47 | hps = utils.get_hparams_from_file(s2config_path) 48 | vq_model = SynthesizerTrn( 49 | hps.data.filter_length // 2 + 1, 50 | hps.train.segment_size // hps.data.hop_length, 51 | n_speakers=hps.data.n_speakers, 52 | **hps.model 53 | ) 54 | if is_half == True: 55 | vq_model = vq_model.half().to(device) 56 | else: 57 | vq_model = vq_model.to(device) 58 | vq_model.eval() 59 | # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) 60 | # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) 61 | print( 62 | vq_model.load_state_dict( 63 | torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False 64 | ) 65 | ) 66 | 67 | def name2go(wav_name, lines): 68 | hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) 69 | if os.path.exists(hubert_path) == False: 70 | return 71 | ssl_content = torch.load(hubert_path, map_location="cpu") 72 | if is_half == True: 73 | ssl_content = ssl_content.half().to(device) 74 | else: 75 | ssl_content = ssl_content.to(device) 76 | codes = vq_model.extract_latent(ssl_content) 77 | semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) 78 | lines.append("%s\t%s" % (wav_name, semantic)) 79 | 80 | with open(inp_text, "r", encoding="utf8") as f: 81 | lines = f.read().strip("\n").split("\n") 82 | 83 | lines1 = [] 84 | for line in lines[int(i_part) :: int(all_parts)]: 85 | # print(line) 86 | try: 87 | # wav_name,text=line.split("\t") 88 | wav_name, spk_name, language, text = line.split("|") 89 | wav_name = os.path.basename(wav_name) 90 | # name2go(name,lines1) 91 | name2go(wav_name, lines1) 92 | except: 93 | print(line, traceback.format_exc()) 94 | with open(semantic_path, "w", encoding="utf8") as f: 95 | f.write("\n".join(lines1)) 96 | -------------------------------------------------------------------------------- /GPT_SoVITS/process_ckpt.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from collections import OrderedDict 3 | from time import time as ttime 4 | import shutil,os 5 | import torch 6 | from tools.i18n.i18n import I18nAuto 7 | 8 | i18n = I18nAuto() 9 | 10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path 11 | dir=os.path.dirname(path) 12 | name=os.path.basename(path) 13 | tmp_path="%s.pth"%(ttime()) 14 | torch.save(fea,tmp_path) 15 | shutil.move(tmp_path,"%s/%s"%(dir,name)) 16 | 17 | def savee(ckpt, name, epoch, steps, hps): 18 | try: 19 | opt = OrderedDict() 20 | opt["weight"] = {} 21 | for key in ckpt.keys(): 22 | if "enc_q" in key: 23 | continue 24 | opt["weight"][key] = ckpt[key].half() 25 | opt["config"] = hps 26 | opt["info"] = "%sepoch_%siteration" % (epoch, steps) 27 | # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 28 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 29 | return "Success." 30 | except: 31 | return traceback.format_exc() 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | def cleaned_text_to_sequence(cleaned_text): 7 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | return phones 15 | 16 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/chinese.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/chinese.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/cleaner.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/cleaner.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/english.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/english.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/japanese.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/japanese.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/symbols.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/symbols.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/__pycache__/tone_sandhi.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/__pycache__/tone_sandhi.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pdb 3 | import re 4 | 5 | import cn2an 6 | from pypinyin import lazy_pinyin, Style 7 | 8 | from text.symbols import punctuation 9 | from text.tone_sandhi import ToneSandhi 10 | from text.zh_normalization.text_normlization import TextNormalizer 11 | 12 | normalizer = lambda x: cn2an.transform(x, "an2cn") 13 | 14 | current_file_path = os.path.dirname(__file__) 15 | pinyin_to_symbol_map = { 16 | line.split("\t")[0]: line.strip().split("\t")[1] 17 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 18 | } 19 | 20 | import jieba_fast.posseg as psg 21 | 22 | 23 | rep_map = { 24 | ":": ",", 25 | ";": ",", 26 | ",": ",", 27 | "。": ".", 28 | "!": "!", 29 | "?": "?", 30 | "\n": ".", 31 | "·": ",", 32 | "、": ",", 33 | "...": "…", 34 | "$": ".", 35 | "/": ",", 36 | "—": "-", 37 | "~": "…", 38 | "~":"…", 39 | } 40 | 41 | tone_modifier = ToneSandhi() 42 | 43 | 44 | def replace_punctuation(text): 45 | text = text.replace("嗯", "恩").replace("呣", "母") 46 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 47 | 48 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 49 | 50 | replaced_text = re.sub( 51 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 52 | ) 53 | 54 | return replaced_text 55 | 56 | 57 | def g2p(text): 58 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 59 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 60 | phones, word2ph = _g2p(sentences) 61 | return phones, word2ph 62 | 63 | 64 | def _get_initials_finals(word): 65 | initials = [] 66 | finals = [] 67 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 68 | orig_finals = lazy_pinyin( 69 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 70 | ) 71 | for c, v in zip(orig_initials, orig_finals): 72 | initials.append(c) 73 | finals.append(v) 74 | return initials, finals 75 | 76 | 77 | def _g2p(segments): 78 | phones_list = [] 79 | word2ph = [] 80 | for seg in segments: 81 | pinyins = [] 82 | # Replace all English words in the sentence 83 | seg = re.sub("[a-zA-Z]+", "", seg) 84 | seg_cut = psg.lcut(seg) 85 | initials = [] 86 | finals = [] 87 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 88 | for word, pos in seg_cut: 89 | if pos == "eng": 90 | continue 91 | sub_initials, sub_finals = _get_initials_finals(word) 92 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 93 | initials.append(sub_initials) 94 | finals.append(sub_finals) 95 | 96 | # assert len(sub_initials) == len(sub_finals) == len(word) 97 | initials = sum(initials, []) 98 | finals = sum(finals, []) 99 | # 100 | for c, v in zip(initials, finals): 101 | raw_pinyin = c + v 102 | # NOTE: post process for pypinyin outputs 103 | # we discriminate i, ii and iii 104 | if c == v: 105 | assert c in punctuation 106 | phone = [c] 107 | word2ph.append(1) 108 | else: 109 | v_without_tone = v[:-1] 110 | tone = v[-1] 111 | 112 | pinyin = c + v_without_tone 113 | assert tone in "12345" 114 | 115 | if c: 116 | # 多音节 117 | v_rep_map = { 118 | "uei": "ui", 119 | "iou": "iu", 120 | "uen": "un", 121 | } 122 | if v_without_tone in v_rep_map.keys(): 123 | pinyin = c + v_rep_map[v_without_tone] 124 | else: 125 | # 单音节 126 | pinyin_rep_map = { 127 | "ing": "ying", 128 | "i": "yi", 129 | "in": "yin", 130 | "u": "wu", 131 | } 132 | if pinyin in pinyin_rep_map.keys(): 133 | pinyin = pinyin_rep_map[pinyin] 134 | else: 135 | single_rep_map = { 136 | "v": "yu", 137 | "e": "e", 138 | "i": "y", 139 | "u": "w", 140 | } 141 | if pinyin[0] in single_rep_map.keys(): 142 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 143 | 144 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 145 | new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ") 146 | new_v = new_v + tone 147 | phone = [new_c, new_v] 148 | word2ph.append(len(phone)) 149 | 150 | phones_list += phone 151 | return phones_list, word2ph 152 | 153 | 154 | def text_normalize(text): 155 | # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization 156 | tx = TextNormalizer() 157 | sentences = tx.normalize(text) 158 | dest_text = "" 159 | for sentence in sentences: 160 | dest_text += replace_punctuation(sentence) 161 | return dest_text 162 | 163 | 164 | if __name__ == "__main__": 165 | text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" 166 | text = "呣呣呣~就是…大人的鼹鼠党吧?" 167 | text = "你好" 168 | text = text_normalize(text) 169 | print(g2p(text)) 170 | 171 | 172 | # # 示例用法 173 | # text = "这是一个示例文本:,你好!这是一个测试..." 174 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 175 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english 2 | 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english} 4 | special = [ 5 | # ("%", "zh", "SP"), 6 | ("¥", "zh", "SP2"), 7 | ("^", "zh", "SP3"), 8 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 9 | ] 10 | 11 | 12 | def clean_text(text, language): 13 | if(language not in language_module_map): 14 | language="en" 15 | text=" " 16 | for special_s, special_l, target_symbol in special: 17 | if special_s in text and language == special_l: 18 | return clean_special(text, language, special_s, target_symbol) 19 | language_module = language_module_map[language] 20 | norm_text = language_module.text_normalize(text) 21 | if language == "zh": 22 | phones, word2ph = language_module.g2p(norm_text) 23 | assert len(phones) == sum(word2ph) 24 | assert len(norm_text) == len(word2ph) 25 | else: 26 | phones = language_module.g2p(norm_text) 27 | word2ph = None 28 | 29 | for ph in phones: 30 | assert ph in symbols 31 | return phones, word2ph, norm_text 32 | 33 | 34 | def clean_special(text, language, special_s, target_symbol): 35 | """ 36 | 特殊静音段sp符号处理 37 | """ 38 | text = text.replace(special_s, ",") 39 | language_module = language_module_map[language] 40 | norm_text = language_module.text_normalize(text) 41 | phones = language_module.g2p(norm_text) 42 | new_ph = [] 43 | for ph in phones[0]: 44 | assert ph in symbols 45 | if ph == ",": 46 | new_ph.append(target_symbol) 47 | else: 48 | new_ph.append(ph) 49 | return new_ph, phones[1], norm_text 50 | 51 | 52 | def text_to_sequence(text, language): 53 | phones = clean_text(text) 54 | return cleaned_text_to_sequence(phones) 55 | 56 | 57 | if __name__ == "__main__": 58 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 59 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 2 | JSON JH EY1 S AH0 N -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/engdict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/japanese.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py 2 | import re 3 | import sys 4 | 5 | import pyopenjtalk 6 | 7 | 8 | from text import symbols 9 | # Regular expression matching Japanese without punctuation marks: 10 | _japanese_characters = re.compile( 11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 12 | ) 13 | 14 | # Regular expression matching non-Japanese characters or punctuation marks: 15 | _japanese_marks = re.compile( 16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 17 | ) 18 | 19 | # List of (symbol, Japanese) pairs for marks: 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] 21 | 22 | 23 | # List of (consonant, sokuon) pairs: 24 | _real_sokuon = [ 25 | (re.compile("%s" % x[0]), x[1]) 26 | for x in [ 27 | (r"Q([↑↓]*[kg])", r"k#\1"), 28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"), 29 | (r"Q([↑↓]*[sʃ])", r"s\1"), 30 | (r"Q([↑↓]*[pb])", r"p#\1"), 31 | ] 32 | ] 33 | 34 | # List of (consonant, hatsuon) pairs: 35 | _real_hatsuon = [ 36 | (re.compile("%s" % x[0]), x[1]) 37 | for x in [ 38 | (r"N([↑↓]*[pbm])", r"m\1"), 39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"), 40 | (r"N([↑↓]*[tdn])", r"n\1"), 41 | (r"N([↑↓]*[kg])", r"ŋ\1"), 42 | ] 43 | ] 44 | 45 | 46 | def post_replace_ph(ph): 47 | rep_map = { 48 | ":": ",", 49 | ";": ",", 50 | ",": ",", 51 | "。": ".", 52 | "!": "!", 53 | "?": "?", 54 | "\n": ".", 55 | "·": ",", 56 | "、": ",", 57 | "...": "…", 58 | } 59 | if ph in rep_map.keys(): 60 | ph = rep_map[ph] 61 | if ph in symbols: 62 | return ph 63 | if ph not in symbols: 64 | ph = "UNK" 65 | return ph 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def preprocess_jap(text, with_prosody=False): 75 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = [] 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if with_prosody: 83 | text += pyopenjtalk_g2p_prosody(sentence)[1:-1] 84 | else: 85 | p = pyopenjtalk.g2p(sentence) 86 | text += p.split(" ") 87 | 88 | if i < len(marks): 89 | if marks[i] == " ":# 防止意外的UNK 90 | continue 91 | text += [marks[i].replace(" ", "")] 92 | return text 93 | 94 | 95 | def text_normalize(text): 96 | # todo: jap text normalize 97 | return text 98 | 99 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py 100 | def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): 101 | """Extract phoneme + prosoody symbol sequence from input full-context labels. 102 | 103 | The algorithm is based on `Prosodic features control by symbols as input of 104 | sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. 105 | 106 | Args: 107 | text (str): Input text. 108 | drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. 109 | 110 | Returns: 111 | List[str]: List of phoneme + prosody symbols. 112 | 113 | Examples: 114 | >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody 115 | >>> pyopenjtalk_g2p_prosody("こんにちは。") 116 | ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] 117 | 118 | .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic 119 | modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 120 | 121 | """ 122 | labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) 123 | N = len(labels) 124 | 125 | phones = [] 126 | for n in range(N): 127 | lab_curr = labels[n] 128 | 129 | # current phoneme 130 | p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) 131 | # deal unvoiced vowels as normal vowels 132 | if drop_unvoiced_vowels and p3 in "AEIOU": 133 | p3 = p3.lower() 134 | 135 | # deal with sil at the beginning and the end of text 136 | if p3 == "sil": 137 | assert n == 0 or n == N - 1 138 | if n == 0: 139 | phones.append("^") 140 | elif n == N - 1: 141 | # check question form or not 142 | e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) 143 | if e3 == 0: 144 | phones.append("$") 145 | elif e3 == 1: 146 | phones.append("?") 147 | continue 148 | elif p3 == "pau": 149 | phones.append("_") 150 | continue 151 | else: 152 | phones.append(p3) 153 | 154 | # accent type and position info (forward or backward) 155 | a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) 156 | a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) 157 | a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) 158 | 159 | # number of mora in accent phrase 160 | f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) 161 | 162 | a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) 163 | # accent phrase border 164 | if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": 165 | phones.append("#") 166 | # pitch falling 167 | elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: 168 | phones.append("]") 169 | # pitch rising 170 | elif a2 == 1 and a2_next == 2: 171 | phones.append("[") 172 | 173 | return phones 174 | 175 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py 176 | def _numeric_feature_by_regex(regex, s): 177 | match = re.search(regex, s) 178 | if match is None: 179 | return -50 180 | return int(match.group(1)) 181 | 182 | def g2p(norm_text, with_prosody=False): 183 | phones = preprocess_jap(norm_text, with_prosody) 184 | phones = [post_replace_ph(i) for i in phones] 185 | # todo: implement tones and word2ph 186 | return phones 187 | 188 | 189 | if __name__ == "__main__": 190 | phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") 191 | print(phones) -------------------------------------------------------------------------------- /GPT_SoVITS/text/namedict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/namedict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/symbols.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 4 | punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 5 | punctuation.append("-") 6 | pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"] 7 | # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"] 8 | pad = "_" 9 | 10 | c = [ 11 | "AA", 12 | "EE", 13 | "OO", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "f", 19 | "g", 20 | "h", 21 | "j", 22 | "k", 23 | "l", 24 | "m", 25 | "n", 26 | "p", 27 | "q", 28 | "r", 29 | "s", 30 | "sh", 31 | "t", 32 | "w", 33 | "x", 34 | "y", 35 | "z", 36 | "zh", 37 | ] 38 | v = [ 39 | "E1", 40 | "En1", 41 | "a1", 42 | "ai1", 43 | "an1", 44 | "ang1", 45 | "ao1", 46 | "e1", 47 | "ei1", 48 | "en1", 49 | "eng1", 50 | "er1", 51 | "i1", 52 | "i01", 53 | "ia1", 54 | "ian1", 55 | "iang1", 56 | "iao1", 57 | "ie1", 58 | "in1", 59 | "ing1", 60 | "iong1", 61 | "ir1", 62 | "iu1", 63 | "o1", 64 | "ong1", 65 | "ou1", 66 | "u1", 67 | "ua1", 68 | "uai1", 69 | "uan1", 70 | "uang1", 71 | "ui1", 72 | "un1", 73 | "uo1", 74 | "v1", 75 | "van1", 76 | "ve1", 77 | "vn1", 78 | "E2", 79 | "En2", 80 | "a2", 81 | "ai2", 82 | "an2", 83 | "ang2", 84 | "ao2", 85 | "e2", 86 | "ei2", 87 | "en2", 88 | "eng2", 89 | "er2", 90 | "i2", 91 | "i02", 92 | "ia2", 93 | "ian2", 94 | "iang2", 95 | "iao2", 96 | "ie2", 97 | "in2", 98 | "ing2", 99 | "iong2", 100 | "ir2", 101 | "iu2", 102 | "o2", 103 | "ong2", 104 | "ou2", 105 | "u2", 106 | "ua2", 107 | "uai2", 108 | "uan2", 109 | "uang2", 110 | "ui2", 111 | "un2", 112 | "uo2", 113 | "v2", 114 | "van2", 115 | "ve2", 116 | "vn2", 117 | "E3", 118 | "En3", 119 | "a3", 120 | "ai3", 121 | "an3", 122 | "ang3", 123 | "ao3", 124 | "e3", 125 | "ei3", 126 | "en3", 127 | "eng3", 128 | "er3", 129 | "i3", 130 | "i03", 131 | "ia3", 132 | "ian3", 133 | "iang3", 134 | "iao3", 135 | "ie3", 136 | "in3", 137 | "ing3", 138 | "iong3", 139 | "ir3", 140 | "iu3", 141 | "o3", 142 | "ong3", 143 | "ou3", 144 | "u3", 145 | "ua3", 146 | "uai3", 147 | "uan3", 148 | "uang3", 149 | "ui3", 150 | "un3", 151 | "uo3", 152 | "v3", 153 | "van3", 154 | "ve3", 155 | "vn3", 156 | "E4", 157 | "En4", 158 | "a4", 159 | "ai4", 160 | "an4", 161 | "ang4", 162 | "ao4", 163 | "e4", 164 | "ei4", 165 | "en4", 166 | "eng4", 167 | "er4", 168 | "i4", 169 | "i04", 170 | "ia4", 171 | "ian4", 172 | "iang4", 173 | "iao4", 174 | "ie4", 175 | "in4", 176 | "ing4", 177 | "iong4", 178 | "ir4", 179 | "iu4", 180 | "o4", 181 | "ong4", 182 | "ou4", 183 | "u4", 184 | "ua4", 185 | "uai4", 186 | "uan4", 187 | "uang4", 188 | "ui4", 189 | "un4", 190 | "uo4", 191 | "v4", 192 | "van4", 193 | "ve4", 194 | "vn4", 195 | "E5", 196 | "En5", 197 | "a5", 198 | "ai5", 199 | "an5", 200 | "ang5", 201 | "ao5", 202 | "e5", 203 | "ei5", 204 | "en5", 205 | "eng5", 206 | "er5", 207 | "i5", 208 | "i05", 209 | "ia5", 210 | "ian5", 211 | "iang5", 212 | "iao5", 213 | "ie5", 214 | "in5", 215 | "ing5", 216 | "iong5", 217 | "ir5", 218 | "iu5", 219 | "o5", 220 | "ong5", 221 | "ou5", 222 | "u5", 223 | "ua5", 224 | "uai5", 225 | "uan5", 226 | "uang5", 227 | "ui5", 228 | "un5", 229 | "uo5", 230 | "v5", 231 | "van5", 232 | "ve5", 233 | "vn5", 234 | ] 235 | 236 | v_without_tone = [ 237 | "E", 238 | "En", 239 | "a", 240 | "ai", 241 | "an", 242 | "ang", 243 | "ao", 244 | "e", 245 | "ei", 246 | "en", 247 | "eng", 248 | "er", 249 | "i", 250 | "i0", 251 | "ia", 252 | "ian", 253 | "iang", 254 | "iao", 255 | "ie", 256 | "in", 257 | "ing", 258 | "iong", 259 | "ir", 260 | "iu", 261 | "o", 262 | "ong", 263 | "ou", 264 | "u", 265 | "ua", 266 | "uai", 267 | "uan", 268 | "uang", 269 | "ui", 270 | "un", 271 | "uo", 272 | "v", 273 | "van", 274 | "ve", 275 | "vn", 276 | ] 277 | 278 | # japanese 279 | ja_symbols = [ 280 | "I", 281 | "N", 282 | "U", 283 | "a", 284 | "b", 285 | "by", 286 | "ch", 287 | "cl", 288 | "d", 289 | "dy", 290 | "e", 291 | "f", 292 | "g", 293 | "gy", 294 | "h", 295 | "hy", 296 | "i", 297 | "j", 298 | "k", 299 | "ky", 300 | "m", 301 | "my", 302 | "n", 303 | "ny", 304 | "o", 305 | "p", 306 | "py", 307 | "r", 308 | "ry", 309 | "s", 310 | "sh", 311 | "t", 312 | "ts", 313 | "u", 314 | "v", 315 | "w", 316 | "y", 317 | "z", 318 | # "[", #上升调型 319 | # "]", #下降调型 320 | # "$", #结束符 321 | # "^", #开始符 322 | ] 323 | 324 | arpa = { 325 | "AH0", 326 | "S", 327 | "AH1", 328 | "EY2", 329 | "AE2", 330 | "EH0", 331 | "OW2", 332 | "UH0", 333 | "NG", 334 | "B", 335 | "G", 336 | "AY0", 337 | "M", 338 | "AA0", 339 | "F", 340 | "AO0", 341 | "ER2", 342 | "UH1", 343 | "IY1", 344 | "AH2", 345 | "DH", 346 | "IY0", 347 | "EY1", 348 | "IH0", 349 | "K", 350 | "N", 351 | "W", 352 | "IY2", 353 | "T", 354 | "AA1", 355 | "ER1", 356 | "EH2", 357 | "OY0", 358 | "UH2", 359 | "UW1", 360 | "Z", 361 | "AW2", 362 | "AW1", 363 | "V", 364 | "UW2", 365 | "AA2", 366 | "ER", 367 | "AW0", 368 | "UW0", 369 | "R", 370 | "OW1", 371 | "EH1", 372 | "ZH", 373 | "AE0", 374 | "IH2", 375 | "IH", 376 | "Y", 377 | "JH", 378 | "P", 379 | "AY1", 380 | "EY0", 381 | "OY2", 382 | "TH", 383 | "HH", 384 | "D", 385 | "ER0", 386 | "CH", 387 | "AO1", 388 | "AE1", 389 | "AO2", 390 | "OY1", 391 | "AY2", 392 | "IH1", 393 | "OW0", 394 | "L", 395 | "SH", 396 | } 397 | 398 | symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) 399 | symbols = sorted(set(symbols)) 400 | if __name__ == "__main__": 401 | print(len(symbols)) 402 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | ord(char) + 65248: ord(char) 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒" 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。 2 | 如不认可该条款,则不能使用或引用软件包内任何代码和文件。 3 | 4 | 特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件: 5 | 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。 6 | 软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。 7 | 8 | 9 | MIT License 10 | 11 | Copyright (c) 2024 AIFSH 12 | 13 | Permission is hereby granted, free of charge, to any person obtaining a copy 14 | of this software and associated documentation files (the "Software"), to deal 15 | in the Software without restriction, including without limitation the rights 16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | copies of the Software, and to permit persons to whom the Software is 18 | furnished to do so, subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be included in all 21 | copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 | SOFTWARE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | new repo https://github.com/AIFSH/GSTTS-ComfyUI 2 | # ComfyUI-GPT_SoVITS 3 | a comfyui custom node for [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)! you can voice cloning and tts in comfyui now 4 |
5 |
6 | webpage 7 |
8 |
9 | 10 | # Disclaimer / 免责声明 11 | We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws. 12 | 我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. 13 | 14 | ## Features 15 | - `srt` file for subtitle was supported 16 | - mutiple speaker was supported in finetune and inference by `srt` 17 | - huge comfyui custom nodes can merge in gpt_sovits 18 | 19 | ## How to use 20 | make sure `ffmpeg` is worked in your commandline 21 | for Linux 22 | ``` 23 | apt update 24 | apt install ffmpeg 25 | ``` 26 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically 27 | 28 | then! 29 | ``` 30 | git clone https://github.com/AIFSH/ComfyUI-GPT_SoVITS.git 31 | cd ComfyUI-GPT_SoVITS 32 | pip install -r requirements.txt 33 | ``` 34 | `weights` will be downloaded from huggingface automatically! if you in china,make sure your internet attach the huggingface 35 | or if you still struggle with huggingface, you may try follow [hf-mirror](https://hf-mirror.com/) to config your env. 36 | 37 | 或者下载[权重文件](https://pan.quark.cn/s/e5057be01087)解压后把`pretrained_models`整个文件夹放进`ComfyUI-GPT_SoVITS`目录 38 | 39 | ## Windows 40 | There is a portable standalone build for Windows that should work for running on Nvidia GPUs and cuda>=11.8, 41 | click [the link](https://www.bilibili.com/video/BV1qx4y1h7T2) to download 42 |
43 |
44 | Wechat 45 |
46 |
47 | 48 | ## Tutorial 49 | - [Demo](https://www.bilibili.com/video/BV1yC411G7NJ) 50 | - [Demo for mutiple speaker](https://www.bilibili.com/video/BV1QC41137Wq/) 51 | - [FULL WorkFLOW](https://www.bilibili.com/video/BV1pp421D7qa) 52 | ## My other nodes you may need 53 | - [ComfyUI-UVR5](https://github.com/AIFSH/ComfyUI-UVR5) 54 | - [ComfyUI-IP_LAP](https://github.com/AIFSH/ComfyUI-IP_LAP) 55 | 56 | ## WeChat Group && Donate 57 |
58 |
59 | Wechat 60 | donate 61 |
62 |
63 | 64 | ## Thanks 65 | - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) 66 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import site 2 | import os,sys 3 | import logging 4 | from server import PromptServer 5 | 6 | now_dir = os.path.dirname(os.path.abspath(__file__)) 7 | site_packages_roots = [] 8 | for path in site.getsitepackages(): 9 | if "packages" in path: 10 | site_packages_roots.append(path) 11 | if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] 12 | #os.environ["OPENBLAS_NUM_THREADS"] = "4" 13 | for site_packages_root in site_packages_roots: 14 | if os.path.exists(site_packages_root): 15 | try: 16 | with open("%s/GPT_SoVITS.pth" % (site_packages_root), "w") as f: 17 | f.write( 18 | "%s\n%s/GPT_SoVITS\n%s/GPT_SoVITS/text\n" 19 | % (now_dir,now_dir,now_dir) 20 | ) 21 | break 22 | except PermissionError: 23 | raise PermissionError 24 | 25 | if os.path.isfile("%s/GPT_SoVITS.pth" % (site_packages_root)): 26 | print("!!!GPT_SoVITS path was added to " + "%s/GPT_SoVITS.pth" % (site_packages_root) 27 | + "\n if meet `No module` error,try `python main.py` again, don't be foolish to pip install tools") 28 | 29 | from huggingface_hub import snapshot_download 30 | model_path = os.path.join(now_dir,"pretrained_models") 31 | if not os.path.isfile(os.path.join(model_path,"s2G488k.pth")): 32 | snapshot_download(repo_id="lj1995/GPT-SoVITS",local_dir=model_path) 33 | else: 34 | print("GPT_SoVITS use cache models,make sure your 'pretrained_models' complete") 35 | 36 | WEB_DIRECTORY = "./web" 37 | from .nodes import LoadSRT,LoadAudio, GPT_SOVITS_INFER, PreViewAudio,GPT_SOVITS_FT, GPT_SOVITS_TTS 38 | 39 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension 40 | # WEB_DIRECTORY = "./somejs" 41 | 42 | # A dictionary that contains all nodes you want to export with their names 43 | # NOTE: names should be globally unique 44 | NODE_CLASS_MAPPINGS = { 45 | "GPT_SOVITS_FT": GPT_SOVITS_FT, 46 | "LoadAudio": LoadAudio, 47 | "PreViewAudio": PreViewAudio, 48 | "LoadSRT": LoadSRT, 49 | "GPT_SOVITS_INFER": GPT_SOVITS_INFER, 50 | "GPT_SOVITS_TTS": GPT_SOVITS_TTS 51 | } 52 | 53 | # A dictionary that contains the friendly/humanly readable titles for the nodes 54 | NODE_DISPLAY_NAME_MAPPINGS = { 55 | "GPT_SOVITS_FT": "GPT_SOVITS Finetune", 56 | "LoadAudio": "AudioLoader", 57 | "PreViewAudio": "PreView Audio", 58 | "LoadSRT": "SRT FILE Loader", 59 | "GPT_SOVITS_INFER": "GPT_SOVITS Inference", 60 | "GPT_SOVITS_TTS": "GPT_SOVITS TTS" 61 | } 62 | 63 | @PromptServer.instance.routes.get("/gpt_sovits/reboot") 64 | def restart(self): 65 | try: 66 | sys.stdout.close_log() 67 | except Exception as e: 68 | pass 69 | 70 | return os.execv(sys.executable, [sys.executable] + sys.argv) 71 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | import torch 4 | 5 | # 推理用的指定模型 6 | sovits_path = "" 7 | gpt_path = "" 8 | is_half_str = os.environ.get("is_half", "True") 9 | is_half = True if is_half_str.lower() == 'true' else False 10 | is_share_str = os.environ.get("is_share","False") 11 | is_share= True if is_share_str.lower() == 'true' else False 12 | 13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" 14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" 15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" 16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" 17 | 18 | exp_root = "logs" 19 | python_exec = sys.executable or "python" 20 | if torch.cuda.is_available(): 21 | infer_device = "cuda" 22 | else: 23 | infer_device = "cpu" 24 | 25 | webui_port_main = 9874 26 | webui_port_uvr5 = 9873 27 | webui_port_infer_tts = 9872 28 | webui_port_subfix = 9871 29 | 30 | api_port = 9880 31 | 32 | if infer_device == "cuda": 33 | gpu_name = torch.cuda.get_device_name(0) 34 | if ( 35 | ("16" in gpu_name and "V100" not in gpu_name.upper()) 36 | or "P40" in gpu_name.upper() 37 | or "P10" in gpu_name.upper() 38 | or "1060" in gpu_name 39 | or "1070" in gpu_name 40 | or "1080" in gpu_name 41 | ): 42 | is_half=False 43 | 44 | if(infer_device=="cpu"):is_half=False 45 | 46 | class Config: 47 | def __init__(self): 48 | self.sovits_path = sovits_path 49 | self.gpt_path = gpt_path 50 | self.is_half = is_half 51 | 52 | self.cnhubert_path = cnhubert_path 53 | self.bert_path = bert_path 54 | self.pretrained_sovits_path = pretrained_sovits_path 55 | self.pretrained_gpt_path = pretrained_gpt_path 56 | 57 | self.exp_root = exp_root 58 | self.python_exec = python_exec 59 | self.infer_device = infer_device 60 | 61 | self.webui_port_main = webui_port_main 62 | self.webui_port_uvr5 = webui_port_uvr5 63 | self.webui_port_infer_tts = webui_port_infer_tts 64 | self.webui_port_subfix = webui_port_subfix 65 | 66 | self.api_port = api_port 67 | -------------------------------------------------------------------------------- /donate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/donate.jpg -------------------------------------------------------------------------------- /note.txt: -------------------------------------------------------------------------------- 1 | when update GPT_SoVITS 2 | rember make change s2_train.py 3 | from 4 | import utils 5 | to 6 | from GPT_SoVITS import utils -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pydub 2 | srt 3 | librosa 4 | LangSegment 5 | transformers 6 | cn2an 7 | pypinyin 8 | jieba_fast 9 | pyopenjtalk 10 | wordsegment 11 | g2p_en 12 | pytorch_lightning 13 | audiotsm 14 | ffmpeg-python -------------------------------------------------------------------------------- /tools/__pycache__/my_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/tools/__pycache__/my_utils.cpython-310.pyc -------------------------------------------------------------------------------- /tools/i18n/__pycache__/i18n.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/tools/i18n/__pycache__/i18n.cpython-310.pyc -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | parent_directory = os.path.dirname(os.path.abspath(__file__)) 6 | def load_language_list(language): 7 | with open(os.path.join(parent_directory,f"locale/{language}.json"), "r", encoding="utf-8") as f: 8 | language_list = json.load(f) 9 | return language_list 10 | 11 | 12 | class I18nAuto: 13 | def __init__(self, language=None): 14 | if language in ["Auto", None]: 15 | language = locale.getdefaultlocale()[ 16 | 0 17 | ] # getlocale can't identify the system's language ((None, None)) 18 | if not os.path.exists(os.path.join(parent_directory,f"locale/{language}.json")): 19 | language = "en_US" 20 | self.language = language 21 | self.language_map = load_language_list(language) 22 | 23 | def __call__(self, key): 24 | return self.language_map.get(key, key) 25 | 26 | def __repr__(self): 27 | return "Use Language: " + self.language 28 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音", 3 | "A模型权重": "A模型权重", 4 | "A模型路径": "A模型路径", 5 | "B模型路径": "B模型路径", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt处理", 13 | "harvest进程数": "harvest进程数", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一键训练", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "保存名", 32 | "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", 33 | "保存的模型名不带后缀": "保存的模型名不带后缀", 34 | "保存频率save_every_epoch": "保存频率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", 38 | "停止音频转换": "停止音频转换", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路径", 41 | "加载模型": "加载模型", 42 | "加载预训练底模D路径": "加载预训练底模D路径", 43 | "加载预训练底模G路径": "加载预训练底模G路径", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸载音色省显存", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "响应阈值", 51 | "响度因子": "响度因子", 52 | "处理数据": "处理数据", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "导出文件格式", 55 | "常见问题解答": "常见问题解答", 56 | "常规设置": "常规设置", 57 | "开始音频转换": "开始音频转换", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "性能设置", 60 | "总训练轮数total_epoch": "总训练轮数total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定输出文件夹", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理时间(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", 74 | "显卡信息": "显卡信息", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", 78 | "检索特征占比": "检索特征占比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", 82 | "模型是否带音高指导": "模型是否带音高指导", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否", 85 | "模型版本型号": "模型版本型号", 86 | "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", 87 | "模型路径": "模型路径", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出长度", 90 | "版本": "版本", 91 | "特征提取": "特征提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", 94 | "目标采样率": "目标采样率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型信息", 99 | "要置入的模型信息": "要置入的模型信息", 100 | "训练": "训练", 101 | "训练模型": "训练模型", 102 | "训练特征索引": "训练特征索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "请指定说话人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "请选择说话人id", 108 | "转换": "转换", 109 | "输入实验名": "输入实验名", 110 | "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "输入训练文件夹路径", 116 | "输入设备": "输入设备", 117 | "输入降噪": "输入降噪", 118 | "输出信息": "输出信息", 119 | "输出变声": "输出变声", 120 | "输出设备": "输出设备", 121 | "输出降噪": "输出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", 123 | "选择.index文件": "选择.index文件", 124 | "选择.pth文件": "选择.pth文件", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "采样长度", 130 | "重载设备列表": "重载设备列表", 131 | "音调设置": "音调设置", 132 | "音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)", 133 | "音高算法": "音高算法", 134 | "额外推理时长": "额外推理时长" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_HK.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_SG.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # Define the standard file name 6 | standard_file = "locale/zh_CN.json" 7 | 8 | # Find all JSON files in the directory 9 | dir_path = "locale/" 10 | languages = [ 11 | os.path.join(dir_path, f) 12 | for f in os.listdir(dir_path) 13 | if f.endswith(".json") and f != standard_file 14 | ] 15 | 16 | # Load the standard file 17 | with open(standard_file, "r", encoding="utf-8") as f: 18 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 19 | 20 | # Loop through each language file 21 | for lang_file in languages: 22 | # Load the language file 23 | with open(lang_file, "r", encoding="utf-8") as f: 24 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 25 | 26 | # Find the difference between the language file and the standard file 27 | diff = set(standard_data.keys()) - set(lang_data.keys()) 28 | 29 | miss = set(lang_data.keys()) - set(standard_data.keys()) 30 | 31 | # Add any missing keys to the language file 32 | for key in diff: 33 | lang_data[key] = key 34 | 35 | # Del any extra keys to the language file 36 | for key in miss: 37 | del lang_data[key] 38 | 39 | # Sort the keys of the language file to match the order of the standard file 40 | lang_data = OrderedDict( 41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 42 | ) 43 | 44 | # Save the updated language file 45 | with open(lang_file, "w", encoding="utf-8") as f: 46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 47 | f.write("\n") 48 | -------------------------------------------------------------------------------- /tools/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import glob 3 | import json 4 | from collections import OrderedDict 5 | 6 | 7 | def extract_i18n_strings(node): 8 | i18n_strings = [] 9 | 10 | if ( 11 | isinstance(node, ast.Call) 12 | and isinstance(node.func, ast.Name) 13 | and node.func.id == "i18n" 14 | ): 15 | for arg in node.args: 16 | if isinstance(arg, ast.Str): 17 | i18n_strings.append(arg.s) 18 | 19 | for child_node in ast.iter_child_nodes(node): 20 | i18n_strings.extend(extract_i18n_strings(child_node)) 21 | 22 | return i18n_strings 23 | 24 | 25 | # scan the directory for all .py files (recursively) 26 | # for each file, parse the code into an AST 27 | # for each AST, extract the i18n strings 28 | 29 | strings = [] 30 | for filename in glob.iglob("**/*.py", recursive=True): 31 | with open(filename, "r") as f: 32 | code = f.read() 33 | if "I18nAuto" in code: 34 | tree = ast.parse(code) 35 | i18n_strings = extract_i18n_strings(tree) 36 | print(filename, len(i18n_strings)) 37 | strings.extend(i18n_strings) 38 | code_keys = set(strings) 39 | """ 40 | n_i18n.py 41 | gui_v1.py 26 42 | app.py 16 43 | infer-web.py 147 44 | scan_i18n.py 0 45 | i18n.py 0 46 | lib/train/process_ckpt.py 1 47 | """ 48 | print() 49 | print("Total unique:", len(code_keys)) 50 | 51 | 52 | standard_file = "i18n/locale/zh_CN.json" 53 | with open(standard_file, "r", encoding="utf-8") as f: 54 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 55 | standard_keys = set(standard_data.keys()) 56 | 57 | # Define the standard file name 58 | unused_keys = standard_keys - code_keys 59 | print("Unused keys:", len(unused_keys)) 60 | for unused_key in unused_keys: 61 | print("\t", unused_key) 62 | 63 | missing_keys = code_keys - standard_keys 64 | print("Missing keys:", len(missing_keys)) 65 | for missing_key in missing_keys: 66 | print("\t", missing_key) 67 | 68 | code_keys_dict = OrderedDict() 69 | for s in strings: 70 | code_keys_dict[s] = s 71 | 72 | # write back 73 | with open(standard_file, "w", encoding="utf-8") as f: 74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 75 | f.write("\n") 76 | -------------------------------------------------------------------------------- /tools/my_utils.py: -------------------------------------------------------------------------------- 1 | import platform,os,traceback 2 | import ffmpeg 3 | import numpy as np 4 | 5 | 6 | def load_audio(file, sr): 7 | try: 8 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 9 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 10 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 11 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 12 | if os.path.exists(file) == False: 13 | raise RuntimeError( 14 | "You input a wrong audio path that does not exists, please fix it!" 15 | ) 16 | out, _ = ( 17 | ffmpeg.input(file, threads=0) 18 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 19 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 20 | ) 21 | except Exception as e: 22 | traceback.print_exc() 23 | raise RuntimeError(f"Failed to load audio: {e}") 24 | 25 | return np.frombuffer(out, np.float32).flatten() 26 | 27 | 28 | def clean_path(path_str): 29 | if platform.system() == 'Windows': 30 | path_str = path_str.replace('/', '\\') 31 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 32 | -------------------------------------------------------------------------------- /web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/web.png -------------------------------------------------------------------------------- /web/js/alertMSG.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | 3 | app.registerExtension({ 4 | name: "GPT_SOVITS.alertMSG", 5 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 6 | if (nodeData?.name == "GPT_SOVITS_FT") { 7 | nodeType.prototype.onExecuted = function (data) { 8 | // alert("Success!you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n Now you can tts or inference"); 9 | let msg = "Success! you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n you'd like to reboot the server to use tts and inference?" 10 | if (confirm(msg)) { 11 | try { 12 | api.fetchApi("/gpt_sovits/reboot"); 13 | } 14 | catch(exception) { 15 | } 16 | } 17 | } 18 | } 19 | }, 20 | }); -------------------------------------------------------------------------------- /web/js/previewAudio.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | 4 | function fitHeight(node) { 5 | node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]]) 6 | node?.graph?.setDirtyCanvas(true); 7 | } 8 | function chainCallback(object, property, callback) { 9 | if (object == undefined) { 10 | //This should not happen. 11 | console.error("Tried to add callback to non-existant object") 12 | return; 13 | } 14 | if (property in object) { 15 | const callback_orig = object[property] 16 | object[property] = function () { 17 | const r = callback_orig.apply(this, arguments); 18 | callback.apply(this, arguments); 19 | return r 20 | }; 21 | } else { 22 | object[property] = callback; 23 | } 24 | } 25 | 26 | function addPreviewOptions(nodeType) { 27 | chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) { 28 | // The intended way of appending options is returning a list of extra options, 29 | // but this isn't used in widgetInputs.js and would require 30 | // less generalization of chainCallback 31 | let optNew = [] 32 | try { 33 | const previewWidget = this.widgets.find((w) => w.name === "audiopreview"); 34 | 35 | let url = null 36 | if (previewWidget.audioEl?.hidden == false && previewWidget.audioEl.src) { 37 | //Use full quality audio 38 | //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params)); 39 | url = previewWidget.audioEl.src 40 | } 41 | if (url) { 42 | optNew.push( 43 | { 44 | content: "Open preview", 45 | callback: () => { 46 | window.open(url, "_blank") 47 | }, 48 | }, 49 | { 50 | content: "Save preview", 51 | callback: () => { 52 | const a = document.createElement("a"); 53 | a.href = url; 54 | a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename")); 55 | document.body.append(a); 56 | a.click(); 57 | requestAnimationFrame(() => a.remove()); 58 | }, 59 | } 60 | ); 61 | } 62 | if(options.length > 0 && options[0] != null && optNew.length > 0) { 63 | optNew.push(null); 64 | } 65 | options.unshift(...optNew); 66 | 67 | } catch (error) { 68 | console.log(error); 69 | } 70 | 71 | }); 72 | } 73 | function previewAudio(node,file,type){ 74 | var element = document.createElement("div"); 75 | const previewNode = node; 76 | var previewWidget = node.addDOMWidget("audiopreview", "preview", element, { 77 | serialize: false, 78 | hideOnZoom: false, 79 | getValue() { 80 | return element.value; 81 | }, 82 | setValue(v) { 83 | element.value = v; 84 | }, 85 | }); 86 | previewWidget.computeSize = function(width) { 87 | if (this.aspectRatio && !this.parentEl.hidden) { 88 | let height = (previewNode.size[0]-20)/ this.aspectRatio + 10; 89 | if (!(height > 0)) { 90 | height = 0; 91 | } 92 | this.computedHeight = height + 10; 93 | return [width, height]; 94 | } 95 | return [width, -4];//no loaded src, widget should not display 96 | } 97 | // element.style['pointer-events'] = "none" 98 | previewWidget.value = {hidden: false, paused: false, params: {}} 99 | previewWidget.parentEl = document.createElement("div"); 100 | previewWidget.parentEl.className = "audio_preview"; 101 | previewWidget.parentEl.style['width'] = "100%" 102 | element.appendChild(previewWidget.parentEl); 103 | previewWidget.audioEl = document.createElement("audio"); 104 | previewWidget.audioEl.controls = true; 105 | previewWidget.audioEl.loop = false; 106 | previewWidget.audioEl.muted = false; 107 | previewWidget.audioEl.style['width'] = "100%" 108 | previewWidget.audioEl.addEventListener("loadedmetadata", () => { 109 | 110 | previewWidget.aspectRatio = previewWidget.audioEl.audioWidth / previewWidget.audioEl.audioHeight; 111 | fitHeight(this); 112 | }); 113 | previewWidget.audioEl.addEventListener("error", () => { 114 | //TODO: consider a way to properly notify the user why a preview isn't shown. 115 | previewWidget.parentEl.hidden = true; 116 | fitHeight(this); 117 | }); 118 | 119 | let params = { 120 | "filename": file, 121 | "type": type, 122 | } 123 | 124 | previewWidget.parentEl.hidden = previewWidget.value.hidden; 125 | previewWidget.audioEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden; 126 | let target_width = 256 127 | if (element.style?.width) { 128 | //overscale to allow scrolling. Endpoint won't return higher than native 129 | target_width = element.style.width.slice(0,-2)*2; 130 | } 131 | if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") { 132 | params.force_size = target_width+"x?" 133 | } else { 134 | let size = params.force_size.split("x") 135 | let ar = parseInt(size[0])/parseInt(size[1]) 136 | params.force_size = target_width+"x"+(target_width/ar) 137 | } 138 | 139 | previewWidget.audioEl.src = api.apiURL('/view?' + new URLSearchParams(params)); 140 | 141 | previewWidget.audioEl.hidden = false; 142 | previewWidget.parentEl.appendChild(previewWidget.audioEl) 143 | } 144 | 145 | app.registerExtension({ 146 | name: "GPT_SOVITS.AudioPreviewer", 147 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 148 | if (nodeData?.name == "PreViewAudio") { 149 | nodeType.prototype.onExecuted = function (data) { 150 | previewAudio(this, data.audio[0], data.audio[1]); 151 | } 152 | addPreviewOptions(nodeType) 153 | } 154 | } 155 | }); -------------------------------------------------------------------------------- /web/js/refreshPath.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | import { ComfyWidgets } from "../../../scripts/widgets.js" 4 | function rebootAPI() { 5 | if (confirm("Are you sure you'd like to reboot the server to refresh weights path?")) { 6 | try { 7 | api.fetchApi("/gpt_sovits/reboot"); 8 | } 9 | catch(exception) { 10 | 11 | } 12 | return true; 13 | } 14 | 15 | return false; 16 | } 17 | function pathRefresh(node, inputName, inputData, app) { 18 | const gptWidget = node.widgets.find((w) => w.name === "gpt_weight") 19 | const sovitsWidget = node.widgets.find((w) => w.name === "sovits_weight") 20 | /* 21 | A method that returns the required style for the html 22 | */ 23 | var default_gpt_value = gptWidget.value; 24 | Object.defineProperty(gptWidget, "value", { 25 | set : function(value) { 26 | this._real_value = value; 27 | }, 28 | 29 | get : function() { 30 | let value = ""; 31 | if (this._real_value) { 32 | value = this._real_value; 33 | } else { 34 | return default_gpt_value; 35 | } 36 | 37 | if (value.filename) { 38 | let real_value = value; 39 | value = ""; 40 | if (real_value.subfolder) { 41 | value = real_value.subfolder + "/"; 42 | } 43 | 44 | value += real_value.filename; 45 | 46 | if(real_value.type && real_value.type !== "input") 47 | value += ` [${real_value.type}]`; 48 | } 49 | return value; 50 | } 51 | }); 52 | 53 | var default_sovits_value = sovitsWidget.value; 54 | Object.defineProperty(sovitsWidget, "value", { 55 | set : function(value) { 56 | this._real_value = value; 57 | }, 58 | 59 | get : function() { 60 | let value = ""; 61 | if (this._real_value) { 62 | value = this._real_value; 63 | } else { 64 | return default_sovits_value; 65 | } 66 | 67 | if (value.filename) { 68 | let real_value = value; 69 | value = ""; 70 | if (real_value.subfolder) { 71 | value = real_value.subfolder + "/"; 72 | } 73 | 74 | value += real_value.filename; 75 | 76 | if(real_value.type && real_value.type !== "input") 77 | value += ` [${real_value.type}]`; 78 | } 79 | return value; 80 | } 81 | }); 82 | 83 | // Create the button widget for selecting the files 84 | let refreshWidget = node.addWidget("button", "REBOOT TO REFRESH WEIGHTS LIST", "refresh", () => { 85 | rebootAPI() 86 | }); 87 | 88 | refreshWidget.serialize = false; 89 | 90 | const cb = node.callback; 91 | gptWidget.callback = function () { 92 | if (cb) { 93 | return cb.apply(this, arguments); 94 | } 95 | }; 96 | sovitsWidget.callback = function () { 97 | if (cb) { 98 | return cb.apply(this, arguments); 99 | } 100 | }; 101 | 102 | return { widget: refreshWidget }; 103 | } 104 | ComfyWidgets.PATHREFRESH = pathRefresh; 105 | 106 | app.registerExtension({ 107 | name: "GPT_SOVITS.RefreshPath", 108 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 109 | if (nodeData?.name == "GPT_SOVITS_TTS") { 110 | nodeData.input.required.upload = ["PATHREFRESH"]; 111 | } 112 | 113 | if (nodeData?.name == "GPT_SOVITS_INFER") { 114 | nodeData.input.required.upload = ["PATHREFRESH"]; 115 | } 116 | }, 117 | }); -------------------------------------------------------------------------------- /web/js/uploadSRT.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | import { ComfyWidgets } from "../../../scripts/widgets.js" 4 | 5 | function srtUpload(node, inputName, inputData, app) { 6 | const srtWidget = node.widgets.find((w) => w.name === "srt"); 7 | let uploadWidget; 8 | /* 9 | A method that returns the required style for the html 10 | */ 11 | var default_value = srtWidget.value; 12 | Object.defineProperty(srtWidget, "value", { 13 | set : function(value) { 14 | this._real_value = value; 15 | }, 16 | 17 | get : function() { 18 | let value = ""; 19 | if (this._real_value) { 20 | value = this._real_value; 21 | } else { 22 | return default_value; 23 | } 24 | 25 | if (value.filename) { 26 | let real_value = value; 27 | value = ""; 28 | if (real_value.subfolder) { 29 | value = real_value.subfolder + "/"; 30 | } 31 | 32 | value += real_value.filename; 33 | 34 | if(real_value.type && real_value.type !== "input") 35 | value += ` [${real_value.type}]`; 36 | } 37 | return value; 38 | } 39 | }); 40 | async function uploadFile(file, updateNode, pasted = false) { 41 | try { 42 | // Wrap file in formdata so it includes filename 43 | const body = new FormData(); 44 | body.append("image", file); 45 | if (pasted) body.append("subfolder", "pasted"); 46 | const resp = await api.fetchApi("/upload/image", { 47 | method: "POST", 48 | body, 49 | }); 50 | 51 | if (resp.status === 200) { 52 | const data = await resp.json(); 53 | // Add the file to the dropdown list and update the widget value 54 | let path = data.name; 55 | if (data.subfolder) path = data.subfolder + "/" + path; 56 | 57 | if (!srtWidget.options.values.includes(path)) { 58 | srtWidget.options.values.push(path); 59 | } 60 | 61 | if (updateNode) { 62 | srtWidget.value = path; 63 | } 64 | } else { 65 | alert(resp.status + " - " + resp.statusText); 66 | } 67 | } catch (error) { 68 | alert(error); 69 | } 70 | } 71 | 72 | const fileInput = document.createElement("input"); 73 | Object.assign(fileInput, { 74 | type: "file", 75 | accept: "file/srt,file/txt", 76 | style: "display: none", 77 | onchange: async () => { 78 | if (fileInput.files.length) { 79 | await uploadFile(fileInput.files[0], true); 80 | } 81 | }, 82 | }); 83 | document.body.append(fileInput); 84 | 85 | // Create the button widget for selecting the files 86 | uploadWidget = node.addWidget("button", "choose srt file to upload", "Audio", () => { 87 | fileInput.click(); 88 | }); 89 | 90 | uploadWidget.serialize = false; 91 | 92 | const cb = node.callback; 93 | srtWidget.callback = function () { 94 | if (cb) { 95 | return cb.apply(this, arguments); 96 | } 97 | }; 98 | 99 | return { widget: uploadWidget }; 100 | } 101 | 102 | ComfyWidgets.SRTPLOAD = srtUpload; 103 | 104 | app.registerExtension({ 105 | name: "GPT_SOVITS.UploadSRT", 106 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 107 | if (nodeData?.name == "LoadSRT") { 108 | nodeData.input.required.upload = ["SRTPLOAD"]; 109 | } 110 | }, 111 | }); 112 | 113 | -------------------------------------------------------------------------------- /wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-GPT_SoVITS/3a9e57e2d2ce19e017e48aff3616c144da6145d6/wechat.jpg --------------------------------------------------------------------------------