├── .gitignore ├── Adapters ├── base │ ├── Base_TTS_Instance.py │ ├── Base_TTS_Task.py │ └── __init__.py └── gsv_fast │ ├── TTS_infer_pack │ ├── AR │ │ ├── __init__.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── bucket_sampler.py │ │ │ ├── data_module.py │ │ │ └── dataset.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── t2s_lightning_module.py │ │ │ ├── t2s_lightning_module_onnx.py │ │ │ ├── t2s_model.py │ │ │ ├── t2s_model_batch_only.py │ │ │ ├── t2s_model_onnx.py │ │ │ └── utils.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── activation_onnx.py │ │ │ ├── embedding.py │ │ │ ├── embedding_onnx.py │ │ │ ├── lr_schedulers.py │ │ │ ├── optim.py │ │ │ ├── patched_mha_with_cache.py │ │ │ ├── patched_mha_with_cache_onnx.py │ │ │ ├── scaling.py │ │ │ ├── transformer.py │ │ │ └── transformer_onnx.py │ │ ├── text_processing │ │ │ ├── __init__.py │ │ │ ├── phonemizer.py │ │ │ └── symbols.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── initialize.py │ │ │ └── io.py │ ├── TTS.py │ ├── TextPreprocessor.py │ ├── __init__.py │ ├── feature_extractor │ │ ├── __init__.py │ │ ├── cnhubert.py │ │ └── whisper_enc.py │ ├── i18n │ │ └── locale │ │ │ ├── en_US.json │ │ │ ├── es_ES.json │ │ │ ├── fr_FR.json │ │ │ ├── it_IT.json │ │ │ ├── ja_JP.json │ │ │ ├── ko_KR.json │ │ │ ├── pt_BR.json │ │ │ ├── ru_RU.json │ │ │ ├── tr_TR.json │ │ │ ├── zh_CN.json │ │ │ ├── zh_HK.json │ │ │ ├── zh_SG.json │ │ │ └── zh_TW.json │ ├── module │ │ ├── __init__.py │ │ ├── attentions.py │ │ ├── attentions_onnx.py │ │ ├── commons.py │ │ ├── core_vq.py │ │ ├── data_utils.py │ │ ├── losses.py │ │ ├── mel_processing.py │ │ ├── models.py │ │ ├── models_onnx.py │ │ ├── modules.py │ │ ├── mrte_model.py │ │ ├── quantize.py │ │ └── transforms.py │ ├── text │ │ ├── __init__.py │ │ ├── chinese.py │ │ ├── cleaner.py │ │ ├── cmudict-fast.rep │ │ ├── cmudict.rep │ │ ├── engdict-hot.rep │ │ ├── engdict_cache.pickle │ │ ├── english.py │ │ ├── japanese.py │ │ ├── opencpop-strict.txt │ │ ├── symbols.py │ │ ├── tone_sandhi.py │ │ └── zh_normalization │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── char_convert.py │ │ │ ├── chronology.py │ │ │ ├── constants.py │ │ │ ├── num.py │ │ │ ├── phonecode.py │ │ │ ├── quantifier.py │ │ │ └── text_normlization.py │ ├── text_segmentation_method.py │ └── utils.py │ ├── __init__.py │ ├── config_manager.py │ ├── gsv_adapter.py │ ├── gsv_task.py │ └── ssml_dealer.py ├── LICENSE ├── README.md ├── WebUIs └── GSVI │ ├── Character_Manager.py │ ├── Post_Webui.py │ └── i18n │ └── locale │ ├── en_US.json │ ├── zh_CN.json │ └── zh_TW.json ├── __init__.py ├── api.py ├── api_doc.md ├── configs ├── api_config.json └── gsv_fast │ ├── config.json │ └── params_config.json ├── install.sh ├── models ├── gsv │ └── .gitignore └── pretrained_models │ └── gsv │ └── .gitignore ├── requirements.txt ├── src └── api_config_manager.py ├── test ├── Model_Test.py ├── test_Concurrency.py ├── test_refer.ipynb └── test_refer.py └── tools ├── __init__.py ├── cmd-denoise.py ├── i18n ├── i18n.py ├── locale │ ├── en_US.json │ ├── es_ES.json │ ├── fr_FR.json │ ├── it_IT.json │ ├── ja_JP.json │ ├── ko_KR.json │ ├── ru_RU.json │ ├── tr_TR.json │ ├── zh_CN.json │ ├── zh_HK.json │ ├── zh_SG.json │ └── zh_TW.json ├── locale_diff.py └── scan_i18n.py ├── my_utils.py ├── normalize_loudness.py ├── slice_audio.py ├── slicer2.py └── subfix_webui.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | *.pyc 4 | env 5 | runtime 6 | .idea 7 | output 8 | logs 9 | reference 10 | GPT_weights 11 | SoVITS_weights 12 | TEMP 13 | PortableGit 14 | cache 15 | 16 | ffmpeg.exe 17 | ffprobe.exe 18 | 19 | -------------------------------------------------------------------------------- /Adapters/base/Base_TTS_Instance.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from .Base_TTS_Task import Base_TTS_Task as TTS_Task 4 | 5 | class Base_TTS_Instance(ABC): 6 | @abstractmethod 7 | def __init__(self, models_path=None, **kwargs): 8 | pass 9 | 10 | @abstractmethod 11 | def generate(self, task: TTS_Task): 12 | if task.task_type == "text": 13 | print("生成文本任务") 14 | return self.generate_from_text(task) 15 | elif task.task_type == "ssml": 16 | print("生成SSML任务") 17 | return self.generate_from_ssml(task) 18 | else: 19 | print("未知任务类型") 20 | return None 21 | 22 | @abstractmethod 23 | def generate_from_text(self, task: TTS_Task): 24 | print(f"文本: {task.text}") 25 | return "生成文本任务" 26 | 27 | @abstractmethod 28 | def generate_from_ssml(self, task: TTS_Task): 29 | print(f"SSML: {task.ssml}") 30 | return "生成SSML任务" 31 | 32 | @abstractmethod 33 | def get_characters(self): 34 | return {"character": ["emotion1", "emotion2", "emotion3"]} 35 | 36 | @abstractmethod 37 | def params_analyser(self, data)->TTS_Task: 38 | pass 39 | 40 | @abstractmethod 41 | def ms_like_analyser(self, data)->TTS_Task: 42 | pass 43 | -------------------------------------------------------------------------------- /Adapters/base/Base_TTS_Task.py: -------------------------------------------------------------------------------- 1 | 2 | import os, json, sys 3 | 4 | from uuid import uuid4 5 | from typing import Literal 6 | import urllib.parse 7 | import hashlib 8 | 9 | 10 | class Base_TTS_Task: 11 | """ 12 | Represents a basic Text-to-Speech (TTS) task. 13 | 14 | Attributes: 15 | uuid (str): The unique identifier for the task. 16 | task_type (Literal["audio", "ssml", "text"]): The type of the TTS task. 17 | audio_path (str): The path to the audio file. 18 | 19 | src (str): The source of the audio file. 20 | ssml (str): The SSML content. 21 | text (str): The text content. 22 | variation (str): The variation of the text content. 23 | 24 | params_config (dict): The parameter configuration. 25 | disabled_features (list): The list of disabled features. 26 | format (str): The audio format. 27 | stream (bool): Indicates if the audio should be streamed. 28 | loudness (float): The loudness of the audio. 29 | speed (float): The speed of the audio. 30 | 31 | Methods: 32 | get_param_value(param_name, data, return_default=True, special_dict={}): Returns the value of a parameter. 33 | update_from_param(param_name, data, special_dict={}): Updates a parameter value. 34 | 35 | Methods need to rewrite: 36 | load_from_dict(data: dict={}): Loads the task from a dictionary. 37 | md5(): Returns the MD5 hash of the task. 38 | to_dict(): Returns the task as a dictionary. 39 | __str__(): Returns a string representation of the task. 40 | """ 41 | def __init__(self, other_task=None): 42 | self.uuid: str = str(uuid4()) 43 | 44 | self.task_type: Literal["audio", "ssml", "text"] = "text" 45 | self.audio_path: str = "" 46 | 47 | # 任务类型为音频时的属性 48 | self.src: str = "" 49 | 50 | # 任务类型为SSML时的属性 51 | self.ssml: str = "" 52 | 53 | # 任务类型为文本时的属性 54 | self.text: str = "" 55 | self.variation: str = None 56 | 57 | # 从文件可以读取参数配置与别名 58 | self.params_config: dict = None 59 | self.disabled_features: list = [] 60 | 61 | # 通用属性 62 | self.format: str = "wav" if other_task is None else other_task.format 63 | self.stream: bool = False if other_task is None else other_task.stream 64 | self.loudness: float = None if other_task is None else other_task.loudness 65 | self.speed: float = 1.0 if other_task is None else other_task.speed 66 | self.save_temp: bool = False if other_task is None else other_task.save_temp 67 | self.sample_rate: int = 32000 if other_task is None else other_task.sample_rate 68 | 69 | def get_param_value(self, param_name, data, return_default=True, special_dict={}): 70 | # ban disabled features 71 | param_config = self.params_config[param_name] 72 | if param_name not in self.disabled_features: 73 | for alias in param_config['alias']: 74 | if data.get(alias) is not None: 75 | if special_dict.get(data.get(alias)) is not None: 76 | return special_dict[data.get(alias)] 77 | elif param_config['type'] == 'int': 78 | return int(data.get(alias)) 79 | elif param_config['type'] == 'float': 80 | x = data.get(alias) 81 | if isinstance(x, str) and x[-1] == "%": 82 | return float(x[:-1]) / 100 83 | return float(x) 84 | elif param_config['type'] == 'bool': 85 | return str(data.get(alias)).lower() in ('true', '1', 't', 'y', 'yes', "allow", "allowed") 86 | else: # 默认为字符串 87 | return urllib.parse.unquote(data.get(alias)) 88 | if return_default: 89 | return param_config['default'] 90 | else: 91 | return None 92 | 93 | def update_from_param(self, param_name, data, special_dict={}): 94 | value = self.get_param_value(param_name, data, return_default=False, special_dict=special_dict) 95 | if value is not None: 96 | setattr(self, param_name, value) 97 | 98 | def load_from_dict(self, data: dict={}): 99 | 100 | assert self.params_config is not None, "params_config.json not found." 101 | 102 | task_type = self.get_param_value('task_type', data) 103 | self.task_type = "ssml" if "ssml" in task_type.lower() else "text" 104 | if self.task_type == "text" and data.get("ssml") not in [None, ""]: 105 | self.task_type = "ssml" 106 | # 参数提取 107 | if self.task_type == "text": 108 | self.text = self.get_param_value('text', data).strip() 109 | else: 110 | self.ssml = self.get_param_value('ssml', data).strip() 111 | 112 | self.format = self.get_param_value('format', data) 113 | self.stream = self.get_param_value('stream', data) 114 | self.loudness = self.get_param_value('loudness', data) 115 | self.speed = self.get_param_value('speed', data) 116 | 117 | 118 | def md5(self): 119 | m = hashlib.md5() 120 | if self.task_type == "audio": 121 | m.update(self.src.encode()) 122 | elif self.task_type == "ssml": 123 | m.update(self.ssml.encode()) 124 | elif self.task_type == "text": 125 | m.update(self.text.encode()) 126 | m.update(self.variation.encode()) 127 | return m.hexdigest() 128 | 129 | def to_dict(self): 130 | return { 131 | "text": self.text, 132 | "text_language": self.text_language, 133 | "character_emotion": self.emotion, 134 | "batch_size": self.batch_size, 135 | "speed": self.speed, 136 | "top_k": self.top_k, 137 | "top_p": self.top_p, 138 | "temperature": self.temperature, 139 | "cut_method": self.cut_method, 140 | "format": self.format, 141 | "seed": self.seed, 142 | 143 | "stream": self.stream, 144 | "loudness": self.loudness, 145 | "save_temp": self.save_temp, 146 | 147 | } 148 | 149 | def __str__(self): 150 | json_content = json.dumps(self.to_dict(), ensure_ascii=False) # ensure_ascii=False to properly display non-ASCII characters 151 | return f"----------------TTS Task--------------\n content: {json_content}\n--------------------------------------" 152 | 153 | 154 | -------------------------------------------------------------------------------- /Adapters/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .Base_TTS_Task import Base_TTS_Task 2 | from .Base_TTS_Instance import Base_TTS_Instance -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/AR/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/AR/data/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/data/bucket_sampler.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import math 5 | import random 6 | from random import shuffle 7 | from typing import Iterator 8 | from typing import Optional 9 | from typing import TypeVar 10 | 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data import Dataset 14 | from torch.utils.data import Sampler 15 | 16 | __all__ = [ 17 | "DistributedBucketSampler", 18 | ] 19 | 20 | T_co = TypeVar("T_co", covariant=True) 21 | 22 | 23 | class DistributedBucketSampler(Sampler[T_co]): 24 | r""" 25 | sort the dataset wrt. input length 26 | divide samples into buckets 27 | sort within buckets 28 | divide buckets into batches 29 | sort batches 30 | """ 31 | 32 | def __init__( 33 | self, 34 | dataset: Dataset, 35 | num_replicas: Optional[int] = None, 36 | rank: Optional[int] = None, 37 | shuffle: bool = True, 38 | seed: int = 0, 39 | drop_last: bool = False, 40 | batch_size: int = 32, 41 | ) -> None: 42 | if num_replicas is None: 43 | if not dist.is_available(): 44 | raise RuntimeError("Requires distributed package to be available") 45 | num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1 46 | if rank is None: 47 | if not dist.is_available(): 48 | raise RuntimeError("Requires distributed package to be available") 49 | rank = dist.get_rank() if torch.cuda.is_available() else 0 50 | if torch.cuda.is_available(): 51 | torch.cuda.set_device(rank) 52 | if rank >= num_replicas or rank < 0: 53 | raise ValueError( 54 | "Invalid rank {}, rank should be in the interval" 55 | " [0, {}]".format(rank, num_replicas - 1) 56 | ) 57 | self.dataset = dataset 58 | self.num_replicas = num_replicas 59 | self.rank = rank 60 | self.epoch = 0 61 | self.drop_last = drop_last 62 | # If the dataset length is evenly divisible by # of replicas, then there 63 | # is no need to drop any data, since the dataset will be split equally. 64 | if ( 65 | self.drop_last and len(self.dataset) % self.num_replicas != 0 66 | ): # type: ignore[arg-type] 67 | # Split to nearest available length that is evenly divisible. 68 | # This is to ensure each rank receives the same amount of data when 69 | # using this Sampler. 70 | self.num_samples = math.ceil( 71 | (len(self.dataset) - self.num_replicas) 72 | / self.num_replicas # type: ignore[arg-type] 73 | ) 74 | else: 75 | self.num_samples = math.ceil( 76 | len(self.dataset) / self.num_replicas 77 | ) # type: ignore[arg-type] 78 | self.total_size = self.num_samples * self.num_replicas 79 | self.shuffle = shuffle 80 | self.seed = seed 81 | self.batch_size = batch_size 82 | self.id_with_length = self._get_sample_lengths() 83 | self.id_buckets = self.make_buckets(bucket_width=2.0) 84 | 85 | def _get_sample_lengths(self): 86 | id_with_lengths = [] 87 | for i in range(len(self.dataset)): 88 | id_with_lengths.append((i, self.dataset.get_sample_length(i))) 89 | id_with_lengths.sort(key=lambda x: x[1]) 90 | return id_with_lengths 91 | 92 | def make_buckets(self, bucket_width: float = 2.0): 93 | buckets = [] 94 | cur = [] 95 | max_sec = bucket_width 96 | for id, sec in self.id_with_length: 97 | if sec < max_sec: 98 | cur.append(id) 99 | else: 100 | buckets.append(cur) 101 | cur = [id] 102 | max_sec += bucket_width 103 | if len(cur) > 0: 104 | buckets.append(cur) 105 | return buckets 106 | 107 | def __iter__(self) -> Iterator[T_co]: 108 | if self.shuffle: 109 | # deterministically shuffle based on epoch and seed 110 | g = torch.Generator() 111 | g.manual_seed(self.seed + self.epoch) 112 | random.seed(self.epoch + self.seed) 113 | shuffled_bucket = [] 114 | for buc in self.id_buckets: 115 | buc_copy = buc.copy() 116 | shuffle(buc_copy) 117 | shuffled_bucket.append(buc_copy) 118 | grouped_batch_size = self.batch_size * self.num_replicas 119 | shuffled_bucket = list(itertools.chain(*shuffled_bucket)) 120 | n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) 121 | batches = [ 122 | shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] 123 | for b in range(n_batch) 124 | ] 125 | shuffle(batches) 126 | indices = list(itertools.chain(*batches)) 127 | else: 128 | # type: ignore[arg-type] 129 | indices = list(range(len(self.dataset))) 130 | 131 | if not self.drop_last: 132 | # add extra samples to make it evenly divisible 133 | padding_size = self.total_size - len(indices) 134 | if padding_size <= len(indices): 135 | indices += indices[:padding_size] 136 | else: 137 | indices += (indices * math.ceil(padding_size / len(indices)))[ 138 | :padding_size 139 | ] 140 | else: 141 | # remove tail of data to make it evenly divisible. 142 | indices = indices[: self.total_size] 143 | assert len(indices) == self.total_size 144 | 145 | # subsample 146 | indices = indices[self.rank : self.total_size : self.num_replicas] 147 | assert len(indices) == self.num_samples 148 | 149 | return iter(indices) 150 | 151 | def __len__(self) -> int: 152 | return self.num_samples 153 | 154 | def set_epoch(self, epoch: int) -> None: 155 | r""" 156 | Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas 157 | use a different random ordering for each epoch. Otherwise, the next iteration of this 158 | sampler will yield the same ordering. 159 | 160 | Args: 161 | epoch (int): Epoch number. 162 | """ 163 | self.epoch = epoch 164 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from AR.data.bucket_sampler import DistributedBucketSampler 5 | from AR.data.dataset import Text2SemanticDataset 6 | from torch.utils.data import DataLoader 7 | 8 | 9 | class Text2SemanticDataModule(LightningDataModule): 10 | def __init__( 11 | self, 12 | config, 13 | train_semantic_path, 14 | train_phoneme_path, 15 | dev_semantic_path=None, 16 | dev_phoneme_path=None, 17 | ): 18 | super().__init__() 19 | self.config = config 20 | self.train_semantic_path = train_semantic_path 21 | self.train_phoneme_path = train_phoneme_path 22 | self.dev_semantic_path = dev_semantic_path 23 | self.dev_phoneme_path = dev_phoneme_path 24 | self.num_workers = self.config["data"]["num_workers"] 25 | 26 | def prepare_data(self): 27 | pass 28 | 29 | def setup(self, stage=None, output_logs=False): 30 | self._train_dataset = Text2SemanticDataset( 31 | phoneme_path=self.train_phoneme_path, 32 | semantic_path=self.train_semantic_path, 33 | max_sec=self.config["data"]["max_sec"], 34 | pad_val=self.config["data"]["pad_val"], 35 | ) 36 | self._dev_dataset = self._train_dataset 37 | # self._dev_dataset = Text2SemanticDataset( 38 | # phoneme_path=self.dev_phoneme_path, 39 | # semantic_path=self.dev_semantic_path, 40 | # max_sample=self.config['data']['max_eval_sample'], 41 | # max_sec=self.config['data']['max_sec'], 42 | # pad_val=self.config['data']['pad_val']) 43 | 44 | def train_dataloader(self): 45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] 46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 48 | return DataLoader( 49 | self._train_dataset, 50 | batch_size=batch_size, 51 | sampler=sampler, 52 | collate_fn=self._train_dataset.collate, 53 | num_workers=self.num_workers, 54 | persistent_workers=True, 55 | prefetch_factor=16, 56 | ) 57 | 58 | def val_dataloader(self): 59 | return DataLoader( 60 | self._dev_dataset, 61 | batch_size=1, 62 | shuffle=False, 63 | collate_fn=self._train_dataset.collate, 64 | num_workers=max(self.num_workers, 12), 65 | persistent_workers=True, 66 | prefetch_factor=16, 67 | ) 68 | 69 | # 这个会使用到嘛? 70 | def test_dataloader(self): 71 | return DataLoader( 72 | self._dev_dataset, 73 | batch_size=1, 74 | shuffle=False, 75 | collate_fn=self._train_dataset.collate, 76 | ) 77 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/AR/models/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/models/t2s_lightning_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os, sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from typing import Dict 8 | 9 | import torch 10 | from pytorch_lightning import LightningModule 11 | from AR.models.t2s_model import Text2SemanticDecoder 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 13 | from AR.modules.optim import ScaledAdam 14 | 15 | class Text2SemanticLightningModule(LightningModule): 16 | def __init__(self, config, output_dir, is_train=True, flash_attn_enabled:bool = False): 17 | super().__init__() 18 | self.config = config 19 | self.top_k = 3 20 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k,flash_attn_enabled=flash_attn_enabled) 21 | pretrained_s1 = config.get("pretrained_s1") 22 | if pretrained_s1 and is_train: 23 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 24 | print( 25 | self.load_state_dict( 26 | torch.load(pretrained_s1, map_location="cpu")["weight"] 27 | ) 28 | ) 29 | if is_train: 30 | self.automatic_optimization = False 31 | self.save_hyperparameters() 32 | self.eval_dir = output_dir / "eval" 33 | self.eval_dir.mkdir(parents=True, exist_ok=True) 34 | 35 | def training_step(self, batch: Dict, batch_idx: int): 36 | opt = self.optimizers() 37 | scheduler = self.lr_schedulers() 38 | forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old 39 | loss, acc = forward( 40 | batch["phoneme_ids"], 41 | batch["phoneme_ids_len"], 42 | batch["semantic_ids"], 43 | batch["semantic_ids_len"], 44 | batch["bert_feature"], 45 | ) 46 | self.manual_backward(loss) 47 | if batch_idx > 0 and batch_idx % 4 == 0: 48 | opt.step() 49 | opt.zero_grad() 50 | scheduler.step() 51 | 52 | self.log( 53 | "total_loss", 54 | loss, 55 | on_step=True, 56 | on_epoch=True, 57 | prog_bar=True, 58 | sync_dist=True, 59 | ) 60 | self.log( 61 | "lr", 62 | scheduler.get_last_lr()[0], 63 | on_epoch=True, 64 | prog_bar=True, 65 | sync_dist=True, 66 | ) 67 | self.log( 68 | f"top_{self.top_k}_acc", 69 | acc, 70 | on_step=True, 71 | on_epoch=True, 72 | prog_bar=True, 73 | sync_dist=True, 74 | ) 75 | 76 | def validation_step(self, batch: Dict, batch_idx: int): 77 | return 78 | 79 | # # get loss 80 | # loss, acc = self.model.forward( 81 | # batch['phoneme_ids'], batch['phoneme_ids_len'], 82 | # batch['semantic_ids'], batch['semantic_ids_len'], 83 | # batch['bert_feature'] 84 | # ) 85 | # 86 | # self.log( 87 | # "val_total_loss", 88 | # loss, 89 | # on_step=True, 90 | # on_epoch=True, 91 | # prog_bar=True, 92 | # sync_dist=True) 93 | # self.log( 94 | # f"val_top_{self.top_k}_acc", 95 | # acc, 96 | # on_step=True, 97 | # on_epoch=True, 98 | # prog_bar=True, 99 | # sync_dist=True) 100 | # 101 | # # get infer output 102 | # semantic_len = batch['semantic_ids'].size(1) 103 | # prompt_len = min(int(semantic_len * 0.5), 150) 104 | # prompt = batch['semantic_ids'][:, :prompt_len] 105 | # pred_semantic = self.model.infer(batch['phoneme_ids'], 106 | # batch['phoneme_ids_len'], prompt, 107 | # batch['bert_feature'] 108 | # ) 109 | # save_name = f'semantic_toks_{batch_idx}.pt' 110 | # save_path = os.path.join(self.eval_dir, save_name) 111 | # torch.save(pred_semantic.detach().cpu(), save_path) 112 | 113 | def configure_optimizers(self): 114 | model_parameters = self.model.parameters() 115 | parameters_names = [] 116 | parameters_names.append( 117 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()] 118 | ) 119 | lm_opt = ScaledAdam( 120 | model_parameters, 121 | lr=0.01, 122 | betas=(0.9, 0.95), 123 | clipping_scale=2.0, 124 | parameters_names=parameters_names, 125 | show_dominant_parameters=False, 126 | clipping_update_period=1000, 127 | ) 128 | 129 | return { 130 | "optimizer": lm_opt, 131 | "lr_scheduler": { 132 | "scheduler": WarmupCosineLRSchedule( 133 | lm_opt, 134 | init_lr=self.config["optimizer"]["lr_init"], 135 | peak_lr=self.config["optimizer"]["lr"], 136 | end_lr=self.config["optimizer"]["lr_end"], 137 | warmup_steps=self.config["optimizer"]["warmup_steps"], 138 | total_steps=self.config["optimizer"]["decay_steps"], 139 | ) 140 | }, 141 | } 142 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/models/t2s_lightning_module_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os, sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from typing import Dict 8 | 9 | import torch 10 | from pytorch_lightning import LightningModule 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 13 | from AR.modules.optim import ScaledAdam 14 | 15 | 16 | class Text2SemanticLightningModule(LightningModule): 17 | def __init__(self, config, output_dir, is_train=True): 18 | super().__init__() 19 | self.config = config 20 | self.top_k = 3 21 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 22 | pretrained_s1 = config.get("pretrained_s1") 23 | if pretrained_s1 and is_train: 24 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 25 | print( 26 | self.load_state_dict( 27 | torch.load(pretrained_s1, map_location="cpu")["weight"] 28 | ) 29 | ) 30 | if is_train: 31 | self.automatic_optimization = False 32 | self.save_hyperparameters() 33 | self.eval_dir = output_dir / "eval" 34 | self.eval_dir.mkdir(parents=True, exist_ok=True) 35 | 36 | def training_step(self, batch: Dict, batch_idx: int): 37 | opt = self.optimizers() 38 | scheduler = self.lr_schedulers() 39 | loss, acc = self.model.forward( 40 | batch["phoneme_ids"], 41 | batch["phoneme_ids_len"], 42 | batch["semantic_ids"], 43 | batch["semantic_ids_len"], 44 | batch["bert_feature"], 45 | ) 46 | self.manual_backward(loss) 47 | if batch_idx > 0 and batch_idx % 4 == 0: 48 | opt.step() 49 | opt.zero_grad() 50 | scheduler.step() 51 | 52 | self.log( 53 | "total_loss", 54 | loss, 55 | on_step=True, 56 | on_epoch=True, 57 | prog_bar=True, 58 | sync_dist=True, 59 | ) 60 | self.log( 61 | "lr", 62 | scheduler.get_last_lr()[0], 63 | on_epoch=True, 64 | prog_bar=True, 65 | sync_dist=True, 66 | ) 67 | self.log( 68 | f"top_{self.top_k}_acc", 69 | acc, 70 | on_step=True, 71 | on_epoch=True, 72 | prog_bar=True, 73 | sync_dist=True, 74 | ) 75 | 76 | def validation_step(self, batch: Dict, batch_idx: int): 77 | return 78 | 79 | def configure_optimizers(self): 80 | model_parameters = self.model.parameters() 81 | parameters_names = [] 82 | parameters_names.append( 83 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()] 84 | ) 85 | lm_opt = ScaledAdam( 86 | model_parameters, 87 | lr=0.01, 88 | betas=(0.9, 0.95), 89 | clipping_scale=2.0, 90 | parameters_names=parameters_names, 91 | show_dominant_parameters=False, 92 | clipping_update_period=1000, 93 | ) 94 | 95 | return { 96 | "optimizer": lm_opt, 97 | "lr_scheduler": { 98 | "scheduler": WarmupCosineLRSchedule( 99 | lm_opt, 100 | init_lr=self.config["optimizer"]["lr_init"], 101 | peak_lr=self.config["optimizer"]["lr"], 102 | end_lr=self.config["optimizer"]["lr_end"], 103 | warmup_steps=self.config["optimizer"]["warmup_steps"], 104 | total_steps=self.config["optimizer"]["decay_steps"], 105 | ) 106 | }, 107 | } 108 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/AR/modules/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/activation_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py 2 | from typing import Optional 3 | from typing import Tuple 4 | import torch 5 | from torch import Tensor 6 | from torch.nn import Linear 7 | from torch.nn import Module 8 | from torch.nn.init import constant_ 9 | from torch.nn.init import xavier_normal_ 10 | from torch.nn.init import xavier_uniform_ 11 | from torch.nn.modules.linear import NonDynamicallyQuantizableLinear 12 | from torch.nn.parameter import Parameter 13 | 14 | from torch.nn import functional as F 15 | from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched 16 | 17 | 18 | class MultiheadAttention(Module): 19 | __constants__ = ["batch_first"] 20 | bias_k: Optional[torch.Tensor] 21 | bias_v: Optional[torch.Tensor] 22 | 23 | def __init__( 24 | self, 25 | embed_dim, 26 | num_heads, 27 | dropout=0.0, 28 | bias=True, 29 | add_bias_kv=False, 30 | add_zero_attn=False, 31 | kdim=None, 32 | vdim=None, 33 | batch_first=False, 34 | linear1_cls=Linear, 35 | linear2_cls=Linear, 36 | device=None, 37 | dtype=None, 38 | ) -> None: 39 | factory_kwargs = {"device": device, "dtype": dtype} 40 | super(MultiheadAttention, self).__init__() 41 | self.embed_dim = embed_dim 42 | self.kdim = kdim if kdim is not None else embed_dim 43 | self.vdim = vdim if vdim is not None else embed_dim 44 | self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim 45 | 46 | self.num_heads = num_heads 47 | self.dropout = dropout 48 | self.batch_first = batch_first 49 | self.head_dim = embed_dim // num_heads 50 | assert ( 51 | self.head_dim * num_heads == self.embed_dim 52 | ), "embed_dim must be divisible by num_heads" 53 | 54 | if add_bias_kv: 55 | self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) 56 | self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) 57 | else: 58 | self.bias_k = self.bias_v = None 59 | 60 | if linear1_cls == Linear: 61 | if not self._qkv_same_embed_dim: 62 | self.q_proj_weight = Parameter( 63 | torch.empty((embed_dim, embed_dim), **factory_kwargs) 64 | ) 65 | self.k_proj_weight = Parameter( 66 | torch.empty((embed_dim, self.kdim), **factory_kwargs) 67 | ) 68 | self.v_proj_weight = Parameter( 69 | torch.empty((embed_dim, self.vdim), **factory_kwargs) 70 | ) 71 | self.register_parameter("in_proj_weight", None) 72 | else: 73 | self.in_proj_weight = Parameter( 74 | torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) 75 | ) 76 | self.register_parameter("q_proj_weight", None) 77 | self.register_parameter("k_proj_weight", None) 78 | self.register_parameter("v_proj_weight", None) 79 | 80 | if bias: 81 | self.in_proj_bias = Parameter( 82 | torch.empty(3 * embed_dim, **factory_kwargs) 83 | ) 84 | else: 85 | self.register_parameter("in_proj_bias", None) 86 | self.out_proj = NonDynamicallyQuantizableLinear( 87 | embed_dim, embed_dim, bias=bias, **factory_kwargs 88 | ) 89 | 90 | self._reset_parameters() 91 | else: 92 | if not self._qkv_same_embed_dim: 93 | raise NotImplementedError 94 | else: 95 | self.in_proj_linear = linear1_cls( 96 | embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs 97 | ) 98 | self.in_proj_weight = self.in_proj_linear.weight 99 | 100 | self.register_parameter("q_proj_weight", None) 101 | self.register_parameter("k_proj_weight", None) 102 | self.register_parameter("v_proj_weight", None) 103 | 104 | if bias: 105 | self.in_proj_bias = self.in_proj_linear.bias 106 | else: 107 | self.register_parameter("in_proj_bias", None) 108 | 109 | self.out_proj = linear2_cls( 110 | embed_dim, embed_dim, bias=bias, **factory_kwargs 111 | ) 112 | 113 | if self.bias_k is not None: 114 | xavier_normal_(self.bias_k) 115 | if self.bias_v is not None: 116 | xavier_normal_(self.bias_v) 117 | 118 | self.add_zero_attn = add_zero_attn 119 | 120 | def _reset_parameters(self): 121 | if self._qkv_same_embed_dim: 122 | xavier_uniform_(self.in_proj_weight) 123 | else: 124 | xavier_uniform_(self.q_proj_weight) 125 | xavier_uniform_(self.k_proj_weight) 126 | xavier_uniform_(self.v_proj_weight) 127 | 128 | if self.in_proj_bias is not None: 129 | constant_(self.in_proj_bias, 0.0) 130 | constant_(self.out_proj.bias, 0.0) 131 | 132 | if self.bias_k is not None: 133 | xavier_normal_(self.bias_k) 134 | if self.bias_v is not None: 135 | xavier_normal_(self.bias_v) 136 | 137 | def __setstate__(self, state): 138 | # Support loading old MultiheadAttention checkpoints generated by v1.1.0 139 | if "_qkv_same_embed_dim" not in state: 140 | state["_qkv_same_embed_dim"] = True 141 | 142 | super(MultiheadAttention, self).__setstate__(state) 143 | 144 | def forward( 145 | self, 146 | query: Tensor, 147 | key: Tensor, 148 | value: Tensor, 149 | key_padding_mask: Optional[Tensor] = None, 150 | need_weights: bool = True, 151 | attn_mask: Optional[Tensor] = None, 152 | average_attn_weights: bool = True, 153 | cache=None, 154 | ) -> Tuple[Tensor, Optional[Tensor]]: 155 | any_nested = query.is_nested or key.is_nested or value.is_nested 156 | query = key = value = query.transpose(1, 0) 157 | attn_output = multi_head_attention_forward_patched( 158 | query, 159 | key, 160 | value, 161 | self.embed_dim, 162 | self.num_heads, 163 | self.in_proj_weight, 164 | self.in_proj_bias, 165 | self.bias_k, 166 | self.bias_v, 167 | self.add_zero_attn, 168 | self.dropout, 169 | self.out_proj.weight, 170 | self.out_proj.bias, 171 | training=self.training, 172 | key_padding_mask=key_padding_mask, 173 | need_weights=need_weights, 174 | attn_mask=attn_mask, 175 | average_attn_weights=average_attn_weights, 176 | cache=cache, 177 | ) 178 | return attn_output.transpose(1, 0) 179 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange( 64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32 65 | ).unsqueeze(1) 66 | else: 67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 68 | div_term = torch.exp( 69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) 70 | * -(math.log(10000.0) / self.embedding_dim) 71 | ) 72 | pe[:, 0::2] = torch.sin(position * div_term) 73 | pe[:, 1::2] = torch.cos(position * div_term) 74 | pe = pe.unsqueeze(0) 75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 76 | 77 | def forward(self, x: torch.Tensor) -> torch.Tensor: 78 | self.extend_pe(x) 79 | output = x.unsqueeze(-1) if x.ndim == 2 else x 80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 81 | return self.dropout(output) 82 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / ( 53 | self.total_steps - self.warmup_steps 54 | ) 55 | if decay_ratio < 0.0 or decay_ratio > 1.0: 56 | raise RuntimeError( 57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." 58 | ) 59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 61 | 62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 63 | self.set_lr(lr) 64 | self.lr = lr 65 | self._current_step += 1 66 | return self.lr 67 | 68 | 69 | if __name__ == "__main__": 70 | m = nn.Linear(10, 10) 71 | opt = Adam(m.parameters(), lr=1e-4) 72 | s = WarmupCosineLRSchedule( 73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 74 | ) 75 | lrs = [] 76 | for i in range(25000): 77 | s.step() 78 | lrs.append(s.lr) 79 | print(s.lr) 80 | 81 | plt.plot(lrs) 82 | plt.plot(range(0, 25000), lrs) 83 | plt.show() 84 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _mha_shape_check, 4 | _canonical_mask, 5 | _none_or_dtype, 6 | _in_projection_packed, 7 | ) 8 | 9 | def multi_head_attention_forward_patched( 10 | query, 11 | key, 12 | value, 13 | embed_dim_to_check: int, 14 | num_heads: int, 15 | in_proj_weight, 16 | in_proj_bias: Optional[Tensor], 17 | bias_k: Optional[Tensor], 18 | bias_v: Optional[Tensor], 19 | add_zero_attn: bool, 20 | dropout_p: float, 21 | out_proj_weight: Tensor, 22 | out_proj_bias: Optional[Tensor], 23 | training: bool = True, 24 | key_padding_mask: Optional[Tensor] = None, 25 | need_weights: bool = True, 26 | attn_mask: Optional[Tensor] = None, 27 | use_separate_proj_weight: bool = False, 28 | q_proj_weight: Optional[Tensor] = None, 29 | k_proj_weight: Optional[Tensor] = None, 30 | v_proj_weight: Optional[Tensor] = None, 31 | static_k: Optional[Tensor] = None, 32 | static_v: Optional[Tensor] = None, 33 | average_attn_weights: bool = True, 34 | is_causal: bool = False, 35 | cache=None, 36 | ) -> Tuple[Tensor, Optional[Tensor]]: 37 | 38 | # set up shape vars 39 | _, _, embed_dim = query.shape 40 | attn_mask = _canonical_mask( 41 | mask=attn_mask, 42 | mask_name="attn_mask", 43 | other_type=None, 44 | other_name="", 45 | target_type=query.dtype, 46 | check_other=False, 47 | ) 48 | head_dim = embed_dim // num_heads 49 | 50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 53 | 54 | if cache["first_infer"] == 1: 55 | cache["k"][cache["stage"]] = k 56 | cache["v"][cache["stage"]] = v 57 | else: 58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 60 | k = cache["k"][cache["stage"]] 61 | v = cache["v"][cache["stage"]] 62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 63 | 64 | attn_mask = _canonical_mask( 65 | mask=attn_mask, 66 | mask_name="attn_mask", 67 | other_type=None, 68 | other_name="", 69 | target_type=q.dtype, 70 | check_other=False, 71 | ) 72 | attn_mask = attn_mask.unsqueeze(0) 73 | 74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 77 | 78 | dropout_p = 0.0 79 | attn_mask = attn_mask.unsqueeze(0) 80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 83 | attn_output = scaled_dot_product_attention( 84 | q, k, v, attn_mask, dropout_p, is_causal 85 | ) 86 | attn_output = ( 87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 88 | ) 89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 90 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 91 | 92 | return attn_output 93 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = ( 34 | rf"([{''.join(self._special_cases_dict.keys())}])" 35 | ) 36 | 37 | def _normalize_punctuation(self, text: str) -> str: 38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 40 | text = regex.sub(r"\pZ+", r" ", text) 41 | return text.strip() 42 | 43 | def _convert_punctuation(self, word: Word) -> str: 44 | if not word.phonemes: 45 | return "" 46 | if word.phonemes[0] in ["‖", "|"]: 47 | return word.text.strip() 48 | 49 | phonemes = "".join(word.phonemes) 50 | # remove modifier characters ˈˌː with regex 51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 52 | return phonemes.strip() 53 | 54 | def phonemize(self, text: str, espeak: bool = False) -> str: 55 | text_to_phonemize: str = self._normalize_punctuation(text) 56 | sents: List[Sentence] = [ 57 | sent 58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) 59 | ] 60 | words: List[str] = [ 61 | self._convert_punctuation(word) for word in itertools.chain(*sents) 62 | ] 63 | return " ".join(words) 64 | 65 | def transform(self, phonemes): 66 | # convert phonemes to ids 67 | # dictionary is in symbols.py 68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 69 | 70 | 71 | if __name__ == "__main__": 72 | phonemizer = GruutPhonemizer("en-us") 73 | # text -> IPA 74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 75 | print("phonemes:", phonemes) 76 | print("len(phonemes):", len(phonemes)) 77 | phoneme_ids = phonemizer.transform(phonemes) 78 | print("phoneme_ids:", phoneme_ids) 79 | print("len(phoneme_ids):", len(phoneme_ids)) 80 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 8 | SPACE_ID = SYMBOLS.index(" ") 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 11 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == 'true' else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted( 22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 23 | # 获取最新的 ckpt 文件名 24 | newest_ckpt = sorted_info[0][2] 25 | return newest_ckpt 26 | 27 | 28 | # 文本存在且不为空时 return True 29 | def check_txt_file(file_path): 30 | try: 31 | with open(file_path, 'r') as file: 32 | text = file.readline().strip() 33 | assert text.strip() != '' 34 | return text 35 | except Exception: 36 | return False 37 | return False 38 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | 7 | def initialize(model: torch.nn.Module, init: str): 8 | """Initialize weights of a neural network module. 9 | 10 | Parameters are initialized using the given method or distribution. 11 | 12 | Custom initialization routines can be implemented into submodules 13 | as function `espnet_initialization_fn` within the custom module. 14 | 15 | Args: 16 | model: Target. 17 | init: Method of initialization. 18 | """ 19 | assert check_argument_types() 20 | print("init with", init) 21 | 22 | # weight init 23 | for p in model.parameters(): 24 | if p.dim() > 1: 25 | if init == "xavier_uniform": 26 | torch.nn.init.xavier_uniform_(p.data) 27 | elif init == "xavier_normal": 28 | torch.nn.init.xavier_normal_(p.data) 29 | elif init == "kaiming_uniform": 30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 31 | elif init == "kaiming_normal": 32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 33 | else: 34 | raise ValueError("Unknown initialization: " + init) 35 | # bias init 36 | for name, p in model.named_parameters(): 37 | if ".bias" in name and p.dim() == 1: 38 | p.data.zero_() 39 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict( 22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") 23 | ) 24 | with open(path, "a") as args_file: 25 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 26 | args_file.write( 27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) 28 | ) 29 | args_file.write("==> Cmd:\n") 30 | args_file.write(str(sys.argv)) 31 | args_file.write("\n==> args:\n") 32 | for k, v in sorted(args_dict.items()): 33 | args_file.write(" %s: %s\n" % (str(k), str(v))) 34 | args_file.close() 35 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/__init__.py: -------------------------------------------------------------------------------- 1 | from . import TTS, text_segmentation_method -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = { 4 | 'cnhubert': cnhubert, 5 | 'whisper': whisper_enc 6 | } -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/feature_extractor/cnhubert.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import librosa 4 | import torch 5 | import torch.nn.functional as F 6 | import soundfile as sf 7 | import logging 8 | 9 | logging.getLogger("numba").setLevel(logging.WARNING) 10 | 11 | from transformers import ( 12 | Wav2Vec2FeatureExtractor, 13 | HubertModel, 14 | ) 15 | 16 | 17 | import torch.nn as nn 18 | 19 | cnhubert_base_path = None 20 | 21 | 22 | class CNHubert(nn.Module): 23 | def __init__(self, base_path:str=None): 24 | super().__init__() 25 | if base_path is None: 26 | base_path = cnhubert_base_path 27 | self.model = HubertModel.from_pretrained(base_path) 28 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( 29 | base_path 30 | ) 31 | 32 | 33 | def forward(self, x): 34 | input_values = self.feature_extractor( 35 | x, return_tensors="pt", sampling_rate=16000 36 | ).input_values.to(x.device) 37 | feats = self.model(input_values)["last_hidden_state"] 38 | return feats 39 | 40 | 41 | # class CNHubertLarge(nn.Module): 42 | # def __init__(self): 43 | # super().__init__() 44 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 45 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 46 | # def forward(self, x): 47 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 48 | # feats = self.model(input_values)["last_hidden_state"] 49 | # return feats 50 | # 51 | # class CVec(nn.Module): 52 | # def __init__(self): 53 | # super().__init__() 54 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 55 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 56 | # def forward(self, x): 57 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 58 | # feats = self.model(input_values)["last_hidden_state"] 59 | # return feats 60 | # 61 | # class cnw2v2base(nn.Module): 62 | # def __init__(self): 63 | # super().__init__() 64 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 65 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 66 | # def forward(self, x): 67 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 68 | # feats = self.model(input_values)["last_hidden_state"] 69 | # return feats 70 | 71 | 72 | def get_model(): 73 | model = CNHubert() 74 | model.eval() 75 | return model 76 | 77 | 78 | # def get_large_model(): 79 | # model = CNHubertLarge() 80 | # model.eval() 81 | # return model 82 | # 83 | # def get_model_cvec(): 84 | # model = CVec() 85 | # model.eval() 86 | # return model 87 | # 88 | # def get_model_cnw2v2base(): 89 | # model = cnw2v2base() 90 | # model.eval() 91 | # return model 92 | 93 | 94 | def get_content(hmodel, wav_16k_tensor): 95 | with torch.no_grad(): 96 | feats = hmodel(wav_16k_tensor) 97 | return feats.transpose(1, 2) 98 | 99 | 100 | # if __name__ == "__main__": 101 | # model = get_model() 102 | # src_path = "/Users/Shared/原音频2.wav" 103 | # wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) 104 | # model = model 105 | # wav_16k_tensor = wav_16k_tensor 106 | # feats = get_content(model, wav_16k_tensor) 107 | # print(feats.shape) 108 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ 23 | :1, :feature_len, : 24 | ].transpose(1, 2) 25 | return feature 26 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/module/__init__.py -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def init_weights(m, mean=0.0, std=0.01): 7 | classname = m.__class__.__name__ 8 | if classname.find("Conv") != -1: 9 | m.weight.data.normal_(mean, std) 10 | 11 | 12 | def get_padding(kernel_size, dilation=1): 13 | return int((kernel_size * dilation - dilation) / 2) 14 | 15 | 16 | def convert_pad_shape(pad_shape): 17 | l = pad_shape[::-1] 18 | pad_shape = [item for sublist in l for item in sublist] 19 | return pad_shape 20 | 21 | 22 | def intersperse(lst, item): 23 | result = [item] * (len(lst) * 2 + 1) 24 | result[1::2] = lst 25 | return result 26 | 27 | 28 | def kl_divergence(m_p, logs_p, m_q, logs_q): 29 | """KL(P||Q)""" 30 | kl = (logs_q - logs_p) - 0.5 31 | kl += ( 32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 33 | ) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 68 | position = torch.arange(length, dtype=torch.float) 69 | num_timescales = channels // 2 70 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 71 | num_timescales - 1 72 | ) 73 | inv_timescales = min_timescale * torch.exp( 74 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 75 | ) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2, 3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1.0 / norm_type) 161 | return total_norm 162 | 163 | 164 | def squeeze(x, x_mask=None, n_sqz=2): 165 | b, c, t = x.size() 166 | 167 | t = (t // n_sqz) * n_sqz 168 | x = x[:, :, :t] 169 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 170 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 171 | 172 | if x_mask is not None: 173 | x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] 174 | else: 175 | x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 176 | return x_sqz * x_mask, x_mask 177 | 178 | 179 | def unsqueeze(x, x_mask=None, n_sqz=2): 180 | b, c, t = x.size() 181 | 182 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 183 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 184 | 185 | if x_mask is not None: 186 | x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 187 | else: 188 | x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 189 | return x_unsqz * x_mask, x_mask 190 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1 - dr) ** 2) 26 | g_loss = torch.mean(dg**2) 27 | loss += r_loss + g_loss 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1 - dg) ** 2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | 64 | def mle_loss(z, m, logs, logdet, mask): 65 | l = torch.sum(logs) + 0.5 * torch.sum( 66 | torch.exp(-2 * logs) * ((z - m) ** 2) 67 | ) # neg normal likelihood w/o the constant term 68 | l = l - torch.sum(logdet) # log jacobian determinant 69 | l = l / torch.sum( 70 | torch.ones_like(z) * mask 71 | ) # averaging across batch, channel and time axes 72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 73 | return l 74 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | if torch.min(y) < -1.0: 53 | print("min value is ", torch.min(y)) 54 | if torch.max(y) > 1.0: 55 | print("max value is ", torch.max(y)) 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + "_" + str(y.device) 59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 62 | dtype=y.dtype, device=y.device 63 | ) 64 | 65 | y = torch.nn.functional.pad( 66 | y.unsqueeze(1), 67 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 68 | mode="reflect", 69 | ) 70 | y = y.squeeze(1) 71 | spec = torch.stft( 72 | y, 73 | n_fft, 74 | hop_length=hop_size, 75 | win_length=win_size, 76 | window=hann_window[wnsize_dtype_device], 77 | center=center, 78 | pad_mode="reflect", 79 | normalized=False, 80 | onesided=True, 81 | return_complex=False, 82 | ) 83 | 84 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 85 | return spec 86 | 87 | 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 89 | global mel_basis 90 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 91 | fmax_dtype_device = str(fmax) + "_" + dtype_device 92 | if fmax_dtype_device not in mel_basis: 93 | mel = librosa_mel_fn( 94 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 95 | ) 96 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 97 | dtype=spec.dtype, device=spec.device 98 | ) 99 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 100 | spec = spectral_normalize_torch(spec) 101 | return spec 102 | 103 | 104 | def mel_spectrogram_torch( 105 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 106 | ): 107 | if torch.min(y) < -1.0: 108 | print("min value is ", torch.min(y)) 109 | if torch.max(y) > 1.0: 110 | print("max value is ", torch.max(y)) 111 | 112 | global mel_basis, hann_window 113 | dtype_device = str(y.dtype) + "_" + str(y.device) 114 | fmax_dtype_device = str(fmax) + "_" + dtype_device 115 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 116 | if fmax_dtype_device not in mel_basis: 117 | mel = librosa_mel_fn( 118 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 119 | ) 120 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 121 | dtype=y.dtype, device=y.device 122 | ) 123 | if wnsize_dtype_device not in hann_window: 124 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 125 | dtype=y.dtype, device=y.device 126 | ) 127 | 128 | y = torch.nn.functional.pad( 129 | y.unsqueeze(1), 130 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 131 | mode="reflect", 132 | ) 133 | y = y.squeeze(1) 134 | 135 | spec = torch.stft( 136 | y, 137 | n_fft, 138 | hop_length=hop_size, 139 | win_length=win_size, 140 | window=hann_window[wnsize_dtype_device], 141 | center=center, 142 | pad_mode="reflect", 143 | normalized=False, 144 | onesided=True, 145 | return_complex=False, 146 | ) 147 | 148 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 149 | 150 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 151 | spec = spectral_normalize_torch(spec) 152 | 153 | return spec 154 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/mrte_model.py: -------------------------------------------------------------------------------- 1 | # This is Multi-reference timbre encoder 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn.utils import remove_weight_norm, weight_norm 6 | from module.attentions import MultiHeadAttention 7 | 8 | 9 | class MRTE(nn.Module): 10 | def __init__( 11 | self, 12 | content_enc_channels=192, 13 | hidden_size=512, 14 | out_channels=192, 15 | kernel_size=5, 16 | n_heads=4, 17 | ge_layer=2, 18 | ): 19 | super(MRTE, self).__init__() 20 | self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads) 21 | self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) 22 | self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) 23 | self.c_post = nn.Conv1d(hidden_size, out_channels, 1) 24 | 25 | def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None): 26 | if ge == None: 27 | ge = 0 28 | attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1) 29 | 30 | ssl_enc = self.c_pre(ssl_enc * ssl_mask) 31 | text_enc = self.text_pre(text * text_mask) 32 | if test != None: 33 | if test == 0: 34 | x = ( 35 | self.cross_attention( 36 | ssl_enc * ssl_mask, text_enc * text_mask, attn_mask 37 | ) 38 | + ssl_enc 39 | + ge 40 | ) 41 | elif test == 1: 42 | x = ssl_enc + ge 43 | elif test == 2: 44 | x = ( 45 | self.cross_attention( 46 | ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask 47 | ) 48 | + ge 49 | ) 50 | else: 51 | raise ValueError("test should be 0,1,2") 52 | else: 53 | x = ( 54 | self.cross_attention( 55 | ssl_enc * ssl_mask, text_enc * text_mask, attn_mask 56 | ) 57 | + ssl_enc 58 | + ge 59 | ) 60 | x = self.c_post(x * ssl_mask) 61 | return x 62 | 63 | 64 | class SpeakerEncoder(torch.nn.Module): 65 | def __init__( 66 | self, 67 | mel_n_channels=80, 68 | model_num_layers=2, 69 | model_hidden_size=256, 70 | model_embedding_size=256, 71 | ): 72 | super(SpeakerEncoder, self).__init__() 73 | self.lstm = nn.LSTM( 74 | mel_n_channels, model_hidden_size, model_num_layers, batch_first=True 75 | ) 76 | self.linear = nn.Linear(model_hidden_size, model_embedding_size) 77 | self.relu = nn.ReLU() 78 | 79 | def forward(self, mels): 80 | self.lstm.flatten_parameters() 81 | _, (hidden, _) = self.lstm(mels.transpose(-1, -2)) 82 | embeds_raw = self.relu(self.linear(hidden[-1])) 83 | return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 84 | 85 | 86 | class MELEncoder(nn.Module): 87 | def __init__( 88 | self, 89 | in_channels, 90 | out_channels, 91 | hidden_channels, 92 | kernel_size, 93 | dilation_rate, 94 | n_layers, 95 | ): 96 | super().__init__() 97 | self.in_channels = in_channels 98 | self.out_channels = out_channels 99 | self.hidden_channels = hidden_channels 100 | self.kernel_size = kernel_size 101 | self.dilation_rate = dilation_rate 102 | self.n_layers = n_layers 103 | 104 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 105 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers) 106 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 107 | 108 | def forward(self, x): 109 | # print(x.shape,x_lengths.shape) 110 | x = self.pre(x) 111 | x = self.enc(x) 112 | x = self.proj(x) 113 | return x 114 | 115 | 116 | class WN(torch.nn.Module): 117 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers): 118 | super(WN, self).__init__() 119 | assert kernel_size % 2 == 1 120 | self.hidden_channels = hidden_channels 121 | self.kernel_size = kernel_size 122 | self.dilation_rate = dilation_rate 123 | self.n_layers = n_layers 124 | 125 | self.in_layers = torch.nn.ModuleList() 126 | self.res_skip_layers = torch.nn.ModuleList() 127 | 128 | for i in range(n_layers): 129 | dilation = dilation_rate**i 130 | padding = int((kernel_size * dilation - dilation) / 2) 131 | in_layer = nn.Conv1d( 132 | hidden_channels, 133 | 2 * hidden_channels, 134 | kernel_size, 135 | dilation=dilation, 136 | padding=padding, 137 | ) 138 | in_layer = weight_norm(in_layer) 139 | self.in_layers.append(in_layer) 140 | 141 | # last one is not necessary 142 | if i < n_layers - 1: 143 | res_skip_channels = 2 * hidden_channels 144 | else: 145 | res_skip_channels = hidden_channels 146 | 147 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 148 | res_skip_layer = weight_norm(res_skip_layer, name="weight") 149 | self.res_skip_layers.append(res_skip_layer) 150 | 151 | def forward(self, x): 152 | output = torch.zeros_like(x) 153 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 154 | 155 | for i in range(self.n_layers): 156 | x_in = self.in_layers[i](x) 157 | 158 | acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor) 159 | 160 | res_skip_acts = self.res_skip_layers[i](acts) 161 | if i < self.n_layers - 1: 162 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 163 | x = x + res_acts 164 | output = output + res_skip_acts[:, self.hidden_channels :, :] 165 | else: 166 | output = output + res_skip_acts 167 | return output 168 | 169 | def remove_weight_norm(self): 170 | for l in self.in_layers: 171 | remove_weight_norm(l) 172 | for l in self.res_skip_layers: 173 | remove_weight_norm(l) 174 | 175 | 176 | @torch.jit.script 177 | def fused_add_tanh_sigmoid_multiply(input, n_channels): 178 | n_channels_int = n_channels[0] 179 | t_act = torch.tanh(input[:, :n_channels_int, :]) 180 | s_act = torch.sigmoid(input[:, n_channels_int:, :]) 181 | acts = t_act * s_act 182 | return acts 183 | 184 | 185 | if __name__ == "__main__": 186 | content_enc = torch.randn(3, 192, 100) 187 | content_mask = torch.ones(3, 1, 100) 188 | ref_mel = torch.randn(3, 128, 30) 189 | ref_mask = torch.ones(3, 1, 30) 190 | model = MRTE() 191 | out = model(content_enc, content_mask, ref_mel, ref_mask) 192 | print(out.shape) 193 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/module/quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Residual vector quantizer implementation.""" 8 | 9 | from dataclasses import dataclass, field 10 | import math 11 | import typing as tp 12 | 13 | import torch 14 | from torch import nn 15 | 16 | from module.core_vq import ResidualVectorQuantization 17 | 18 | 19 | @dataclass 20 | class QuantizedResult: 21 | quantized: torch.Tensor 22 | codes: torch.Tensor 23 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item. 24 | penalty: tp.Optional[torch.Tensor] = None 25 | metrics: dict = field(default_factory=dict) 26 | 27 | 28 | class ResidualVectorQuantizer(nn.Module): 29 | """Residual Vector Quantizer. 30 | Args: 31 | dimension (int): Dimension of the codebooks. 32 | n_q (int): Number of residual vector quantizers used. 33 | bins (int): Codebook size. 34 | decay (float): Decay for exponential moving average over the codebooks. 35 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks. 36 | kmeans_iters (int): Number of iterations used for kmeans initialization. 37 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes 38 | that have an exponential moving average cluster size less than the specified threshold with 39 | randomly selected vector from the current batch. 40 | """ 41 | 42 | def __init__( 43 | self, 44 | dimension: int = 256, 45 | n_q: int = 8, 46 | bins: int = 1024, 47 | decay: float = 0.99, 48 | kmeans_init: bool = True, 49 | kmeans_iters: int = 50, 50 | threshold_ema_dead_code: int = 2, 51 | ): 52 | super().__init__() 53 | self.n_q = n_q 54 | self.dimension = dimension 55 | self.bins = bins 56 | self.decay = decay 57 | self.kmeans_init = kmeans_init 58 | self.kmeans_iters = kmeans_iters 59 | self.threshold_ema_dead_code = threshold_ema_dead_code 60 | self.vq = ResidualVectorQuantization( 61 | dim=self.dimension, 62 | codebook_size=self.bins, 63 | num_quantizers=self.n_q, 64 | decay=self.decay, 65 | kmeans_init=self.kmeans_init, 66 | kmeans_iters=self.kmeans_iters, 67 | threshold_ema_dead_code=self.threshold_ema_dead_code, 68 | ) 69 | 70 | def forward( 71 | self, 72 | x: torch.Tensor, 73 | n_q: tp.Optional[int] = None, 74 | layers: tp.Optional[list] = None, 75 | ) -> QuantizedResult: 76 | """Residual vector quantization on the given input tensor. 77 | Args: 78 | x (torch.Tensor): Input tensor. 79 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 80 | layers (list): Layer that need to return quantized. Defalt: None. 81 | Returns: 82 | QuantizedResult: 83 | The quantized (or approximately quantized) representation with 84 | the associated numbert quantizers and layer quantized required to return. 85 | """ 86 | n_q = n_q if n_q else self.n_q 87 | if layers and max(layers) >= n_q: 88 | raise ValueError( 89 | f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." 90 | ) 91 | quantized, codes, commit_loss, quantized_list = self.vq( 92 | x, n_q=n_q, layers=layers 93 | ) 94 | return quantized, codes, torch.mean(commit_loss), quantized_list 95 | 96 | def encode( 97 | self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None 98 | ) -> torch.Tensor: 99 | """Encode a given input tensor with the specified sample rate at the given bandwidth. 100 | The RVQ encode method sets the appropriate number of quantizer to use 101 | and returns indices for each quantizer. 102 | Args: 103 | x (torch.Tensor): Input tensor. 104 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 105 | st (int): Start to encode input from which layers. Default: 0. 106 | """ 107 | n_q = n_q if n_q else self.n_q 108 | st = st or 0 109 | codes = self.vq.encode(x, n_q=n_q, st=st) 110 | return codes 111 | 112 | def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor: 113 | """Decode the given codes to the quantized representation. 114 | Args: 115 | codes (torch.Tensor): Input indices for each quantizer. 116 | st (int): Start to decode input codes from which layers. Default: 0. 117 | """ 118 | quantized = self.vq.decode(codes, st=st) 119 | return quantized 120 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | def cleaned_text_to_sequence(cleaned_text): 7 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | return phones 15 | 16 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pdb 3 | import re 4 | 5 | import cn2an 6 | from pypinyin import lazy_pinyin, Style 7 | 8 | from text.symbols import punctuation 9 | from text.tone_sandhi import ToneSandhi 10 | from text.zh_normalization.text_normlization import TextNormalizer 11 | 12 | normalizer = lambda x: cn2an.transform(x, "an2cn") 13 | 14 | current_file_path = os.path.dirname(__file__) 15 | pinyin_to_symbol_map = { 16 | line.split("\t")[0]: line.strip().split("\t")[1] 17 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 18 | } 19 | 20 | import jieba_fast.posseg as psg 21 | 22 | 23 | rep_map = { 24 | ":": ",", 25 | ";": ",", 26 | ",": ",", 27 | "。": ".", 28 | "!": "!", 29 | "?": "?", 30 | "\n": ".", 31 | "·": ",", 32 | "、": ",", 33 | "...": "…", 34 | "$": ".", 35 | "/": ",", 36 | "—": "-", 37 | "~": "…", 38 | "~":"…", 39 | } 40 | 41 | tone_modifier = ToneSandhi() 42 | 43 | 44 | def replace_punctuation(text): 45 | text = text.replace("嗯", "恩").replace("呣", "母") 46 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 47 | 48 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 49 | 50 | replaced_text = re.sub( 51 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 52 | ) 53 | 54 | return replaced_text 55 | 56 | 57 | def g2p(text): 58 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 59 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 60 | phones, word2ph = _g2p(sentences) 61 | return phones, word2ph 62 | 63 | 64 | def _get_initials_finals(word): 65 | initials = [] 66 | finals = [] 67 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 68 | orig_finals = lazy_pinyin( 69 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 70 | ) 71 | for c, v in zip(orig_initials, orig_finals): 72 | initials.append(c) 73 | finals.append(v) 74 | return initials, finals 75 | 76 | 77 | def _g2p(segments): 78 | phones_list = [] 79 | word2ph = [] 80 | for seg in segments: 81 | pinyins = [] 82 | # Replace all English words in the sentence 83 | seg = re.sub("[a-zA-Z]+", "", seg) 84 | seg_cut = psg.lcut(seg) 85 | initials = [] 86 | finals = [] 87 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 88 | for word, pos in seg_cut: 89 | if pos == "eng": 90 | continue 91 | sub_initials, sub_finals = _get_initials_finals(word) 92 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 93 | initials.append(sub_initials) 94 | finals.append(sub_finals) 95 | 96 | # assert len(sub_initials) == len(sub_finals) == len(word) 97 | initials = sum(initials, []) 98 | finals = sum(finals, []) 99 | # 100 | for c, v in zip(initials, finals): 101 | raw_pinyin = c + v 102 | # NOTE: post process for pypinyin outputs 103 | # we discriminate i, ii and iii 104 | if c == v: 105 | assert c in punctuation 106 | phone = [c] 107 | word2ph.append(1) 108 | else: 109 | v_without_tone = v[:-1] 110 | tone = v[-1] 111 | 112 | pinyin = c + v_without_tone 113 | assert tone in "12345" 114 | 115 | if c: 116 | # 多音节 117 | v_rep_map = { 118 | "uei": "ui", 119 | "iou": "iu", 120 | "uen": "un", 121 | } 122 | if v_without_tone in v_rep_map.keys(): 123 | pinyin = c + v_rep_map[v_without_tone] 124 | else: 125 | # 单音节 126 | pinyin_rep_map = { 127 | "ing": "ying", 128 | "i": "yi", 129 | "in": "yin", 130 | "u": "wu", 131 | } 132 | if pinyin in pinyin_rep_map.keys(): 133 | pinyin = pinyin_rep_map[pinyin] 134 | else: 135 | single_rep_map = { 136 | "v": "yu", 137 | "e": "e", 138 | "i": "y", 139 | "u": "w", 140 | } 141 | if pinyin[0] in single_rep_map.keys(): 142 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 143 | 144 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 145 | new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ") 146 | new_v = new_v + tone 147 | phone = [new_c, new_v] 148 | word2ph.append(len(phone)) 149 | 150 | phones_list += phone 151 | return phones_list, word2ph 152 | 153 | 154 | def text_normalize(text): 155 | # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization 156 | tx = TextNormalizer() 157 | sentences = tx.normalize(text) 158 | dest_text = "" 159 | for sentence in sentences: 160 | dest_text += replace_punctuation(sentence) 161 | return dest_text 162 | 163 | 164 | if __name__ == "__main__": 165 | text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" 166 | text = "呣呣呣~就是…大人的鼹鼠党吧?" 167 | text = "你好" 168 | text = text_normalize(text) 169 | print(g2p(text)) 170 | 171 | 172 | # # 示例用法 173 | # text = "这是一个示例文本:,你好!这是一个测试..." 174 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 175 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english 2 | 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english} 4 | special = [ 5 | # ("%", "zh", "SP"), 6 | ("¥", "zh", "SP2"), 7 | ("^", "zh", "SP3"), 8 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 9 | ] 10 | 11 | 12 | def clean_text(text, language): 13 | if(language not in language_module_map): 14 | language="en" 15 | text=" " 16 | for special_s, special_l, target_symbol in special: 17 | if special_s in text and language == special_l: 18 | return clean_special(text, language, special_s, target_symbol) 19 | language_module = language_module_map[language] 20 | norm_text = language_module.text_normalize(text) 21 | if language == "zh": 22 | phones, word2ph = language_module.g2p(norm_text) 23 | assert len(phones) == sum(word2ph) 24 | assert len(norm_text) == len(word2ph) 25 | else: 26 | phones = language_module.g2p(norm_text) 27 | word2ph = None 28 | 29 | for ph in phones: 30 | assert ph in symbols 31 | return phones, word2ph, norm_text 32 | 33 | 34 | def clean_special(text, language, special_s, target_symbol): 35 | """ 36 | 特殊静音段sp符号处理 37 | """ 38 | text = text.replace(special_s, ",") 39 | language_module = language_module_map[language] 40 | norm_text = language_module.text_normalize(text) 41 | phones = language_module.g2p(norm_text) 42 | new_ph = [] 43 | for ph in phones[0]: 44 | assert ph in symbols 45 | if ph == ",": 46 | new_ph.append(target_symbol) 47 | else: 48 | new_ph.append(ph) 49 | return new_ph, phones[1], norm_text 50 | 51 | 52 | def text_to_sequence(text, language): 53 | phones = clean_text(text) 54 | return cleaned_text_to_sequence(phones) 55 | 56 | 57 | if __name__ == "__main__": 58 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 59 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/Adapters/gsv_fast/TTS_infer_pack/text/engdict_cache.pickle -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/english.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | import wordsegment 5 | from g2p_en import G2p 6 | 7 | from string import punctuation 8 | 9 | from text import symbols 10 | 11 | current_file_path = os.path.dirname(__file__) 12 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") 13 | CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep") 14 | CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep") 15 | CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle") 16 | 17 | arpa = { 18 | "AH0", 19 | "S", 20 | "AH1", 21 | "EY2", 22 | "AE2", 23 | "EH0", 24 | "OW2", 25 | "UH0", 26 | "NG", 27 | "B", 28 | "G", 29 | "AY0", 30 | "M", 31 | "AA0", 32 | "F", 33 | "AO0", 34 | "ER2", 35 | "UH1", 36 | "IY1", 37 | "AH2", 38 | "DH", 39 | "IY0", 40 | "EY1", 41 | "IH0", 42 | "K", 43 | "N", 44 | "W", 45 | "IY2", 46 | "T", 47 | "AA1", 48 | "ER1", 49 | "EH2", 50 | "OY0", 51 | "UH2", 52 | "UW1", 53 | "Z", 54 | "AW2", 55 | "AW1", 56 | "V", 57 | "UW2", 58 | "AA2", 59 | "ER", 60 | "AW0", 61 | "UW0", 62 | "R", 63 | "OW1", 64 | "EH1", 65 | "ZH", 66 | "AE0", 67 | "IH2", 68 | "IH", 69 | "Y", 70 | "JH", 71 | "P", 72 | "AY1", 73 | "EY0", 74 | "OY2", 75 | "TH", 76 | "HH", 77 | "D", 78 | "ER0", 79 | "CH", 80 | "AO1", 81 | "AE1", 82 | "AO2", 83 | "OY1", 84 | "AY2", 85 | "IH1", 86 | "OW0", 87 | "L", 88 | "SH", 89 | } 90 | 91 | 92 | def replace_phs(phs): 93 | rep_map = {"'": "-"} 94 | phs_new = [] 95 | for ph in phs: 96 | if ph in symbols: 97 | phs_new.append(ph) 98 | elif ph in rep_map.keys(): 99 | phs_new.append(rep_map[ph]) 100 | else: 101 | print("ph not in symbols: ", ph) 102 | return phs_new 103 | 104 | 105 | def read_dict(): 106 | g2p_dict = {} 107 | start_line = 49 108 | with open(CMU_DICT_PATH) as f: 109 | line = f.readline() 110 | line_index = 1 111 | while line: 112 | if line_index >= start_line: 113 | line = line.strip() 114 | word_split = line.split(" ") 115 | word = word_split[0].lower() 116 | 117 | syllable_split = word_split[1].split(" - ") 118 | g2p_dict[word] = [] 119 | for syllable in syllable_split: 120 | phone_split = syllable.split(" ") 121 | g2p_dict[word].append(phone_split) 122 | 123 | line_index = line_index + 1 124 | line = f.readline() 125 | 126 | return g2p_dict 127 | 128 | 129 | def read_dict_new(): 130 | g2p_dict = {} 131 | with open(CMU_DICT_PATH) as f: 132 | line = f.readline() 133 | line_index = 1 134 | while line: 135 | if line_index >= 57: 136 | line = line.strip() 137 | word_split = line.split(" ") 138 | word = word_split[0].lower() 139 | g2p_dict[word] = [word_split[1].split(" ")] 140 | 141 | line_index = line_index + 1 142 | line = f.readline() 143 | 144 | with open(CMU_DICT_FAST_PATH) as f: 145 | line = f.readline() 146 | line_index = 1 147 | while line: 148 | if line_index >= 0: 149 | line = line.strip() 150 | word_split = line.split(" ") 151 | word = word_split[0].lower() 152 | if word not in g2p_dict: 153 | g2p_dict[word] = [word_split[1:]] 154 | 155 | line_index = line_index + 1 156 | line = f.readline() 157 | 158 | with open(CMU_DICT_HOT_PATH) as f: 159 | line = f.readline() 160 | line_index = 1 161 | while line: 162 | if line_index >= 0: 163 | line = line.strip() 164 | word_split = line.split(" ") 165 | word = word_split[0].lower() 166 | # 自定义发音词直接覆盖字典 167 | g2p_dict[word] = [word_split[1:]] 168 | 169 | line_index = line_index + 1 170 | line = f.readline() 171 | 172 | return g2p_dict 173 | 174 | 175 | def cache_dict(g2p_dict, file_path): 176 | with open(file_path, "wb") as pickle_file: 177 | pickle.dump(g2p_dict, pickle_file) 178 | 179 | 180 | def get_dict(): 181 | if os.path.exists(CACHE_PATH): 182 | with open(CACHE_PATH, "rb") as pickle_file: 183 | g2p_dict = pickle.load(pickle_file) 184 | else: 185 | g2p_dict = read_dict_new() 186 | cache_dict(g2p_dict, CACHE_PATH) 187 | 188 | return g2p_dict 189 | 190 | 191 | eng_dict = get_dict() 192 | 193 | 194 | def text_normalize(text): 195 | # todo: eng text normalize 196 | # 适配中文及 g2p_en 标点 197 | rep_map = { 198 | "[;::,;]": ",", 199 | '["’]': "'", 200 | "。": ".", 201 | "!": "!", 202 | "?": "?", 203 | } 204 | for p, r in rep_map.items(): 205 | text = re.sub(p, r, text) 206 | 207 | return text 208 | 209 | 210 | class en_G2p(G2p): 211 | def __init__(self): 212 | super().__init__() 213 | # 分词初始化 214 | wordsegment.load() 215 | 216 | # 扩展过时字典 217 | self.cmu = get_dict() 218 | 219 | # 剔除读音错误的几个缩写 220 | for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]: 221 | del self.cmu[word.lower()] 222 | 223 | # "A" 落单不读 "AH0" 读 "EY1" 224 | self.cmu['a'] = [['EY1']] 225 | 226 | 227 | def predict(self, word): 228 | # 小写 oov 长度小于等于 3 直接读字母 229 | if (len(word) <= 3): 230 | return [phone for w in word for phone in self(w)] 231 | 232 | # 尝试分离所有格 233 | if re.match(r"^([a-z]+)('s)$", word): 234 | phone = self(word[:-2]) 235 | phone.extend(['Z']) 236 | return phone 237 | 238 | # 尝试进行分词,应对复合词 239 | comps = wordsegment.segment(word.lower()) 240 | 241 | # 无法分词的送回去预测 242 | if len(comps)==1: 243 | return super().predict(word) 244 | 245 | # 可以分词的递归处理 246 | return [phone for comp in comps for phone in self(comp)] 247 | 248 | 249 | _g2p = en_G2p() 250 | 251 | 252 | def g2p(text): 253 | # g2p_en 整段推理,剔除不存在的arpa返回 254 | phone_list = _g2p(text) 255 | phones = [ph if ph != "" else "UNK" for ph in phone_list if ph not in [" ", "", "UW", "", ""]] 256 | 257 | return replace_phs(phones) 258 | 259 | 260 | if __name__ == "__main__": 261 | # print(get_dict()) 262 | print(g2p("hello")) 263 | print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) 264 | # all_phones = set() 265 | # for k, syllables in eng_dict.items(): 266 | # for group in syllables: 267 | # for ph in group: 268 | # all_phones.add(ph) 269 | # print(all_phones) 270 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/japanese.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py 2 | import re 3 | import sys 4 | 5 | import pyopenjtalk 6 | 7 | 8 | from text import symbols 9 | # Regular expression matching Japanese without punctuation marks: 10 | _japanese_characters = re.compile( 11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 12 | ) 13 | 14 | # Regular expression matching non-Japanese characters or punctuation marks: 15 | _japanese_marks = re.compile( 16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 17 | ) 18 | 19 | # List of (symbol, Japanese) pairs for marks: 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] 21 | 22 | 23 | # List of (consonant, sokuon) pairs: 24 | _real_sokuon = [ 25 | (re.compile("%s" % x[0]), x[1]) 26 | for x in [ 27 | (r"Q([↑↓]*[kg])", r"k#\1"), 28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"), 29 | (r"Q([↑↓]*[sʃ])", r"s\1"), 30 | (r"Q([↑↓]*[pb])", r"p#\1"), 31 | ] 32 | ] 33 | 34 | # List of (consonant, hatsuon) pairs: 35 | _real_hatsuon = [ 36 | (re.compile("%s" % x[0]), x[1]) 37 | for x in [ 38 | (r"N([↑↓]*[pbm])", r"m\1"), 39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"), 40 | (r"N([↑↓]*[tdn])", r"n\1"), 41 | (r"N([↑↓]*[kg])", r"ŋ\1"), 42 | ] 43 | ] 44 | 45 | 46 | def post_replace_ph(ph): 47 | rep_map = { 48 | ":": ",", 49 | ";": ",", 50 | ",": ",", 51 | "。": ".", 52 | "!": "!", 53 | "?": "?", 54 | "\n": ".", 55 | "·": ",", 56 | "、": ",", 57 | "...": "…", 58 | } 59 | if ph in rep_map.keys(): 60 | ph = rep_map[ph] 61 | if ph in symbols: 62 | return ph 63 | if ph not in symbols: 64 | ph = "UNK" 65 | return ph 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def preprocess_jap(text, with_prosody=False): 75 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = [] 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if with_prosody: 83 | text += pyopenjtalk_g2p_prosody(sentence)[1:-1] 84 | else: 85 | p = pyopenjtalk.g2p(sentence) 86 | text += p.split(" ") 87 | 88 | if i < len(marks): 89 | if marks[i] == " ":# 防止意外的UNK 90 | continue 91 | text += [marks[i].replace(" ", "")] 92 | return text 93 | 94 | 95 | def text_normalize(text): 96 | # todo: jap text normalize 97 | return text 98 | 99 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py 100 | def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): 101 | """Extract phoneme + prosoody symbol sequence from input full-context labels. 102 | 103 | The algorithm is based on `Prosodic features control by symbols as input of 104 | sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. 105 | 106 | Args: 107 | text (str): Input text. 108 | drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. 109 | 110 | Returns: 111 | List[str]: List of phoneme + prosody symbols. 112 | 113 | Examples: 114 | >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody 115 | >>> pyopenjtalk_g2p_prosody("こんにちは。") 116 | ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] 117 | 118 | .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic 119 | modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 120 | 121 | """ 122 | labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) 123 | N = len(labels) 124 | 125 | phones = [] 126 | for n in range(N): 127 | lab_curr = labels[n] 128 | 129 | # current phoneme 130 | p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) 131 | # deal unvoiced vowels as normal vowels 132 | if drop_unvoiced_vowels and p3 in "AEIOU": 133 | p3 = p3.lower() 134 | 135 | # deal with sil at the beginning and the end of text 136 | if p3 == "sil": 137 | assert n == 0 or n == N - 1 138 | if n == 0: 139 | phones.append("^") 140 | elif n == N - 1: 141 | # check question form or not 142 | e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) 143 | if e3 == 0: 144 | phones.append("$") 145 | elif e3 == 1: 146 | phones.append("?") 147 | continue 148 | elif p3 == "pau": 149 | phones.append("_") 150 | continue 151 | else: 152 | phones.append(p3) 153 | 154 | # accent type and position info (forward or backward) 155 | a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) 156 | a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) 157 | a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) 158 | 159 | # number of mora in accent phrase 160 | f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) 161 | 162 | a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) 163 | # accent phrase border 164 | if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": 165 | phones.append("#") 166 | # pitch falling 167 | elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: 168 | phones.append("]") 169 | # pitch rising 170 | elif a2 == 1 and a2_next == 2: 171 | phones.append("[") 172 | 173 | return phones 174 | 175 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py 176 | def _numeric_feature_by_regex(regex, s): 177 | match = re.search(regex, s) 178 | if match is None: 179 | return -50 180 | return int(match.group(1)) 181 | 182 | def g2p(norm_text, with_prosody=False): 183 | phones = preprocess_jap(norm_text, with_prosody) 184 | phones = [post_replace_ph(i) for i in phones] 185 | # todo: implement tones and word2ph 186 | return phones 187 | 188 | 189 | if __name__ == "__main__": 190 | phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") 191 | print(phones) -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/symbols.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 4 | punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 5 | punctuation.append("-") 6 | pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"] 7 | # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"] 8 | pad = "_" 9 | 10 | c = [ 11 | "AA", 12 | "EE", 13 | "OO", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "f", 19 | "g", 20 | "h", 21 | "j", 22 | "k", 23 | "l", 24 | "m", 25 | "n", 26 | "p", 27 | "q", 28 | "r", 29 | "s", 30 | "sh", 31 | "t", 32 | "w", 33 | "x", 34 | "y", 35 | "z", 36 | "zh", 37 | ] 38 | v = [ 39 | "E1", 40 | "En1", 41 | "a1", 42 | "ai1", 43 | "an1", 44 | "ang1", 45 | "ao1", 46 | "e1", 47 | "ei1", 48 | "en1", 49 | "eng1", 50 | "er1", 51 | "i1", 52 | "i01", 53 | "ia1", 54 | "ian1", 55 | "iang1", 56 | "iao1", 57 | "ie1", 58 | "in1", 59 | "ing1", 60 | "iong1", 61 | "ir1", 62 | "iu1", 63 | "o1", 64 | "ong1", 65 | "ou1", 66 | "u1", 67 | "ua1", 68 | "uai1", 69 | "uan1", 70 | "uang1", 71 | "ui1", 72 | "un1", 73 | "uo1", 74 | "v1", 75 | "van1", 76 | "ve1", 77 | "vn1", 78 | "E2", 79 | "En2", 80 | "a2", 81 | "ai2", 82 | "an2", 83 | "ang2", 84 | "ao2", 85 | "e2", 86 | "ei2", 87 | "en2", 88 | "eng2", 89 | "er2", 90 | "i2", 91 | "i02", 92 | "ia2", 93 | "ian2", 94 | "iang2", 95 | "iao2", 96 | "ie2", 97 | "in2", 98 | "ing2", 99 | "iong2", 100 | "ir2", 101 | "iu2", 102 | "o2", 103 | "ong2", 104 | "ou2", 105 | "u2", 106 | "ua2", 107 | "uai2", 108 | "uan2", 109 | "uang2", 110 | "ui2", 111 | "un2", 112 | "uo2", 113 | "v2", 114 | "van2", 115 | "ve2", 116 | "vn2", 117 | "E3", 118 | "En3", 119 | "a3", 120 | "ai3", 121 | "an3", 122 | "ang3", 123 | "ao3", 124 | "e3", 125 | "ei3", 126 | "en3", 127 | "eng3", 128 | "er3", 129 | "i3", 130 | "i03", 131 | "ia3", 132 | "ian3", 133 | "iang3", 134 | "iao3", 135 | "ie3", 136 | "in3", 137 | "ing3", 138 | "iong3", 139 | "ir3", 140 | "iu3", 141 | "o3", 142 | "ong3", 143 | "ou3", 144 | "u3", 145 | "ua3", 146 | "uai3", 147 | "uan3", 148 | "uang3", 149 | "ui3", 150 | "un3", 151 | "uo3", 152 | "v3", 153 | "van3", 154 | "ve3", 155 | "vn3", 156 | "E4", 157 | "En4", 158 | "a4", 159 | "ai4", 160 | "an4", 161 | "ang4", 162 | "ao4", 163 | "e4", 164 | "ei4", 165 | "en4", 166 | "eng4", 167 | "er4", 168 | "i4", 169 | "i04", 170 | "ia4", 171 | "ian4", 172 | "iang4", 173 | "iao4", 174 | "ie4", 175 | "in4", 176 | "ing4", 177 | "iong4", 178 | "ir4", 179 | "iu4", 180 | "o4", 181 | "ong4", 182 | "ou4", 183 | "u4", 184 | "ua4", 185 | "uai4", 186 | "uan4", 187 | "uang4", 188 | "ui4", 189 | "un4", 190 | "uo4", 191 | "v4", 192 | "van4", 193 | "ve4", 194 | "vn4", 195 | "E5", 196 | "En5", 197 | "a5", 198 | "ai5", 199 | "an5", 200 | "ang5", 201 | "ao5", 202 | "e5", 203 | "ei5", 204 | "en5", 205 | "eng5", 206 | "er5", 207 | "i5", 208 | "i05", 209 | "ia5", 210 | "ian5", 211 | "iang5", 212 | "iao5", 213 | "ie5", 214 | "in5", 215 | "ing5", 216 | "iong5", 217 | "ir5", 218 | "iu5", 219 | "o5", 220 | "ong5", 221 | "ou5", 222 | "u5", 223 | "ua5", 224 | "uai5", 225 | "uan5", 226 | "uang5", 227 | "ui5", 228 | "un5", 229 | "uo5", 230 | "v5", 231 | "van5", 232 | "ve5", 233 | "vn5", 234 | ] 235 | 236 | v_without_tone = [ 237 | "E", 238 | "En", 239 | "a", 240 | "ai", 241 | "an", 242 | "ang", 243 | "ao", 244 | "e", 245 | "ei", 246 | "en", 247 | "eng", 248 | "er", 249 | "i", 250 | "i0", 251 | "ia", 252 | "ian", 253 | "iang", 254 | "iao", 255 | "ie", 256 | "in", 257 | "ing", 258 | "iong", 259 | "ir", 260 | "iu", 261 | "o", 262 | "ong", 263 | "ou", 264 | "u", 265 | "ua", 266 | "uai", 267 | "uan", 268 | "uang", 269 | "ui", 270 | "un", 271 | "uo", 272 | "v", 273 | "van", 274 | "ve", 275 | "vn", 276 | ] 277 | 278 | # japanese 279 | ja_symbols = [ 280 | "I", 281 | "N", 282 | "U", 283 | "a", 284 | "b", 285 | "by", 286 | "ch", 287 | "cl", 288 | "d", 289 | "dy", 290 | "e", 291 | "f", 292 | "g", 293 | "gy", 294 | "h", 295 | "hy", 296 | "i", 297 | "j", 298 | "k", 299 | "ky", 300 | "m", 301 | "my", 302 | "n", 303 | "ny", 304 | "o", 305 | "p", 306 | "py", 307 | "r", 308 | "ry", 309 | "s", 310 | "sh", 311 | "t", 312 | "ts", 313 | "u", 314 | "v", 315 | "w", 316 | "y", 317 | "z", 318 | # "[", #上升调型 319 | # "]", #下降调型 320 | # "$", #结束符 321 | # "^", #开始符 322 | ] 323 | 324 | arpa = { 325 | "AH0", 326 | "S", 327 | "AH1", 328 | "EY2", 329 | "AE2", 330 | "EH0", 331 | "OW2", 332 | "UH0", 333 | "NG", 334 | "B", 335 | "G", 336 | "AY0", 337 | "M", 338 | "AA0", 339 | "F", 340 | "AO0", 341 | "ER2", 342 | "UH1", 343 | "IY1", 344 | "AH2", 345 | "DH", 346 | "IY0", 347 | "EY1", 348 | "IH0", 349 | "K", 350 | "N", 351 | "W", 352 | "IY2", 353 | "T", 354 | "AA1", 355 | "ER1", 356 | "EH2", 357 | "OY0", 358 | "UH2", 359 | "UW1", 360 | "Z", 361 | "AW2", 362 | "AW1", 363 | "V", 364 | "UW2", 365 | "AA2", 366 | "ER", 367 | "AW0", 368 | "UW0", 369 | "R", 370 | "OW1", 371 | "EH1", 372 | "ZH", 373 | "AE0", 374 | "IH2", 375 | "IH", 376 | "Y", 377 | "JH", 378 | "P", 379 | "AY1", 380 | "EY0", 381 | "OY2", 382 | "TH", 383 | "HH", 384 | "D", 385 | "ER0", 386 | "CH", 387 | "AO1", 388 | "AE1", 389 | "AO2", 390 | "OY1", 391 | "AY2", 392 | "IH1", 393 | "OW0", 394 | "L", 395 | "SH", 396 | } 397 | 398 | symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) 399 | symbols = sorted(set(symbols)) 400 | if __name__ == "__main__": 401 | print(len(symbols)) 402 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | ord(char) + 65248: ord(char) 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/TTS_infer_pack/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒" 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /Adapters/gsv_fast/__init__.py: -------------------------------------------------------------------------------- 1 | from .gsv_adapter import GSV_Instance as TTS_Instance -------------------------------------------------------------------------------- /Adapters/gsv_fast/gsv_task.py: -------------------------------------------------------------------------------- 1 | 2 | import os, json, sys 3 | sys.path.append(".") 4 | 5 | from uuid import uuid4 6 | from typing import List, Dict, Literal 7 | import urllib.parse 8 | import hashlib 9 | 10 | 11 | def get_params_config(): 12 | try: 13 | with open(os.path.join("configs/gsv_fast", "params_config.json"), "r", encoding="utf-8") as f: 14 | return json.load(f) 15 | except: 16 | raise FileNotFoundError("params_config.json not found.") 17 | 18 | params_config = get_params_config() 19 | 20 | from Adapters.base import Base_TTS_Task 21 | 22 | class GSV_TTS_Task(Base_TTS_Task): 23 | 24 | def __init__(self, other_task=None): 25 | super().__init__(other_task) 26 | 27 | self.params_config:dict = params_config 28 | self.disabled_features: List[str] = [] 29 | 30 | self.character: str = self.params_config["character"]["default"] if other_task is None else other_task.character 31 | self.emotion: str = self.params_config["emotion"]["default"] if other_task is None else other_task.emotion 32 | self.text_language: str = self.params_config["text_language"]["default"] if other_task is None else other_task.text_language 33 | self.speaker_id: int = self.params_config["speaker_id"]["default"] if other_task is None else other_task.speaker_id 34 | self.batch_size: int = self.params_config["batch_size"]["default"] if other_task is None else other_task.batch_size 35 | self.top_k: int = self.params_config["top_k"]["default"] if other_task is None else other_task.top_k 36 | self.top_p: float = self.params_config["top_p"]["default"] if other_task is None else other_task.top_p 37 | self.temperature: float = self.params_config["temperature"]["default"] if other_task is None else other_task.temperature 38 | self.cut_method: str = self.params_config["cut_method"]["default"] if other_task is None else other_task.cut_method 39 | self.seed: int = self.params_config["seed"]["default"] if other_task is None else other_task.seed 40 | 41 | # 通用属性 42 | self.sample_rate: int = 32000 # 采样率, gsv底模为32000, 因此只能是32000 43 | self.format: str = self.params_config["format"]["default"] if other_task is None else other_task.format 44 | self.stream: bool = self.params_config["stream"]["default"] if other_task is None else other_task.stream 45 | self.loudness: float = self.params_config["loudness"]["default"] if other_task is None else other_task.loudness 46 | self.speed: float = self.params_config["speed"]["default"] if other_task is None else other_task.speed 47 | self.save_temp: bool = self.params_config["save_temp"]["default"] if other_task is None else other_task.save_temp 48 | 49 | 50 | def load_from_dict(self, data: dict={}): 51 | 52 | assert self.params_config is not None, "params_config.json not found." 53 | 54 | super().load_from_dict(data) 55 | 56 | # 参数提取 57 | if self.task_type == "text": 58 | self.text = self.get_param_value('text', data).strip() 59 | 60 | self.character = self.get_param_value('character', data) 61 | self.speaker_id = self.get_param_value('speaker_id', data) 62 | 63 | self.text_language = self.get_param_value('text_language', data) 64 | self.batch_size = self.get_param_value('batch_size', data) 65 | self.speed = self.get_param_value('speed', data) 66 | self.top_k = self.get_param_value('top_k', data) 67 | self.top_p = self.get_param_value('top_p', data) 68 | self.temperature = self.get_param_value('temperature', data) 69 | self.seed = self.get_param_value('seed', data) 70 | 71 | self.emotion = self.get_param_value('emotion', data) 72 | self.cut_method = self.get_param_value('cut_method', data) 73 | if self.cut_method == "auto_cut": 74 | self.cut_method = f"auto_cut_100" 75 | else: 76 | self.ssml = self.get_param_value('ssml', data).strip() 77 | 78 | 79 | def md5(self): 80 | m = hashlib.md5() 81 | if self.task_type == "audio": 82 | m.update(self.src.encode()) 83 | elif self.task_type == "ssml": 84 | m.update(self.ssml.encode()) 85 | elif self.task_type == "text": 86 | m.update(self.variation.encode()) 87 | m.update(self.text.encode()) 88 | m.update(self.text_language.encode()) 89 | m.update(self.character.encode()) 90 | m.update(str(self.speaker_id).encode()) 91 | m.update(str(self.speed).encode()) 92 | m.update(str(self.top_k).encode()) 93 | m.update(str(self.top_p).encode()) 94 | m.update(str(self.temperature).encode()) 95 | m.update(str(self.cut_method).encode()) 96 | m.update(str(self.emotion).encode()) 97 | return m.hexdigest() 98 | 99 | def updateVariation(self): 100 | self.variation = str(uuid4()) 101 | 102 | def to_dict(self): 103 | return { 104 | "text": self.text, 105 | "text_language": self.text_language, 106 | "character_emotion": self.emotion, 107 | "batch_size": self.batch_size, 108 | "speed": self.speed, 109 | "top_k": self.top_k, 110 | "top_p": self.top_p, 111 | "temperature": self.temperature, 112 | "cut_method": self.cut_method, 113 | "format": self.format, 114 | "seed": self.seed, 115 | 116 | "stream": self.stream, 117 | "loudness": self.loudness, 118 | "save_temp": self.save_temp, 119 | 120 | } 121 | 122 | def __str__(self): 123 | character = self.character 124 | json_content = json.dumps(self.to_dict(), ensure_ascii=False) # ensure_ascii=False to properly display non-ASCII characters 125 | return f"----------------TTS Task--------------\ncharacter: {character}, content: {json_content}\n--------------------------------------" 126 | 127 | 128 | if __name__ == "__main__": 129 | sys.path.append(".") 130 | task = GSV_TTS_Task() 131 | print(task) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 XTer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 文档待更新 2 | 请暂时不要用这个项目 3 | 4 | 如果想使用GPT-soVITS的推理特化版,请直接看:https://github.com/X-T-E-R/GPT-SoVITS-Inference 5 | 6 | 本项目意图在于让使用各类语音合成引擎的方式变得统一,支持多种语音合成引擎适配器,允许直接作为模组使用或启动后端服务。 7 | 8 | 目前已经实现的部分: 9 | 1. 框架 10 | 2. 解析器:参数解析器(可指定别名、已反向兼容很多其它项目)、类微软请求的解析器 11 | 3. Adapter:GPT-soVITS Adapter 12 | 4. fastAPI的返回:File、Stream 13 | 14 | 文档待更新 15 | 请暂时不要用这个项目 16 | 17 | ## Credits 18 | 19 | ### 整段使用的代码: 20 | 1. 内部的 `Adapters/gsv_fast` 文件夹主要来自[GPT-soVITS](https://github.com/RVC-Boss/GPT-SoVITS)项目的`fast_inference_`分支 21 | 2. `tools/i18n` 魔改自 [GSVI](https://github.com/X-T-E-R/GPT-SoVITS-Inference) ,基于 [i18n](https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/i18n) 22 | 23 | ### 感谢所有有关项目与贡献者 24 | 25 | #### Theoretical 26 | 27 | - [ar-vits](https://github.com/innnky/ar-vits) 28 | - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) 29 | - [vits](https://github.com/jaywalnut310/vits) 30 | - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) 31 | - [contentvec](https://github.com/auspicious3000/contentvec/) 32 | - [hifi-gan](https://github.com/jik876/hifi-gan) 33 | - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) 34 | 35 | #### Pretrained Models 36 | 37 | - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) 38 | - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) 39 | 40 | #### Text Frontend for Inference 41 | 42 | - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) 43 | - [LangSegment](https://github.com/juntaosun/LangSegment) 44 | 45 | #### WebUI Tools 46 | 47 | - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) 48 | - [audio-slicer](https://github.com/openvpi/audio-slicer) 49 | - [SubFix](https://github.com/cronrpc/SubFix) 50 | - [FFmpeg](https://github.com/FFmpeg/FFmpeg) 51 | - [gradio](https://github.com/gradio-app/gradio) 52 | - [faster-whisper](https://github.com/SYSTRAN/faster-whisper) 53 | - [FunASR](https://github.com/alibaba-damo-academy/FunASR) -------------------------------------------------------------------------------- /WebUIs/GSVI/i18n/locale/en_US.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", Return Content:", 3 | "

这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面

若有疑问或需要进一步了解,可参考文档:点击查看详细文档

": "

This is the model management interface. It allows you to assign emotions to multiple reference audio segments. If you only have one segment, you can skip using this interface.

If you have questions or need further information, please refer to the documentation: Click to view detailed documentation.

", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT Model Path", 6 | "Sovits模型路径": "SoVITS Model Path", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "Japanese Only", 11 | "all_zh": "Chinese Only", 12 | "auto": "Auto Detect", 13 | "auto_cut": "Smart Split", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "Batch Size: 1 means no parallel processing. Larger values are faster but more prone to issues.", 15 | "cut0": "Split by Line Break Only", 16 | "cut1": "Group Four Sentences Together", 17 | "cut2": "Group 50 Characters Together", 18 | "cut3": "Split by Chinese Period", 19 | "cut4": "Split by English Period", 20 | "cut5": "Split by Punctuation", 21 | "en": "English", 22 | "https://space.bilibili.com/66633770": "https://github.com/X-T-E-R", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "Japanese", 25 | "json设置(一般不动)": "JSON Settings (Do not change it unless you know what you are doing)", 26 | "zh": "Chinese", 27 | "不切": "Do Not Split", 28 | "人物情感列表网址": "Character Emotion List URL", 29 | "从json中读取": "Read from JSON", 30 | "使用前,请确认后端服务已启动。": "Before using, please ensure the backend service is running.", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "Save JSON\n(There may not be a completion notice; no error means success)", 32 | "保存失败!": "Save Failed!", 33 | "保存成功!": "Save Successful!", 34 | "停止播放": "Stop Playback", 35 | "切句方式": "Sentence Splitting Method", 36 | "前端处理后的文本(每句):": "Front-end Processed Text (Per Sentence):", 37 | "参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range. Please replace it!", 38 | "参考音频路径": "Reference Audio Path", 39 | "发送json格式": "Send in JSON", 40 | "发送并开始播放": "Send and Start Playback", 41 | "发送请求": "Send Request", 42 | "发送请求到": "Send Request to", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, or adjusting the batch size.", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, changing the reference audio (using the model management interface), or adjusting the batch size.", 45 | "基础选项": "Basic Options", 46 | "实际输入的参考文本:": "Actual Reference Text Input:", 47 | "实际输入的目标文本(切句后):": "Actual Target Text Input (After Splitting):", 48 | "实际输入的目标文本(每句):": "Actual Target Text Input (Per Sentence):", 49 | "实际输入的目标文本:": "Actual Target Text Input:", 50 | "密码": "Password", 51 | "当前人物": "Current Character", 52 | "当前人物变更为: ": "Current Character Changed to: ", 53 | "您在使用经典推理模式,部分选项不可用": "You are using Classic Inference Mode. Some options are unavailable.", 54 | "情感列表": "Emotion", 55 | "情感风格": "Emotion", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "Five little monkeys jumping on the bed, one fell off and bumped his head. Mama called the doctor, and the doctor said, \"No more monkeys jumping on the bed!\"", 57 | "扫描": "Scan", 58 | "扫描人物列表": "Scan Character List", 59 | "扫描模型文件夹:": "Scan Model Folder:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "Model file not found! Please place valid files in the folder!!!", 61 | "提供的推理特化包,当前版本:": ", Current Version: ", 62 | "提示": "Tip", 63 | "提示文本": "Prompt Text", 64 | "提示语言": "Prompt Language", 65 | "文件打开失败,保存失败!": "File Opening Failed, Save Failed!", 66 | "文本语言": "Text Language", 67 | "是否自动匹配情感": "Automatically Match Emotions", 68 | "模型文件夹路径": "Model Folder Path", 69 | "每句允许最大切分字词数": "Max Words per Split Sentence", 70 | "流式音频": "Streaming Audio", 71 | "添加情感": "Add Emotion", 72 | "点击查看详细文档": "Click to View Detailed Documentation", 73 | "版本": "Version", 74 | "用户名": "Username", 75 | "种子": "Seed", 76 | "简介": "Introduction", 77 | "缺失某些项,保存失败!": "Missing Some Items, Save Failed!", 78 | "网址设置": "URL Settings", 79 | "自动生成info": "Auto Generate Info", 80 | "若有疑问或需要进一步了解,可参考文档:": "If you have questions or need further information, please refer to the documentation: ", 81 | "认证信息": "Authentication Info", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "Authentication is enabled. You can disable it in config.json.\nHowever, this feature is not fully implemented yet and is just for show.", 83 | "语速": "Speed", 84 | "请修改后点击下方按钮进行保存": "Please modify and click the button below to save", 85 | "请求失败,状态码:": "Request Failed, Status Code:", 86 | "请求失败,请检查URL是否正确": "Request Failed. Please check if the URL is correct.", 87 | "请求完整音频": "Request Complete Audio", 88 | "请求网址": "Request URL", 89 | "输入文本": "Input Text", 90 | "这是一个由": "This is a Inference Specialization Package provided by ", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "This is a configuration file for https://github.com/X-T-E-R/TTS-for-GPT-soVITS, a simple and easy-to-use frontend and backend project", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "This is a demonstration page version and does not utilize backend services, the parameters below are invalid.", 93 | "选择角色": "Select Character", 94 | "音频输出": "Audio Output", 95 | "音频预览": "Audio Preview", 96 | "项目开源地址:": "Github Link: ", 97 | "高级选项": "Advanced Options", 98 | "最大允许长度": "Max Length Allowed" 99 | } 100 | -------------------------------------------------------------------------------- /WebUIs/GSVI/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回内容:", 3 | "

这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面

若有疑问或需要进一步了解,可参考文档:点击查看详细文档

": "

这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面

若有疑问或需要进一步了解,可参考文档:点击查看详细文档

", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路径", 6 | "Sovits模型路径": "Sovits模型路径", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "只有日文", 11 | "all_zh": "只有中文", 12 | "auto": "自动判断", 13 | "auto_cut": "智能切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题", 15 | "cut0": "仅凭换行切分", 16 | "cut1": "凑四句一切", 17 | "cut2": "凑50字一切", 18 | "cut3": "按中文句号。切", 19 | "cut4": "按英文句号.切", 20 | "cut5": "按标点符号切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json设置(一般不动)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情感列表网址", 29 | "从json中读取": "从json中读取", 30 | "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)", 32 | "保存失败!": "保存失败!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端处理后的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!", 38 | "参考音频路径": "参考音频路径", 39 | "发送json格式": "发送json格式", 40 | "发送并开始播放": "发送并开始播放", 41 | "发送请求": "发送请求", 42 | "发送请求到": "发送请求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。", 45 | "基础选项": "基础选项", 46 | "实际输入的参考文本:": "实际输入的参考文本:", 47 | "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):", 48 | "实际输入的目标文本(每句):": "实际输入的目标文本(每句):", 49 | "实际输入的目标文本:": "实际输入的目标文本:", 50 | "密码": "密码", 51 | "当前人物": "当前人物", 52 | "当前人物变更为: ": "当前人物变更为: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用", 54 | "情感列表": "情感列表", 55 | "情感风格": "情感风格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。", 57 | "扫描": "扫描", 58 | "扫描人物列表": "扫描人物列表", 59 | "扫描模型文件夹:": "扫描模型文件夹:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示语言", 65 | "文件打开失败,保存失败!": "文件打开失败,保存失败!", 66 | "文本语言": "文本语言", 67 | "是否自动匹配情感": "是否自动匹配情感", 68 | "模型文件夹路径": "模型文件夹路径", 69 | "每句允许最大切分字词数": "每句允许最大切分字词数", 70 | "流式音频": "流式音频", 71 | "添加情感": "添加情感", 72 | "点击查看详细文档": "点击查看详细文档", 73 | "版本": "版本", 74 | "用户名": "用户名", 75 | "种子": "种子", 76 | "简介": "简介", 77 | "缺失某些项,保存失败!": "缺失某些项,保存失败!", 78 | "网址设置": "网址设置", 79 | "自动生成info": "自动生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:", 81 | "认证信息": "认证信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设", 83 | "语速": "语速", 84 | "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存", 85 | "请求失败,状态码:": "请求失败,状态码:", 86 | "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确", 87 | "请求完整音频": "请求完整音频", 88 | "请求网址": "请求网址", 89 | "输入文本": "输入文本", 90 | "这是一个由": "这是一个由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。", 93 | "选择角色": "选择角色", 94 | "音频输出": "音频输出", 95 | "音频预览": "音频预览", 96 | "项目开源地址:": "项目开源地址:", 97 | "高级选项": "高级选项", 98 | "最大允许长度": "最大允许长度" 99 | } 100 | -------------------------------------------------------------------------------- /WebUIs/GSVI/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回內容:", 3 | "

这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面

若有疑问或需要进一步了解,可参考文档:点击查看详细文档

": "

這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面

若有疑問或需要進一步了解,可參考文件:點擊查看詳細文件

", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路徑", 6 | "Sovits模型路径": "Sovits模型路徑", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "僅日文", 11 | "all_zh": "僅中文", 12 | "auto": "自動判斷", 13 | "auto_cut": "智慧切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題", 15 | "cut0": "僅憑換行切分", 16 | "cut1": "湊四句一切", 17 | "cut2": "湊50字一切", 18 | "cut3": "按中文句號。切", 19 | "cut4": "按英文句號.切", 20 | "cut5": "按標點符號切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json設置(一般不動)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情緒列表網址", 29 | "从json中读取": "從json中讀取", 30 | "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)", 32 | "保存失败!": "保存失敗!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端處理後的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!", 38 | "参考音频路径": "參考音頻路徑", 39 | "发送json格式": "發送json格式", 40 | "发送并开始播放": "發送並開始播放", 41 | "发送请求": "發送請求", 42 | "发送请求到": "發送請求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。", 45 | "基础选项": "基礎選項", 46 | "实际输入的参考文本:": "實際輸入的參考文本:", 47 | "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):", 48 | "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):", 49 | "实际输入的目标文本:": "實際輸入的目標文本:", 50 | "密码": "密碼", 51 | "当前人物": "當前人物", 52 | "当前人物变更为: ": "當前人物變更為: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用", 54 | "情感列表": "情緒列表", 55 | "情感风格": "情緒風格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。", 57 | "扫描": "掃描", 58 | "扫描人物列表": "掃描人物列表", 59 | "扫描模型文件夹:": "掃描模型文件夾:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示語言", 65 | "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!", 66 | "文本语言": "文本語言", 67 | "是否自动匹配情感": "是否自動匹配情緒", 68 | "模型文件夹路径": "模型文件夾路徑", 69 | "每句允许最大切分字词数": "每句允許最大切分字詞數", 70 | "流式音频": "流式音頻", 71 | "添加情感": "添加情緒", 72 | "点击查看详细文档": "點擊查看詳細文件", 73 | "版本": "版本", 74 | "用户名": "使用者名稱", 75 | "种子": "種子", 76 | "简介": "簡介", 77 | "缺失某些项,保存失败!": "缺失某些項,保存失敗!", 78 | "网址设置": "網址設置", 79 | "自动生成info": "自動生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:", 81 | "认证信息": "認證信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設", 83 | "语速": "語速", 84 | "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存", 85 | "请求失败,状态码:": "請求失敗,狀態碼:", 86 | "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確", 87 | "请求完整音频": "請求完整音頻", 88 | "请求网址": "請求網址", 89 | "输入文本": "輸入文本", 90 | "这是一个由": "這是一個由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。", 93 | "选择角色": "選擇角色", 94 | "音频输出": "音頻輸出", 95 | "音频预览": "音頻預覽", 96 | "项目开源地址:": "Github Link:", 97 | "高级选项": "高級選項", 98 | "最大允许长度": "最大允許長度" 99 | } 100 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/__init__.py -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | # 在开头加入路径 2 | import os, sys 3 | import importlib 4 | 5 | # 尝试清空含有GPT_SoVITS的路径 6 | for path in sys.path: 7 | if path.find(r"GPT_SoVITS") != -1: 8 | sys.path.remove(path) 9 | 10 | now_dir = os.getcwd() 11 | sys.path.append(now_dir) 12 | # sys.path.append(os.path.join(now_dir, "GPT_SoVITS")) 13 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 14 | 15 | __version__ = "0.1.0" 16 | 17 | print(f"Backend Version: {__version__}") 18 | 19 | import soundfile as sf 20 | from fastapi import FastAPI, Request, HTTPException 21 | from fastapi.responses import JSONResponse, FileResponse, StreamingResponse 22 | from fastapi.middleware.cors import CORSMiddleware 23 | import tempfile 24 | import uvicorn 25 | import json 26 | 27 | # 将当前文件所在的目录添加到 sys.path 28 | 29 | from src.api_config_manager import api_config 30 | from Adapters.base import Base_TTS_Task, Base_TTS_Instance 31 | 32 | enabled_adapters = api_config.enabled_adapters 33 | default_adapter = api_config.default_adapter 34 | 35 | if len(enabled_adapters) > 1: 36 | tts_instance_dict:dict[str, Base_TTS_Instance] = {} 37 | for adapter in enabled_adapters: 38 | module = importlib.import_module(f"Adapters.{adapter}") 39 | tts_instance_dict[adapter] = getattr(module, "TTS_Instance")() 40 | else: 41 | module = importlib.import_module(f"Adapters.{default_adapter}") 42 | tts_instance:Base_TTS_Instance = getattr(module, "TTS_Instance")() 43 | 44 | # 存储临时文件的字典 45 | temp_files = {} 46 | 47 | app = FastAPI() 48 | 49 | # 设置CORS 50 | app.add_middleware( 51 | CORSMiddleware, 52 | allow_origins=["*"], 53 | allow_credentials=True, 54 | allow_methods=["*"], 55 | allow_headers=["*"], 56 | ) 57 | 58 | @app.get('/character_list') 59 | async def character_list(request: Request): 60 | if len(enabled_adapters) > 1: 61 | adapter = request.query_params.get("adapter", default_adapter) 62 | tts_instance = tts_instance_dict[adapter] 63 | res = JSONResponse(tts_instance.get_characters()) 64 | return res 65 | 66 | @app.get('/voice/speakers') 67 | async def speakers(request: Request): 68 | if len(enabled_adapters) > 1: 69 | adapter = request.query_params.get("adapter", default_adapter) 70 | tts_instance = tts_instance_dict[adapter] 71 | speaker_dict = tts_instance.get_characters() 72 | name_list = list(speaker_dict.keys()) 73 | speaker_list = [{"id": i, "name": name_list[i], "lang":["zh","en","ja"]} for i in range(len(name_list))] 74 | res = { 75 | "VITS": speaker_list, 76 | "GSVI": speaker_list, 77 | "GPT-SOVITS": speaker_list 78 | } 79 | return JSONResponse(res) 80 | 81 | def generate_task(task: Base_TTS_Task): 82 | if task.task_type == "text" and task.text.strip() == "": 83 | return HTTPException(status_code=400, detail="Text is empty") 84 | elif task.task_type == "ssml" and task.ssml.strip() == "": 85 | return HTTPException(status_code=400, detail="SSML is empty") 86 | format = task.format 87 | save_temp = task.save_temp 88 | request_hash = None if not save_temp else task.md5() 89 | stream = task.stream 90 | 91 | if task.task_type == "text": 92 | gen = tts_instance.generate_from_text(task) 93 | elif task.task_type == "ssml": 94 | # 还不支持 stream 95 | audio_path = tts_instance.generate_from_ssml(task) 96 | if audio_path is None: 97 | return HTTPException(status_code=400, detail="SSML is invalid") 98 | return FileResponse(audio_path, media_type=f"audio/{format}", filename=f"audio.{format}") 99 | 100 | if stream == False: 101 | # TODO: use SQL instead of dict 102 | if save_temp and request_hash in temp_files: 103 | return FileResponse(path=temp_files[request_hash], media_type=f'audio/{format}') 104 | else: 105 | # 假设 gen 是你的音频生成器 106 | try: 107 | sampling_rate, audio_data = next(gen) 108 | except StopIteration: 109 | raise HTTPException(status_code=404, detail="Generator is empty or error occurred") 110 | # 创建一个临时文件 111 | with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{format}') as tmp_file: 112 | # 尝试写入用户指定的格式,如果失败则回退到 WAV 格式 113 | try: 114 | sf.write(tmp_file, audio_data, sampling_rate, format=format) 115 | except Exception as e: 116 | # 如果指定的格式无法写入,则回退到 WAV 格式 117 | sf.write(tmp_file, audio_data, sampling_rate, format='wav') 118 | format = 'wav' # 更新格式为 wav 119 | 120 | tmp_file_path = tmp_file.name 121 | task.audio_path = tmp_file_path 122 | if save_temp: 123 | temp_files[request_hash] = tmp_file_path 124 | # 返回文件响应,FileResponse 会负责将文件发送给客户端 125 | return FileResponse(tmp_file_path, media_type=f"audio/{format}", filename=f"audio.{format}") 126 | else: 127 | return StreamingResponse(gen, media_type='audio/wav') 128 | 129 | 130 | # route 由 json 文件配置 131 | async def tts(request: Request): 132 | if len(enabled_adapters) > 1: 133 | adapter = request.query_params.get("adapter", default_adapter) 134 | tts_instance = tts_instance_dict[adapter] 135 | # 尝试从JSON中获取数据,如果不是JSON,则从查询参数中获取 136 | if request.method == "GET": 137 | data = request.query_params 138 | else: 139 | data = await request.json() 140 | return_type = "audio" 141 | # 认定一个请求只有一个任务 142 | if data.get("textType", None) is not None: 143 | task = tts_instance.ms_like_analyser(data) 144 | return_type = "json" 145 | else: 146 | task = tts_instance.params_analyser(data) 147 | 148 | print(task) 149 | if return_type == "audio": 150 | return generate_task(task) 151 | else: 152 | # TODO: return json 153 | return generate_task(task) 154 | pass 155 | 156 | routes = ['/tts'] 157 | 158 | # 注册路由 159 | for path in routes: 160 | app.api_route(path, methods=['GET', 'POST'])(tts) 161 | 162 | # 便于小白理解 163 | def print_ipv4_ip(host = "127.0.0.1", port = 5000): 164 | import socket 165 | 166 | def get_internal_ip(): 167 | """获取内部IP地址""" 168 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 169 | try: 170 | # 这不会发送真正的数据包 171 | s.connect(('10.253.156.219', 1)) 172 | IP = s.getsockname()[0] 173 | except Exception: 174 | IP = '127.0.0.1' 175 | finally: 176 | s.close() 177 | return IP 178 | 179 | if host == "0.0.0.0": 180 | display_hostname = get_internal_ip() 181 | if display_hostname != "127.0.0.1": 182 | print(f"Please use http://{display_hostname}:{port} to access the service.") 183 | 184 | tts_host = api_config.tts_host 185 | tts_port = api_config.tts_port 186 | 187 | if __name__ == "__main__": 188 | print_ipv4_ip(tts_host, tts_port) 189 | uvicorn.run(app, host=tts_host, port=tts_port) 190 | 191 | -------------------------------------------------------------------------------- /api_doc.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This document aims to introduce how to use our Text-to-Speech API, including making requests via GET and POST methods. This API supports converting text into the voice of specified characters and supports different languages and emotional expressions. 4 | 5 | ## Character and Emotion List 6 | 7 | To obtain the supported characters and their corresponding emotions, please visit the following URL: 8 | 9 | - URL: `http://127.0.0.1:5000/character_list` 10 | - Returns: A JSON format list of characters and corresponding emotions 11 | - Method: `GET` 12 | 13 | ``` 14 | { 15 | "Hanabi": [ 16 | "default", 17 | "Normal", 18 | "Yandere", 19 | ], 20 | "Hutao": [ 21 | "default" 22 | ] 23 | } 24 | ``` 25 | 26 | ## Regarding Aliases 27 | 28 | From version 2.2.4, an alias system was added. Detailed allowed aliases can be found in `Inference/params_config.json`. 29 | 30 | ## Text-to-Speech 31 | 32 | - URL: `http://127.0.0.1:5000/tts` 33 | - Returns: Audio on success. Error message on failure. 34 | - Method: `GET`/`POST` 35 | 36 | ### GET Method 37 | 38 | #### Format 39 | 40 | ``` 41 | http://127.0.0.1:5000/tts?character={{characterName}}&text={{text}} 42 | ``` 43 | 44 | - Parameter explanation: 45 | - `character`: The name of the character folder, pay attention to case sensitivity, full/half width, and language (Chinese/English). 46 | - `text`: The text to be converted, URL encoding is recommended. 47 | - Optional parameters include `text_language`, `format`, `top_k`, `top_p`, `batch_size`, `speed`, `temperature`, `emotion`, `save_temp`, and `stream`, detailed explanations are provided in the POST section below. 48 | - From version 2.2.4, an alias system was added, with detailed allowed aliases found in `Inference/params_config.json`. 49 | 50 | ### POST Method 51 | 52 | #### JSON Package Format 53 | 54 | ##### All Parameters 55 | 56 | ``` 57 | { 58 | "method": "POST", 59 | "body": { 60 | "character": "${chaName}", 61 | "emotion": "${Emotion}", 62 | "text": "${speakText}", 63 | "text_language": "${textLanguage}", 64 | "batch_size": ${batch_size}, 65 | "speed": ${speed}, 66 | "top_k": ${topK}, 67 | "top_p": ${topP}, 68 | "temperature": ${temperature}, 69 | "stream": "${stream}", 70 | "format": "${Format}", 71 | "save_temp": "${saveTemp}" 72 | } 73 | } 74 | ``` 75 | 76 | You can omit one or more items. From version 2.2.4, an alias system was introduced, detailed allowed aliases can be found in `Inference/params_config.json`. 77 | 78 | ##### Minimal Data: 79 | 80 | ``` 81 | { 82 | "method": "POST", 83 | "body": { 84 | "text": "${speakText}" 85 | } 86 | } 87 | ``` 88 | 89 | ##### Parameter Explanation 90 | 91 | - **text**: The text to be converted, URL encoding is recommended. 92 | - **character**: Character folder name, pay attention to case sensitivity, full/half width, and language. 93 | - **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used. 94 | - **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed. 95 | - **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar. 96 | 97 | - **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1. 98 | - **speed**: Speech speed, default is 1.0. 99 | - **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false. 100 | - **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false. 101 | - **format**: Format, default is WAV, allows MP3/ WAV/ OGG. 102 | 103 | -------------------------------------------------------------------------------- /configs/api_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "locale": "auto", 3 | "tts_port": 5000, 4 | "tts_host": "0.0.0.0", 5 | "enabled_adapters": ["gsv_fast"] 6 | } 7 | -------------------------------------------------------------------------------- /configs/gsv_fast/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "locale": "auto", 3 | "tts_port": 5000, 4 | "tts_host": "0.0.0.0", 5 | "device": "auto", 6 | "half_precision": "auto", 7 | 8 | "is_share": "false", 9 | "models_path": "models/gsv", 10 | "max_word_count": 100, 11 | "max_text_length": -1, 12 | "save_prompt_cache": "true", 13 | "save_model_cache": "false", 14 | "备注0": "locale是语言环境,auto表示自动选择,如果你想要强制指定语言环境,可以填写zh_CN或者en_US等等", 15 | "备注1": "路径可以填写绝对路径或者相对路径,相对路径指的是在主项目根目录的相对路径", 16 | "备注2": "tts_port是tts服务的端口号,可以自己定义,只要不和其他服务的端口号冲突就行,默认是5000", 17 | "备注3": "half-precision可以填写true或者false,auto表示自动选择,有一些显卡不支持half-precision,就把它设置为false", 18 | "备注4": "device可以填写cpu或者cuda,auto表示自动选择,如果你的显卡不支持cuda,就把它设置为cpu,一般不用动", 19 | "备注5": "max_text_length仅是app.py输入框的最大限制文本长度,-1表示不限制", 20 | "备注6": "is_share代表是否分享你的前端为一个共享gradio,就可以分享链接给你的朋友了,默认为false", 21 | "备注7": [ 22 | "save_cache开启会将数据缓存到磁盘上", 23 | "prompt_cache每个只有约0.5mb,开启后每条请求能省0.3-1.5s左右,建议开", 24 | "model_cache每个角色有300mb,切换人物时能省0.5s左右,自行考虑" 25 | ] 26 | } -------------------------------------------------------------------------------- /configs/gsv_fast/params_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_type":{ 3 | "type": "string", 4 | "description": "Task type for the API.", 5 | "alias": ["task_type", "task", "type", "textType"], 6 | "default": "tts" 7 | }, 8 | "text": { 9 | "type": "string", 10 | "description": "The text to be synthesized.", 11 | "alias": ["text", "txt", "tex", "t"], 12 | "default": "" 13 | }, 14 | "ssml": { 15 | "type": "string", 16 | "description": "The SSML text to be synthesized.", 17 | "alias": ["ssml", "text", "txt", "tex", "t"], 18 | "default": "" 19 | }, 20 | "text_language": { 21 | "type": "string", 22 | "description": "Language of the text.", 23 | "alias": ["text_language", "lang", "language", "lan", "text_lang", "xml:lang"], 24 | "default": "auto" 25 | }, 26 | "character": { 27 | "type": "string", 28 | "description": "Character name for the model.", 29 | "alias": ["cha_name", "character", "model_name", "cha", "spk" , "speaker", "name", "role"], 30 | "default": "" 31 | }, 32 | "speaker_id": { 33 | "type": "int", 34 | "description": "Speaker ID for the model.", 35 | "alias": ["speaker_id", "id"], 36 | "default": null 37 | }, 38 | "emotion": { 39 | "type": "string", 40 | "description": "Emotion of the character.", 41 | "alias": ["character_emotion", "emotion", "style"], 42 | "default": "default" 43 | }, 44 | "batch_size": { 45 | "type": "int", 46 | "description": "Batch size for processing.", 47 | "alias": ["batch_size", "batch"], 48 | "default": 10 49 | }, 50 | "speed": { 51 | "type": "float", 52 | "description": "Speed factor for synthesis.", 53 | "alias": ["speed", "speed_factor", "spd", "rate"], 54 | "default": 1.0 55 | }, 56 | "top_k": { 57 | "type": "int", 58 | "description": "Top K parameter for sampling.", 59 | "alias": ["top_k", "topk"], 60 | "default": 1 61 | }, 62 | "top_p": { 63 | "type": "float", 64 | "description": "Top P parameter for sampling.", 65 | "alias": ["top_p", "topp"], 66 | "default": 0.8 67 | }, 68 | "temperature": { 69 | "type": "float", 70 | "description": "Temperature for sampling.", 71 | "alias": ["temperature"], 72 | "default": 0.8 73 | }, 74 | "seed": { 75 | "type": "int", 76 | "description": "Seed for randomness.", 77 | "alias": ["seed"], 78 | "default": -1 79 | }, 80 | "stream": { 81 | "type": "bool", 82 | "description": "Stream the audio or not.", 83 | "alias": ["stream", "streaming"], 84 | "default": false 85 | }, 86 | "save_temp": { 87 | "type": "bool", 88 | "description": "Save the output temporarily.", 89 | "alias": ["save_temp", "save"], 90 | "default": false 91 | }, 92 | "cut_method": { 93 | "type": "string", 94 | "description": "Method for text cutting.", 95 | "alias": ["cut_method", "cut"], 96 | "default": "auto_cut_100" 97 | }, 98 | "format": { 99 | "type": "string", 100 | "description": "Format of the output audio.", 101 | "alias": ["format"], 102 | "default": "wav" 103 | }, 104 | "loudness": { 105 | "type": "float", 106 | "description": "Loudness of the audio.", 107 | "alias": ["loudness", "volume", "vol"], 108 | "default": null 109 | }, 110 | "pitch": { 111 | "type": "float", 112 | "description": "Pitch of the audio.", 113 | "alias": ["pitch"], 114 | "default": null 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | conda install -c conda-forge gcc 3 | conda install -c conda-forge gxx 4 | conda install ffmpeg cmake 5 | conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia 6 | pip install -r requirements.txt 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /models/gsv/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /models/pretrained_models/gsv/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.28.1 2 | pydub 3 | 4 | pydantic 5 | soundfile 6 | flash-attention 7 | numpy 8 | scipy 9 | tensorboard 10 | librosa==0.9.2 11 | numba 12 | torchmetrics==0.11.4 13 | pytorch-lightning 14 | gradio==4.19 15 | gradio_client 16 | ffmpeg-python 17 | onnxruntime 18 | tqdm 19 | funasr==1.0.0 20 | cn2an 21 | pypinyin 22 | pyopenjtalk 23 | g2p_en 24 | torchaudio 25 | modelscope==1.10.0 26 | sentencepiece 27 | transformers 28 | chardet 29 | PyYAML 30 | psutil 31 | jieba_fast 32 | jieba 33 | LangSegment>=0.3.1 34 | Faster_Whisper 35 | fastapi 36 | uvicorn 37 | wordsegment 38 | srt 39 | 40 | pyloudnorm 41 | 42 | azure-cognitiveservices-speech 43 | openai -------------------------------------------------------------------------------- /src/api_config_manager.py: -------------------------------------------------------------------------------- 1 | import os, sys, json 2 | 3 | 4 | class API_Config(): 5 | def __init__(self, config_path = None): 6 | self.config_path = config_path 7 | assert os.path.exists(self.config_path), f"配置文件不存在: {self.config_path}" 8 | if os.path.exists(self.config_path): 9 | with open(self.config_path, 'r', encoding='utf-8') as f: 10 | config:dict = json.load(f) 11 | 12 | self.tts_host = config.get("tts_host", "0.0.0.0") 13 | self.tts_port = config.get("tts_port", 5000) 14 | 15 | locale_language = str(config.get("locale", "auto")) 16 | self.locale_language = None if locale_language.lower() == "auto" else locale_language 17 | 18 | self.enabled_adapters = config.get("enabled_adapters", ["gsv_fast"]) 19 | self.default_adapter = self.enabled_adapters[0] 20 | 21 | api_config = API_Config(os.path.join("configs", "api_config.json")) -------------------------------------------------------------------------------- /test/Model_Test.py: -------------------------------------------------------------------------------- 1 | import requests, json, os 2 | from string import Template 3 | from urllib.parse import quote 4 | import time 5 | 6 | def send_request(endpoint, endpoint_data, text, cha_name, character_emotion, text_language, top_k=6, top_p=0.8, temperature=0.8): 7 | urlencoded_text = requests.utils.quote(text) 8 | 9 | # 使用Template填充变量 10 | endpoint_template = Template(endpoint) 11 | final_endpoint = endpoint_template.substitute(chaName=cha_name, speakText=urlencoded_text,textLanguage=text_language, topK=top_k, topP=top_p, temperature=temperature, characterEmotion=character_emotion) 12 | 13 | endpoint_data_template = Template(endpoint_data) 14 | filled_json_str = endpoint_data_template.substitute(chaName=cha_name, speakText=urlencoded_text,textLanguage=text_language, topK=top_k, topP=top_p, temperature=temperature, characterEmotion=character_emotion) 15 | # 解析填充后的JSON字符串 16 | request_data = json.loads(filled_json_str) 17 | body = request_data["body"] 18 | 19 | # 发送POST请求 20 | response = requests.post(final_endpoint, json=body) 21 | 22 | # 检查请求是否成功 23 | if response.status_code == 200: 24 | # 生成保存路径 25 | 26 | save_path = f"tmp_audio/{cha_name}/{quote(character_emotion)}.wav" 27 | 28 | # 检查保存路径是否存在 29 | if not os.path.exists(f"tmp_audio/{cha_name}/"): 30 | os.makedirs(f"tmp_audio/{cha_name}/") 31 | 32 | # 保存音频文件到本地 33 | with open(save_path, "wb") as f: 34 | f.write(response.content) 35 | 36 | 37 | 38 | 39 | else: 40 | print(f"请求失败,状态码:{response.status_code}") 41 | 42 | 43 | global models_path 44 | models_path = r"D:\123pan\Downloads\准备重新封包" 45 | 46 | def load_info_config(character_name): 47 | emotion_options = ["default"] 48 | try: 49 | with open(f"{models_path}/{character_name}/infer_config.json", "r", encoding="utf-8") as f: 50 | config = json.load(f) 51 | emotion_list=config.get('emotion_list', None) 52 | if emotion_list is not None: 53 | emotion_options = [] 54 | for emotion, details in emotion_list.items(): 55 | emotion_options.append(emotion) 56 | except: 57 | pass 58 | return emotion_options 59 | 60 | 61 | default_endpoint = "http://127.0.0.1:5000/tts" 62 | default_endpoint_data = """{ 63 | "method": "POST", 64 | "body": { 65 | "cha_name": "${chaName}", 66 | "character_emotion": "${characterEmotion}", 67 | "text": "${speakText}", 68 | "text_language": "${textLanguage}", 69 | "top_k": ${topK}, 70 | "top_p": ${topP}, 71 | "temperature": ${temperature} 72 | } 73 | } 74 | """ 75 | default_text=""" 76 | 我可太激动了!跑近一看,居然是一群小动物组成的戏团!牵着火焰小马的猴子、指挥戏团的兔子、双脚站立不停跳舞的大猫… 77 | 它们唱唱跳跳地 78 | 带我走出了森林,还让我务必收下这个面具!它们说,这个就是他们看到我时的样子!但你看,这面具明明是只狐狸,怎么可能是我呢?""" 79 | 80 | character_name = "花火" 81 | 82 | emotion_options = load_info_config(character_name) 83 | 84 | 85 | for emotion in emotion_options: 86 | print(emotion) 87 | send_request(default_endpoint, default_endpoint_data, default_text, character_name, emotion, "多语种混合") 88 | time.sleep(20) 89 | -------------------------------------------------------------------------------- /test/test_Concurrency.py: -------------------------------------------------------------------------------- 1 | # from text_utils.segmenter import SentenceSegmenter 2 | # from text_utils.tokenizer import Tokenizer 3 | #https://github.com/numb3r3/text_utils 4 | # from polyglot.text import Text 5 | import requests,re,jieba 6 | import concurrent.futures 7 | import os 8 | 9 | 10 | 11 | def send(speaker,text,index): 12 | 13 | # print(f"speaker:{speaker},text:{text},index:{index}") 14 | from time import time 15 | start = time() 16 | response = requests.get(f'http://127.0.0.1:5000/tts?text={text}&cha={speaker}&top_k=5') 17 | end = time() 18 | print(f"speaker:{speaker},index:{index},time:{end-start}") 19 | os.makedirs('tmp_audio',exist_ok=True) 20 | with open(f'tmp_audio/{str(index)}.wav','ab') as f: 21 | f.write(response.content) 22 | 23 | 24 | 25 | text = '''我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。''' 26 | speakers = ['银狼','HuTao'] 27 | count = 3 28 | 29 | 30 | 31 | SK_TEXT_INDEX = [(speakers[index%len(speakers)], text, index+1) for index in range(count)] 32 | 33 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 34 | tasks = [executor.submit(send, a[0],a[1], a[2]) for a in SK_TEXT_INDEX] 35 | concurrent.futures.wait(tasks) -------------------------------------------------------------------------------- /test/test_refer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | from Adapters.gsv_fast import GSV_Instance 5 | from Adapters.gsv_fast.gsv_task import GSV_TTS_Task as TTS_Task 6 | 7 | gsv_instance = GSV_Instance() 8 | task:TTS_Task = TTS_Task() 9 | 10 | task.character = "Hutao" 11 | task.text = "你好,我是一个测试文本。" 12 | 13 | 14 | gen = gsv_instance.generate(task) 15 | sr, audio = next(gen) 16 | 17 | gen = gsv_instance.generate(task) 18 | sr, audio = next(gen) 19 | 20 | import tempfile 21 | import soundfile as sf 22 | with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: 23 | sf.write(f.name, audio, sr) 24 | print(f"Audio saved to {f.name}") -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/Uni-TTS/3eb56ecaa40759aaa537719e3dee117813fea7f7/tools/__init__.py -------------------------------------------------------------------------------- /tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os,argparse 2 | 3 | from modelscope.pipelines import pipeline 4 | from modelscope.utils.constant import Tasks 5 | from tqdm import tqdm 6 | 7 | path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' 8 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) 10 | def execute_denoise(input_folder,output_folder): 11 | os.makedirs(output_folder,exist_ok=True) 12 | # print(input_folder) 13 | # print(list(os.listdir(input_folder).sort())) 14 | for name in tqdm(os.listdir(input_folder)): 15 | ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-i", "--input_folder", type=str, required=True, 20 | help="Path to the folder containing WAV files.") 21 | parser.add_argument("-o", "--output_folder", type=str, required=True, 22 | help="Output folder to store transcriptions.") 23 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 24 | help="fp16 or fp32")#还没接入 25 | cmd = parser.parse_args() 26 | execute_denoise( 27 | input_folder = cmd.input_folder, 28 | output_folder = cmd.output_folder, 29 | ) -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | 6 | def load_language_list(language, locale_path="./i18n/locale"): 7 | with open(os.path.join(locale_path, f"{language}.json"), "r", encoding="utf-8") as f: 8 | language_list = json.load(f) 9 | return language_list 10 | 11 | from Adapters.gsv_fast.config_manager import inference_config 12 | 13 | class I18nAuto: 14 | def __init__(self, language=None, locale_path="./i18n/locale"): 15 | if language in ["Auto", None]: 16 | if inference_config.locale_language in ["Auto", None, ""]: 17 | language = locale.getdefaultlocale()[0] 18 | else: 19 | language = inference_config.locale_language 20 | if not os.path.exists(os.path.join(locale_path, f"{language}.json")): 21 | language = "en_US" 22 | self.language = language 23 | self.language_map = load_language_list(language, locale_path) 24 | 25 | def __call__(self, key): 26 | return self.language_map.get(key, key) 27 | 28 | def __repr__(self): 29 | return "Use Language: " + self.language 30 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音", 3 | "A模型权重": "A模型权重", 4 | "A模型路径": "A模型路径", 5 | "B模型路径": "B模型路径", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt处理", 13 | "harvest进程数": "harvest进程数", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一键训练", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "保存名", 32 | "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", 33 | "保存的模型名不带后缀": "保存的模型名不带后缀", 34 | "保存频率save_every_epoch": "保存频率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", 38 | "停止音频转换": "停止音频转换", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路径", 41 | "加载模型": "加载模型", 42 | "加载预训练底模D路径": "加载预训练底模D路径", 43 | "加载预训练底模G路径": "加载预训练底模G路径", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸载音色省显存", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "响应阈值", 51 | "响度因子": "响度因子", 52 | "处理数据": "处理数据", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "导出文件格式", 55 | "常见问题解答": "常见问题解答", 56 | "常规设置": "常规设置", 57 | "开始音频转换": "开始音频转换", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "性能设置", 60 | "总训练轮数total_epoch": "总训练轮数total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定输出文件夹", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理时间(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", 74 | "显卡信息": "显卡信息", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", 78 | "检索特征占比": "检索特征占比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", 82 | "模型是否带音高指导": "模型是否带音高指导", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否", 85 | "模型版本型号": "模型版本型号", 86 | "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", 87 | "模型路径": "模型路径", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出长度", 90 | "版本": "版本", 91 | "特征提取": "特征提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", 94 | "目标采样率": "目标采样率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型信息", 99 | "要置入的模型信息": "要置入的模型信息", 100 | "训练": "训练", 101 | "训练模型": "训练模型", 102 | "训练特征索引": "训练特征索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "请指定说话人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "请选择说话人id", 108 | "转换": "转换", 109 | "输入实验名": "输入实验名", 110 | "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "输入训练文件夹路径", 116 | "输入设备": "输入设备", 117 | "输入降噪": "输入降噪", 118 | "输出信息": "输出信息", 119 | "输出变声": "输出变声", 120 | "输出设备": "输出设备", 121 | "输出降噪": "输出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", 123 | "选择.index文件": "选择.index文件", 124 | "选择.pth文件": "选择.pth文件", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "采样长度", 130 | "重载设备列表": "重载设备列表", 131 | "音调设置": "音调设置", 132 | "音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)", 133 | "音高算法": "音高算法", 134 | "额外推理时长": "额外推理时长" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_HK.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_SG.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", 3 | "A模型权重": "A模型權重", 4 | "A模型路径": "A模型路徑", 5 | "B模型路径": "B模型路徑", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt處理", 13 | "harvest进程数": "harvest進程數", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一鍵訓練", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "儲存名", 32 | "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", 33 | "保存的模型名不带后缀": "儲存的模型名不帶副檔名", 34 | "保存频率save_every_epoch": "保存頻率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", 38 | "停止音频转换": "停止音訊轉換", 39 | "全流程结束!": "全流程结束!", 40 | "刷新音色列表和索引路径": "刷新音色列表和索引路徑", 41 | "加载模型": "載入模型", 42 | "加载预训练底模D路径": "加載預訓練底模D路徑", 43 | "加载预训练底模G路径": "加載預訓練底模G路徑", 44 | "单次推理": "单次推理", 45 | "卸载音色省显存": "卸載音色節省 VRAM", 46 | "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", 47 | "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", 48 | "否": "否", 49 | "启用相位声码器": "启用相位声码器", 50 | "响应阈值": "響應閾值", 51 | "响度因子": "響度因子", 52 | "处理数据": "處理資料", 53 | "导出Onnx模型": "导出Onnx模型", 54 | "导出文件格式": "導出檔格式", 55 | "常见问题解答": "常見問題解答", 56 | "常规设置": "一般設定", 57 | "开始音频转换": "開始音訊轉換", 58 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 59 | "性能设置": "效能設定", 60 | "总训练轮数total_epoch": "總訓練輪數total_epoch", 61 | "批量推理": "批量推理", 62 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", 63 | "指定输出主人声文件夹": "指定输出主人声文件夹", 64 | "指定输出文件夹": "指定輸出資料夾", 65 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 66 | "推理时间(ms):": "推理時間(ms):", 67 | "推理音色": "推理音色", 68 | "提取": "提取", 69 | "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", 70 | "是": "是", 71 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", 72 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", 73 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", 74 | "显卡信息": "顯示卡資訊", 75 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", 76 | "查看": "查看", 77 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", 78 | "检索特征占比": "檢索特徵佔比", 79 | "模型": "模型", 80 | "模型推理": "模型推理", 81 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", 82 | "模型是否带音高指导": "模型是否帶音高指導", 83 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", 84 | "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", 85 | "模型版本型号": "模型版本型號", 86 | "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", 87 | "模型路径": "模型路徑", 88 | "每张显卡的batch_size": "每张显卡的batch_size", 89 | "淡入淡出长度": "淡入淡出長度", 90 | "版本": "版本", 91 | "特征提取": "特徵提取", 92 | "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", 93 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", 94 | "目标采样率": "目標取樣率", 95 | "算法延迟(ms):": "算法延迟(ms):", 96 | "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", 97 | "融合": "融合", 98 | "要改的模型信息": "要改的模型資訊", 99 | "要置入的模型信息": "要置入的模型資訊", 100 | "训练": "訓練", 101 | "训练模型": "訓練模型", 102 | "训练特征索引": "訓練特徵索引", 103 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 104 | "请指定说话人id": "請指定說話人id", 105 | "请选择index文件": "请选择index文件", 106 | "请选择pth文件": "请选择pth文件", 107 | "请选择说话人id": "請選擇說話人ID", 108 | "转换": "轉換", 109 | "输入实验名": "輸入實驗名稱", 110 | "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", 111 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", 112 | "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", 113 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", 114 | "输入监听": "输入监听", 115 | "输入训练文件夹路径": "輸入訓練檔案夾路徑", 116 | "输入设备": "輸入設備", 117 | "输入降噪": "輸入降噪", 118 | "输出信息": "輸出訊息", 119 | "输出变声": "输出变声", 120 | "输出设备": "輸出設備", 121 | "输出降噪": "輸出降噪", 122 | "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", 123 | "选择.index文件": "選擇 .index 檔案", 124 | "选择.pth文件": "選擇 .pth 檔案", 125 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 126 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 127 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 128 | "采样率:": "采样率:", 129 | "采样长度": "取樣長度", 130 | "重载设备列表": "重載設備列表", 131 | "音调设置": "音調設定", 132 | "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", 133 | "音高算法": "音高演算法", 134 | "额外推理时长": "額外推理時長" 135 | } 136 | -------------------------------------------------------------------------------- /tools/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # dir_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path 6 | dir_path = "./tools/srt_slicer/i18n/locale" 7 | # Define the standard file name 8 | standard_file = os.path.join(dir_path, "zh_CN.json") 9 | 10 | # Find all JSON files in the directory 11 | languages = [ 12 | os.path.join(dir_path, f) 13 | for f in os.listdir(dir_path) 14 | if f.endswith(".json") and f != standard_file 15 | ] 16 | 17 | # Load the standard file 18 | with open(standard_file, "r", encoding="utf-8") as f: 19 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 20 | 21 | # Loop through each language file 22 | for lang_file in languages: 23 | # Load the language file 24 | with open(lang_file, "r", encoding="utf-8") as f: 25 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 26 | 27 | # Find the difference between the language file and the standard file 28 | diff = set(standard_data.keys()) - set(lang_data.keys()) 29 | 30 | miss = set(lang_data.keys()) - set(standard_data.keys()) 31 | 32 | # Add any missing keys to the language file 33 | for key in diff: 34 | lang_data[key] = standard_data[key] 35 | 36 | # Del any extra keys to the language file 37 | for key in miss: 38 | del lang_data[key] 39 | 40 | # Sort the keys of the language file to match the order of the standard file 41 | lang_data = OrderedDict( 42 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 43 | ) 44 | 45 | # Save the updated language file 46 | with open(lang_file, "w", encoding="utf-8") as f: 47 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 48 | f.write("\n") 49 | -------------------------------------------------------------------------------- /tools/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | from collections import OrderedDict 4 | import os 5 | 6 | # locale_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path 7 | # scan_list = ["./", 8 | # "GPT_SoVITS/", 9 | # "tools/" 10 | # ] # The path to the directory you want to scan, you can change it to your own path 11 | # scan_subfolders = False # Whether to scan subfolders 12 | 13 | locale_path = "./tools/srt_slicer/i18n/locale" 14 | scan_list = ["./tools/srt_slicer"] # The path to the directory you want to scan, you can change it to your own path 15 | scan_subfolders = True 16 | 17 | special_words_to_keep = { 18 | "auto": "自动判断", 19 | "zh": "中文", 20 | "en": "英文", 21 | "ja": "日文", 22 | "all_zh": "只有中文", 23 | "all_ja": "只有日文", 24 | "auto_cut": "智能切分", 25 | "cut0": "仅凭换行切分", 26 | "cut1": "凑四句一切", 27 | "cut2": "凑50字一切", 28 | "cut3": "按中文句号。切", 29 | "cut4": "按英文句号.切", 30 | "cut5": "按标点符号切", 31 | 32 | } 33 | 34 | 35 | def extract_i18n_strings(node): 36 | i18n_strings = [] 37 | 38 | if ( 39 | isinstance(node, ast.Call) 40 | and isinstance(node.func, ast.Name) 41 | and node.func.id == "i18n" 42 | ): 43 | for arg in node.args: 44 | if isinstance(arg, ast.Str): 45 | i18n_strings.append(arg.s) 46 | 47 | for child_node in ast.iter_child_nodes(node): 48 | i18n_strings.extend(extract_i18n_strings(child_node)) 49 | 50 | return i18n_strings 51 | 52 | strings = [] 53 | 54 | # for each file, parse the code into an AST 55 | # for each AST, extract the i18n strings 56 | def scan_i18n_strings(filename): 57 | with open(filename, "r", encoding="utf-8") as f: 58 | code = f.read() 59 | if "I18nAuto" in code: 60 | tree = ast.parse(code) 61 | i18n_strings = extract_i18n_strings(tree) 62 | print(filename, len(i18n_strings)) 63 | strings.extend(i18n_strings) 64 | 65 | 66 | # scan the directory for all .py files (recursively) 67 | if scan_subfolders: 68 | for folder in scan_list: 69 | for dirpath, dirnames, filenames in os.walk(folder): 70 | for filename in [f for f in filenames if f.endswith(".py")]: 71 | scan_i18n_strings(os.path.join(dirpath, filename)) 72 | else: 73 | for folder in scan_list: 74 | for filename in os.listdir(folder): 75 | if filename.endswith(".py"): 76 | scan_i18n_strings(os.path.join(folder, filename)) 77 | 78 | code_keys = set(strings) 79 | """ 80 | n_i18n.py 81 | gui_v1.py 26 82 | app.py 16 83 | infer-web.py 147 84 | scan_i18n.py 0 85 | i18n.py 0 86 | lib/train/process_ckpt.py 1 87 | """ 88 | print() 89 | print("Total unique:", len(code_keys)) 90 | 91 | 92 | standard_file = os.path.join(locale_path, "zh_CN.json") 93 | try: 94 | with open(standard_file, "r", encoding="utf-8") as f: 95 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 96 | standard_keys = set(standard_data.keys()) 97 | except FileNotFoundError: 98 | standard_keys = set() 99 | # Define the standard file name 100 | unused_keys = standard_keys - code_keys 101 | print("Unused keys:", len(unused_keys)) 102 | for unused_key in unused_keys: 103 | print("\t", unused_key) 104 | 105 | missing_keys = code_keys - standard_keys 106 | print("Missing keys:", len(missing_keys)) 107 | for missing_key in missing_keys: 108 | print("\t", missing_key) 109 | 110 | 111 | 112 | code_keys_dict = OrderedDict() 113 | for s in strings: 114 | if s in special_words_to_keep: 115 | code_keys_dict[s] = special_words_to_keep[s] 116 | else: 117 | code_keys_dict[s] = s 118 | 119 | # write back 120 | os.makedirs(locale_path, exist_ok=True) 121 | with open(standard_file, "w", encoding="utf-8") as f: 122 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 123 | f.write("\n") 124 | -------------------------------------------------------------------------------- /tools/my_utils.py: -------------------------------------------------------------------------------- 1 | import platform,os,traceback 2 | import ffmpeg 3 | import numpy as np 4 | 5 | 6 | def load_audio(file, sr): 7 | try: 8 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 9 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 10 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 11 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 12 | if os.path.exists(file) == False: 13 | raise RuntimeError( 14 | "You input a wrong audio path that does not exists, please fix it!" 15 | ) 16 | out, _ = ( 17 | ffmpeg.input(file, threads=0) 18 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 19 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 20 | ) 21 | except Exception as e: 22 | traceback.print_exc() 23 | raise RuntimeError(f"Failed to load audio: {e}") 24 | 25 | return np.frombuffer(out, np.float32).flatten() 26 | 27 | 28 | def clean_path(path_str): 29 | if platform.system() == 'Windows': 30 | path_str = path_str.replace('/', '\\') 31 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 32 | -------------------------------------------------------------------------------- /tools/normalize_loudness.py: -------------------------------------------------------------------------------- 1 | import soundfile as sf 2 | import numpy as np 3 | from pyloudnorm import Meter, normalize 4 | import os 5 | 6 | def normalize_loudness(audio_path, target_loudness, target_path): 7 | """ 8 | 归一化音频文件的响度到指定的目标响度。 9 | 10 | 参数: 11 | audio_path (str): 原始音频文件的路径。 12 | target_loudness (float): 目标响度值(LUFS)。 13 | target_path (str): 归一化后音频的保存路径。 14 | 15 | 返回: 16 | bool: 归一化操作是否成功。 17 | """ 18 | try: 19 | # 读取音频文件 20 | data, rate = sf.read(audio_path) 21 | 22 | # 创建响度仪表,基于ITU-R BS.1770 23 | meter = Meter(rate) # 采样率 24 | 25 | # 测量音频的响度 26 | loudness = meter.integrated_loudness(data) 27 | 28 | # 响度归一化 29 | normalized_audio = normalize.loudness(data, loudness, target_loudness) 30 | 31 | os.makedirs(os.path.dirname(target_path), exist_ok=True) 32 | # 保存归一化后的音频文件 33 | sf.write(target_path, normalized_audio, rate) 34 | 35 | return True 36 | except Exception as e: 37 | raise e -------------------------------------------------------------------------------- /tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os,sys,numpy as np 2 | import traceback 3 | from scipy.io import wavfile 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 5 | # sys.path.append(parent_directory) 6 | from my_utils import load_audio 7 | from slicer2 import Slicer 8 | 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): 10 | os.makedirs(opt_root,exist_ok=True) 11 | if os.path.isfile(inp): 12 | input=[inp] 13 | elif os.path.isdir(inp): 14 | input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 15 | else: 16 | return "输入路径存在但既不是文件也不是文件夹" 17 | slicer = Slicer( 18 | sr=32000, # 长音频采样率 19 | threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 20 | min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 21 | min_interval= int(min_interval), # 最短切割间隔 22 | hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 23 | max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 24 | ) 25 | _max=float(_max) 26 | alpha=float(alpha) 27 | for inp_path in input[int(i_part)::int(all_part)]: 28 | # print(inp_path) 29 | try: 30 | name = os.path.basename(inp_path) 31 | audio = load_audio(inp_path, 32000) 32 | # print(audio.shape) 33 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 34 | tmp_max = np.abs(chunk).max() 35 | if(tmp_max>1):chunk/=tmp_max 36 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 37 | wavfile.write( 38 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 39 | 32000, 40 | # chunk.astype(np.float32), 41 | (chunk * 32767).astype(np.int16), 42 | ) 43 | except: 44 | print(inp_path,"->fail->",traceback.format_exc()) 45 | return "执行完毕,请检查输出文件" 46 | 47 | print(slice(*sys.argv[1:])) 48 | 49 | --------------------------------------------------------------------------------