├── GPT_SoVITS ├── AR │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── bucket_sampler.py │ │ ├── data_module.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── t2s_lightning_module.py │ │ ├── t2s_lightning_module_onnx.py │ │ ├── t2s_model.py │ │ ├── t2s_model_onnx.py │ │ └── utils.py │ ├── modules │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── activation_onnx.py │ │ ├── embedding.py │ │ ├── embedding_onnx.py │ │ ├── lr_schedulers.py │ │ ├── optim.py │ │ ├── patched_mha_with_cache.py │ │ ├── patched_mha_with_cache_onnx.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── transformer_onnx.py │ ├── text_processing │ │ ├── __init__.py │ │ ├── phonemizer.py │ │ └── symbols.py │ └── utils │ │ ├── __init__.py │ │ ├── initialize.py │ │ └── io.py ├── BigVGAN │ ├── LICENSE │ ├── README.md │ ├── activations.py │ ├── alias_free_activation │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ ├── activation1d.py │ │ │ ├── anti_alias_activation.cpp │ │ │ ├── anti_alias_activation_cuda.cu │ │ │ ├── build │ │ │ │ └── _ │ │ │ ├── compat.h │ │ │ ├── load.py │ │ │ └── type_shim.h │ │ └── torch │ │ │ ├── __init__.py │ │ │ ├── act.py │ │ │ ├── filter.py │ │ │ └── resample.py │ ├── bigvgan.py │ ├── configs │ │ ├── bigvgan_22khz_80band.json │ │ ├── bigvgan_24khz_100band.json │ │ ├── bigvgan_base_22khz_80band.json │ │ ├── bigvgan_base_24khz_100band.json │ │ ├── bigvgan_v2_22khz_80band_256x.json │ │ ├── bigvgan_v2_22khz_80band_fmax8k_256x.json │ │ ├── bigvgan_v2_24khz_100band_256x.json │ │ ├── bigvgan_v2_44khz_128band_256x.json │ │ └── bigvgan_v2_44khz_128band_512x.json │ ├── discriminators.py │ ├── env.py │ ├── incl_licenses │ │ ├── LICENSE_1 │ │ ├── LICENSE_2 │ │ ├── LICENSE_3 │ │ ├── LICENSE_4 │ │ ├── LICENSE_5 │ │ ├── LICENSE_6 │ │ ├── LICENSE_7 │ │ └── LICENSE_8 │ ├── inference.py │ ├── inference_e2e.py │ ├── loss.py │ ├── meldataset.py │ ├── nv-modelcard++ │ │ ├── .gitkeep │ │ ├── bias.md │ │ ├── explainability.md │ │ ├── overview.md │ │ ├── privacy.md │ │ └── safety.md │ ├── requirements.txt │ ├── tests │ │ ├── test_activation.py │ │ ├── test_activation_snake_beta.py │ │ └── test_cuda_vs_torch_model.py │ ├── train.py │ └── utils0.py ├── TTS_infer_pack │ ├── TTS.py │ ├── TextPreprocessor.py │ ├── __init__.py │ └── text_segmentation_method.py ├── configs │ ├── .gitignore │ └── s2.json ├── download.py ├── export_torch_script.py ├── export_torch_script_v3.py ├── f5_tts │ └── model │ │ ├── __init__.py │ │ ├── backbones │ │ ├── README.md │ │ ├── dit.py │ │ ├── mmdit.py │ │ └── unett.py │ │ └── modules.py ├── feature_extractor │ ├── __init__.py │ ├── cnhubert.py │ └── whisper_enc.py ├── inference_cli.py ├── inference_gui.py ├── inference_webui.py ├── inference_webui_fast.py ├── module │ ├── __init__.py │ ├── attentions.py │ ├── attentions_onnx.py │ ├── commons.py │ ├── core_vq.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── models_onnx.py │ ├── modules.py │ ├── mrte_model.py │ ├── quantize.py │ └── transforms.py ├── onnx_export.py ├── prepare_datasets │ ├── 1-get-text.py │ ├── 2-get-hubert-wav32k.py │ └── 3-get-semantic.py ├── pretrained_models │ └── .gitignore ├── process_ckpt.py ├── s1_train.py ├── s2_train.py ├── s2_train_v3.py ├── s2_train_v3_lora.py ├── text │ ├── .gitignore │ ├── LangSegmenter │ │ ├── __init__.py │ │ └── langsegmenter.py │ ├── __init__.py │ ├── cantonese.py │ ├── chinese.py │ ├── chinese2.py │ ├── cleaner.py │ ├── cmudict-fast.rep │ ├── cmudict.rep │ ├── en_normalization │ │ └── expend.py │ ├── engdict-hot.rep │ ├── engdict_cache.pickle │ ├── english.py │ ├── g2pw │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── g2pw.py │ │ ├── onnx_api.py │ │ ├── polyphonic-fix.rep │ │ ├── polyphonic.pickle │ │ ├── polyphonic.rep │ │ └── utils.py │ ├── ja_userdic │ │ └── userdict.csv │ ├── japanese.py │ ├── korean.py │ ├── namedict_cache.pickle │ ├── opencpop-strict.txt │ ├── symbols.py │ ├── symbols2.py │ ├── tone_sandhi.py │ └── zh_normalization │ │ ├── README.md │ │ ├── __init__.py │ │ ├── char_convert.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py └── utils.py ├── LICENSE ├── README.md ├── chat_server.py ├── client-gui ├── .gitignore ├── README.md ├── pyproject.toml └── src │ ├── assets │ ├── icon.png │ └── splash_android.png │ ├── cli.py │ ├── client_gui.py │ ├── client_utils.py │ ├── frame_queue.py │ ├── pickable_session.py │ ├── pysilero.py │ ├── ui.py │ └── utils.py ├── client-requirements.txt ├── client_cli.py ├── config.yaml ├── extra-req.txt ├── extra-req2.txt ├── requirements.txt ├── screen └── img.png ├── tools ├── AP_BWE_main │ ├── 24kto48k │ │ └── readme.txt │ ├── LICENSE │ ├── README.md │ ├── datasets1 │ │ ├── __init__.py │ │ └── dataset.py │ └── models │ │ ├── __init__.py │ │ └── model.py ├── __init__.py ├── asr │ ├── config.py │ ├── fasterwhisper_asr.py │ ├── funasr_asr.py │ └── models │ │ └── .gitignore ├── audio_sr.py ├── cmd-denoise.py ├── denoise-model │ └── .gitignore ├── i18n │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ko_KR.json │ │ ├── pt_BR.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ └── scan_i18n.py ├── my_utils.py ├── slice_audio.py ├── slicer2.py ├── subfix_webui.py └── uvr5 │ ├── bs_roformer │ ├── __init__.py │ ├── attend.py │ ├── bs_roformer.py │ └── mel_band_roformer.py │ ├── bsroformer.py │ ├── lib │ ├── lib_v5 │ │ ├── dataset.py │ │ ├── layers.py │ │ ├── layers_123812KB.py │ │ ├── layers_123821KB.py │ │ ├── layers_33966KB.py │ │ ├── layers_537227KB.py │ │ ├── layers_537238KB.py │ │ ├── layers_new.py │ │ ├── model_param_init.py │ │ ├── modelparams │ │ │ ├── 1band_sr16000_hl512.json │ │ │ ├── 1band_sr32000_hl512.json │ │ │ ├── 1band_sr33075_hl384.json │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ ├── 1band_sr44100_hl256.json │ │ │ ├── 1band_sr44100_hl512.json │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ ├── 2band_32000.json │ │ │ ├── 2band_44100_lofi.json │ │ │ ├── 2band_48000.json │ │ │ ├── 3band_44100.json │ │ │ ├── 3band_44100_mid.json │ │ │ ├── 3band_44100_msb2.json │ │ │ ├── 4band_44100.json │ │ │ ├── 4band_44100_mid.json │ │ │ ├── 4band_44100_msb.json │ │ │ ├── 4band_44100_msb2.json │ │ │ ├── 4band_44100_reverse.json │ │ │ ├── 4band_44100_sw.json │ │ │ ├── 4band_v2.json │ │ │ ├── 4band_v2_sn.json │ │ │ ├── 4band_v3.json │ │ │ └── ensemble.json │ │ ├── nets.py │ │ ├── nets_123812KB.py │ │ ├── nets_123821KB.py │ │ ├── nets_33966KB.py │ │ ├── nets_537227KB.py │ │ ├── nets_537238KB.py │ │ ├── nets_61968KB.py │ │ ├── nets_new.py │ │ └── spec_utils.py │ ├── name_params.json │ └── utils.py │ ├── mdxnet.py │ ├── uvr5_weights │ └── .gitignore │ ├── vr.py │ └── webui.py └── utilss ├── __init__.py ├── __pycache__ ├── __init__.cpython-39.pyc └── sv.cpython-39.pyc └── sv.py /GPT_SoVITS/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/data/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from torch.utils.data import DataLoader 5 | 6 | from AR.data.bucket_sampler import DistributedBucketSampler 7 | from AR.data.dataset import Text2SemanticDataset 8 | 9 | 10 | class Text2SemanticDataModule(LightningDataModule): 11 | def __init__( 12 | self, 13 | config, 14 | train_semantic_path, 15 | train_phoneme_path, 16 | dev_semantic_path=None, 17 | dev_phoneme_path=None, 18 | ): 19 | super().__init__() 20 | self.config = config 21 | self.train_semantic_path = train_semantic_path 22 | self.train_phoneme_path = train_phoneme_path 23 | self.dev_semantic_path = dev_semantic_path 24 | self.dev_phoneme_path = dev_phoneme_path 25 | self.num_workers = self.config["data"]["num_workers"] 26 | 27 | def prepare_data(self): 28 | pass 29 | 30 | def setup(self, stage=None, output_logs=False): 31 | self._train_dataset = Text2SemanticDataset( 32 | phoneme_path=self.train_phoneme_path, 33 | semantic_path=self.train_semantic_path, 34 | max_sec=self.config["data"]["max_sec"], 35 | pad_val=self.config["data"]["pad_val"], 36 | ) 37 | self._dev_dataset = self._train_dataset 38 | # self._dev_dataset = Text2SemanticDataset( 39 | # phoneme_path=self.dev_phoneme_path, 40 | # semantic_path=self.dev_semantic_path, 41 | # max_sample=self.config['data']['max_eval_sample'], 42 | # max_sec=self.config['data']['max_sec'], 43 | # pad_val=self.config['data']['pad_val']) 44 | 45 | def train_dataloader(self): 46 | batch_size = ( 47 | self.config["train"]["batch_size"] // 2 48 | if self.config["train"].get("if_dpo", False) is True 49 | else self.config["train"]["batch_size"] 50 | ) 51 | batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存 52 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 53 | return DataLoader( 54 | self._train_dataset, 55 | batch_size=batch_size, 56 | sampler=sampler, 57 | collate_fn=self._train_dataset.collate, 58 | num_workers=self.num_workers, 59 | persistent_workers=True, 60 | prefetch_factor=16, 61 | ) 62 | 63 | def val_dataloader(self): 64 | return DataLoader( 65 | self._dev_dataset, 66 | batch_size=1, 67 | shuffle=False, 68 | collate_fn=self._train_dataset.collate, 69 | num_workers=max(self.num_workers, 12), 70 | persistent_workers=True, 71 | prefetch_factor=16, 72 | ) 73 | 74 | # 这个会使用到嘛? 75 | def test_dataloader(self): 76 | return DataLoader( 77 | self._dev_dataset, 78 | batch_size=1, 79 | shuffle=False, 80 | collate_fn=self._train_dataset.collate, 81 | ) 82 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/models/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os 4 | import sys 5 | 6 | now_dir = os.getcwd() 7 | sys.path.append(now_dir) 8 | from typing import Dict 9 | 10 | import torch 11 | from pytorch_lightning import LightningModule 12 | 13 | from AR.models.t2s_model_onnx import Text2SemanticDecoder 14 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 15 | from AR.modules.optim import ScaledAdam 16 | 17 | 18 | class Text2SemanticLightningModule(LightningModule): 19 | def __init__(self, config, output_dir, is_train=True): 20 | super().__init__() 21 | self.config = config 22 | self.top_k = 3 23 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 24 | pretrained_s1 = config.get("pretrained_s1") 25 | if pretrained_s1 and is_train: 26 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 27 | print( 28 | self.load_state_dict( 29 | torch.load( 30 | pretrained_s1, 31 | map_location="cpu", 32 | )["weight"], 33 | ), 34 | ) 35 | if is_train: 36 | self.automatic_optimization = False 37 | self.save_hyperparameters() 38 | self.eval_dir = output_dir / "eval" 39 | self.eval_dir.mkdir(parents=True, exist_ok=True) 40 | 41 | def training_step(self, batch: Dict, batch_idx: int): 42 | opt = self.optimizers() 43 | scheduler = self.lr_schedulers() 44 | loss, acc = self.model.forward( 45 | batch["phoneme_ids"], 46 | batch["phoneme_ids_len"], 47 | batch["semantic_ids"], 48 | batch["semantic_ids_len"], 49 | batch["bert_feature"], 50 | ) 51 | self.manual_backward(loss) 52 | if batch_idx > 0 and batch_idx % 4 == 0: 53 | opt.step() 54 | opt.zero_grad() 55 | scheduler.step() 56 | 57 | self.log( 58 | "total_loss", 59 | loss, 60 | on_step=True, 61 | on_epoch=True, 62 | prog_bar=True, 63 | sync_dist=True, 64 | ) 65 | self.log( 66 | "lr", 67 | scheduler.get_last_lr()[0], 68 | on_epoch=True, 69 | prog_bar=True, 70 | sync_dist=True, 71 | ) 72 | self.log( 73 | f"top_{self.top_k}_acc", 74 | acc, 75 | on_step=True, 76 | on_epoch=True, 77 | prog_bar=True, 78 | sync_dist=True, 79 | ) 80 | 81 | def validation_step(self, batch: Dict, batch_idx: int): 82 | return 83 | 84 | def configure_optimizers(self): 85 | model_parameters = self.model.parameters() 86 | parameters_names = [] 87 | parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) 88 | lm_opt = ScaledAdam( 89 | model_parameters, 90 | lr=0.01, 91 | betas=(0.9, 0.95), 92 | clipping_scale=2.0, 93 | parameters_names=parameters_names, 94 | show_dominant_parameters=False, 95 | clipping_update_period=1000, 96 | ) 97 | 98 | return { 99 | "optimizer": lm_opt, 100 | "lr_scheduler": { 101 | "scheduler": WarmupCosineLRSchedule( 102 | lm_opt, 103 | init_lr=self.config["optimizer"]["lr_init"], 104 | peak_lr=self.config["optimizer"]["lr"], 105 | end_lr=self.config["optimizer"]["lr_end"], 106 | warmup_steps=self.config["optimizer"]["warmup_steps"], 107 | total_steps=self.config["optimizer"]["decay_steps"], 108 | ) 109 | }, 110 | } 111 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/modules/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) 64 | else: 65 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 66 | div_term = torch.exp( 67 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim) 68 | ) 69 | pe[:, 0::2] = torch.sin(position * div_term) 70 | pe[:, 1::2] = torch.cos(position * div_term) 71 | pe = pe.unsqueeze(0) 72 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 73 | 74 | def forward(self, x: torch.Tensor) -> torch.Tensor: 75 | self.extend_pe(x) 76 | output = x.unsqueeze(-1) if x.ndim == 2 else x 77 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 78 | return self.dropout(output) 79 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps) 53 | if decay_ratio < 0.0 or decay_ratio > 1.0: 54 | raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.") 55 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 56 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 57 | 58 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 59 | self.set_lr(lr) 60 | self.lr = lr 61 | self._current_step += 1 62 | return self.lr 63 | 64 | 65 | if __name__ == "__main__": 66 | m = nn.Linear(10, 10) 67 | opt = Adam(m.parameters(), lr=1e-4) 68 | s = WarmupCosineLRSchedule( 69 | opt, 70 | 1e-6, 71 | 2e-4, 72 | 1e-6, 73 | warmup_steps=2000, 74 | total_steps=20000, 75 | current_step=0, 76 | ) 77 | lrs = [] 78 | for i in range(25000): 79 | s.step() 80 | lrs.append(s.lr) 81 | print(s.lr) 82 | 83 | plt.plot(lrs) 84 | plt.plot(range(0, 25000), lrs) 85 | plt.show() 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _canonical_mask, 4 | ) 5 | 6 | 7 | def multi_head_attention_forward_patched( 8 | query, 9 | key, 10 | value, 11 | embed_dim_to_check: int, 12 | num_heads: int, 13 | in_proj_weight, 14 | in_proj_bias: Optional[Tensor], 15 | bias_k: Optional[Tensor], 16 | bias_v: Optional[Tensor], 17 | add_zero_attn: bool, 18 | dropout_p: float, 19 | out_proj_weight: Tensor, 20 | out_proj_bias: Optional[Tensor], 21 | training: bool = True, 22 | key_padding_mask: Optional[Tensor] = None, 23 | need_weights: bool = True, 24 | attn_mask: Optional[Tensor] = None, 25 | use_separate_proj_weight: bool = False, 26 | q_proj_weight: Optional[Tensor] = None, 27 | k_proj_weight: Optional[Tensor] = None, 28 | v_proj_weight: Optional[Tensor] = None, 29 | static_k: Optional[Tensor] = None, 30 | static_v: Optional[Tensor] = None, 31 | average_attn_weights: bool = True, 32 | is_causal: bool = False, 33 | cache=None, 34 | ) -> Tuple[Tensor, Optional[Tensor]]: 35 | # set up shape vars 36 | _, _, embed_dim = query.shape 37 | attn_mask = _canonical_mask( 38 | mask=attn_mask, 39 | mask_name="attn_mask", 40 | other_type=None, 41 | other_name="", 42 | target_type=query.dtype, 43 | check_other=False, 44 | ) 45 | head_dim = embed_dim // num_heads 46 | 47 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 48 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 49 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 50 | 51 | if cache["first_infer"] == 1: 52 | cache["k"][cache["stage"]] = k 53 | cache["v"][cache["stage"]] = v 54 | else: 55 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 56 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 57 | k = cache["k"][cache["stage"]] 58 | v = cache["v"][cache["stage"]] 59 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 60 | 61 | attn_mask = _canonical_mask( 62 | mask=attn_mask, 63 | mask_name="attn_mask", 64 | other_type=None, 65 | other_name="", 66 | target_type=q.dtype, 67 | check_other=False, 68 | ) 69 | attn_mask = attn_mask.unsqueeze(0) 70 | 71 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 72 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 73 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 74 | 75 | dropout_p = 0.0 76 | attn_mask = attn_mask.unsqueeze(0) 77 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 78 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 79 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 80 | attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) 81 | attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 82 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 83 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 84 | 85 | return attn_output 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" 34 | 35 | def _normalize_punctuation(self, text: str) -> str: 36 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 37 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 38 | text = regex.sub(r"\pZ+", r" ", text) 39 | return text.strip() 40 | 41 | def _convert_punctuation(self, word: Word) -> str: 42 | if not word.phonemes: 43 | return "" 44 | if word.phonemes[0] in ["‖", "|"]: 45 | return word.text.strip() 46 | 47 | phonemes = "".join(word.phonemes) 48 | # remove modifier characters ˈˌː with regex 49 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 50 | return phonemes.strip() 51 | 52 | def phonemize(self, text: str, espeak: bool = False) -> str: 53 | text_to_phonemize: str = self._normalize_punctuation(text) 54 | sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)] 55 | words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)] 56 | return " ".join(words) 57 | 58 | def transform(self, phonemes): 59 | # convert phonemes to ids 60 | # dictionary is in symbols.py 61 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 62 | 63 | 64 | if __name__ == "__main__": 65 | phonemizer = GruutPhonemizer("en-us") 66 | # text -> IPA 67 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 68 | print("phonemes:", phonemes) 69 | print("len(phonemes):", len(phonemes)) 70 | phoneme_ids = phonemizer.transform(phonemes) 71 | print("phoneme_ids:", phoneme_ids) 72 | print("len(phoneme_ids):", len(phoneme_ids)) 73 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = ( 7 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 8 | ) 9 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 10 | SPACE_ID = SYMBOLS.index(" ") 11 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 12 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 13 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == "true" else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r"epoch=(\d+)-step=(\d+)\.ckpt" 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 22 | # 获取最新的 ckpt 文件名 23 | newest_ckpt = sorted_info[0][2] 24 | return newest_ckpt 25 | 26 | 27 | # 文本存在且不为空时 return True 28 | def check_txt_file(file_path): 29 | try: 30 | with open(file_path, "r") as file: 31 | text = file.readline().strip() 32 | assert text.strip() != "" 33 | return text 34 | except Exception: 35 | return False 36 | return False 37 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | 4 | import torch 5 | from typeguard import check_argument_types 6 | 7 | 8 | def initialize(model: torch.nn.Module, init: str): 9 | """Initialize weights of a neural network module. 10 | 11 | Parameters are initialized using the given method or distribution. 12 | 13 | Custom initialization routines can be implemented into submodules 14 | as function `espnet_initialization_fn` within the custom module. 15 | 16 | Args: 17 | model: Target. 18 | init: Method of initialization. 19 | """ 20 | assert check_argument_types() 21 | print("init with", init) 22 | 23 | # weight init 24 | for p in model.parameters(): 25 | if p.dim() > 1: 26 | if init == "xavier_uniform": 27 | torch.nn.init.xavier_uniform_(p.data) 28 | elif init == "xavier_normal": 29 | torch.nn.init.xavier_normal_(p.data) 30 | elif init == "kaiming_uniform": 31 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 32 | elif init == "kaiming_normal": 33 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 34 | else: 35 | raise ValueError("Unknown initialization: " + init) 36 | # bias init 37 | for name, p in model.named_parameters(): 38 | if ".bias" in name and p.dim() == 1: 39 | p.data.zero_() 40 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_")) 22 | with open(path, "a") as args_file: 23 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 24 | args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version())) 25 | args_file.write("==> Cmd:\n") 26 | args_file.write(str(sys.argv)) 27 | args_file.write("\n==> args:\n") 28 | for k, v in sorted(args_dict.items()): 29 | args_file.write(" %s: %s\n" % (str(k), str(v))) 30 | args_file.close() 31 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 NVIDIA CORPORATION. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from alias_free_activation.torch.resample import UpSample1d, DownSample1d 7 | 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda 9 | from alias_free_activation.cuda import load 10 | 11 | anti_alias_activation_cuda = load.load() 12 | 13 | 14 | class FusedAntiAliasActivation(torch.autograd.Function): 15 | """ 16 | Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs. 17 | The hyperparameters are hard-coded in the kernel to maximize speed. 18 | NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters. 19 | """ 20 | 21 | @staticmethod 22 | def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta): 23 | activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta) 24 | 25 | return activation_results 26 | 27 | @staticmethod 28 | def backward(ctx, output_grads): 29 | raise NotImplementedError 30 | return output_grads, None, None 31 | 32 | 33 | class Activation1d(nn.Module): 34 | def __init__( 35 | self, 36 | activation, 37 | up_ratio: int = 2, 38 | down_ratio: int = 2, 39 | up_kernel_size: int = 12, 40 | down_kernel_size: int = 12, 41 | fused: bool = True, 42 | ): 43 | super().__init__() 44 | self.up_ratio = up_ratio 45 | self.down_ratio = down_ratio 46 | self.act = activation 47 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 48 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 49 | 50 | self.fused = fused # Whether to use fused CUDA kernel or not 51 | 52 | def forward(self, x): 53 | if not self.fused: 54 | x = self.upsample(x) 55 | x = self.act(x) 56 | x = self.downsample(x) 57 | return x 58 | else: 59 | if self.act.__class__.__name__ == "Snake": 60 | beta = self.act.alpha.data # Snake uses same params for alpha and beta 61 | else: 62 | beta = self.act.beta.data # Snakebeta uses different params for alpha and beta 63 | alpha = self.act.alpha.data 64 | if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log 65 | alpha = torch.log(alpha) 66 | beta = torch.log(beta) 67 | 68 | x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta) 69 | return x 70 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta); 20 | 21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 22 | m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)"); 23 | } -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/build/_: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import pathlib 6 | import subprocess 7 | 8 | from torch.utils import cpp_extension 9 | 10 | """ 11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below 13 | """ 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 15 | 16 | 17 | def load(): 18 | # Check if cuda 11 is installed for compute capability 8.0 19 | cc_flag = [] 20 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 21 | if int(bare_metal_major) >= 11: 22 | cc_flag.append("-gencode") 23 | cc_flag.append("arch=compute_80,code=sm_80") 24 | 25 | # Build path 26 | srcpath = pathlib.Path(__file__).parent.absolute() 27 | buildpath = srcpath / "build" 28 | _create_build_dir(buildpath) 29 | 30 | # Helper function to build the kernels. 31 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 32 | return cpp_extension.load( 33 | name=name, 34 | sources=sources, 35 | build_directory=buildpath, 36 | extra_cflags=[ 37 | "-O3", 38 | ], 39 | extra_cuda_cflags=[ 40 | "-O3", 41 | "-gencode", 42 | "arch=compute_70,code=sm_70", 43 | "--use_fast_math", 44 | ] 45 | + extra_cuda_flags 46 | + cc_flag, 47 | verbose=True, 48 | ) 49 | 50 | extra_cuda_flags = [ 51 | "-U__CUDA_NO_HALF_OPERATORS__", 52 | "-U__CUDA_NO_HALF_CONVERSIONS__", 53 | "--expt-relaxed-constexpr", 54 | "--expt-extended-lambda", 55 | ] 56 | 57 | sources = [ 58 | srcpath / "anti_alias_activation.cpp", 59 | srcpath / "anti_alias_activation_cuda.cu", 60 | ] 61 | anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags) 62 | 63 | return anti_alias_activation_cuda 64 | 65 | 66 | def _get_cuda_bare_metal_version(cuda_dir): 67 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) 68 | output = raw_output.split() 69 | release_idx = output.index("release") + 1 70 | release = output[release_idx].split(".") 71 | bare_metal_major = release[0] 72 | bare_metal_minor = release[1][0] 73 | 74 | return raw_output, bare_metal_major, bare_metal_minor 75 | 76 | 77 | def _create_build_dir(buildpath): 78 | try: 79 | os.mkdir(buildpath) 80 | except OSError: 81 | if not os.path.isdir(buildpath): 82 | print(f"Creation of the build directory {buildpath} failed") 83 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * 7 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__( 10 | self, 11 | activation, 12 | up_ratio: int = 2, 13 | down_ratio: int = 2, 14 | up_kernel_size: int = 12, 15 | down_kernel_size: int = 12, 16 | ): 17 | super().__init__() 18 | self.up_ratio = up_ratio 19 | self.down_ratio = down_ratio 20 | self.act = activation 21 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 22 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 23 | 24 | # x: [B,C,T] 25 | def forward(self, x): 26 | x = self.upsample(x) 27 | x = self.act(x) 28 | x = self.downsample(x) 29 | 30 | return x 31 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if "sinc" in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where( 21 | x == 0, 22 | torch.tensor(1.0, device=x.device, dtype=x.dtype), 23 | torch.sin(math.pi * x) / math.pi / x, 24 | ) 25 | 26 | 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 28 | # https://adefossez.github.io/julius/julius/lowpass.html 29 | # LICENSE is in incl_licenses directory. 30 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] 31 | even = kernel_size % 2 == 0 32 | half_size = kernel_size // 2 33 | 34 | # For kaiser window 35 | delta_f = 4 * half_width 36 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 37 | if A > 50.0: 38 | beta = 0.1102 * (A - 8.7) 39 | elif A >= 21.0: 40 | beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0) 41 | else: 42 | beta = 0.0 43 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 44 | 45 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 46 | if even: 47 | time = torch.arange(-half_size, half_size) + 0.5 48 | else: 49 | time = torch.arange(kernel_size) - half_size 50 | if cutoff == 0: 51 | filter_ = torch.zeros_like(time) 52 | else: 53 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 54 | """ 55 | Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal. 56 | """ 57 | filter_ /= filter_.sum() 58 | filter = filter_.view(1, 1, kernel_size) 59 | 60 | return filter 61 | 62 | 63 | class LowPassFilter1d(nn.Module): 64 | def __init__( 65 | self, 66 | cutoff=0.5, 67 | half_width=0.6, 68 | stride: int = 1, 69 | padding: bool = True, 70 | padding_mode: str = "replicate", 71 | kernel_size: int = 12, 72 | ): 73 | """ 74 | kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible. 75 | """ 76 | super().__init__() 77 | if cutoff < -0.0: 78 | raise ValueError("Minimum cutoff must be larger than zero.") 79 | if cutoff > 0.5: 80 | raise ValueError("A cutoff above 0.5 does not make sense.") 81 | self.kernel_size = kernel_size 82 | self.even = kernel_size % 2 == 0 83 | self.pad_left = kernel_size // 2 - int(self.even) 84 | self.pad_right = kernel_size // 2 85 | self.stride = stride 86 | self.padding = padding 87 | self.padding_mode = padding_mode 88 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 89 | self.register_buffer("filter", filter) 90 | 91 | # Input [B, C, T] 92 | def forward(self, x): 93 | _, C, _ = x.shape 94 | 95 | if self.padding: 96 | x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode) 97 | out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 98 | 99 | return out 100 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size) 20 | self.register_buffer("filter", filter) 21 | 22 | # x: [B, C, T] 23 | def forward(self, x): 24 | _, C, _ = x.shape 25 | 26 | x = F.pad(x, (self.pad, self.pad), mode="replicate") 27 | x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 28 | x = x[..., self.pad_left : -self.pad_right] 29 | 30 | return x 31 | 32 | 33 | class DownSample1d(nn.Module): 34 | def __init__(self, ratio=2, kernel_size=None): 35 | super().__init__() 36 | self.ratio = ratio 37 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 38 | self.lowpass = LowPassFilter1d( 39 | cutoff=0.5 / ratio, 40 | half_width=0.6 / ratio, 41 | stride=ratio, 42 | kernel_size=self.kernel_size, 43 | ) 44 | 45 | def forward(self, x): 46 | xx = self.lowpass(x) 47 | 48 | return xx 49 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 80, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 22050, 33 | 34 | "fmin": 0, 35 | "fmax": 8000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 100, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 24000, 33 | 34 | "fmin": 0, 35 | "fmax": 12000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 80, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 22050, 33 | 34 | "fmin": 0, 35 | "fmax": 8000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 100, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 24000, 33 | 34 | "fmin": 0, 35 | "fmax": 12000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 80, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 22050, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 80, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 22050, 49 | 50 | "fmin": 0, 51 | "fmax": 8000, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 100, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 24000, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 128, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 44100, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,4,2,2,2,2], 12 | "upsample_kernel_sizes": [16,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 128, 43 | "num_freq": 2049, 44 | "n_fft": 2048, 45 | "hop_size": 512, 46 | "win_size": 2048, 47 | 48 | "sampling_rate": 44100, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/env.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import os 5 | import shutil 6 | 7 | 8 | class AttrDict(dict): 9 | def __init__(self, *args, **kwargs): 10 | super(AttrDict, self).__init__(*args, **kwargs) 11 | self.__dict__ = self 12 | 13 | 14 | def build_env(config, config_name, path): 15 | t_path = os.path.join(path, config_name) 16 | if config != t_path: 17 | os.makedirs(path, exist_ok=True) 18 | shutil.copyfile(config, os.path.join(path, config_name)) 19 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Edward Dixon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Seungwon Park 박승원 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5: -------------------------------------------------------------------------------- 1 | Copyright 2020 Alexandre Défossez 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or 10 | substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-present, Descript 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Charactr Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Amphion 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/inference.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import os 7 | import argparse 8 | import json 9 | import torch 10 | import librosa 11 | from utils import load_checkpoint 12 | from meldataset import get_mel_spectrogram 13 | from scipy.io.wavfile import write 14 | from env import AttrDict 15 | from meldataset import MAX_WAV_VALUE 16 | from bigvgan import BigVGAN as Generator 17 | 18 | h = None 19 | device = None 20 | torch.backends.cudnn.benchmark = False 21 | 22 | 23 | def inference(a, h): 24 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device) 25 | 26 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 27 | generator.load_state_dict(state_dict_g["generator"]) 28 | 29 | filelist = os.listdir(a.input_wavs_dir) 30 | 31 | os.makedirs(a.output_dir, exist_ok=True) 32 | 33 | generator.eval() 34 | generator.remove_weight_norm() 35 | with torch.no_grad(): 36 | for i, filname in enumerate(filelist): 37 | # Load the ground truth audio and resample if necessary 38 | wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True) 39 | wav = torch.FloatTensor(wav).to(device) 40 | # Compute mel spectrogram from the ground truth audio 41 | x = get_mel_spectrogram(wav.unsqueeze(0), generator.h) 42 | 43 | y_g_hat = generator(x) 44 | 45 | audio = y_g_hat.squeeze() 46 | audio = audio * MAX_WAV_VALUE 47 | audio = audio.cpu().numpy().astype("int16") 48 | 49 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav") 50 | write(output_file, h.sampling_rate, audio) 51 | print(output_file) 52 | 53 | 54 | def main(): 55 | print("Initializing Inference Process..") 56 | 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--input_wavs_dir", default="test_files") 59 | parser.add_argument("--output_dir", default="generated_files") 60 | parser.add_argument("--checkpoint_file", required=True) 61 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False) 62 | 63 | a = parser.parse_args() 64 | 65 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") 66 | with open(config_file) as f: 67 | data = f.read() 68 | 69 | global h 70 | json_config = json.loads(data) 71 | h = AttrDict(json_config) 72 | 73 | torch.manual_seed(h.seed) 74 | global device 75 | if torch.cuda.is_available(): 76 | torch.cuda.manual_seed(h.seed) 77 | device = torch.device("cuda") 78 | else: 79 | device = torch.device("cpu") 80 | 81 | inference(a, h) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/inference_e2e.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import glob 7 | import os 8 | import numpy as np 9 | import argparse 10 | import json 11 | import torch 12 | from scipy.io.wavfile import write 13 | from env import AttrDict 14 | from meldataset import MAX_WAV_VALUE 15 | from bigvgan import BigVGAN as Generator 16 | 17 | h = None 18 | device = None 19 | torch.backends.cudnn.benchmark = False 20 | 21 | 22 | def load_checkpoint(filepath, device): 23 | assert os.path.isfile(filepath) 24 | print(f"Loading '{filepath}'") 25 | checkpoint_dict = torch.load(filepath, map_location=device) 26 | print("Complete.") 27 | return checkpoint_dict 28 | 29 | 30 | def scan_checkpoint(cp_dir, prefix): 31 | pattern = os.path.join(cp_dir, prefix + "*") 32 | cp_list = glob.glob(pattern) 33 | if len(cp_list) == 0: 34 | return "" 35 | return sorted(cp_list)[-1] 36 | 37 | 38 | def inference(a, h): 39 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device) 40 | 41 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 42 | generator.load_state_dict(state_dict_g["generator"]) 43 | 44 | filelist = os.listdir(a.input_mels_dir) 45 | 46 | os.makedirs(a.output_dir, exist_ok=True) 47 | 48 | generator.eval() 49 | generator.remove_weight_norm() 50 | with torch.no_grad(): 51 | for i, filname in enumerate(filelist): 52 | # Load the mel spectrogram in .npy format 53 | x = np.load(os.path.join(a.input_mels_dir, filname)) 54 | x = torch.FloatTensor(x).to(device) 55 | if len(x.shape) == 2: 56 | x = x.unsqueeze(0) 57 | 58 | y_g_hat = generator(x) 59 | 60 | audio = y_g_hat.squeeze() 61 | audio = audio * MAX_WAV_VALUE 62 | audio = audio.cpu().numpy().astype("int16") 63 | 64 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav") 65 | write(output_file, h.sampling_rate, audio) 66 | print(output_file) 67 | 68 | 69 | def main(): 70 | print("Initializing Inference Process..") 71 | 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument("--input_mels_dir", default="test_mel_files") 74 | parser.add_argument("--output_dir", default="generated_files_from_mel") 75 | parser.add_argument("--checkpoint_file", required=True) 76 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False) 77 | 78 | a = parser.parse_args() 79 | 80 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") 81 | with open(config_file) as f: 82 | data = f.read() 83 | 84 | global h 85 | json_config = json.loads(data) 86 | h = AttrDict(json_config) 87 | 88 | torch.manual_seed(h.seed) 89 | global device 90 | if torch.cuda.is_available(): 91 | torch.cuda.manual_seed(h.seed) 92 | device = torch.device("cuda") 93 | else: 94 | device = torch.device("cpu") 95 | 96 | inference(a, h) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- | 3 | | Participation considerations from adversely impacted groups protected classes in model design and testing: | None | 4 | | Measures taken to mitigate against unwanted bias: | No measures taken to mitigate against unwanted bias. | 5 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 3 | | Intended Application & Domain: | Generating waveform from mel spectrogram. | 4 | | Model Type: | Convolutional Neural Network (CNN) | 5 | | Intended Users: | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms. | 6 | | Output: | Audio Waveform | 7 | | Describe how the model works: | Model generates audio waveform corresponding to the input mel spectrogram. | 8 | | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable | 9 | | Technical Limitations: | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained. | 10 | | Verified to have met prescribed NVIDIA quality standards: | Yes | 11 | | Performance Metrics: | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) | 12 | | Potential Known Risks: | This model may generate low-quality or distorted soundwaves. | 13 | | Licensing: | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE | 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- | 3 | | Generatable or reverse engineerable personal information? | None | 4 | | Protected class data used to create this model? | None | 5 | | Was consent obtained for any personal data used? | Not Applicable (No Personal Data) | 6 | | How often is dataset reviewed? | Before Release | 7 | | Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable | 8 | | If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable | 9 | | If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable | 10 | | If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable | 11 | | Is data in dataset traceable? | Yes | 12 | | Is there provenance for all datasets used in training? | Yes | 13 | | Does data labeling (annotation, metadata) comply with privacy laws? | Yes | 14 | | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. | 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 3 | | Model Application(s): | Synethic Audio Generation | 4 | | Describe the life critical impact (if present). | Not Applicable | 5 | | Use Case Restrictions: | None | 6 | | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. | 7 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | numpy 3 | librosa>=0.8.1 4 | scipy 5 | tensorboard 6 | soundfile 7 | matplotlib 8 | pesq 9 | auraloss 10 | tqdm 11 | nnAudio 12 | ninja 13 | huggingface_hub>=0.23.4 -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/tests/test_activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import sys 6 | 7 | # to import modules from parent_dir 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(parent_dir) 10 | 11 | import torch 12 | from alias_free_activation.cuda import activation1d 13 | from activations import Snake 14 | 15 | 16 | def test_load_fused_kernels(): 17 | try: 18 | print("[Success] load_fused_kernels") 19 | except ImportError as e: 20 | print("[Fail] load_fused_kernels") 21 | raise e 22 | 23 | 24 | def test_anti_alias_activation(): 25 | data = torch.rand((10, 10, 200), device="cuda") 26 | 27 | # Check activations.Snake cuda vs. torch 28 | fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda() 29 | fused_activation_output = fused_anti_alias_activation(data) 30 | 31 | torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda() 32 | torch_activation_output = torch_anti_alias_activation(data) 33 | 34 | test_result = (fused_activation_output - torch_activation_output).abs() 35 | 36 | while test_result.dim() != 1: 37 | test_result = test_result.mean(dim=-1) 38 | 39 | diff = test_result.mean(dim=-1) 40 | 41 | if diff <= 1e-3: 42 | print( 43 | f"\n[Success] test_fused_anti_alias_activation" 44 | f"\n > mean_difference={diff}" 45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}" 46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 47 | ) 48 | else: 49 | print( 50 | f"\n[Fail] test_fused_anti_alias_activation" 51 | f"\n > mean_difference={diff}, " 52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, " 53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 54 | ) 55 | 56 | 57 | if __name__ == "__main__": 58 | from alias_free_activation.cuda import load 59 | 60 | load.load() 61 | test_load_fused_kernels() 62 | test_anti_alias_activation() 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import sys 6 | 7 | # to import modules from parent_dir 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(parent_dir) 10 | 11 | import torch 12 | from alias_free_activation.cuda import activation1d 13 | from activations import SnakeBeta 14 | 15 | 16 | def test_load_fused_kernels(): 17 | try: 18 | print("[Success] load_fused_kernels") 19 | except ImportError as e: 20 | print("[Fail] load_fused_kernels") 21 | raise e 22 | 23 | 24 | def test_anti_alias_activation(): 25 | data = torch.rand((10, 10, 200), device="cuda") 26 | 27 | # Check activations, Snake CUDA vs. Torch 28 | fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda() 29 | fused_activation_output = fused_anti_alias_activation(data) 30 | 31 | torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda() 32 | torch_activation_output = torch_anti_alias_activation(data) 33 | 34 | test_result = (fused_activation_output - torch_activation_output).abs() 35 | 36 | while test_result.dim() != 1: 37 | test_result = test_result.mean(dim=-1) 38 | 39 | diff = test_result.mean(dim=-1) 40 | 41 | if diff <= 1e-3: 42 | print( 43 | f"\n[Success] test_fused_anti_alias_activation" 44 | f"\n > mean_difference={diff}" 45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}" 46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 47 | ) 48 | else: 49 | print( 50 | f"\n[Fail] test_fused_anti_alias_activation" 51 | f"\n > mean_difference={diff}, " 52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, " 53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 54 | ) 55 | 56 | 57 | if __name__ == "__main__": 58 | from alias_free_activation.cuda import load 59 | 60 | load.load() 61 | test_load_fused_kernels() 62 | test_anti_alias_activation() 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/utils0.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import glob 5 | import os 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | from .meldataset import MAX_WAV_VALUE 13 | from scipy.io.wavfile import write 14 | 15 | 16 | def plot_spectrogram(spectrogram): 17 | fig, ax = plt.subplots(figsize=(10, 2)) 18 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 19 | plt.colorbar(im, ax=ax) 20 | 21 | fig.canvas.draw() 22 | plt.close() 23 | 24 | return fig 25 | 26 | 27 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0): 28 | fig, ax = plt.subplots(figsize=(10, 2)) 29 | im = ax.imshow( 30 | spectrogram, 31 | aspect="auto", 32 | origin="lower", 33 | interpolation="none", 34 | vmin=1e-6, 35 | vmax=clip_max, 36 | ) 37 | plt.colorbar(im, ax=ax) 38 | 39 | fig.canvas.draw() 40 | plt.close() 41 | 42 | return fig 43 | 44 | 45 | def init_weights(m, mean=0.0, std=0.01): 46 | classname = m.__class__.__name__ 47 | if classname.find("Conv") != -1: 48 | m.weight.data.normal_(mean, std) 49 | 50 | 51 | def apply_weight_norm(m): 52 | classname = m.__class__.__name__ 53 | if classname.find("Conv") != -1: 54 | weight_norm(m) 55 | 56 | 57 | def get_padding(kernel_size, dilation=1): 58 | return int((kernel_size * dilation - dilation) / 2) 59 | 60 | 61 | def load_checkpoint(filepath, device): 62 | assert os.path.isfile(filepath) 63 | print(f"Loading '{filepath}'") 64 | checkpoint_dict = torch.load(filepath, map_location=device) 65 | print("Complete.") 66 | return checkpoint_dict 67 | 68 | 69 | def save_checkpoint(filepath, obj): 70 | print(f"Saving checkpoint to {filepath}") 71 | torch.save(obj, filepath) 72 | print("Complete.") 73 | 74 | 75 | def scan_checkpoint(cp_dir, prefix, renamed_file=None): 76 | # Fallback to original scanning logic first 77 | pattern = os.path.join(cp_dir, prefix + "????????") 78 | cp_list = glob.glob(pattern) 79 | 80 | if len(cp_list) > 0: 81 | last_checkpoint_path = sorted(cp_list)[-1] 82 | print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'") 83 | return last_checkpoint_path 84 | 85 | # If no pattern-based checkpoints are found, check for renamed file 86 | if renamed_file: 87 | renamed_path = os.path.join(cp_dir, renamed_file) 88 | if os.path.isfile(renamed_path): 89 | print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'") 90 | return renamed_path 91 | 92 | return None 93 | 94 | 95 | def save_audio(audio, path, sr): 96 | # wav: torch with 1d shape 97 | audio = audio * MAX_WAV_VALUE 98 | audio = audio.cpu().numpy().astype("int16") 99 | write(path, sr, audio) 100 | -------------------------------------------------------------------------------- /GPT_SoVITS/TTS_infer_pack/__init__.py: -------------------------------------------------------------------------------- 1 | from . import TTS, text_segmentation_method 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/.gitignore: -------------------------------------------------------------------------------- 1 | *.yaml -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 500, 5 | "seed": 1234, 6 | "epochs": 100, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 20480, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "text_low_lr_rate": 0.4, 22 | "grad_ckpt": false 23 | }, 24 | "data": { 25 | "max_wav_value": 32768.0, 26 | "sampling_rate": 32000, 27 | "filter_length": 2048, 28 | "hop_length": 640, 29 | "win_length": 2048, 30 | "n_mel_channels": 128, 31 | "mel_fmin": 0.0, 32 | "mel_fmax": null, 33 | "add_blank": true, 34 | "n_speakers": 300, 35 | "cleaned_text": true 36 | }, 37 | "model": { 38 | "inter_channels": 192, 39 | "hidden_channels": 192, 40 | "filter_channels": 768, 41 | "n_heads": 2, 42 | "n_layers": 6, 43 | "kernel_size": 3, 44 | "p_dropout": 0.1, 45 | "resblock": "1", 46 | "resblock_kernel_sizes": [ 47 | 3, 48 | 7, 49 | 11 50 | ], 51 | "resblock_dilation_sizes": [ 52 | [ 53 | 1, 54 | 3, 55 | 5 56 | ], 57 | [ 58 | 1, 59 | 3, 60 | 5 61 | ], 62 | [ 63 | 1, 64 | 3, 65 | 5 66 | ] 67 | ], 68 | "upsample_rates": [ 69 | 10, 70 | 8, 71 | 2, 72 | 2, 73 | 2 74 | ], 75 | "upsample_initial_channel": 512, 76 | "upsample_kernel_sizes": [ 77 | 16, 78 | 16, 79 | 8, 80 | 2, 81 | 2 82 | ], 83 | "n_layers_q": 3, 84 | "use_spectral_norm": false, 85 | "gin_channels": 512, 86 | "semantic_frame_rate": "25hz", 87 | "freeze_quantizer": true 88 | }, 89 | "s2_ckpt_dir": "logs/s2/big2k1", 90 | "content_module": "cnhubert" 91 | } -------------------------------------------------------------------------------- /GPT_SoVITS/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | now_dir = os.getcwd() 5 | sys.path.insert(0, now_dir) 6 | from text.g2pw import G2PWPinyin 7 | 8 | g2pw = G2PWPinyin( 9 | model_dir="GPT_SoVITS/text/G2PWModel", 10 | model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", 11 | v_to_u=False, 12 | neutral_tone_with_five=True, 13 | ) 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/f5_tts/model/__init__.py: -------------------------------------------------------------------------------- 1 | # from f5_tts.model.cfm import CFM 2 | # 3 | # from f5_tts.model.backbones.unett import UNetT 4 | from GPT_SoVITS.f5_tts.model.backbones.dit import DiT 5 | # from f5_tts.model.backbones.dit import DiTNoCond 6 | # from f5_tts.model.backbones.dit import DiTNoCondNoT 7 | # from f5_tts.model.backbones.mmdit import MMDiT 8 | 9 | # from f5_tts.model.trainer import Trainer 10 | 11 | 12 | # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"] 13 | # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"] 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/f5_tts/model/backbones/README.md: -------------------------------------------------------------------------------- 1 | ## Backbones quick introduction 2 | 3 | 4 | ### unett.py 5 | - flat unet transformer 6 | - structure same as in e2-tts & voicebox paper except using rotary pos emb 7 | - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat 8 | 9 | ### dit.py 10 | - adaln-zero dit 11 | - embedded timestep as condition 12 | - concatted noised_input + masked_cond + embedded_text, linear proj in 13 | - possible abs pos emb & convnextv2 blocks for embedded text before concat 14 | - possible long skip connection (first layer to last layer) 15 | 16 | ### mmdit.py 17 | - sd3 structure 18 | - timestep as condition 19 | - left stream: text embedded and applied a abs pos emb 20 | - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett 21 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc} 4 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/cnhubert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from transformers import logging as tf_logging 4 | 5 | tf_logging.set_verbosity_error() 6 | 7 | import logging 8 | 9 | logging.getLogger("numba").setLevel(logging.WARNING) 10 | 11 | from transformers import ( 12 | Wav2Vec2FeatureExtractor, 13 | HubertModel, 14 | ) 15 | 16 | import utils 17 | import torch.nn as nn 18 | 19 | cnhubert_base_path = None 20 | 21 | 22 | class CNHubert(nn.Module): 23 | def __init__(self, base_path: str = None): 24 | super().__init__() 25 | if base_path is None: 26 | base_path = cnhubert_base_path 27 | if os.path.exists(base_path): 28 | ... 29 | else: 30 | raise FileNotFoundError(base_path) 31 | self.model = HubertModel.from_pretrained(base_path, local_files_only=True) 32 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True) 33 | 34 | def forward(self, x): 35 | input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 36 | feats = self.model(input_values)["last_hidden_state"] 37 | return feats 38 | 39 | 40 | # class CNHubertLarge(nn.Module): 41 | # def __init__(self): 42 | # super().__init__() 43 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 44 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 45 | # def forward(self, x): 46 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 47 | # feats = self.model(input_values)["last_hidden_state"] 48 | # return feats 49 | # 50 | # class CVec(nn.Module): 51 | # def __init__(self): 52 | # super().__init__() 53 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 54 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 55 | # def forward(self, x): 56 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 57 | # feats = self.model(input_values)["last_hidden_state"] 58 | # return feats 59 | # 60 | # class cnw2v2base(nn.Module): 61 | # def __init__(self): 62 | # super().__init__() 63 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 64 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 65 | # def forward(self, x): 66 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 67 | # feats = self.model(input_values)["last_hidden_state"] 68 | # return feats 69 | 70 | 71 | def get_model(): 72 | model = CNHubert() 73 | model.eval() 74 | return model 75 | 76 | 77 | # def get_large_model(): 78 | # model = CNHubertLarge() 79 | # model.eval() 80 | # return model 81 | # 82 | # def get_model_cvec(): 83 | # model = CVec() 84 | # model.eval() 85 | # return model 86 | # 87 | # def get_model_cnw2v2base(): 88 | # model = cnw2v2base() 89 | # model.eval() 90 | # return model 91 | 92 | 93 | def get_content(hmodel, wav_16k_tensor): 94 | with torch.no_grad(): 95 | feats = hmodel(wav_16k_tensor) 96 | return feats.transpose(1, 2) 97 | 98 | 99 | if __name__ == "__main__": 100 | model = get_model() 101 | src_path = "/Users/Shared/原音频2.wav" 102 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) 103 | model = model 104 | wav_16k_tensor = wav_16k_tensor 105 | feats = get_content(model, wav_16k_tensor) 106 | print(feats.shape) 107 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2) 23 | return feature 24 | -------------------------------------------------------------------------------- /GPT_SoVITS/inference_cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import soundfile as sf 4 | 5 | from tools.i18n.i18n import I18nAuto 6 | from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav 7 | 8 | i18n = I18nAuto() 9 | 10 | 11 | def synthesize( 12 | GPT_model_path, 13 | SoVITS_model_path, 14 | ref_audio_path, 15 | ref_text_path, 16 | ref_language, 17 | target_text_path, 18 | target_language, 19 | output_path, 20 | ): 21 | # Read reference text 22 | with open(ref_text_path, "r", encoding="utf-8") as file: 23 | ref_text = file.read() 24 | 25 | # Read target text 26 | with open(target_text_path, "r", encoding="utf-8") as file: 27 | target_text = file.read() 28 | 29 | # Change model weights 30 | change_gpt_weights(gpt_path=GPT_model_path) 31 | change_sovits_weights(sovits_path=SoVITS_model_path) 32 | 33 | # Synthesize audio 34 | synthesis_result = get_tts_wav( 35 | ref_wav_path=ref_audio_path, 36 | prompt_text=ref_text, 37 | prompt_language=i18n(ref_language), 38 | text=target_text, 39 | text_language=i18n(target_language), 40 | top_p=1, 41 | temperature=1, 42 | ) 43 | 44 | result_list = list(synthesis_result) 45 | 46 | if result_list: 47 | last_sampling_rate, last_audio_data = result_list[-1] 48 | output_wav_path = os.path.join(output_path, "output.wav") 49 | sf.write(output_wav_path, last_audio_data, last_sampling_rate) 50 | print(f"Audio saved to {output_wav_path}") 51 | 52 | 53 | def main(): 54 | parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") 55 | parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") 56 | parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") 57 | parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") 58 | parser.add_argument("--ref_text", required=True, help="Path to the reference text file") 59 | parser.add_argument( 60 | "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio" 61 | ) 62 | parser.add_argument("--target_text", required=True, help="Path to the target text file") 63 | parser.add_argument( 64 | "--target_language", 65 | required=True, 66 | choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], 67 | help="Language of the target text", 68 | ) 69 | parser.add_argument("--output_path", required=True, help="Path to the output directory") 70 | 71 | args = parser.parse_args() 72 | 73 | synthesize( 74 | args.gpt_model, 75 | args.sovits_model, 76 | args.ref_audio, 77 | args.ref_text, 78 | args.ref_language, 79 | args.target_text, 80 | args.target_language, 81 | args.output_path, 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/module/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def feature_loss(fmap_r, fmap_g): 7 | loss = 0 8 | for dr, dg in zip(fmap_r, fmap_g): 9 | for rl, gl in zip(dr, dg): 10 | rl = rl.float().detach() 11 | gl = gl.float() 12 | loss += torch.mean(torch.abs(rl - gl)) 13 | 14 | return loss * 2 15 | 16 | 17 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 18 | loss = 0 19 | r_losses = [] 20 | g_losses = [] 21 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 22 | dr = dr.float() 23 | dg = dg.float() 24 | r_loss = torch.mean((1 - dr) ** 2) 25 | g_loss = torch.mean(dg**2) 26 | loss += r_loss + g_loss 27 | r_losses.append(r_loss.item()) 28 | g_losses.append(g_loss.item()) 29 | 30 | return loss, r_losses, g_losses 31 | 32 | 33 | def generator_loss(disc_outputs): 34 | loss = 0 35 | gen_losses = [] 36 | for dg in disc_outputs: 37 | dg = dg.float() 38 | l = torch.mean((1 - dg) ** 2) 39 | gen_losses.append(l) 40 | loss += l 41 | 42 | return loss, gen_losses 43 | 44 | 45 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 46 | """ 47 | z_p, logs_q: [b, h, t_t] 48 | m_p, logs_p: [b, h, t_t] 49 | """ 50 | z_p = z_p.float() 51 | logs_q = logs_q.float() 52 | m_p = m_p.float() 53 | logs_p = logs_p.float() 54 | z_mask = z_mask.float() 55 | 56 | kl = logs_p - logs_q - 0.5 57 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 58 | kl = torch.sum(kl * z_mask) 59 | l = kl / torch.sum(z_mask) 60 | return l 61 | 62 | 63 | def mle_loss(z, m, logs, logdet, mask): 64 | l = torch.sum(logs) + 0.5 * torch.sum( 65 | torch.exp(-2 * logs) * ((z - m) ** 2) 66 | ) # neg normal likelihood w/o the constant term 67 | l = l - torch.sum(logdet) # log jacobian determinant 68 | l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes 69 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 70 | return l 71 | -------------------------------------------------------------------------------- /GPT_SoVITS/pretrained_models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /GPT_SoVITS/process_ckpt.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from collections import OrderedDict 3 | from time import time as ttime 4 | import shutil 5 | import os 6 | import torch 7 | from tools.i18n.i18n import I18nAuto 8 | 9 | i18n = I18nAuto() 10 | 11 | 12 | def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path 13 | dir = os.path.dirname(path) 14 | name = os.path.basename(path) 15 | tmp_path = "%s.pth" % (ttime()) 16 | torch.save(fea, tmp_path) 17 | shutil.move(tmp_path, "%s/%s" % (dir, name)) 18 | 19 | 20 | """ 21 | 00:v1 22 | 01:v2 23 | 02:v3 24 | 03:v3lora 25 | 04:v4lora 26 | 27 | """ 28 | from io import BytesIO 29 | 30 | 31 | def my_save2(fea, path,cfm_version): 32 | bio = BytesIO() 33 | torch.save(fea, bio) 34 | bio.seek(0) 35 | data = bio.getvalue() 36 | byte=b"03" if cfm_version=="v3"else b"04" 37 | data = byte + data[2:] 38 | with open(path, "wb") as f: 39 | f.write(data) 40 | 41 | 42 | def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None): 43 | try: 44 | opt = OrderedDict() 45 | opt["weight"] = {} 46 | for key in ckpt.keys(): 47 | if "enc_q" in key: 48 | continue 49 | opt["weight"][key] = ckpt[key].half() 50 | opt["config"] = hps 51 | opt["info"] = "%sepoch_%siteration" % (epoch, steps) 52 | if lora_rank: 53 | opt["lora_rank"] = lora_rank 54 | my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version) 55 | else: 56 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 57 | return "Success." 58 | except: 59 | return traceback.format_exc() 60 | 61 | 62 | head2version = { 63 | b"00": ["v1", "v1", False], 64 | b"01": ["v2", "v2", False], 65 | b"02": ["v2", "v3", False], 66 | b"03": ["v2", "v3", True], 67 | b"04": ["v2", "v4", True], 68 | } 69 | hash_pretrained_dict = { 70 | "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained 71 | "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained 72 | "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained 73 | "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained 74 | } 75 | import hashlib 76 | 77 | 78 | def get_hash_from_file(sovits_path): 79 | with open(sovits_path, "rb") as f: 80 | data = f.read(8192) 81 | hash_md5 = hashlib.md5() 82 | hash_md5.update(data) 83 | return hash_md5.hexdigest() 84 | 85 | 86 | def get_sovits_version_from_path_fast(sovits_path): 87 | ###1-if it is pretrained sovits models, by hash 88 | hash = get_hash_from_file(sovits_path) 89 | if hash in hash_pretrained_dict: 90 | return hash_pretrained_dict[hash] 91 | ###2-new weights, by head 92 | with open(sovits_path, "rb") as f: 93 | version = f.read(2) 94 | if version != b"PK": 95 | return head2version[version] 96 | ###3-old weights, by file size 97 | if_lora_v3 = False 98 | size = os.path.getsize(sovits_path) 99 | """ 100 | v1weights:about 82942KB 101 | half thr:82978KB 102 | v2weights:about 83014KB 103 | v3weights:about 750MB 104 | """ 105 | if size < 82978 * 1024: 106 | model_version = version = "v1" 107 | elif size < 700 * 1024 * 1024: 108 | model_version = version = "v2" 109 | else: 110 | version = "v2" 111 | model_version = "v3" 112 | return version, model_version, if_lora_v3 113 | 114 | 115 | def load_sovits_new(sovits_path): 116 | f = open(sovits_path, "rb") 117 | meta = f.read(2) 118 | if meta != "PK": 119 | data = b"PK" + f.read() 120 | bio = BytesIO() 121 | bio.write(data) 122 | bio.seek(0) 123 | return torch.load(bio, map_location="cpu", weights_only=False) 124 | return torch.load(sovits_path, map_location="cpu", weights_only=False) 125 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/.gitignore: -------------------------------------------------------------------------------- 1 | G2PWModel 2 | __pycache__ 3 | *.zip -------------------------------------------------------------------------------- /GPT_SoVITS/text/LangSegmenter/__init__.py: -------------------------------------------------------------------------------- 1 | from .langsegmenter import LangSegmenter 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | # if os.environ.get("version","v1")=="v1": 3 | # from text.symbols import symbols 4 | # else: 5 | # from text.symbols2 import symbols 6 | 7 | from text import symbols as symbols_v1 8 | from text import symbols2 as symbols_v2 9 | 10 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} 11 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} 12 | 13 | 14 | def cleaned_text_to_sequence(cleaned_text, version=None): 15 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | Args: 17 | text: string to convert to a sequence 18 | Returns: 19 | List of integers corresponding to the symbols in the text 20 | """ 21 | if version is None: 22 | version = os.environ.get("version", "v2") 23 | if version == "v1": 24 | phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] 25 | else: 26 | phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] 27 | 28 | return phones 29 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import cleaned_text_to_sequence 2 | import os 3 | # if os.environ.get("version","v1")=="v1": 4 | # from text import chinese 5 | # from text.symbols import symbols 6 | # else: 7 | # from text import chinese2 as chinese 8 | # from text.symbols2 import symbols 9 | 10 | from text import symbols as symbols_v1 11 | from text import symbols2 as symbols_v2 12 | 13 | special = [ 14 | # ("%", "zh", "SP"), 15 | ("¥", "zh", "SP2"), 16 | ("^", "zh", "SP3"), 17 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 18 | ] 19 | 20 | 21 | def clean_text(text, language, version=None): 22 | if version is None: 23 | version = os.environ.get("version", "v2") 24 | if version == "v1": 25 | symbols = symbols_v1.symbols 26 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 27 | else: 28 | symbols = symbols_v2.symbols 29 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} 30 | 31 | if language not in language_module_map: 32 | language = "en" 33 | text = " " 34 | for special_s, special_l, target_symbol in special: 35 | if special_s in text and language == special_l: 36 | return clean_special(text, language, special_s, target_symbol, version) 37 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) 38 | if hasattr(language_module, "text_normalize"): 39 | norm_text = language_module.text_normalize(text) 40 | else: 41 | norm_text = text 42 | if language == "zh" or language == "yue": ########## 43 | phones, word2ph = language_module.g2p(norm_text) 44 | assert len(phones) == sum(word2ph) 45 | assert len(norm_text) == len(word2ph) 46 | elif language == "en": 47 | phones = language_module.g2p(norm_text) 48 | if len(phones) < 4: 49 | phones = [","] + phones 50 | word2ph = None 51 | else: 52 | phones = language_module.g2p(norm_text) 53 | word2ph = None 54 | phones = ["UNK" if ph not in symbols else ph for ph in phones] 55 | return phones, word2ph, norm_text 56 | 57 | 58 | def clean_special(text, language, special_s, target_symbol, version=None): 59 | if version is None: 60 | version = os.environ.get("version", "v2") 61 | if version == "v1": 62 | symbols = symbols_v1.symbols 63 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 64 | else: 65 | symbols = symbols_v2.symbols 66 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} 67 | 68 | """ 69 | 特殊静音段sp符号处理 70 | """ 71 | text = text.replace(special_s, ",") 72 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) 73 | norm_text = language_module.text_normalize(text) 74 | phones = language_module.g2p(norm_text) 75 | new_ph = [] 76 | for ph in phones[0]: 77 | assert ph in symbols 78 | if ph == ",": 79 | new_ph.append(target_symbol) 80 | else: 81 | new_ph.append(ph) 82 | return new_ph, phones[1], norm_text 83 | 84 | 85 | def text_to_sequence(text, language, version=None): 86 | version = os.environ.get("version", version) 87 | if version is None: 88 | version = "v2" 89 | phones = clean_text(text) 90 | return cleaned_text_to_sequence(phones, version) 91 | 92 | 93 | if __name__ == "__main__": 94 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 95 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 2 | JSON JH EY1 S AH0 N 3 | CONDA K AA1 N D AH0 -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/engdict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/__init__.py: -------------------------------------------------------------------------------- 1 | from text.g2pw.g2pw import * 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/polyphonic.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/g2pw/polyphonic.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/polyphonic.rep: -------------------------------------------------------------------------------- 1 | 湖泊: ['hu2','po1'] 2 | 地壳: ['di4','qiao4'] 3 | 柏树: ['bai3','shu4'] 4 | 曝光: ['bao4','guang1'] 5 | 弹力: ['tan2','li4'] 6 | 字帖: ['zi4','tie4'] 7 | 口吃: ['kou3','chi1'] 8 | 包扎: ['bao1','za1'] 9 | 哪吒: ['ne2','zha1'] 10 | 说服: ['shuo1','fu2'] 11 | 识字: ['shi2','zi4'] 12 | 骨头: ['gu3','tou5'] 13 | 对称: ['dui4','chen4'] 14 | 口供: ['kou3','gong4'] 15 | 抹布: ['ma1','bu4'] 16 | 露背: ['lu4','bei4'] 17 | 圈养: ['juan4', 'yang3'] 18 | 眼眶: ['yan3', 'kuang4'] 19 | 品行: ['pin3','xing2'] 20 | 颤抖: ['chan4','dou3'] 21 | 差不多: ['cha4','bu5','duo1'] 22 | 鸭绿江: ['ya1','lu4','jiang1'] 23 | 撒切尔: ['sa4','qie4','er3'] 24 | 比比皆是: ['bi3','bi3','jie1','shi4'] 25 | 身无长物: ['shen1','wu2','chang2','wu4'] 26 | 手里: ['shou2','li3'] 27 | 关卡: ['guan1','qia3'] 28 | 怀揣: ['huai2','chuai1'] 29 | 挑剔: ['tiao1','ti4'] 30 | 供称: ['gong4','cheng1'] 31 | 作坊: ['zuo1', 'fang5'] 32 | 中医: ['zhong1','yi1'] 33 | 嚷嚷: ['rang1','rang5'] 34 | 商厦: ['shang1','sha4'] 35 | 大厦: ['da4','sha4'] 36 | 刹车: ['sha1','che1'] 37 | 嘚瑟: ['de4','se5'] 38 | 朝鲜: ['chao2','xian3'] 39 | 阿房宫: ['e1','pang2','gong1'] 40 | 阿胶: ['e1','jiao1'] 41 | 咖喱: ['ga1','li5'] 42 | 时分: ['shi2','fen1'] 43 | 蚌埠: ['beng4','bu4'] 44 | 驯服: ['xun4','fu2'] 45 | 幸免于难: ['xing4','mian3','yu2','nan4'] 46 | 恶行: ['e4','xing2'] 47 | 唉: ['ai4'] 48 | 扎实: ['zha1','shi2'] 49 | 干将: ['gan4','jiang4'] 50 | 陈威行: ['chen2', 'wei1', 'hang2'] 51 | 郭晟: ['guo1', 'sheng4'] 52 | 中标: ['zhong4', 'biao1'] 53 | 抗住: ['kang2', 'zhu4'] -------------------------------------------------------------------------------- /GPT_SoVITS/text/namedict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/namedict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip("0")) 25 | if num_string.startswith("0"): 26 | result = DIGITS["0"] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile( 32 | r"([0-1]?[0-9]|2[0-3])" 33 | r":([0-5][0-9])" 34 | r"(:([0-5][0-9]))?" 35 | ) 36 | 37 | # 时间范围,如8:30-12:30 38 | RE_TIME_RANGE = re.compile( 39 | r"([0-1]?[0-9]|2[0-3])" 40 | r":([0-5][0-9])" 41 | r"(:([0-5][0-9]))?" 42 | r"(~|-)" 43 | r"([0-1]?[0-9]|2[0-3])" 44 | r":([0-5][0-9])" 45 | r"(:([0-5][0-9]))?" 46 | ) 47 | 48 | 49 | def replace_time(match) -> str: 50 | """ 51 | Args: 52 | match (re.Match) 53 | Returns: 54 | str 55 | """ 56 | 57 | is_range = len(match.groups()) > 5 58 | 59 | hour = match.group(1) 60 | minute = match.group(2) 61 | second = match.group(4) 62 | 63 | if is_range: 64 | hour_2 = match.group(6) 65 | minute_2 = match.group(7) 66 | second_2 = match.group(9) 67 | 68 | result = f"{num2str(hour)}点" 69 | if minute.lstrip("0"): 70 | if int(minute) == 30: 71 | result += "半" 72 | else: 73 | result += f"{_time_num2str(minute)}分" 74 | if second and second.lstrip("0"): 75 | result += f"{_time_num2str(second)}秒" 76 | 77 | if is_range: 78 | result += "至" 79 | result += f"{num2str(hour_2)}点" 80 | if minute_2.lstrip("0"): 81 | if int(minute) == 30: 82 | result += "半" 83 | else: 84 | result += f"{_time_num2str(minute_2)}分" 85 | if second_2 and second_2.lstrip("0"): 86 | result += f"{_time_num2str(second_2)}秒" 87 | 88 | return result 89 | 90 | 91 | RE_DATE = re.compile( 92 | r"(\d{4}|\d{2})年" 93 | r"((0?[1-9]|1[0-2])月)?" 94 | r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?" 95 | ) 96 | 97 | 98 | def replace_date(match) -> str: 99 | """ 100 | Args: 101 | match (re.Match) 102 | Returns: 103 | str 104 | """ 105 | year = match.group(1) 106 | month = match.group(3) 107 | day = match.group(5) 108 | result = "" 109 | if year: 110 | result += f"{verbalize_digit(year)}年" 111 | if month: 112 | result += f"{verbalize_cardinal(month)}月" 113 | if day: 114 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 115 | return result 116 | 117 | 118 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 119 | RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])") 120 | 121 | 122 | def replace_date2(match) -> str: 123 | """ 124 | Args: 125 | match (re.Match) 126 | Returns: 127 | str 128 | """ 129 | year = match.group(1) 130 | month = match.group(3) 131 | day = match.group(4) 132 | result = "" 133 | if year: 134 | result += f"{verbalize_digit(year)}年" 135 | if month: 136 | result += f"{verbalize_cardinal(month)}月" 137 | if day: 138 | result += f"{verbalize_cardinal(day)}日" 139 | return result 140 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters} 22 | 23 | # 英文字符半角 -> 全角映射表 24 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 25 | 26 | # 数字字符全角 -> 半角映射表 (num: 10) 27 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 28 | # 数字字符半角 -> 全角映射表 29 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 30 | 31 | # 标点符号全角 -> 半角映射表 (num: 32) 32 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 33 | # 标点符号半角 -> 全角映射表 34 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 35 | 36 | # 空格 (num: 1) 37 | F2H_SPACE = {"\u3000": " "} 38 | H2F_SPACE = {" ": "\u3000"} 39 | 40 | # 非"有拼音的汉字"的字符串,可用于NSW提取 41 | if SUPPORT_UCS4: 42 | RE_NSW = re.compile( 43 | r"(?:[^" 44 | r"\u3007" # 〇 45 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] 46 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] 47 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] 48 | r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF] 49 | r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F] 50 | r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D] 51 | r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F] 52 | r"])+" 53 | ) 54 | else: 55 | RE_NSW = re.compile( # pragma: no cover 56 | r"(?:[^" 57 | r"\u3007" # 〇 58 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] 59 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] 60 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] 61 | r"])+" 62 | ) 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile(r"(? str: 32 | if mobile: 33 | sp_parts = phone_string.strip("+").split() 34 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts]) 35 | return result 36 | else: 37 | sil_parts = phone_string.split("-") 38 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts]) 39 | return result 40 | 41 | 42 | def replace_phone(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | return phone2str(match.group(0), mobile=False) 50 | 51 | 52 | def replace_mobile(match) -> str: 53 | """ 54 | Args: 55 | match (re.Match) 56 | Returns: 57 | str 58 | """ 59 | return phone2str(match.group(0)) 60 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)") 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒", 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 AlfreScarlet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于GPT-SoVITS的语音交互系统 2 | ## 简介 3 | 一个非常基础的语音交互系统,使用GPT-SoVITS作为TTS模块。集成ASR接口,使用funasr作为语音识别模块基础。支持openai规范的大模型接口。 4 | Linux环境下首Token延迟基本能做到1.5s以内。Windows环境下延迟在2.1s左右 5 | ### 测试平台 6 | 服务端 7 | - OS:Manjaro 8 | - CPU:R9 5950X 9 | - GPU:RTX 3080ti 10 | 11 | 客户端 12 | - 树莓派5 13 | 14 | ### 测试结果 15 | ![](screen/img.png) 16 | ## 整合包使用说明 17 | 整合包下载链接:http://ss.alfresama.moe:5244/MoeChat 18 | ### 注意!重要的事情说三遍 19 | ### 服务端只会对英文""符号包裹的文本进行语音合成,使用前请修改大模型的提示词! 20 | ### 服务端只会对英文""符号包裹的文本进行语音合成,使用前请修改大模型的提示词! 21 | ### 服务端只会对英文""符号包裹的文本进行语音合成,使用前请修改大模型的提示词! 22 | 整合包不包含用于推理的GPT跟SoVITS模型,需要自行添加底模或者训练好的模型。 23 | ### Windows 24 | ```bash 25 | runtime\python.exe chat_server.py 26 | ``` 27 | ### Linux 28 | ```bash 29 | # 创建虚拟环境 30 | python -m venv pp 31 | 32 | # 进入虚拟环境 33 | source pp/bin/activate 34 | 35 | # 安装依赖 36 | pip install -r extra-req.txt 37 | pip install -r requirements.txt 38 | pip install -r extra-req2.txt 39 | 40 | # 运行 41 | python chat_server.py 42 | ``` 43 | ### 配置说明 44 | 整合包配置文件为config.yaml 45 | ```yaml 46 | Core: 47 | sv: # 声纹配置 48 | is_up: true # 是否启用声纹识别 49 | master_audio: # 音频路径 50 | thr: # 不知道有什么用暂时留空 51 | LLM: 52 | api: # 大模型API 53 | key: # 大模型API_Key 54 | model: # 模型名称 55 | extra_config: # 大模型API额外参数,如:temperature: 0.7,温度参数 56 | temperature: 0.7 57 | GSV: 58 | text_lang: zh # 合成文本的语种 59 | GPT_weight: # GPT_weight模型路径 60 | SoVITS_weight: # SoVITS_weight模型路径 61 | ref_audio_path: # 主要参考音频路径 62 | prompt_text: # 参考音频文本 63 | prompt_lang: zh # 参考音频语种 64 | aux_ref_audio_paths: # 多参考音频 65 | - # 多参考音频文件路径 66 | seed: -1 # 种子 67 | top_k: 15 # 情感表现程度,越高情感越丰富,也可能越奇怪 68 | batch_size: 1 69 | extra_ref_audio: # 使用情绪标签选择参考音频,例如 [普通]"你好呀。" 70 | # 实例 71 | 普通: 72 | - 参考音频路径 73 | - 参考音频文本 74 | ``` 75 | 76 | ### 简易客户端使用方法 77 | 78 | #### Windows 79 | 测试使用python 3.10 80 | 首先修改client.py文件asr_api、chat_api的ip地址。 81 | ##### 带简单GUI的客户端 82 | ```bash 83 | # 运行 84 | runtime\python.exe client-gui\src\client_gui.py 85 | ``` 86 | 87 | #### Linux 88 | ```bash 89 | # 创建虚拟环境 90 | python -m venv pp 91 | 92 | # 进入虚拟环境 93 | source pp/bin/activate 94 | 95 | # 安装依赖 96 | pip install -r client-requirements.txt 97 | 98 | # 启动 99 | python client-gui\src\client_gui.py 100 | ``` 101 | 102 | ### 在客户端上修改提示词的方法 103 | 此方法不适用于ollama,非必要情况下可以使用LMstudio 104 | ```bash 105 | # 打开client_cli.py文件,GUI简易客户端打开client-gui\src\utils.py文件,修改下面内容 106 | # 修改前 107 | # 用于存储上下文内容 108 | data = { 109 | "msg": [] 110 | } 111 | 112 | #修改后 113 | # 用于存储上下文内容 114 | data = { 115 | "msg": [ 116 | {"role":"system", "content": ```填入你的提示词```} 117 | ] 118 | } 119 | ``` 120 | 121 | ## 接口说明 122 | 接口全部使用POST请求。 123 | 124 | #### ASR语音识别接口 125 | ```python 126 | # url为/api/asr 127 | # 请求数据格式为json 128 | # 将音频数据编码成urlsafe的base64字符串,放进请求体data字段中 129 | { 130 | "data": str # base64音频数据 131 | } 132 | # 服务端直接返回识别结果文本 133 | ``` 134 | 135 | #### 对话接口 136 | ```python 137 | # 对话接口为sse流式接口,服务端会将大模型的回答切片并生成对应的语音数据,一段一段返回客户端 138 | # 请求数据格式为json 139 | # 将大模型上下文数据放进msg字段,类型为字符串数组 140 | # 请求例子 141 | { 142 | "msg": [ 143 | {"role": "user", "content": "你好呀!"}, 144 | {"role": "assistant", "content": "你好呀!有什么能帮到你的吗?"}, 145 | {"role": "user", "content": "1+1等于多少呢?"}, 146 | ] 147 | } 148 | 149 | # 服务端响应例子 150 | { 151 | "file": str # urlsafe的base64字符串音频文件 152 | "message": str # 音频数据对应的文本 153 | "done": False # bool类型,用于判断是否为最后一个数据包 154 | } 155 | # 最后一个数据包服务端会将大模型完整的回答文本放进message字段返回客户端 156 | { 157 | "file": str 158 | "message": str # 字符串类型,大模型完整回答文本,用于拼接上下文 159 | "done": True # bool类型,用于判断是否为最后一个数据包 160 | } 161 | ``` 162 | -------------------------------------------------------------------------------- /client-gui/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | *.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Flet 163 | storage/ -------------------------------------------------------------------------------- /client-gui/README.md: -------------------------------------------------------------------------------- 1 | # ClientGui app 2 | 3 | ## Run the app 4 | 5 | ### uv 6 | 7 | Run as a desktop app: 8 | 9 | ``` 10 | uv run flet run 11 | ``` 12 | 13 | Run as a web app: 14 | 15 | ``` 16 | uv run flet run --web 17 | ``` 18 | 19 | ### Poetry 20 | 21 | Install dependencies from `pyproject.toml`: 22 | 23 | ``` 24 | poetry install 25 | ``` 26 | 27 | Run as a desktop app: 28 | 29 | ``` 30 | poetry run flet run 31 | ``` 32 | 33 | Run as a web app: 34 | 35 | ``` 36 | poetry run flet run --web 37 | ``` 38 | 39 | For more details on running the app, refer to the [Getting Started Guide](https://flet.dev/docs/getting-started/). 40 | 41 | ## Build the app 42 | 43 | ### Android 44 | 45 | ``` 46 | flet build apk -v 47 | ``` 48 | 49 | For more details on building and signing `.apk` or `.aab`, refer to the [Android Packaging Guide](https://flet.dev/docs/publish/android/). 50 | 51 | ### iOS 52 | 53 | ``` 54 | flet build ipa -v 55 | ``` 56 | 57 | For more details on building and signing `.ipa`, refer to the [iOS Packaging Guide](https://flet.dev/docs/publish/ios/). 58 | 59 | ### macOS 60 | 61 | ``` 62 | flet build macos -v 63 | ``` 64 | 65 | For more details on building macOS package, refer to the [macOS Packaging Guide](https://flet.dev/docs/publish/macos/). 66 | 67 | ### Linux 68 | 69 | ``` 70 | flet build linux -v 71 | ``` 72 | 73 | For more details on building Linux package, refer to the [Linux Packaging Guide](https://flet.dev/docs/publish/linux/). 74 | 75 | ### Windows 76 | 77 | ``` 78 | flet build windows -v 79 | ``` 80 | 81 | For more details on building Windows package, refer to the [Windows Packaging Guide](https://flet.dev/docs/publish/windows/). -------------------------------------------------------------------------------- /client-gui/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "client-gui" 3 | version = "0.1.0" 4 | description = "" 5 | readme = "README.md" 6 | requires-python = ">=3.9" 7 | authors = [ 8 | { name = "Flet developer", email = "you@example.com" } 9 | ] 10 | dependencies = [ 11 | "flet==0.27.6" 12 | ] 13 | 14 | [tool.flet] 15 | # org name in reverse domain name notation, e.g. "com.mycompany". 16 | # Combined with project.name to build bundle ID for iOS and Android apps 17 | org = "com.mycompany" 18 | 19 | # project display name that is used as an app title on Android and iOS home screens, 20 | # shown in window titles and about app dialogs on desktop. 21 | product = "client-gui" 22 | 23 | # company name to display in about app dialogs 24 | company = "Flet" 25 | 26 | # copyright text to display in about app dialogs 27 | copyright = "Copyright (C) 2025 by Flet" 28 | 29 | [tool.flet.app] 30 | path = "src" 31 | 32 | [tool.uv] 33 | dev-dependencies = [ 34 | "flet[all]==0.27.6", 35 | ] 36 | 37 | [tool.poetry] 38 | package-mode = false 39 | 40 | [tool.poetry.group.dev.dependencies] 41 | flet = {extras = ["all"], version = "0.27.6"} -------------------------------------------------------------------------------- /client-gui/src/assets/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/client-gui/src/assets/icon.png -------------------------------------------------------------------------------- /client-gui/src/assets/splash_android.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/client-gui/src/assets/splash_android.png -------------------------------------------------------------------------------- /client-gui/src/cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import click 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | from audiolab import Reader, Writer, info, load_audio 19 | 20 | from pysilero import SileroVAD, VADIterator 21 | 22 | 23 | @click.command() 24 | @click.argument("wav_path", type=click.Path(exists=True, file_okay=True)) 25 | @click.option("--version", default="v5", help="Silero VAD version") 26 | @click.option("--denoise/--no-denoise", default=False, help="Denoise before vad") 27 | @click.option("--streaming/--no-streaming", default=False, help="Streming mode") 28 | @click.option("--save-path", help="Save path for output audio") 29 | @click.option("--plot/--no-plot", default=False, help="Plot the vad probabilities") 30 | def main(wav_path, version, denoise, streaming, save_path, plot): 31 | if not streaming: 32 | model = SileroVAD(version, info(wav_path).rate, denoise=denoise) 33 | speech_timestamps = model.get_speech_timestamps(wav_path, return_seconds=True, save_path=save_path) 34 | print("None streaming result:", list(speech_timestamps)) 35 | 36 | if plot: 37 | audio, rate = load_audio(wav_path, dtype=np.float32) 38 | x1 = np.arange(0, audio.shape[1]) / rate 39 | outputs = list(model.get_speech_probs(wav_path)) 40 | x2 = [i * 32 / 1000 for i in range(0, len(outputs))] 41 | plt.plot(x1, audio[0]) 42 | plt.plot(x2, outputs) 43 | plt.show() 44 | else: 45 | print("Streaming result:", end=" ") 46 | reader = Reader(wav_path, dtype=np.float32, frame_size_ms=10) 47 | if save_path is not None: 48 | writer = Writer(save_path, reader.rate, layout=reader.layout) 49 | vad_iterator = VADIterator(version, reader.rate) 50 | for idx, (frame, _) in enumerate(reader): 51 | partial = idx == reader.num_frames - 1 52 | for speech_dict, speech_samples in vad_iterator(frame.squeeze(), partial, return_seconds=True): 53 | if "start" in speech_dict or "end" in speech_dict: 54 | print(speech_dict, end=" ") 55 | if save_path is not None and speech_samples is not None: 56 | writer.write(speech_samples) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /client-gui/src/client_gui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | now_dir = os.getcwd() 5 | sys.path.append(now_dir) 6 | sys.path.append("%s/client-gui" % (now_dir)) 7 | sys.path.append("%s/client-gui/src" % (now_dir)) 8 | 9 | import flet as ft 10 | import ui 11 | from threading import Thread 12 | import client_utils 13 | 14 | client_utils_thread = Thread(target=client_utils.main, args=()) 15 | client_utils_thread.daemon = True 16 | client_utils_thread.start() 17 | 18 | def get_msg_box(msg: str): 19 | return ft.Container( 20 | content=ft.Text(msg, size=20, text_align=ft.TextAlign.CENTER), 21 | padding=5, 22 | border_radius=10, 23 | bgcolor=ft.colors.BLUE_900, 24 | ) 25 | 26 | def main(page: ft.Page): 27 | page.horizontal_alignment = ft.CrossAxisAlignment.STRETCH 28 | page.title = "Moe Chat GUI" 29 | 30 | def send_message_click(e): 31 | if new_message.value != "" and client_utils.status: 32 | mmsg = f"\"{new_message.value}\"" 33 | client_utils.add_msg_me(mmsg.replace("\"", "")) 34 | client_utils.to_llm_and_tts(mmsg, "0.000") 35 | new_message.value = "" 36 | new_message.focus() 37 | page.update() 38 | 39 | # A new message entry form 40 | new_message = ft.TextField( 41 | hint_text="输入信息", 42 | autofocus=True, 43 | shift_enter=True, 44 | min_lines=1, 45 | max_lines=5, 46 | filled=True, 47 | expand=True, 48 | on_submit=send_message_click, 49 | ) 50 | 51 | # Add everything to the page 52 | page.add( 53 | ft.Container( 54 | content=ui.chat_list, 55 | border=ft.border.all(1, ft.Colors.OUTLINE), 56 | border_radius=5, 57 | padding=10, 58 | expand=True, 59 | ), 60 | ft.Row( 61 | [ 62 | new_message, 63 | ft.IconButton( 64 | icon=ft.Icons.SEND_ROUNDED, 65 | tooltip="Send message", 66 | on_click=send_message_click, 67 | ), 68 | ] 69 | ), 70 | ) 71 | 72 | ft.app(target=main) -------------------------------------------------------------------------------- /client-gui/src/frame_queue.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | import soxr 17 | 18 | 19 | class FrameQueue: 20 | def __init__(self, frame_size, in_rate, speech_pad_samples=0, out_rate=None, padding=True): 21 | self.frame_size = frame_size 22 | # padding zeros for the last frame 23 | self.padding = padding 24 | self.speech_pad_samples = speech_pad_samples 25 | # cache the original samples for padding and soxr's delay 26 | # TODO: use the largest delay of soxr instead of 500ms cache 27 | num_cached_samples = speech_pad_samples + 500 * in_rate // 1000 28 | self.cached_samples = np.zeros(num_cached_samples, dtype=np.float32) 29 | self.cache_start = -len(self.cached_samples) 30 | 31 | self.current_sample = 0 32 | self.remained_samples = np.empty(0, dtype=np.float32) 33 | 34 | if out_rate is None or in_rate == out_rate: 35 | self.step = 1.0 36 | self.resampler = None 37 | else: 38 | self.step = in_rate / out_rate 39 | self.resampler = soxr.ResampleStream(in_rate, out_rate, num_channels=1) 40 | 41 | def add_chunk(self, chunk, is_last=False): 42 | # cache the original frame without resampling for `lookforward` of vad start 43 | # cache start is the absolute sample index of the first sample in the cached_samples 44 | if len(chunk) > 0: 45 | self.cache_start += len(chunk) 46 | self.cached_samples = np.roll(self.cached_samples, -len(chunk)) 47 | self.cached_samples[-len(chunk) :] = chunk[-len(self.cached_samples) :] 48 | # resample 49 | if self.resampler is not None: 50 | chunk = self.resampler.resample_chunk(chunk, is_last) 51 | # enqueue chunk 52 | self.remained_samples = np.concatenate((self.remained_samples, chunk)) 53 | 54 | while len(self.remained_samples) >= self.frame_size: 55 | frame = self.remained_samples[: self.frame_size] 56 | self.remained_samples = self.remained_samples[self.frame_size :] 57 | # frame_start and frame_end is the sample index before resampling 58 | frame_start = self.current_sample 59 | self.current_sample += int(len(frame) * self.step) 60 | frame_end = self.current_sample 61 | yield frame_start, frame_end, frame 62 | 63 | if is_last and len(self.remained_samples) > 0 and self.padding: 64 | frame = self.remained_samples 65 | frame_start = self.current_sample 66 | self.current_sample += int(len(frame) * self.step) 67 | frame = np.pad(frame, (0, self.frame_size - len(frame))) 68 | frame_end = self.current_sample 69 | yield frame_start, frame_end, frame 70 | 71 | def get_frame(self, speech_padding=False): 72 | # dequeue one original frame without resampling 73 | frame_start = self.current_sample - int(self.frame_size * self.step) 74 | frame_end = self.current_sample 75 | if speech_padding: 76 | frame_start -= self.speech_pad_samples 77 | # get the relative sample index of the speech 78 | speech_start = frame_start - self.cache_start 79 | speech_end = frame_end - self.cache_start 80 | return self.cached_samples[speech_start:speech_end] 81 | 82 | 83 | if __name__ == "__main__": 84 | queue = FrameQueue(3, 1000) 85 | frames = [[1, 2, 3], [4, 5], [6, 7, 8]] 86 | for index, frame in enumerate(frames): 87 | for frame_start, frame_end, frame in queue.add_chunk(frame, index == len(frames) - 1): 88 | print(frame_start, frame_end, frame) 89 | -------------------------------------------------------------------------------- /client-gui/src/pickable_session.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from functools import partial 16 | 17 | import onnxruntime as ort 18 | from modelscope import snapshot_download 19 | 20 | 21 | class PickableSession: 22 | """ 23 | This is a wrapper to make the current InferenceSession class pickable. 24 | """ 25 | 26 | def __init__(self, version="v5"): 27 | opts = ort.SessionOptions() 28 | opts.inter_op_num_threads = 1 29 | opts.intra_op_num_threads = 1 30 | opts.log_severity_level = 3 31 | 32 | assert version in ["v4", "v5"] 33 | model_id = "pengzhendong/silero-vad" 34 | try: 35 | repo_dir = snapshot_download(model_id) 36 | except Exception: 37 | from modelscope.utils.file_utils import get_default_modelscope_cache_dir 38 | 39 | repo_dir = f"{get_default_modelscope_cache_dir()}/models/{model_id}" 40 | self.model_path = f"{repo_dir}/{version}/silero_vad.onnx" 41 | self.init_session = partial(ort.InferenceSession, sess_options=opts, providers=["CPUExecutionProvider"]) 42 | self.sess = self.init_session(self.model_path) 43 | 44 | def run(self, *args): 45 | return self.sess.run(None, *args) 46 | 47 | def __getstate__(self): 48 | return {"model_path": self.model_path} 49 | 50 | def __setstate__(self, values): 51 | self.model_path = values["model_path"] 52 | self.sess = self.init_session(self.model_path) 53 | 54 | 55 | VERSIONS = ["v4", "v5"] 56 | silero_vad = {version: PickableSession(version) for version in VERSIONS} 57 | -------------------------------------------------------------------------------- /client-gui/src/ui.py: -------------------------------------------------------------------------------- 1 | import flet as ft 2 | 3 | # Chat messages 4 | chat_list = ft.ListView( 5 | controls=[], 6 | expand=True, 7 | spacing=10, 8 | auto_scroll=True, 9 | ) 10 | 11 | class ChatMessage: 12 | def __init__(self, user_name: str, text: str, positon: str): 13 | if positon == "left": 14 | self.position1 = ft.MainAxisAlignment.START 15 | self.position2 = ft.CrossAxisAlignment.START 16 | else: 17 | self.position1 = ft.MainAxisAlignment.END 18 | self.position2 = ft.CrossAxisAlignment.END 19 | self.user_name = user_name 20 | self.text = text 21 | self.tou = ft.CircleAvatar( 22 | content=ft.Text(self.get_initials(user_name), size=25), 23 | color=ft.Colors.WHITE, 24 | bgcolor=self.get_avatar_color(user_name), 25 | min_radius=30, 26 | ) 27 | self.msg_list = ft.Column( 28 | controls=[ 29 | ft.Container( 30 | height=5, 31 | width=1, 32 | ) 33 | ], 34 | alignment=ft.MainAxisAlignment.START, 35 | horizontal_alignment=self.position2, 36 | auto_scroll=True, 37 | ) 38 | self.cont = ft.Row( 39 | data=self.user_name, 40 | # controls=[self.msg_list, self.tou], 41 | alignment=self.position1, 42 | vertical_alignment=ft.CrossAxisAlignment.START, 43 | expand=True, 44 | ) 45 | if positon == "left": 46 | self.cont.controls = [self.tou, self.msg_list] 47 | print("left") 48 | else: 49 | self.cont.controls = [self.msg_list, self.tou] 50 | print("right") 51 | def get_initials(self, user_name: str): 52 | if user_name: 53 | return user_name[:1].capitalize() 54 | else: 55 | return "Unknown" # or any default value you prefer 56 | def get_avatar_color(self, user_name: str): 57 | colors_lookup = [ 58 | ft.Colors.AMBER, 59 | ft.Colors.BLUE, 60 | ft.Colors.BROWN, 61 | ft.Colors.CYAN, 62 | ft.Colors.GREEN, 63 | ft.Colors.INDIGO, 64 | ft.Colors.LIME, 65 | ft.Colors.ORANGE, 66 | ft.Colors.PINK, 67 | ft.Colors.PURPLE, 68 | ft.Colors.RED, 69 | ft.Colors.TEAL, 70 | ft.Colors.YELLOW, 71 | ] 72 | return colors_lookup[hash(user_name) % len(colors_lookup)] 73 | 74 | 75 | def get_msg_box(msg: str): 76 | return ft.Container( 77 | content=ft.Text(msg, size=20, text_align=ft.TextAlign.CENTER), 78 | padding=5, 79 | border_radius=10, 80 | bgcolor=ft.colors.BLUE_900, 81 | ) -------------------------------------------------------------------------------- /client-gui/src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import warnings 16 | 17 | import librosa 18 | import numpy as np 19 | import parselmouth 20 | 21 | warnings.filterwarnings("ignore") 22 | 23 | 24 | def get_energy(chunk, sr, from_harmonic=1, to_harmonic=5): 25 | sound = parselmouth.Sound(chunk, sampling_frequency=sr) 26 | # pitch 27 | pitch = sound.to_pitch(pitch_floor=100, pitch_ceiling=350) 28 | # pitch energy 29 | # energy = np.mean(pitch.selected_array["strength"]) 30 | pitch = np.mean(pitch.selected_array["frequency"]) 31 | # frame log energy 32 | # energy = np.mean(sound.to_mfcc().to_array(), axis=1)[0] 33 | 34 | # energy form x-th harmonic to y-th harmonic 35 | freqs = librosa.fft_frequencies(sr=sr) 36 | freq_band_idx = np.where((freqs >= from_harmonic * pitch) & (freqs <= to_harmonic * pitch))[0] 37 | energy = np.sum(np.abs(librosa.stft(chunk)[freq_band_idx, :])) 38 | 39 | return energy 40 | -------------------------------------------------------------------------------- /client-requirements.txt: -------------------------------------------------------------------------------- 1 | sounddevice 2 | soundfile 3 | pysilero 4 | numpy 5 | scipy 6 | pygame 7 | requests 8 | flet[all]==0.27.6 -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | Core: 2 | sv: 3 | is_up: true 4 | 5 | master_audio: test.wav # 包含你声音的wav音频文件,建议3s-5s左右。 6 | thr: # 阈值,越小越敏感,建议0.5-0.8之间,实测好像不是很有用? 7 | # wakeword: # 唤醒词相关,暂时不支持 8 | # is_up: false 9 | # word: 爱丽丝 #唤醒词 10 | # sleep_time: 1 #休眠时间,休眠后需要用带有唤醒词的语句唤醒,单位分钟,0表示永远需要唤醒词 11 | LLM: 12 | api: 13 | key: 14 | model: 15 | extra_config: # 大模型API额外参数,如:temperature: 0.7,温度参数 16 | temperature: 0.7 17 | GSV: 18 | text_lang: zh 19 | GPT_weight: 20 | SoVITS_weight: 21 | ref_audio_path: 22 | prompt_text: 23 | prompt_lang: zh 24 | aux_ref_audio_paths: # 多参考音频 v2模型有效 25 | - 26 | seed: -1 27 | top_k: 15 28 | batch_size: 1 29 | extra_ref_audio: # 使用情绪标签选择参考音频,例如 [普通]"你好呀。" 30 | # 实例 31 | 普通: 32 | - 参考音频路径 33 | - 参考音频文本 -------------------------------------------------------------------------------- /extra-req.txt: -------------------------------------------------------------------------------- 1 | faster-whisper 2 | -------------------------------------------------------------------------------- /extra-req2.txt: -------------------------------------------------------------------------------- 1 | addict 2 | datasets==2.18.0 3 | simplejson 4 | sortedcontainers 5 | modelscope==1.24.1 6 | funasr==1.2.6 7 | numpy==1.25 8 | pypinyin -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy<2.0 2 | scipy 3 | tensorboard 4 | librosa==0.10.2 5 | numba 6 | pytorch-lightning>=2.4 7 | gradio<5 8 | ffmpeg-python 9 | onnxruntime; sys_platform == 'darwin' 10 | onnxruntime-gpu; sys_platform != 'darwin' 11 | tqdm 12 | cn2an 13 | pypinyin 14 | pyopenjtalk>=0.4.1 15 | g2p_en 16 | torchaudio 17 | sentencepiece 18 | transformers>=4.43 19 | peft 20 | chardet 21 | PyYAML 22 | psutil 23 | jieba_fast 24 | jieba 25 | split-lang 26 | fast_langdetect>=0.3.1 27 | wordsegment 28 | rotary_embedding_torch 29 | ToJyutping 30 | g2pk2 31 | ko_pron 32 | opencc; sys_platform != 'linux' 33 | opencc==1.1.1; sys_platform == 'linux' 34 | python_mecab_ko; sys_platform != 'win32' 35 | fastapi[standard]>=0.115.2 36 | x_transformers 37 | torchmetrics<=1.5 38 | pydantic<=2.10.6 39 | ctranslate2>=4.0,<5 40 | huggingface_hub>=0.13 41 | tokenizers>=0.13,<1 42 | av>=11 43 | tqdm 44 | -------------------------------------------------------------------------------- /screen/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/screen/img.png -------------------------------------------------------------------------------- /tools/AP_BWE_main/24kto48k/readme.txt: -------------------------------------------------------------------------------- 1 | For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model. 2 | 对于v3模型的推理,如果你发现生成的音频比较闷,可以尝试这个音频超分模型。 3 | 4 | put g_24kto48k.zip and config.json in this folder 5 | 把g_24kto48k.zip and config.json下到这个文件夹 6 | 7 | download link 下载链接: 8 | https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link 9 | 10 | audio sr project page 音频超分项目主页: 11 | https://github.com/yxlu-0102/AP-BWE 12 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ye-Xin Lu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/datasets1/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/datasets1/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import torch 4 | import torchaudio 5 | import torch.utils.data 6 | import torchaudio.functional as aF 7 | 8 | 9 | def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True): 10 | hann_window = torch.hann_window(win_size).to(audio.device) 11 | stft_spec = torch.stft( 12 | audio, 13 | n_fft, 14 | hop_length=hop_size, 15 | win_length=win_size, 16 | window=hann_window, 17 | center=center, 18 | pad_mode="reflect", 19 | normalized=False, 20 | return_complex=True, 21 | ) 22 | log_amp = torch.log(torch.abs(stft_spec) + 1e-4) 23 | pha = torch.angle(stft_spec) 24 | 25 | com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1) 26 | 27 | return log_amp, pha, com 28 | 29 | 30 | def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True): 31 | amp = torch.exp(log_amp) 32 | com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha)) 33 | hann_window = torch.hann_window(win_size).to(com.device) 34 | audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center) 35 | 36 | return audio 37 | 38 | 39 | def get_dataset_filelist(a): 40 | with open(a.input_training_file, "r", encoding="utf-8") as fi: 41 | training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] 42 | 43 | with open(a.input_validation_file, "r", encoding="utf-8") as fi: 44 | validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] 45 | 46 | return training_indexes, validation_indexes 47 | 48 | 49 | class Dataset(torch.utils.data.Dataset): 50 | def __init__( 51 | self, 52 | training_indexes, 53 | wavs_dir, 54 | segment_size, 55 | hr_sampling_rate, 56 | lr_sampling_rate, 57 | split=True, 58 | shuffle=True, 59 | n_cache_reuse=1, 60 | device=None, 61 | ): 62 | self.audio_indexes = training_indexes 63 | random.seed(1234) 64 | if shuffle: 65 | random.shuffle(self.audio_indexes) 66 | self.wavs_dir = wavs_dir 67 | self.segment_size = segment_size 68 | self.hr_sampling_rate = hr_sampling_rate 69 | self.lr_sampling_rate = lr_sampling_rate 70 | self.split = split 71 | self.cached_wav = None 72 | self.n_cache_reuse = n_cache_reuse 73 | self._cache_ref_count = 0 74 | self.device = device 75 | 76 | def __getitem__(self, index): 77 | filename = self.audio_indexes[index] 78 | if self._cache_ref_count == 0: 79 | audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav")) 80 | self.cached_wav = audio 81 | self._cache_ref_count = self.n_cache_reuse 82 | else: 83 | audio = self.cached_wav 84 | self._cache_ref_count -= 1 85 | 86 | if orig_sampling_rate == self.hr_sampling_rate: 87 | audio_hr = audio 88 | else: 89 | audio_hr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.hr_sampling_rate) 90 | 91 | audio_lr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.lr_sampling_rate) 92 | audio_lr = aF.resample(audio_lr, orig_freq=self.lr_sampling_rate, new_freq=self.hr_sampling_rate) 93 | audio_lr = audio_lr[:, : audio_hr.size(1)] 94 | 95 | if self.split: 96 | if audio_hr.size(1) >= self.segment_size: 97 | max_audio_start = audio_hr.size(1) - self.segment_size 98 | audio_start = random.randint(0, max_audio_start) 99 | audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size] 100 | audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size] 101 | else: 102 | audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant") 103 | audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant") 104 | 105 | return (audio_hr.squeeze(), audio_lr.squeeze()) 106 | 107 | def __len__(self): 108 | return len(self.audio_indexes) 109 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/tools/__init__.py -------------------------------------------------------------------------------- /tools/asr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def check_fw_local_models(): 5 | """ 6 | 启动时检查本地是否有 Faster Whisper 模型. 7 | """ 8 | model_size_list = [ 9 | "tiny", 10 | "tiny.en", 11 | "base", 12 | "base.en", 13 | "small", 14 | "small.en", 15 | "medium", 16 | "medium.en", 17 | "large", 18 | "large-v1", 19 | "large-v2", 20 | "large-v3", 21 | ] 22 | for i, size in enumerate(model_size_list): 23 | if os.path.exists(f"tools/asr/models/faster-whisper-{size}"): 24 | model_size_list[i] = size + "-local" 25 | return model_size_list 26 | 27 | 28 | asr_dict = { 29 | "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, 30 | "Faster Whisper (多语种)": { 31 | "lang": ["auto", "zh", "en", "ja", "ko", "yue"], 32 | "size": check_fw_local_models(), 33 | "path": "fasterwhisper_asr.py", 34 | "precision": ["float32", "float16", "int8"], 35 | }, 36 | } 37 | -------------------------------------------------------------------------------- /tools/asr/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tools/audio_sr.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | import sys 3 | import os 4 | 5 | AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main") 6 | sys.path.append(AP_BWE_main_dir_path) 7 | import json 8 | import torch 9 | import torchaudio.functional as aF 10 | # from attrdict import AttrDict####will be bug in py3.10 11 | 12 | from datasets1.dataset import amp_pha_stft, amp_pha_istft 13 | from models.model import APNet_BWE_Model 14 | 15 | 16 | class AP_BWE: 17 | def __init__(self, device, DictToAttrRecursive, checkpoint_file=None): 18 | if checkpoint_file == None: 19 | checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path) 20 | if os.path.exists(checkpoint_file) == False: 21 | raise FileNotFoundError 22 | config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json") 23 | with open(config_file) as f: 24 | data = f.read() 25 | json_config = json.loads(data) 26 | # h = AttrDict(json_config) 27 | h = DictToAttrRecursive(json_config) 28 | model = APNet_BWE_Model(h).to(device) 29 | state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False) 30 | model.load_state_dict(state_dict["generator"]) 31 | model.eval() 32 | self.device = device 33 | self.model = model 34 | self.h = h 35 | 36 | def to(self, *arg, **kwargs): 37 | self.model.to(*arg, **kwargs) 38 | self.device = self.model.conv_pre_mag.weight.device 39 | return self 40 | 41 | def __call__(self, audio, orig_sampling_rate): 42 | with torch.no_grad(): 43 | # audio, orig_sampling_rate = torchaudio.load(inp_path) 44 | # audio = audio.to(self.device) 45 | audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate) 46 | amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size) 47 | amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb) 48 | audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size) 49 | # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16') 50 | return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate 51 | -------------------------------------------------------------------------------- /tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import traceback 4 | 5 | from modelscope.pipelines import pipeline 6 | from modelscope.utils.constant import Tasks 7 | from tqdm import tqdm 8 | 9 | path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k" 10 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 11 | ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise) 12 | 13 | 14 | def execute_denoise(input_folder, output_folder): 15 | os.makedirs(output_folder, exist_ok=True) 16 | # print(input_folder) 17 | # print(list(os.listdir(input_folder).sort())) 18 | for name in tqdm(os.listdir(input_folder)): 19 | try: 20 | ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name)) 21 | except: 22 | traceback.print_exc() 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument( 28 | "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." 29 | ) 30 | parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") 31 | parser.add_argument( 32 | "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" 33 | ) # 还没接入 34 | cmd = parser.parse_args() 35 | execute_denoise( 36 | input_folder=cmd.input_folder, 37 | output_folder=cmd.output_folder, 38 | ) 39 | -------------------------------------------------------------------------------- /tools/denoise-model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") 6 | 7 | 8 | def load_language_list(language): 9 | with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f: 10 | language_list = json.load(f) 11 | return language_list 12 | 13 | 14 | def scan_language_list(): 15 | language_list = [] 16 | for name in os.listdir(I18N_JSON_DIR): 17 | if name.endswith(".json"): 18 | language_list.append(name.split(".")[0]) 19 | return language_list 20 | 21 | 22 | class I18nAuto: 23 | def __init__(self, language=None): 24 | if language in ["Auto", None]: 25 | language = locale.getdefaultlocale()[0] 26 | # getlocale can't identify the system's language ((None, None)) 27 | if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")): 28 | language = "en_US" 29 | self.language = language 30 | self.language_map = load_language_list(language) 31 | 32 | def __call__(self, key): 33 | return self.language_map.get(key, key) 34 | 35 | def __repr__(self): 36 | return "Use Language: " + self.language 37 | 38 | 39 | if __name__ == "__main__": 40 | i18n = I18nAuto(language="en_US") 41 | print(i18n) 42 | -------------------------------------------------------------------------------- /tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import traceback 5 | from scipy.io import wavfile 6 | 7 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 8 | # sys.path.append(parent_directory) 9 | from tools.my_utils import load_audio 10 | from slicer2 import Slicer 11 | 12 | 13 | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part): 14 | os.makedirs(opt_root, exist_ok=True) 15 | if os.path.isfile(inp): 16 | input = [inp] 17 | elif os.path.isdir(inp): 18 | input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 19 | else: 20 | return "输入路径存在但既不是文件也不是文件夹" 21 | slicer = Slicer( 22 | sr=32000, # 长音频采样率 23 | threshold=int(threshold), # 音量小于这个值视作静音的备选切割点 24 | min_length=int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 25 | min_interval=int(min_interval), # 最短切割间隔 26 | hop_size=int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 27 | max_sil_kept=int(max_sil_kept), # 切完后静音最多留多长 28 | ) 29 | _max = float(_max) 30 | alpha = float(alpha) 31 | for inp_path in input[int(i_part) :: int(all_part)]: 32 | # print(inp_path) 33 | try: 34 | name = os.path.basename(inp_path) 35 | audio = load_audio(inp_path, 32000) 36 | # print(audio.shape) 37 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 38 | tmp_max = np.abs(chunk).max() 39 | if tmp_max > 1: 40 | chunk /= tmp_max 41 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 42 | wavfile.write( 43 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 44 | 32000, 45 | # chunk.astype(np.float32), 46 | (chunk * 32767).astype(np.int16), 47 | ) 48 | except: 49 | print(inp_path, "->fail->", traceback.format_exc()) 50 | return "执行完毕,请检查输出文件" 51 | 52 | 53 | print(slice(*sys.argv[1:])) 54 | -------------------------------------------------------------------------------- /tools/uvr5/bs_roformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/tools/uvr5/bs_roformer/__init__.py -------------------------------------------------------------------------------- /tools/uvr5/bs_roformer/attend.py: -------------------------------------------------------------------------------- 1 | from packaging import version 2 | import torch 3 | from torch import nn, einsum 4 | import torch.nn.functional as F 5 | 6 | 7 | def exists(val): 8 | return val is not None 9 | 10 | 11 | def default(v, d): 12 | return v if exists(v) else d 13 | 14 | 15 | class Attend(nn.Module): 16 | def __init__(self, dropout=0.0, flash=False, scale=None): 17 | super().__init__() 18 | self.scale = scale 19 | self.dropout = dropout 20 | self.attn_dropout = nn.Dropout(dropout) 21 | 22 | self.flash = flash 23 | assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), ( 24 | "in order to use flash attention, you must be using pytorch 2.0 or above" 25 | ) 26 | 27 | def flash_attn(self, q, k, v): 28 | # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device 29 | 30 | if exists(self.scale): 31 | default_scale = q.shape[-1] ** -0.5 32 | q = q * (self.scale / default_scale) 33 | 34 | # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale 35 | # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): 36 | return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0) 37 | 38 | def forward(self, q, k, v): 39 | """ 40 | einstein notation 41 | b - batch 42 | h - heads 43 | n, i, j - sequence length (base sequence length, source, target) 44 | d - feature dimension 45 | """ 46 | 47 | # q_len, k_len, device = q.shape[-2], k.shape[-2], q.device 48 | 49 | scale = default(self.scale, q.shape[-1] ** -0.5) 50 | 51 | if self.flash: 52 | return self.flash_attn(q, k, v) 53 | 54 | # similarity 55 | 56 | sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale 57 | 58 | # attention 59 | 60 | attn = sim.softmax(dim=-1) 61 | attn = self.attn_dropout(attn) 62 | 63 | # aggregate values 64 | 65 | out = einsum("b h i j, b h j d -> b h i d", attn, v) 66 | 67 | return out 68 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./lib/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute( 31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True 32 | ): 33 | model.eval() 34 | with torch.no_grad(): 35 | preds = [] 36 | 37 | iterations = [n_window] 38 | 39 | total_iterations = sum(iterations) 40 | for i in tqdm(range(n_window)): 41 | start = i * roi_size 42 | X_mag_window = X_mag_pad[ 43 | None, :, :, start : start + data["window_size"] 44 | ] 45 | X_mag_window = torch.from_numpy(X_mag_window) 46 | if is_half: 47 | X_mag_window = X_mag_window.half() 48 | X_mag_window = X_mag_window.to(device) 49 | 50 | pred = model.predict(X_mag_window, aggressiveness) 51 | 52 | pred = pred.detach().cpu().numpy() 53 | preds.append(pred[0]) 54 | 55 | pred = np.concatenate(preds, axis=2) 56 | return pred 57 | 58 | def preprocess(X_spec): 59 | X_mag = np.abs(X_spec) 60 | X_phase = np.angle(X_spec) 61 | 62 | return X_mag, X_phase 63 | 64 | X_mag, X_phase = preprocess(X_spec) 65 | 66 | coef = X_mag.max() 67 | X_mag_pre = X_mag / coef 68 | 69 | n_frame = X_mag_pre.shape[2] 70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 71 | n_window = int(np.ceil(n_frame / roi_size)) 72 | 73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 74 | 75 | if list(model.state_dict().values())[0].dtype == torch.float16: 76 | is_half = True 77 | else: 78 | is_half = False 79 | pred = _execute( 80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 81 | ) 82 | pred = pred[:, :, :n_frame] 83 | 84 | if data["tta"]: 85 | pad_l += roi_size // 2 86 | pad_r += roi_size // 2 87 | n_window += 1 88 | 89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 90 | 91 | pred_tta = _execute( 92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 93 | ) 94 | pred_tta = pred_tta[:, :, roi_size // 2 :] 95 | pred_tta = pred_tta[:, :, :n_frame] 96 | 97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 98 | else: 99 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 100 | 101 | 102 | def _get_name_params(model_path, model_hash): 103 | data = load_data() 104 | flag = False 105 | ModelName = model_path 106 | for type in list(data): 107 | for model in list(data[type][0]): 108 | for i in range(len(data[type][0][model])): 109 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 110 | flag = True 111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 112 | flag = True 113 | 114 | if flag: 115 | model_params_auto = data[type][0][model][i]["model_params"] 116 | param_name_auto = data[type][0][model][i]["param_name"] 117 | if type == "equivalent": 118 | return param_name_auto, model_params_auto 119 | else: 120 | flag = False 121 | return param_name_auto, model_params_auto 122 | -------------------------------------------------------------------------------- /tools/uvr5/uvr5_weights/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /utilss/__init__.py: -------------------------------------------------------------------------------- 1 | from . import sv -------------------------------------------------------------------------------- /utilss/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/utilss/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /utilss/__pycache__/sv.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/utilss/__pycache__/sv.cpython-39.pyc -------------------------------------------------------------------------------- /utilss/sv.py: -------------------------------------------------------------------------------- 1 | from modelscope.pipelines import pipeline 2 | import soundfile as sf 3 | import numpy as np 4 | from scipy.signal import resample 5 | import io 6 | 7 | class SV: 8 | def __init__(self, config: dict): 9 | self.thr = "" 10 | with open(config["master_audio"], "rb") as f: 11 | audio_bytes = f.read() 12 | self.master_audio = self.resample_wav_bytes(audio_bytes) 13 | if "thr" in config: 14 | if config["thr"]: 15 | self.thr = str(config["thr"]) 16 | self.sv_pipeline = pipeline( 17 | task='speaker-verification', 18 | model='iic/speech_res2net_sv_zh-cn_3dspeaker_16k', 19 | model_revision='master' 20 | ) 21 | def resample_wav_bytes(self, wav_bytes, target_sr=16000): 22 | # 使用BytesIO将字节转为文件类对象 23 | with io.BytesIO(wav_bytes) as wav_file: 24 | # 读取音频数据 25 | data, original_sr = sf.read(wav_file, dtype='float32') 26 | 27 | # 立体声转单声道(取均值) 28 | if len(data.shape) > 1: 29 | data = np.mean(data, axis=1) 30 | 31 | # 计算重采样比例 32 | resample_ratio = target_sr / original_sr 33 | 34 | # 使用scipy的signal.resample进行重采样 35 | target_samples = int(len(data) * resample_ratio) 36 | resampled_data = resample(data, target_samples) 37 | 38 | # 归一化并转为16bit PCM格式 39 | resampled_data = np.clip(resampled_data, -1.0, 1.0) 40 | resampled_data = (resampled_data * 32767).astype(np.int16) 41 | 42 | return resampled_data 43 | def check_speaker(self, speaker_audio: bytes) -> bool: 44 | # with open("ttmp.wav", "wb") as f: 45 | # f.write(speaker_audio) 46 | with io.BytesIO(speaker_audio) as f: 47 | speaker_audio_1, _ = sf.read(f) 48 | res = {} 49 | if self.thr: 50 | res = self.sv_pipeline([speaker_audio_1, self.master_audio], self.thr) 51 | else: 52 | res = self.sv_pipeline([speaker_audio_1, self.master_audio]) 53 | print(f"[声纹识别结果]结果相似度{res['score']}, 目标相似度{self.thr}") 54 | if res["text"] == "yes": 55 | return True 56 | else: 57 | return False --------------------------------------------------------------------------------