├── GPT_SoVITS
    ├── AR
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── bucket_sampler.py
    │   │   ├── data_module.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── t2s_lightning_module.py
    │   │   ├── t2s_lightning_module_onnx.py
    │   │   ├── t2s_model.py
    │   │   ├── t2s_model_onnx.py
    │   │   └── utils.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── activation_onnx.py
    │   │   ├── embedding.py
    │   │   ├── embedding_onnx.py
    │   │   ├── lr_schedulers.py
    │   │   ├── optim.py
    │   │   ├── patched_mha_with_cache.py
    │   │   ├── patched_mha_with_cache_onnx.py
    │   │   ├── scaling.py
    │   │   ├── transformer.py
    │   │   └── transformer_onnx.py
    │   ├── text_processing
    │   │   ├── __init__.py
    │   │   ├── phonemizer.py
    │   │   └── symbols.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── initialize.py
    │   │   └── io.py
    ├── BigVGAN
    │   ├── LICENSE
    │   ├── README.md
    │   ├── activations.py
    │   ├── alias_free_activation
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── activation1d.py
    │   │   │   ├── anti_alias_activation.cpp
    │   │   │   ├── anti_alias_activation_cuda.cu
    │   │   │   ├── build
    │   │   │   │   └── _
    │   │   │   ├── compat.h
    │   │   │   ├── load.py
    │   │   │   └── type_shim.h
    │   │   └── torch
    │   │   │   ├── __init__.py
    │   │   │   ├── act.py
    │   │   │   ├── filter.py
    │   │   │   └── resample.py
    │   ├── bigvgan.py
    │   ├── configs
    │   │   ├── bigvgan_22khz_80band.json
    │   │   ├── bigvgan_24khz_100band.json
    │   │   ├── bigvgan_base_22khz_80band.json
    │   │   ├── bigvgan_base_24khz_100band.json
    │   │   ├── bigvgan_v2_22khz_80band_256x.json
    │   │   ├── bigvgan_v2_22khz_80band_fmax8k_256x.json
    │   │   ├── bigvgan_v2_24khz_100band_256x.json
    │   │   ├── bigvgan_v2_44khz_128band_256x.json
    │   │   └── bigvgan_v2_44khz_128band_512x.json
    │   ├── discriminators.py
    │   ├── env.py
    │   ├── incl_licenses
    │   │   ├── LICENSE_1
    │   │   ├── LICENSE_2
    │   │   ├── LICENSE_3
    │   │   ├── LICENSE_4
    │   │   ├── LICENSE_5
    │   │   ├── LICENSE_6
    │   │   ├── LICENSE_7
    │   │   └── LICENSE_8
    │   ├── inference.py
    │   ├── inference_e2e.py
    │   ├── loss.py
    │   ├── meldataset.py
    │   ├── nv-modelcard++
    │   │   ├── .gitkeep
    │   │   ├── bias.md
    │   │   ├── explainability.md
    │   │   ├── overview.md
    │   │   ├── privacy.md
    │   │   └── safety.md
    │   ├── requirements.txt
    │   ├── tests
    │   │   ├── test_activation.py
    │   │   ├── test_activation_snake_beta.py
    │   │   └── test_cuda_vs_torch_model.py
    │   ├── train.py
    │   └── utils0.py
    ├── TTS_infer_pack
    │   ├── TTS.py
    │   ├── TextPreprocessor.py
    │   ├── __init__.py
    │   └── text_segmentation_method.py
    ├── configs
    │   ├── .gitignore
    │   └── s2.json
    ├── download.py
    ├── export_torch_script.py
    ├── export_torch_script_v3.py
    ├── f5_tts
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── backbones
    │   │       ├── README.md
    │   │       ├── dit.py
    │   │       ├── mmdit.py
    │   │       └── unett.py
    │   │   └── modules.py
    ├── feature_extractor
    │   ├── __init__.py
    │   ├── cnhubert.py
    │   └── whisper_enc.py
    ├── inference_cli.py
    ├── inference_gui.py
    ├── inference_webui.py
    ├── inference_webui_fast.py
    ├── module
    │   ├── __init__.py
    │   ├── attentions.py
    │   ├── attentions_onnx.py
    │   ├── commons.py
    │   ├── core_vq.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── mrte_model.py
    │   ├── quantize.py
    │   └── transforms.py
    ├── onnx_export.py
    ├── prepare_datasets
    │   ├── 1-get-text.py
    │   ├── 2-get-hubert-wav32k.py
    │   └── 3-get-semantic.py
    ├── pretrained_models
    │   └── .gitignore
    ├── process_ckpt.py
    ├── s1_train.py
    ├── s2_train.py
    ├── s2_train_v3.py
    ├── s2_train_v3_lora.py
    ├── text
    │   ├── .gitignore
    │   ├── LangSegmenter
    │   │   ├── __init__.py
    │   │   └── langsegmenter.py
    │   ├── __init__.py
    │   ├── cantonese.py
    │   ├── chinese.py
    │   ├── chinese2.py
    │   ├── cleaner.py
    │   ├── cmudict-fast.rep
    │   ├── cmudict.rep
    │   ├── en_normalization
    │   │   └── expend.py
    │   ├── engdict-hot.rep
    │   ├── engdict_cache.pickle
    │   ├── english.py
    │   ├── g2pw
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── g2pw.py
    │   │   ├── onnx_api.py
    │   │   ├── polyphonic-fix.rep
    │   │   ├── polyphonic.pickle
    │   │   ├── polyphonic.rep
    │   │   └── utils.py
    │   ├── ja_userdic
    │   │   └── userdict.csv
    │   ├── japanese.py
    │   ├── korean.py
    │   ├── namedict_cache.pickle
    │   ├── opencpop-strict.txt
    │   ├── symbols.py
    │   ├── symbols2.py
    │   ├── tone_sandhi.py
    │   └── zh_normalization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── char_convert.py
    │   │   ├── chronology.py
    │   │   ├── constants.py
    │   │   ├── num.py
    │   │   ├── phonecode.py
    │   │   ├── quantifier.py
    │   │   └── text_normlization.py
    └── utils.py
├── LICENSE
├── README.md
├── chat_server.py
├── client-gui
    ├── .gitignore
    ├── README.md
    ├── pyproject.toml
    └── src
    │   ├── assets
    │       ├── icon.png
    │       └── splash_android.png
    │   ├── cli.py
    │   ├── client_gui.py
    │   ├── client_utils.py
    │   ├── frame_queue.py
    │   ├── pickable_session.py
    │   ├── pysilero.py
    │   ├── ui.py
    │   └── utils.py
├── client-requirements.txt
├── client_cli.py
├── config.yaml
├── extra-req.txt
├── extra-req2.txt
├── requirements.txt
├── screen
    └── img.png
├── tools
    ├── AP_BWE_main
    │   ├── 24kto48k
    │   │   └── readme.txt
    │   ├── LICENSE
    │   ├── README.md
    │   ├── datasets1
    │   │   ├── __init__.py
    │   │   └── dataset.py
    │   └── models
    │   │   ├── __init__.py
    │   │   └── model.py
    ├── __init__.py
    ├── asr
    │   ├── config.py
    │   ├── fasterwhisper_asr.py
    │   ├── funasr_asr.py
    │   └── models
    │   │   └── .gitignore
    ├── audio_sr.py
    ├── cmd-denoise.py
    ├── denoise-model
    │   └── .gitignore
    ├── i18n
    │   ├── i18n.py
    │   ├── locale
    │   │   ├── en_US.json
    │   │   ├── es_ES.json
    │   │   ├── fr_FR.json
    │   │   ├── it_IT.json
    │   │   ├── ja_JP.json
    │   │   ├── ko_KR.json
    │   │   ├── pt_BR.json
    │   │   ├── ru_RU.json
    │   │   ├── tr_TR.json
    │   │   ├── zh_CN.json
    │   │   ├── zh_HK.json
    │   │   ├── zh_SG.json
    │   │   └── zh_TW.json
    │   └── scan_i18n.py
    ├── my_utils.py
    ├── slice_audio.py
    ├── slicer2.py
    ├── subfix_webui.py
    └── uvr5
    │   ├── bs_roformer
    │       ├── __init__.py
    │       ├── attend.py
    │       ├── bs_roformer.py
    │       └── mel_band_roformer.py
    │   ├── bsroformer.py
    │   ├── lib
    │       ├── lib_v5
    │       │   ├── dataset.py
    │       │   ├── layers.py
    │       │   ├── layers_123812KB.py
    │       │   ├── layers_123821KB.py
    │       │   ├── layers_33966KB.py
    │       │   ├── layers_537227KB.py
    │       │   ├── layers_537238KB.py
    │       │   ├── layers_new.py
    │       │   ├── model_param_init.py
    │       │   ├── modelparams
    │       │   │   ├── 1band_sr16000_hl512.json
    │       │   │   ├── 1band_sr32000_hl512.json
    │       │   │   ├── 1band_sr33075_hl384.json
    │       │   │   ├── 1band_sr44100_hl1024.json
    │       │   │   ├── 1band_sr44100_hl256.json
    │       │   │   ├── 1band_sr44100_hl512.json
    │       │   │   ├── 1band_sr44100_hl512_cut.json
    │       │   │   ├── 2band_32000.json
    │       │   │   ├── 2band_44100_lofi.json
    │       │   │   ├── 2band_48000.json
    │       │   │   ├── 3band_44100.json
    │       │   │   ├── 3band_44100_mid.json
    │       │   │   ├── 3band_44100_msb2.json
    │       │   │   ├── 4band_44100.json
    │       │   │   ├── 4band_44100_mid.json
    │       │   │   ├── 4band_44100_msb.json
    │       │   │   ├── 4band_44100_msb2.json
    │       │   │   ├── 4band_44100_reverse.json
    │       │   │   ├── 4band_44100_sw.json
    │       │   │   ├── 4band_v2.json
    │       │   │   ├── 4band_v2_sn.json
    │       │   │   ├── 4band_v3.json
    │       │   │   └── ensemble.json
    │       │   ├── nets.py
    │       │   ├── nets_123812KB.py
    │       │   ├── nets_123821KB.py
    │       │   ├── nets_33966KB.py
    │       │   ├── nets_537227KB.py
    │       │   ├── nets_537238KB.py
    │       │   ├── nets_61968KB.py
    │       │   ├── nets_new.py
    │       │   └── spec_utils.py
    │       ├── name_params.json
    │       └── utils.py
    │   ├── mdxnet.py
    │   ├── uvr5_weights
    │       └── .gitignore
    │   ├── vr.py
    │   └── webui.py
└── utilss
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-39.pyc
        └── sv.cpython-39.pyc
    └── sv.py


/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from torch.utils.data import DataLoader
 5 | 
 6 | from AR.data.bucket_sampler import DistributedBucketSampler
 7 | from AR.data.dataset import Text2SemanticDataset
 8 | 
 9 | 
10 | class Text2SemanticDataModule(LightningDataModule):
11 |     def __init__(
12 |         self,
13 |         config,
14 |         train_semantic_path,
15 |         train_phoneme_path,
16 |         dev_semantic_path=None,
17 |         dev_phoneme_path=None,
18 |     ):
19 |         super().__init__()
20 |         self.config = config
21 |         self.train_semantic_path = train_semantic_path
22 |         self.train_phoneme_path = train_phoneme_path
23 |         self.dev_semantic_path = dev_semantic_path
24 |         self.dev_phoneme_path = dev_phoneme_path
25 |         self.num_workers = self.config["data"]["num_workers"]
26 | 
27 |     def prepare_data(self):
28 |         pass
29 | 
30 |     def setup(self, stage=None, output_logs=False):
31 |         self._train_dataset = Text2SemanticDataset(
32 |             phoneme_path=self.train_phoneme_path,
33 |             semantic_path=self.train_semantic_path,
34 |             max_sec=self.config["data"]["max_sec"],
35 |             pad_val=self.config["data"]["pad_val"],
36 |         )
37 |         self._dev_dataset = self._train_dataset
38 |         # self._dev_dataset = Text2SemanticDataset(
39 |         #     phoneme_path=self.dev_phoneme_path,
40 |         #     semantic_path=self.dev_semantic_path,
41 |         #     max_sample=self.config['data']['max_eval_sample'],
42 |         #     max_sec=self.config['data']['max_sec'],
43 |         #     pad_val=self.config['data']['pad_val'])
44 | 
45 |     def train_dataloader(self):
46 |         batch_size = (
47 |             self.config["train"]["batch_size"] // 2
48 |             if self.config["train"].get("if_dpo", False) is True
49 |             else self.config["train"]["batch_size"]
50 |         )
51 |         batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1)  # 防止不保存
52 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
53 |         return DataLoader(
54 |             self._train_dataset,
55 |             batch_size=batch_size,
56 |             sampler=sampler,
57 |             collate_fn=self._train_dataset.collate,
58 |             num_workers=self.num_workers,
59 |             persistent_workers=True,
60 |             prefetch_factor=16,
61 |         )
62 | 
63 |     def val_dataloader(self):
64 |         return DataLoader(
65 |             self._dev_dataset,
66 |             batch_size=1,
67 |             shuffle=False,
68 |             collate_fn=self._train_dataset.collate,
69 |             num_workers=max(self.num_workers, 12),
70 |             persistent_workers=True,
71 |             prefetch_factor=16,
72 |         )
73 | 
74 |     # 这个会使用到嘛？
75 |     def test_dataloader(self):
76 |         return DataLoader(
77 |             self._dev_dataset,
78 |             batch_size=1,
79 |             shuffle=False,
80 |             collate_fn=self._train_dataset.collate,
81 |         )
82 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os
  4 | import sys
  5 | 
  6 | now_dir = os.getcwd()
  7 | sys.path.append(now_dir)
  8 | from typing import Dict
  9 | 
 10 | import torch
 11 | from pytorch_lightning import LightningModule
 12 | 
 13 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 14 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 15 | from AR.modules.optim import ScaledAdam
 16 | 
 17 | 
 18 | class Text2SemanticLightningModule(LightningModule):
 19 |     def __init__(self, config, output_dir, is_train=True):
 20 |         super().__init__()
 21 |         self.config = config
 22 |         self.top_k = 3
 23 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 24 |         pretrained_s1 = config.get("pretrained_s1")
 25 |         if pretrained_s1 and is_train:
 26 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 27 |             print(
 28 |                 self.load_state_dict(
 29 |                     torch.load(
 30 |                         pretrained_s1,
 31 |                         map_location="cpu",
 32 |                     )["weight"],
 33 |                 ),
 34 |             )
 35 |         if is_train:
 36 |             self.automatic_optimization = False
 37 |             self.save_hyperparameters()
 38 |             self.eval_dir = output_dir / "eval"
 39 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 40 | 
 41 |     def training_step(self, batch: Dict, batch_idx: int):
 42 |         opt = self.optimizers()
 43 |         scheduler = self.lr_schedulers()
 44 |         loss, acc = self.model.forward(
 45 |             batch["phoneme_ids"],
 46 |             batch["phoneme_ids_len"],
 47 |             batch["semantic_ids"],
 48 |             batch["semantic_ids_len"],
 49 |             batch["bert_feature"],
 50 |         )
 51 |         self.manual_backward(loss)
 52 |         if batch_idx > 0 and batch_idx % 4 == 0:
 53 |             opt.step()
 54 |             opt.zero_grad()
 55 |             scheduler.step()
 56 | 
 57 |         self.log(
 58 |             "total_loss",
 59 |             loss,
 60 |             on_step=True,
 61 |             on_epoch=True,
 62 |             prog_bar=True,
 63 |             sync_dist=True,
 64 |         )
 65 |         self.log(
 66 |             "lr",
 67 |             scheduler.get_last_lr()[0],
 68 |             on_epoch=True,
 69 |             prog_bar=True,
 70 |             sync_dist=True,
 71 |         )
 72 |         self.log(
 73 |             f"top_{self.top_k}_acc",
 74 |             acc,
 75 |             on_step=True,
 76 |             on_epoch=True,
 77 |             prog_bar=True,
 78 |             sync_dist=True,
 79 |         )
 80 | 
 81 |     def validation_step(self, batch: Dict, batch_idx: int):
 82 |         return
 83 | 
 84 |     def configure_optimizers(self):
 85 |         model_parameters = self.model.parameters()
 86 |         parameters_names = []
 87 |         parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()])
 88 |         lm_opt = ScaledAdam(
 89 |             model_parameters,
 90 |             lr=0.01,
 91 |             betas=(0.9, 0.95),
 92 |             clipping_scale=2.0,
 93 |             parameters_names=parameters_names,
 94 |             show_dominant_parameters=False,
 95 |             clipping_update_period=1000,
 96 |         )
 97 | 
 98 |         return {
 99 |             "optimizer": lm_opt,
100 |             "lr_scheduler": {
101 |                 "scheduler": WarmupCosineLRSchedule(
102 |                     lm_opt,
103 |                     init_lr=self.config["optimizer"]["lr_init"],
104 |                     peak_lr=self.config["optimizer"]["lr"],
105 |                     end_lr=self.config["optimizer"]["lr_end"],
106 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
107 |                     total_steps=self.config["optimizer"]["decay_steps"],
108 |                 )
109 |             },
110 |         }
111 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
64 |         else:
65 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
66 |         div_term = torch.exp(
67 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
68 |         )
69 |         pe[:, 0::2] = torch.sin(position * div_term)
70 |         pe[:, 1::2] = torch.cos(position * div_term)
71 |         pe = pe.unsqueeze(0)
72 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
73 | 
74 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
75 |         self.extend_pe(x)
76 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
77 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
78 |         return self.dropout(output)
79 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
53 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
54 |                 raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.")
55 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
56 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
57 | 
58 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
59 |         self.set_lr(lr)
60 |         self.lr = lr
61 |         self._current_step += 1
62 |         return self.lr
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     m = nn.Linear(10, 10)
67 |     opt = Adam(m.parameters(), lr=1e-4)
68 |     s = WarmupCosineLRSchedule(
69 |         opt,
70 |         1e-6,
71 |         2e-4,
72 |         1e-6,
73 |         warmup_steps=2000,
74 |         total_steps=20000,
75 |         current_step=0,
76 |     )
77 |     lrs = []
78 |     for i in range(25000):
79 |         s.step()
80 |         lrs.append(s.lr)
81 |         print(s.lr)
82 | 
83 |     plt.plot(lrs)
84 |     plt.plot(range(0, 25000), lrs)
85 |     plt.show()
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _canonical_mask,
 4 | )
 5 | 
 6 | 
 7 | def multi_head_attention_forward_patched(
 8 |     query,
 9 |     key,
10 |     value,
11 |     embed_dim_to_check: int,
12 |     num_heads: int,
13 |     in_proj_weight,
14 |     in_proj_bias: Optional[Tensor],
15 |     bias_k: Optional[Tensor],
16 |     bias_v: Optional[Tensor],
17 |     add_zero_attn: bool,
18 |     dropout_p: float,
19 |     out_proj_weight: Tensor,
20 |     out_proj_bias: Optional[Tensor],
21 |     training: bool = True,
22 |     key_padding_mask: Optional[Tensor] = None,
23 |     need_weights: bool = True,
24 |     attn_mask: Optional[Tensor] = None,
25 |     use_separate_proj_weight: bool = False,
26 |     q_proj_weight: Optional[Tensor] = None,
27 |     k_proj_weight: Optional[Tensor] = None,
28 |     v_proj_weight: Optional[Tensor] = None,
29 |     static_k: Optional[Tensor] = None,
30 |     static_v: Optional[Tensor] = None,
31 |     average_attn_weights: bool = True,
32 |     is_causal: bool = False,
33 |     cache=None,
34 | ) -> Tuple[Tensor, Optional[Tensor]]:
35 |     # set up shape vars
36 |     _, _, embed_dim = query.shape
37 |     attn_mask = _canonical_mask(
38 |         mask=attn_mask,
39 |         mask_name="attn_mask",
40 |         other_type=None,
41 |         other_name="",
42 |         target_type=query.dtype,
43 |         check_other=False,
44 |     )
45 |     head_dim = embed_dim // num_heads
46 | 
47 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
48 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
49 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
50 | 
51 |     if cache["first_infer"] == 1:
52 |         cache["k"][cache["stage"]] = k
53 |         cache["v"][cache["stage"]] = v
54 |     else:
55 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
56 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
57 |         k = cache["k"][cache["stage"]]
58 |         v = cache["v"][cache["stage"]]
59 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
60 | 
61 |     attn_mask = _canonical_mask(
62 |         mask=attn_mask,
63 |         mask_name="attn_mask",
64 |         other_type=None,
65 |         other_name="",
66 |         target_type=q.dtype,
67 |         check_other=False,
68 |     )
69 |     attn_mask = attn_mask.unsqueeze(0)
70 | 
71 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
72 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
73 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
74 | 
75 |     dropout_p = 0.0
76 |     attn_mask = attn_mask.unsqueeze(0)
77 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
78 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
79 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
80 |     attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
81 |     attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
82 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
83 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
84 | 
85 |     return attn_output
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
34 | 
35 |     def _normalize_punctuation(self, text: str) -> str:
36 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
37 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
38 |         text = regex.sub(r"\pZ+", r" ", text)
39 |         return text.strip()
40 | 
41 |     def _convert_punctuation(self, word: Word) -> str:
42 |         if not word.phonemes:
43 |             return ""
44 |         if word.phonemes[0] in ["‖", "|"]:
45 |             return word.text.strip()
46 | 
47 |         phonemes = "".join(word.phonemes)
48 |         # remove modifier characters ˈˌː with regex
49 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
50 |         return phonemes.strip()
51 | 
52 |     def phonemize(self, text: str, espeak: bool = False) -> str:
53 |         text_to_phonemize: str = self._normalize_punctuation(text)
54 |         sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)]
55 |         words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)]
56 |         return " ".join(words)
57 | 
58 |     def transform(self, phonemes):
59 |         # convert phonemes to ids
60 |         # dictionary is in symbols.py
61 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     phonemizer = GruutPhonemizer("en-us")
66 |     # text -> IPA
67 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
68 |     print("phonemes:", phonemes)
69 |     print("len(phonemes):", len(phonemes))
70 |     phoneme_ids = phonemizer.transform(phonemes)
71 |     print("phoneme_ids:", phoneme_ids)
72 |     print("len(phoneme_ids):", len(phoneme_ids))
73 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = (
 7 |     "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 8 | )
 9 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
10 | SPACE_ID = SYMBOLS.index(" ")
11 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
12 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
13 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == "true" else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r"epoch=(\d+)-step=(\d+)\.ckpt"
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
22 |     # 获取最新的 ckpt 文件名
23 |     newest_ckpt = sorted_info[0][2]
24 |     return newest_ckpt
25 | 
26 | 
27 | # 文本存在且不为空时 return True
28 | def check_txt_file(file_path):
29 |     try:
30 |         with open(file_path, "r") as file:
31 |             text = file.readline().strip()
32 |         assert text.strip() != ""
33 |         return text
34 |     except Exception:
35 |         return False
36 |     return False
37 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | 
 4 | import torch
 5 | from typeguard import check_argument_types
 6 | 
 7 | 
 8 | def initialize(model: torch.nn.Module, init: str):
 9 |     """Initialize weights of a neural network module.
10 | 
11 |     Parameters are initialized using the given method or distribution.
12 | 
13 |     Custom initialization routines can be implemented into submodules
14 |     as function `espnet_initialization_fn` within the custom module.
15 | 
16 |     Args:
17 |         model: Target.
18 |         init: Method of initialization.
19 |     """
20 |     assert check_argument_types()
21 |     print("init with", init)
22 | 
23 |     # weight init
24 |     for p in model.parameters():
25 |         if p.dim() > 1:
26 |             if init == "xavier_uniform":
27 |                 torch.nn.init.xavier_uniform_(p.data)
28 |             elif init == "xavier_normal":
29 |                 torch.nn.init.xavier_normal_(p.data)
30 |             elif init == "kaiming_uniform":
31 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
32 |             elif init == "kaiming_normal":
33 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
34 |             else:
35 |                 raise ValueError("Unknown initialization: " + init)
36 |     # bias init
37 |     for name, p in model.named_parameters():
38 |         if ".bias" in name and p.dim() == 1:
39 |             p.data.zero_()
40 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_"))
22 |     with open(path, "a") as args_file:
23 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
24 |         args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version()))
25 |         args_file.write("==> Cmd:\n")
26 |         args_file.write(str(sys.argv))
27 |         args_file.write("\n==> args:\n")
28 |         for k, v in sorted(args_dict.items()):
29 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
30 |         args_file.close()
31 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 NVIDIA CORPORATION.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from alias_free_activation.torch.resample import UpSample1d, DownSample1d
 7 | 
 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
 9 | from alias_free_activation.cuda import load
10 | 
11 | anti_alias_activation_cuda = load.load()
12 | 
13 | 
14 | class FusedAntiAliasActivation(torch.autograd.Function):
15 |     """
16 |     Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
17 |     The hyperparameters are hard-coded in the kernel to maximize speed.
18 |     NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
19 |     """
20 | 
21 |     @staticmethod
22 |     def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
23 |         activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta)
24 | 
25 |         return activation_results
26 | 
27 |     @staticmethod
28 |     def backward(ctx, output_grads):
29 |         raise NotImplementedError
30 |         return output_grads, None, None
31 | 
32 | 
33 | class Activation1d(nn.Module):
34 |     def __init__(
35 |         self,
36 |         activation,
37 |         up_ratio: int = 2,
38 |         down_ratio: int = 2,
39 |         up_kernel_size: int = 12,
40 |         down_kernel_size: int = 12,
41 |         fused: bool = True,
42 |     ):
43 |         super().__init__()
44 |         self.up_ratio = up_ratio
45 |         self.down_ratio = down_ratio
46 |         self.act = activation
47 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
48 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
49 | 
50 |         self.fused = fused  # Whether to use fused CUDA kernel or not
51 | 
52 |     def forward(self, x):
53 |         if not self.fused:
54 |             x = self.upsample(x)
55 |             x = self.act(x)
56 |             x = self.downsample(x)
57 |             return x
58 |         else:
59 |             if self.act.__class__.__name__ == "Snake":
60 |                 beta = self.act.alpha.data  # Snake uses same params for alpha and beta
61 |             else:
62 |                 beta = self.act.beta.data  # Snakebeta uses different params for alpha and beta
63 |             alpha = self.act.alpha.data
64 |             if not self.act.alpha_logscale:  # Exp baked into cuda kernel, cancel it out with a log
65 |                 alpha = torch.log(alpha)
66 |                 beta = torch.log(beta)
67 | 
68 |             x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta)
69 |             return x
70 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 |  #include <torch/extension.h>
18 | 
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |     m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/build/_:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import pathlib
 6 | import subprocess
 7 | 
 8 | from torch.utils import cpp_extension
 9 | 
10 | """
11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
13 | """
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 | 
16 | 
17 | def load():
18 |     # Check if cuda 11 is installed for compute capability 8.0
19 |     cc_flag = []
20 |     _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
21 |     if int(bare_metal_major) >= 11:
22 |         cc_flag.append("-gencode")
23 |         cc_flag.append("arch=compute_80,code=sm_80")
24 | 
25 |     # Build path
26 |     srcpath = pathlib.Path(__file__).parent.absolute()
27 |     buildpath = srcpath / "build"
28 |     _create_build_dir(buildpath)
29 | 
30 |     # Helper function to build the kernels.
31 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
32 |         return cpp_extension.load(
33 |             name=name,
34 |             sources=sources,
35 |             build_directory=buildpath,
36 |             extra_cflags=[
37 |                 "-O3",
38 |             ],
39 |             extra_cuda_cflags=[
40 |                 "-O3",
41 |                 "-gencode",
42 |                 "arch=compute_70,code=sm_70",
43 |                 "--use_fast_math",
44 |             ]
45 |             + extra_cuda_flags
46 |             + cc_flag,
47 |             verbose=True,
48 |         )
49 | 
50 |     extra_cuda_flags = [
51 |         "-U__CUDA_NO_HALF_OPERATORS__",
52 |         "-U__CUDA_NO_HALF_CONVERSIONS__",
53 |         "--expt-relaxed-constexpr",
54 |         "--expt-extended-lambda",
55 |     ]
56 | 
57 |     sources = [
58 |         srcpath / "anti_alias_activation.cpp",
59 |         srcpath / "anti_alias_activation_cuda.cu",
60 |     ]
61 |     anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags)
62 | 
63 |     return anti_alias_activation_cuda
64 | 
65 | 
66 | def _get_cuda_bare_metal_version(cuda_dir):
67 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
68 |     output = raw_output.split()
69 |     release_idx = output.index("release") + 1
70 |     release = output[release_idx].split(".")
71 |     bare_metal_major = release[0]
72 |     bare_metal_minor = release[1][0]
73 | 
74 |     return raw_output, bare_metal_major, bare_metal_minor
75 | 
76 | 
77 | def _create_build_dir(buildpath):
78 |     try:
79 |         os.mkdir(buildpath)
80 |     except OSError:
81 |         if not os.path.isdir(buildpath):
82 |             print(f"Creation of the build directory {buildpath} failed")
83 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *
7 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         activation,
12 |         up_ratio: int = 2,
13 |         down_ratio: int = 2,
14 |         up_kernel_size: int = 12,
15 |         down_kernel_size: int = 12,
16 |     ):
17 |         super().__init__()
18 |         self.up_ratio = up_ratio
19 |         self.down_ratio = down_ratio
20 |         self.act = activation
21 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
22 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
23 | 
24 |     # x: [B,C,T]
25 |     def forward(self, x):
26 |         x = self.upsample(x)
27 |         x = self.act(x)
28 |         x = self.downsample(x)
29 | 
30 |         return x
31 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import math
  8 | 
  9 | if "sinc" in dir(torch):
 10 |     sinc = torch.sinc
 11 | else:
 12 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
 13 |     # https://adefossez.github.io/julius/julius/core.html
 14 |     #   LICENSE is in incl_licenses directory.
 15 |     def sinc(x: torch.Tensor):
 16 |         """
 17 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
 18 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
 19 |         """
 20 |         return torch.where(
 21 |             x == 0,
 22 |             torch.tensor(1.0, device=x.device, dtype=x.dtype),
 23 |             torch.sin(math.pi * x) / math.pi / x,
 24 |         )
 25 | 
 26 | 
 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
 28 | # https://adefossez.github.io/julius/julius/lowpass.html
 29 | #   LICENSE is in incl_licenses directory.
 30 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):  # return filter [1,1,kernel_size]
 31 |     even = kernel_size % 2 == 0
 32 |     half_size = kernel_size // 2
 33 | 
 34 |     # For kaiser window
 35 |     delta_f = 4 * half_width
 36 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
 37 |     if A > 50.0:
 38 |         beta = 0.1102 * (A - 8.7)
 39 |     elif A >= 21.0:
 40 |         beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
 41 |     else:
 42 |         beta = 0.0
 43 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
 44 | 
 45 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
 46 |     if even:
 47 |         time = torch.arange(-half_size, half_size) + 0.5
 48 |     else:
 49 |         time = torch.arange(kernel_size) - half_size
 50 |     if cutoff == 0:
 51 |         filter_ = torch.zeros_like(time)
 52 |     else:
 53 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
 54 |         """
 55 |         Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
 56 |         """
 57 |         filter_ /= filter_.sum()
 58 |         filter = filter_.view(1, 1, kernel_size)
 59 | 
 60 |     return filter
 61 | 
 62 | 
 63 | class LowPassFilter1d(nn.Module):
 64 |     def __init__(
 65 |         self,
 66 |         cutoff=0.5,
 67 |         half_width=0.6,
 68 |         stride: int = 1,
 69 |         padding: bool = True,
 70 |         padding_mode: str = "replicate",
 71 |         kernel_size: int = 12,
 72 |     ):
 73 |         """
 74 |         kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
 75 |         """
 76 |         super().__init__()
 77 |         if cutoff < -0.0:
 78 |             raise ValueError("Minimum cutoff must be larger than zero.")
 79 |         if cutoff > 0.5:
 80 |             raise ValueError("A cutoff above 0.5 does not make sense.")
 81 |         self.kernel_size = kernel_size
 82 |         self.even = kernel_size % 2 == 0
 83 |         self.pad_left = kernel_size // 2 - int(self.even)
 84 |         self.pad_right = kernel_size // 2
 85 |         self.stride = stride
 86 |         self.padding = padding
 87 |         self.padding_mode = padding_mode
 88 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
 89 |         self.register_buffer("filter", filter)
 90 | 
 91 |     # Input [B, C, T]
 92 |     def forward(self, x):
 93 |         _, C, _ = x.shape
 94 | 
 95 |         if self.padding:
 96 |             x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
 97 |         out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
 98 | 
 99 |         return out
100 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
20 |         self.register_buffer("filter", filter)
21 | 
22 |     # x: [B, C, T]
23 |     def forward(self, x):
24 |         _, C, _ = x.shape
25 | 
26 |         x = F.pad(x, (self.pad, self.pad), mode="replicate")
27 |         x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
28 |         x = x[..., self.pad_left : -self.pad_right]
29 | 
30 |         return x
31 | 
32 | 
33 | class DownSample1d(nn.Module):
34 |     def __init__(self, ratio=2, kernel_size=None):
35 |         super().__init__()
36 |         self.ratio = ratio
37 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
38 |         self.lowpass = LowPassFilter1d(
39 |             cutoff=0.5 / ratio,
40 |             half_width=0.6 / ratio,
41 |             stride=ratio,
42 |             kernel_size=self.kernel_size,
43 |         )
44 | 
45 |     def forward(self, x):
46 |         xx = self.lowpass(x)
47 | 
48 |         return xx
49 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 80,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 22050,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 8000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 100,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 24000,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 12000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 80,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 22050,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 8000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 100,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 24000,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 12000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 |     
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 80,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 22050,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 |     
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 80,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 22050,
49 | 
50 |     "fmin": 0,
51 |     "fmax": 8000,
52 |     "fmax_for_loss": null,
53 | 
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 100,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 24000,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 128,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 44100,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [16,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 128,
43 |     "num_freq": 2049,
44 |     "n_fft": 2048,
45 |     "hop_size": 512,
46 |     "win_size": 2048,
47 | 
48 |     "sampling_rate": 44100,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 | 
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/env.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import os
 5 | import shutil
 6 | 
 7 | 
 8 | class AttrDict(dict):
 9 |     def __init__(self, *args, **kwargs):
10 |         super(AttrDict, self).__init__(*args, **kwargs)
11 |         self.__dict__ = self
12 | 
13 | 
14 | def build_env(config, config_name, path):
15 |     t_path = os.path.join(path, config_name)
16 |     if config != t_path:
17 |         os.makedirs(path, exist_ok=True)
18 |         shutil.copyfile(config, os.path.join(path, config_name))
19 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Edward Dixon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Seungwon Park 박승원
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Alexandre Défossez
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 4 | associated documentation files (the "Software"), to deal in the Software without restriction,
 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 7 | furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or
10 | substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-present, Descript
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Charactr Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Amphion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | from __future__ import absolute_import, division, print_function, unicode_literals
 5 | 
 6 | import os
 7 | import argparse
 8 | import json
 9 | import torch
10 | import librosa
11 | from utils import load_checkpoint
12 | from meldataset import get_mel_spectrogram
13 | from scipy.io.wavfile import write
14 | from env import AttrDict
15 | from meldataset import MAX_WAV_VALUE
16 | from bigvgan import BigVGAN as Generator
17 | 
18 | h = None
19 | device = None
20 | torch.backends.cudnn.benchmark = False
21 | 
22 | 
23 | def inference(a, h):
24 |     generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
25 | 
26 |     state_dict_g = load_checkpoint(a.checkpoint_file, device)
27 |     generator.load_state_dict(state_dict_g["generator"])
28 | 
29 |     filelist = os.listdir(a.input_wavs_dir)
30 | 
31 |     os.makedirs(a.output_dir, exist_ok=True)
32 | 
33 |     generator.eval()
34 |     generator.remove_weight_norm()
35 |     with torch.no_grad():
36 |         for i, filname in enumerate(filelist):
37 |             # Load the ground truth audio and resample if necessary
38 |             wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True)
39 |             wav = torch.FloatTensor(wav).to(device)
40 |             # Compute mel spectrogram from the ground truth audio
41 |             x = get_mel_spectrogram(wav.unsqueeze(0), generator.h)
42 | 
43 |             y_g_hat = generator(x)
44 | 
45 |             audio = y_g_hat.squeeze()
46 |             audio = audio * MAX_WAV_VALUE
47 |             audio = audio.cpu().numpy().astype("int16")
48 | 
49 |             output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav")
50 |             write(output_file, h.sampling_rate, audio)
51 |             print(output_file)
52 | 
53 | 
54 | def main():
55 |     print("Initializing Inference Process..")
56 | 
57 |     parser = argparse.ArgumentParser()
58 |     parser.add_argument("--input_wavs_dir", default="test_files")
59 |     parser.add_argument("--output_dir", default="generated_files")
60 |     parser.add_argument("--checkpoint_file", required=True)
61 |     parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
62 | 
63 |     a = parser.parse_args()
64 | 
65 |     config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
66 |     with open(config_file) as f:
67 |         data = f.read()
68 | 
69 |     global h
70 |     json_config = json.loads(data)
71 |     h = AttrDict(json_config)
72 | 
73 |     torch.manual_seed(h.seed)
74 |     global device
75 |     if torch.cuda.is_available():
76 |         torch.cuda.manual_seed(h.seed)
77 |         device = torch.device("cuda")
78 |     else:
79 |         device = torch.device("cpu")
80 | 
81 |     inference(a, h)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference_e2e.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | from __future__ import absolute_import, division, print_function, unicode_literals
  5 | 
  6 | import glob
  7 | import os
  8 | import numpy as np
  9 | import argparse
 10 | import json
 11 | import torch
 12 | from scipy.io.wavfile import write
 13 | from env import AttrDict
 14 | from meldataset import MAX_WAV_VALUE
 15 | from bigvgan import BigVGAN as Generator
 16 | 
 17 | h = None
 18 | device = None
 19 | torch.backends.cudnn.benchmark = False
 20 | 
 21 | 
 22 | def load_checkpoint(filepath, device):
 23 |     assert os.path.isfile(filepath)
 24 |     print(f"Loading '{filepath}'")
 25 |     checkpoint_dict = torch.load(filepath, map_location=device)
 26 |     print("Complete.")
 27 |     return checkpoint_dict
 28 | 
 29 | 
 30 | def scan_checkpoint(cp_dir, prefix):
 31 |     pattern = os.path.join(cp_dir, prefix + "*")
 32 |     cp_list = glob.glob(pattern)
 33 |     if len(cp_list) == 0:
 34 |         return ""
 35 |     return sorted(cp_list)[-1]
 36 | 
 37 | 
 38 | def inference(a, h):
 39 |     generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
 40 | 
 41 |     state_dict_g = load_checkpoint(a.checkpoint_file, device)
 42 |     generator.load_state_dict(state_dict_g["generator"])
 43 | 
 44 |     filelist = os.listdir(a.input_mels_dir)
 45 | 
 46 |     os.makedirs(a.output_dir, exist_ok=True)
 47 | 
 48 |     generator.eval()
 49 |     generator.remove_weight_norm()
 50 |     with torch.no_grad():
 51 |         for i, filname in enumerate(filelist):
 52 |             # Load the mel spectrogram in .npy format
 53 |             x = np.load(os.path.join(a.input_mels_dir, filname))
 54 |             x = torch.FloatTensor(x).to(device)
 55 |             if len(x.shape) == 2:
 56 |                 x = x.unsqueeze(0)
 57 | 
 58 |             y_g_hat = generator(x)
 59 | 
 60 |             audio = y_g_hat.squeeze()
 61 |             audio = audio * MAX_WAV_VALUE
 62 |             audio = audio.cpu().numpy().astype("int16")
 63 | 
 64 |             output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav")
 65 |             write(output_file, h.sampling_rate, audio)
 66 |             print(output_file)
 67 | 
 68 | 
 69 | def main():
 70 |     print("Initializing Inference Process..")
 71 | 
 72 |     parser = argparse.ArgumentParser()
 73 |     parser.add_argument("--input_mels_dir", default="test_mel_files")
 74 |     parser.add_argument("--output_dir", default="generated_files_from_mel")
 75 |     parser.add_argument("--checkpoint_file", required=True)
 76 |     parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
 77 | 
 78 |     a = parser.parse_args()
 79 | 
 80 |     config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
 81 |     with open(config_file) as f:
 82 |         data = f.read()
 83 | 
 84 |     global h
 85 |     json_config = json.loads(data)
 86 |     h = AttrDict(json_config)
 87 | 
 88 |     torch.manual_seed(h.seed)
 89 |     global device
 90 |     if torch.cuda.is_available():
 91 |         torch.cuda.manual_seed(h.seed)
 92 |         device = torch.device("cuda")
 93 |     else:
 94 |         device = torch.device("cpu")
 95 | 
 96 |     inference(a, h)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md:
--------------------------------------------------------------------------------
1 | | Field                                                                                                      | Response                                             |
2 | | :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- |
3 | | Participation considerations from adversely impacted groups protected classes in model design and testing: | None                                                 |
4 | | Measures taken to mitigate against unwanted bias:                                                          | No measures taken to mitigate against unwanted bias. |
5 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md:
--------------------------------------------------------------------------------
 1 | | Field                                                                                                 | Response                                                                                                                                                                                                               |
 2 | | :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 3 | | Intended Application & Domain:                                                                        | Generating waveform from mel spectrogram.                                                                                                                                                                              |
 4 | | Model Type:                                                                                           | Convolutional Neural Network (CNN)                                                                                                                                                                                     |
 5 | | Intended Users:                                                                                       | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms.                                                                                                     |
 6 | | Output:                                                                                               | Audio Waveform                                                                                                                                                                                                         |
 7 | | Describe how the model works:                                                                         | Model generates audio waveform corresponding to the input mel spectrogram.                                                                                                                                             |
 8 | | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable                                                                                                                                                                                                         |
 9 | | Technical Limitations:                                                                                | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained.                                                       |
10 | | Verified to have met prescribed NVIDIA quality standards:                                             | Yes                                                                                                                                                                                                                    |
11 | | Performance Metrics:                                                                                  | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) |
12 | | Potential Known Risks:                                                                                | This model may generate low-quality or distorted soundwaves.                                                                                                                                                           |
13 | | Licensing:                                                                                            | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE                                                                                                                                                                    |
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md:
--------------------------------------------------------------------------------
 1 | | Field                                                                                                                                  | Response                                       |
 2 | | :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- |
 3 | | Generatable or reverse engineerable personal information?                                                                              | None                                           |
 4 | | Protected class data used to create this model?                                                                                        | None                                           |
 5 | | Was consent obtained for any personal data used?                                                                                       | Not Applicable (No Personal Data)              |
 6 | | How often is dataset reviewed?                                                                                                         | Before Release                                 |
 7 | | Is a mechanism in place to honor data subject right of access or deletion of personal data?                                            | Not Applicable                                 |
 8 | | If personal collected for the development of the model, was it collected directly by NVIDIA?                                           | Not Applicable                                 |
 9 | | If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable                                 |
10 | | If personal collected for the development of this AI model, was it minimized to only what was required?                                | Not Applicable                                 |
11 | | Is data in dataset traceable?                                                                                                          | Yes                                            |
12 | | Is there provenance for all datasets used in training?                                                                                 | Yes                                            |
13 | | Does data labeling (annotation, metadata) comply with privacy laws?                                                                    | Yes                                            |
14 | | Is data compliant with data subject requests for data correction or removal, if such a request was made?                               | No, not possible with externally-sourced data. |
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md:
--------------------------------------------------------------------------------
1 | | Field                                           | Response                                                                                                                                                                                                          |
2 | | :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
3 | | Model Application(s):                           | Synethic Audio Generation                                                                                                                                                                                         |
4 | | Describe the life critical impact (if present). | Not Applicable                                                                                                                                                                                                    |
5 | | Use Case Restrictions:                          | None                                                                                                                                                                                                              |
6 | | Model and dataset restrictions:                 | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
7 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | numpy
 3 | librosa>=0.8.1
 4 | scipy
 5 | tensorboard
 6 | soundfile
 7 | matplotlib
 8 | pesq
 9 | auraloss
10 | tqdm
11 | nnAudio
12 | ninja
13 | huggingface_hub>=0.23.4


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | # to import modules from parent_dir
 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(parent_dir)
10 | 
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import Snake
14 | 
15 | 
16 | def test_load_fused_kernels():
17 |     try:
18 |         print("[Success] load_fused_kernels")
19 |     except ImportError as e:
20 |         print("[Fail] load_fused_kernels")
21 |         raise e
22 | 
23 | 
24 | def test_anti_alias_activation():
25 |     data = torch.rand((10, 10, 200), device="cuda")
26 | 
27 |     # Check activations.Snake cuda vs. torch
28 |     fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda()
29 |     fused_activation_output = fused_anti_alias_activation(data)
30 | 
31 |     torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda()
32 |     torch_activation_output = torch_anti_alias_activation(data)
33 | 
34 |     test_result = (fused_activation_output - torch_activation_output).abs()
35 | 
36 |     while test_result.dim() != 1:
37 |         test_result = test_result.mean(dim=-1)
38 | 
39 |     diff = test_result.mean(dim=-1)
40 | 
41 |     if diff <= 1e-3:
42 |         print(
43 |             f"\n[Success] test_fused_anti_alias_activation"
44 |             f"\n > mean_difference={diff}"
45 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 |         )
48 |     else:
49 |         print(
50 |             f"\n[Fail] test_fused_anti_alias_activation"
51 |             f"\n > mean_difference={diff}, "
52 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 |         )
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     from alias_free_activation.cuda import load
59 | 
60 |     load.load()
61 |     test_load_fused_kernels()
62 |     test_anti_alias_activation()
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | # to import modules from parent_dir
 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(parent_dir)
10 | 
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import SnakeBeta
14 | 
15 | 
16 | def test_load_fused_kernels():
17 |     try:
18 |         print("[Success] load_fused_kernels")
19 |     except ImportError as e:
20 |         print("[Fail] load_fused_kernels")
21 |         raise e
22 | 
23 | 
24 | def test_anti_alias_activation():
25 |     data = torch.rand((10, 10, 200), device="cuda")
26 | 
27 |     # Check activations, Snake CUDA vs. Torch
28 |     fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda()
29 |     fused_activation_output = fused_anti_alias_activation(data)
30 | 
31 |     torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda()
32 |     torch_activation_output = torch_anti_alias_activation(data)
33 | 
34 |     test_result = (fused_activation_output - torch_activation_output).abs()
35 | 
36 |     while test_result.dim() != 1:
37 |         test_result = test_result.mean(dim=-1)
38 | 
39 |     diff = test_result.mean(dim=-1)
40 | 
41 |     if diff <= 1e-3:
42 |         print(
43 |             f"\n[Success] test_fused_anti_alias_activation"
44 |             f"\n > mean_difference={diff}"
45 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 |         )
48 |     else:
49 |         print(
50 |             f"\n[Fail] test_fused_anti_alias_activation"
51 |             f"\n > mean_difference={diff}, "
52 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 |         )
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     from alias_free_activation.cuda import load
59 | 
60 |     load.load()
61 |     test_load_fused_kernels()
62 |     test_anti_alias_activation()
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/utils0.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import glob
  5 | import os
  6 | import matplotlib
  7 | import torch
  8 | from torch.nn.utils import weight_norm
  9 | 
 10 | matplotlib.use("Agg")
 11 | import matplotlib.pylab as plt
 12 | from .meldataset import MAX_WAV_VALUE
 13 | from scipy.io.wavfile import write
 14 | 
 15 | 
 16 | def plot_spectrogram(spectrogram):
 17 |     fig, ax = plt.subplots(figsize=(10, 2))
 18 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
 19 |     plt.colorbar(im, ax=ax)
 20 | 
 21 |     fig.canvas.draw()
 22 |     plt.close()
 23 | 
 24 |     return fig
 25 | 
 26 | 
 27 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
 28 |     fig, ax = plt.subplots(figsize=(10, 2))
 29 |     im = ax.imshow(
 30 |         spectrogram,
 31 |         aspect="auto",
 32 |         origin="lower",
 33 |         interpolation="none",
 34 |         vmin=1e-6,
 35 |         vmax=clip_max,
 36 |     )
 37 |     plt.colorbar(im, ax=ax)
 38 | 
 39 |     fig.canvas.draw()
 40 |     plt.close()
 41 | 
 42 |     return fig
 43 | 
 44 | 
 45 | def init_weights(m, mean=0.0, std=0.01):
 46 |     classname = m.__class__.__name__
 47 |     if classname.find("Conv") != -1:
 48 |         m.weight.data.normal_(mean, std)
 49 | 
 50 | 
 51 | def apply_weight_norm(m):
 52 |     classname = m.__class__.__name__
 53 |     if classname.find("Conv") != -1:
 54 |         weight_norm(m)
 55 | 
 56 | 
 57 | def get_padding(kernel_size, dilation=1):
 58 |     return int((kernel_size * dilation - dilation) / 2)
 59 | 
 60 | 
 61 | def load_checkpoint(filepath, device):
 62 |     assert os.path.isfile(filepath)
 63 |     print(f"Loading '{filepath}'")
 64 |     checkpoint_dict = torch.load(filepath, map_location=device)
 65 |     print("Complete.")
 66 |     return checkpoint_dict
 67 | 
 68 | 
 69 | def save_checkpoint(filepath, obj):
 70 |     print(f"Saving checkpoint to {filepath}")
 71 |     torch.save(obj, filepath)
 72 |     print("Complete.")
 73 | 
 74 | 
 75 | def scan_checkpoint(cp_dir, prefix, renamed_file=None):
 76 |     # Fallback to original scanning logic first
 77 |     pattern = os.path.join(cp_dir, prefix + "????????")
 78 |     cp_list = glob.glob(pattern)
 79 | 
 80 |     if len(cp_list) > 0:
 81 |         last_checkpoint_path = sorted(cp_list)[-1]
 82 |         print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
 83 |         return last_checkpoint_path
 84 | 
 85 |     # If no pattern-based checkpoints are found, check for renamed file
 86 |     if renamed_file:
 87 |         renamed_path = os.path.join(cp_dir, renamed_file)
 88 |         if os.path.isfile(renamed_path):
 89 |             print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
 90 |             return renamed_path
 91 | 
 92 |     return None
 93 | 
 94 | 
 95 | def save_audio(audio, path, sr):
 96 |     # wav: torch with 1d shape
 97 |     audio = audio * MAX_WAV_VALUE
 98 |     audio = audio.cpu().numpy().astype("int16")
 99 |     write(path, sr, audio)
100 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from . import TTS, text_segmentation_method
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/.gitignore:
--------------------------------------------------------------------------------
1 | *.yaml


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4, 
22 |     "grad_ckpt": false
23 |   },
24 |   "data": {
25 |     "max_wav_value": 32768.0,
26 |     "sampling_rate": 32000,
27 |     "filter_length": 2048,
28 |     "hop_length": 640,
29 |     "win_length": 2048,
30 |     "n_mel_channels": 128,
31 |     "mel_fmin": 0.0,
32 |     "mel_fmax": null,
33 |     "add_blank": true,
34 |     "n_speakers": 300,
35 |     "cleaned_text": true
36 |   },
37 |   "model": {
38 |     "inter_channels": 192,
39 |     "hidden_channels": 192,
40 |     "filter_channels": 768,
41 |     "n_heads": 2,
42 |     "n_layers": 6,
43 |     "kernel_size": 3,
44 |     "p_dropout": 0.1,
45 |     "resblock": "1",
46 |     "resblock_kernel_sizes": [
47 |       3,
48 |       7,
49 |       11
50 |     ],
51 |     "resblock_dilation_sizes": [
52 |       [
53 |         1,
54 |         3,
55 |         5
56 |       ],
57 |       [
58 |         1,
59 |         3,
60 |         5
61 |       ],
62 |       [
63 |         1,
64 |         3,
65 |         5
66 |       ]
67 |     ],
68 |     "upsample_rates": [
69 |       10,
70 |       8,
71 |       2,
72 |       2,
73 |       2
74 |     ],
75 |     "upsample_initial_channel": 512,
76 |     "upsample_kernel_sizes": [
77 |       16,
78 |       16,
79 |       8,
80 |       2,
81 |       2
82 |     ],
83 |     "n_layers_q": 3,
84 |     "use_spectral_norm": false,
85 |     "gin_channels": 512,
86 |     "semantic_frame_rate": "25hz",
87 |     "freeze_quantizer": true
88 |   },
89 |   "s2_ckpt_dir": "logs/s2/big2k1",
90 |   "content_module": "cnhubert"
91 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | now_dir = os.getcwd()
 5 | sys.path.insert(0, now_dir)
 6 | from text.g2pw import G2PWPinyin
 7 | 
 8 | g2pw = G2PWPinyin(
 9 |     model_dir="GPT_SoVITS/text/G2PWModel",
10 |     model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
11 |     v_to_u=False,
12 |     neutral_tone_with_five=True,
13 | )
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # from f5_tts.model.cfm import CFM
 2 | #
 3 | # from f5_tts.model.backbones.unett import UNetT
 4 | from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
 5 | # from f5_tts.model.backbones.dit import DiTNoCond
 6 | # from f5_tts.model.backbones.dit import DiTNoCondNoT
 7 | # from f5_tts.model.backbones.mmdit import MMDiT
 8 | 
 9 | # from f5_tts.model.trainer import Trainer
10 | 
11 | 
12 | # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
13 | # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/backbones/README.md:
--------------------------------------------------------------------------------
 1 | ## Backbones quick introduction
 2 | 
 3 | 
 4 | ### unett.py
 5 | - flat unet transformer
 6 | - structure same as in e2-tts & voicebox paper except using rotary pos emb
 7 | - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
 8 | 
 9 | ### dit.py
10 | - adaln-zero dit
11 | - embedded timestep as condition
12 | - concatted noised_input + masked_cond + embedded_text, linear proj in
13 | - possible abs pos emb & convnextv2 blocks for embedded text before concat
14 | - possible long skip connection (first layer to last layer)
15 | 
16 | ### mmdit.py
17 | - sd3 structure
18 | - timestep as condition
19 | - left stream: text embedded and applied a abs pos emb
20 | - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
21 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc}
4 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from transformers import logging as tf_logging
  4 | 
  5 | tf_logging.set_verbosity_error()
  6 | 
  7 | import logging
  8 | 
  9 | logging.getLogger("numba").setLevel(logging.WARNING)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | import utils
 17 | import torch.nn as nn
 18 | 
 19 | cnhubert_base_path = None
 20 | 
 21 | 
 22 | class CNHubert(nn.Module):
 23 |     def __init__(self, base_path: str = None):
 24 |         super().__init__()
 25 |         if base_path is None:
 26 |             base_path = cnhubert_base_path
 27 |         if os.path.exists(base_path):
 28 |             ...
 29 |         else:
 30 |             raise FileNotFoundError(base_path)
 31 |         self.model = HubertModel.from_pretrained(base_path, local_files_only=True)
 32 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True)
 33 | 
 34 |     def forward(self, x):
 35 |         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 36 |         feats = self.model(input_values)["last_hidden_state"]
 37 |         return feats
 38 | 
 39 | 
 40 | # class CNHubertLarge(nn.Module):
 41 | #     def __init__(self):
 42 | #         super().__init__()
 43 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 44 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 45 | #     def forward(self, x):
 46 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 47 | #         feats = self.model(input_values)["last_hidden_state"]
 48 | #         return feats
 49 | #
 50 | # class CVec(nn.Module):
 51 | #     def __init__(self):
 52 | #         super().__init__()
 53 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 54 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 55 | #     def forward(self, x):
 56 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 57 | #         feats = self.model(input_values)["last_hidden_state"]
 58 | #         return feats
 59 | #
 60 | # class cnw2v2base(nn.Module):
 61 | #     def __init__(self):
 62 | #         super().__init__()
 63 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 64 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 65 | #     def forward(self, x):
 66 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 67 | #         feats = self.model(input_values)["last_hidden_state"]
 68 | #         return feats
 69 | 
 70 | 
 71 | def get_model():
 72 |     model = CNHubert()
 73 |     model.eval()
 74 |     return model
 75 | 
 76 | 
 77 | # def get_large_model():
 78 | #     model = CNHubertLarge()
 79 | #     model.eval()
 80 | #     return model
 81 | #
 82 | # def get_model_cvec():
 83 | #     model = CVec()
 84 | #     model.eval()
 85 | #     return model
 86 | #
 87 | # def get_model_cnw2v2base():
 88 | #     model = cnw2v2base()
 89 | #     model.eval()
 90 | #     return model
 91 | 
 92 | 
 93 | def get_content(hmodel, wav_16k_tensor):
 94 |     with torch.no_grad():
 95 |         feats = hmodel(wav_16k_tensor)
 96 |     return feats.transpose(1, 2)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     model = get_model()
101 |     src_path = "/Users/Shared/原音频2.wav"
102 |     wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
103 |     model = model
104 |     wav_16k_tensor = wav_16k_tensor
105 |     feats = get_content(model, wav_16k_tensor)
106 |     print(feats.shape)
107 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2)
23 |     return feature
24 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/inference_cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import soundfile as sf
 4 | 
 5 | from tools.i18n.i18n import I18nAuto
 6 | from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
 7 | 
 8 | i18n = I18nAuto()
 9 | 
10 | 
11 | def synthesize(
12 |     GPT_model_path,
13 |     SoVITS_model_path,
14 |     ref_audio_path,
15 |     ref_text_path,
16 |     ref_language,
17 |     target_text_path,
18 |     target_language,
19 |     output_path,
20 | ):
21 |     # Read reference text
22 |     with open(ref_text_path, "r", encoding="utf-8") as file:
23 |         ref_text = file.read()
24 | 
25 |     # Read target text
26 |     with open(target_text_path, "r", encoding="utf-8") as file:
27 |         target_text = file.read()
28 | 
29 |     # Change model weights
30 |     change_gpt_weights(gpt_path=GPT_model_path)
31 |     change_sovits_weights(sovits_path=SoVITS_model_path)
32 | 
33 |     # Synthesize audio
34 |     synthesis_result = get_tts_wav(
35 |         ref_wav_path=ref_audio_path,
36 |         prompt_text=ref_text,
37 |         prompt_language=i18n(ref_language),
38 |         text=target_text,
39 |         text_language=i18n(target_language),
40 |         top_p=1,
41 |         temperature=1,
42 |     )
43 | 
44 |     result_list = list(synthesis_result)
45 | 
46 |     if result_list:
47 |         last_sampling_rate, last_audio_data = result_list[-1]
48 |         output_wav_path = os.path.join(output_path, "output.wav")
49 |         sf.write(output_wav_path, last_audio_data, last_sampling_rate)
50 |         print(f"Audio saved to {output_wav_path}")
51 | 
52 | 
53 | def main():
54 |     parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
55 |     parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
56 |     parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file")
57 |     parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file")
58 |     parser.add_argument("--ref_text", required=True, help="Path to the reference text file")
59 |     parser.add_argument(
60 |         "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio"
61 |     )
62 |     parser.add_argument("--target_text", required=True, help="Path to the target text file")
63 |     parser.add_argument(
64 |         "--target_language",
65 |         required=True,
66 |         choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"],
67 |         help="Language of the target text",
68 |     )
69 |     parser.add_argument("--output_path", required=True, help="Path to the output directory")
70 | 
71 |     args = parser.parse_args()
72 | 
73 |     synthesize(
74 |         args.gpt_model,
75 |         args.sovits_model,
76 |         args.ref_audio,
77 |         args.ref_text,
78 |         args.ref_language,
79 |         args.target_text,
80 |         args.target_language,
81 |         args.output_path,
82 |     )
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def feature_loss(fmap_r, fmap_g):
 7 |     loss = 0
 8 |     for dr, dg in zip(fmap_r, fmap_g):
 9 |         for rl, gl in zip(dr, dg):
10 |             rl = rl.float().detach()
11 |             gl = gl.float()
12 |             loss += torch.mean(torch.abs(rl - gl))
13 | 
14 |     return loss * 2
15 | 
16 | 
17 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
18 |     loss = 0
19 |     r_losses = []
20 |     g_losses = []
21 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
22 |         dr = dr.float()
23 |         dg = dg.float()
24 |         r_loss = torch.mean((1 - dr) ** 2)
25 |         g_loss = torch.mean(dg**2)
26 |         loss += r_loss + g_loss
27 |         r_losses.append(r_loss.item())
28 |         g_losses.append(g_loss.item())
29 | 
30 |     return loss, r_losses, g_losses
31 | 
32 | 
33 | def generator_loss(disc_outputs):
34 |     loss = 0
35 |     gen_losses = []
36 |     for dg in disc_outputs:
37 |         dg = dg.float()
38 |         l = torch.mean((1 - dg) ** 2)
39 |         gen_losses.append(l)
40 |         loss += l
41 | 
42 |     return loss, gen_losses
43 | 
44 | 
45 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
46 |     """
47 |     z_p, logs_q: [b, h, t_t]
48 |     m_p, logs_p: [b, h, t_t]
49 |     """
50 |     z_p = z_p.float()
51 |     logs_q = logs_q.float()
52 |     m_p = m_p.float()
53 |     logs_p = logs_p.float()
54 |     z_mask = z_mask.float()
55 | 
56 |     kl = logs_p - logs_q - 0.5
57 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
58 |     kl = torch.sum(kl * z_mask)
59 |     l = kl / torch.sum(z_mask)
60 |     return l
61 | 
62 | 
63 | def mle_loss(z, m, logs, logdet, mask):
64 |     l = torch.sum(logs) + 0.5 * torch.sum(
65 |         torch.exp(-2 * logs) * ((z - m) ** 2)
66 |     )  # neg normal likelihood w/o the constant term
67 |     l = l - torch.sum(logdet)  # log jacobian determinant
68 |     l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
69 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
70 |     return l
71 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | from collections import OrderedDict
  3 | from time import time as ttime
  4 | import shutil
  5 | import os
  6 | import torch
  7 | from tools.i18n.i18n import I18nAuto
  8 | 
  9 | i18n = I18nAuto()
 10 | 
 11 | 
 12 | def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
 13 |     dir = os.path.dirname(path)
 14 |     name = os.path.basename(path)
 15 |     tmp_path = "%s.pth" % (ttime())
 16 |     torch.save(fea, tmp_path)
 17 |     shutil.move(tmp_path, "%s/%s" % (dir, name))
 18 | 
 19 | 
 20 | """
 21 | 00:v1
 22 | 01:v2
 23 | 02:v3
 24 | 03:v3lora
 25 | 04:v4lora
 26 | 
 27 | """
 28 | from io import BytesIO
 29 | 
 30 | 
 31 | def my_save2(fea, path,cfm_version):
 32 |     bio = BytesIO()
 33 |     torch.save(fea, bio)
 34 |     bio.seek(0)
 35 |     data = bio.getvalue()
 36 |     byte=b"03" if cfm_version=="v3"else b"04"
 37 |     data = byte + data[2:]
 38 |     with open(path, "wb") as f:
 39 |         f.write(data)
 40 | 
 41 | 
 42 | def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None):
 43 |     try:
 44 |         opt = OrderedDict()
 45 |         opt["weight"] = {}
 46 |         for key in ckpt.keys():
 47 |             if "enc_q" in key:
 48 |                 continue
 49 |             opt["weight"][key] = ckpt[key].half()
 50 |         opt["config"] = hps
 51 |         opt["info"] = "%sepoch_%siteration" % (epoch, steps)
 52 |         if lora_rank:
 53 |             opt["lora_rank"] = lora_rank
 54 |             my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version)
 55 |         else:
 56 |             my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
 57 |         return "Success."
 58 |     except:
 59 |         return traceback.format_exc()
 60 | 
 61 | 
 62 | head2version = {
 63 |     b"00": ["v1", "v1", False],
 64 |     b"01": ["v2", "v2", False],
 65 |     b"02": ["v2", "v3", False],
 66 |     b"03": ["v2", "v3", True],
 67 |     b"04": ["v2", "v4", True],
 68 | }
 69 | hash_pretrained_dict = {
 70 |     "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False],  # s2G488k.pth#sovits_v1_pretrained
 71 |     "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False],  # s2Gv3.pth#sovits_v3_pretrained
 72 |     "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False],  # s2G2333K.pth#sovits_v2_pretrained
 73 |     "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False],  # s2Gv4.pth#sovits_v4_pretrained
 74 | }
 75 | import hashlib
 76 | 
 77 | 
 78 | def get_hash_from_file(sovits_path):
 79 |     with open(sovits_path, "rb") as f:
 80 |         data = f.read(8192)
 81 |     hash_md5 = hashlib.md5()
 82 |     hash_md5.update(data)
 83 |     return hash_md5.hexdigest()
 84 | 
 85 | 
 86 | def get_sovits_version_from_path_fast(sovits_path):
 87 |     ###1-if it is pretrained sovits models, by hash
 88 |     hash = get_hash_from_file(sovits_path)
 89 |     if hash in hash_pretrained_dict:
 90 |         return hash_pretrained_dict[hash]
 91 |     ###2-new weights, by head
 92 |     with open(sovits_path, "rb") as f:
 93 |         version = f.read(2)
 94 |     if version != b"PK":
 95 |         return head2version[version]
 96 |     ###3-old weights, by file size
 97 |     if_lora_v3 = False
 98 |     size = os.path.getsize(sovits_path)
 99 |     """
100 |             v1weights:about 82942KB
101 |                 half thr:82978KB
102 |             v2weights:about 83014KB
103 |             v3weights:about 750MB
104 |     """
105 |     if size < 82978 * 1024:
106 |         model_version = version = "v1"
107 |     elif size < 700 * 1024 * 1024:
108 |         model_version = version = "v2"
109 |     else:
110 |         version = "v2"
111 |         model_version = "v3"
112 |     return version, model_version, if_lora_v3
113 | 
114 | 
115 | def load_sovits_new(sovits_path):
116 |     f = open(sovits_path, "rb")
117 |     meta = f.read(2)
118 |     if meta != "PK":
119 |         data = b"PK" + f.read()
120 |         bio = BytesIO()
121 |         bio.write(data)
122 |         bio.seek(0)
123 |         return torch.load(bio, map_location="cpu", weights_only=False)
124 |     return torch.load(sovits_path, map_location="cpu", weights_only=False)
125 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/.gitignore:
--------------------------------------------------------------------------------
1 | G2PWModel
2 | __pycache__
3 | *.zip


--------------------------------------------------------------------------------
/GPT_SoVITS/text/LangSegmenter/__init__.py:
--------------------------------------------------------------------------------
1 | from .langsegmenter import LangSegmenter
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # if os.environ.get("version","v1")=="v1":
 3 | #   from text.symbols import symbols
 4 | # else:
 5 | #   from text.symbols2 import symbols
 6 | 
 7 | from text import symbols as symbols_v1
 8 | from text import symbols2 as symbols_v2
 9 | 
10 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
11 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
12 | 
13 | 
14 | def cleaned_text_to_sequence(cleaned_text, version=None):
15 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 |     Args:
17 |       text: string to convert to a sequence
18 |     Returns:
19 |       List of integers corresponding to the symbols in the text
20 |     """
21 |     if version is None:
22 |         version = os.environ.get("version", "v2")
23 |     if version == "v1":
24 |         phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
25 |     else:
26 |         phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
27 | 
28 |     return phones
29 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import cleaned_text_to_sequence
 2 | import os
 3 | # if os.environ.get("version","v1")=="v1":
 4 | #     from text import chinese
 5 | #     from text.symbols import symbols
 6 | # else:
 7 | #     from text import chinese2 as chinese
 8 | #     from text.symbols2 import symbols
 9 | 
10 | from text import symbols as symbols_v1
11 | from text import symbols2 as symbols_v2
12 | 
13 | special = [
14 |     # ("%", "zh", "SP"),
15 |     ("￥", "zh", "SP2"),
16 |     ("^", "zh", "SP3"),
17 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
18 | ]
19 | 
20 | 
21 | def clean_text(text, language, version=None):
22 |     if version is None:
23 |         version = os.environ.get("version", "v2")
24 |     if version == "v1":
25 |         symbols = symbols_v1.symbols
26 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
27 |     else:
28 |         symbols = symbols_v2.symbols
29 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
30 | 
31 |     if language not in language_module_map:
32 |         language = "en"
33 |         text = " "
34 |     for special_s, special_l, target_symbol in special:
35 |         if special_s in text and language == special_l:
36 |             return clean_special(text, language, special_s, target_symbol, version)
37 |     language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
38 |     if hasattr(language_module, "text_normalize"):
39 |         norm_text = language_module.text_normalize(text)
40 |     else:
41 |         norm_text = text
42 |     if language == "zh" or language == "yue":  ##########
43 |         phones, word2ph = language_module.g2p(norm_text)
44 |         assert len(phones) == sum(word2ph)
45 |         assert len(norm_text) == len(word2ph)
46 |     elif language == "en":
47 |         phones = language_module.g2p(norm_text)
48 |         if len(phones) < 4:
49 |             phones = [","] + phones
50 |         word2ph = None
51 |     else:
52 |         phones = language_module.g2p(norm_text)
53 |         word2ph = None
54 |     phones = ["UNK" if ph not in symbols else ph for ph in phones]
55 |     return phones, word2ph, norm_text
56 | 
57 | 
58 | def clean_special(text, language, special_s, target_symbol, version=None):
59 |     if version is None:
60 |         version = os.environ.get("version", "v2")
61 |     if version == "v1":
62 |         symbols = symbols_v1.symbols
63 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
64 |     else:
65 |         symbols = symbols_v2.symbols
66 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
67 | 
68 |     """
69 |     特殊静音段sp符号处理
70 |     """
71 |     text = text.replace(special_s, ",")
72 |     language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
73 |     norm_text = language_module.text_normalize(text)
74 |     phones = language_module.g2p(norm_text)
75 |     new_ph = []
76 |     for ph in phones[0]:
77 |         assert ph in symbols
78 |         if ph == ",":
79 |             new_ph.append(target_symbol)
80 |         else:
81 |             new_ph.append(ph)
82 |     return new_ph, phones[1], norm_text
83 | 
84 | 
85 | def text_to_sequence(text, language, version=None):
86 |     version = os.environ.get("version", version)
87 |     if version is None:
88 |         version = "v2"
89 |     phones = clean_text(text)
90 |     return cleaned_text_to_sequence(phones, version)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
95 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N
3 | CONDA K AA1 N D AH0


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/__init__.py:
--------------------------------------------------------------------------------
1 | from text.g2pw.g2pw import *
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/g2pw/polyphonic.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.rep:
--------------------------------------------------------------------------------
 1 | 湖泊: ['hu2','po1']
 2 | 地壳: ['di4','qiao4']
 3 | 柏树: ['bai3','shu4']
 4 | 曝光: ['bao4','guang1']
 5 | 弹力: ['tan2','li4']
 6 | 字帖: ['zi4','tie4']
 7 | 口吃: ['kou3','chi1']
 8 | 包扎: ['bao1','za1']
 9 | 哪吒: ['ne2','zha1']
10 | 说服: ['shuo1','fu2']
11 | 识字: ['shi2','zi4']
12 | 骨头: ['gu3','tou5']
13 | 对称: ['dui4','chen4']
14 | 口供: ['kou3','gong4']
15 | 抹布: ['ma1','bu4']
16 | 露背: ['lu4','bei4']
17 | 圈养: ['juan4', 'yang3']
18 | 眼眶: ['yan3', 'kuang4']
19 | 品行: ['pin3','xing2']
20 | 颤抖: ['chan4','dou3']
21 | 差不多: ['cha4','bu5','duo1']
22 | 鸭绿江: ['ya1','lu4','jiang1']
23 | 撒切尔: ['sa4','qie4','er3']
24 | 比比皆是: ['bi3','bi3','jie1','shi4']
25 | 身无长物: ['shen1','wu2','chang2','wu4']
26 | 手里: ['shou2','li3']
27 | 关卡: ['guan1','qia3']
28 | 怀揣: ['huai2','chuai1']
29 | 挑剔: ['tiao1','ti4']
30 | 供称: ['gong4','cheng1']
31 | 作坊: ['zuo1', 'fang5']
32 | 中医: ['zhong1','yi1']
33 | 嚷嚷: ['rang1','rang5']
34 | 商厦: ['shang1','sha4']
35 | 大厦: ['da4','sha4']
36 | 刹车: ['sha1','che1']
37 | 嘚瑟: ['de4','se5']
38 | 朝鲜: ['chao2','xian3']
39 | 阿房宫: ['e1','pang2','gong1']
40 | 阿胶: ['e1','jiao1']
41 | 咖喱: ['ga1','li5']
42 | 时分: ['shi2','fen1']
43 | 蚌埠: ['beng4','bu4']
44 | 驯服: ['xun4','fu2']
45 | 幸免于难: ['xing4','mian3','yu2','nan4']
46 | 恶行: ['e4','xing2']
47 | 唉: ['ai4']
48 | 扎实: ['zha1','shi2']
49 | 干将: ['gan4','jiang4']
50 | 陈威行: ['chen2', 'wei1', 'hang2']
51 | 郭晟: ['guo1', 'sheng4']
52 | 中标: ['zhong4', 'biao1']
53 | 抗住: ['kang2', 'zhu4']


--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip("0"))
 25 |     if num_string.startswith("0"):
 26 |         result = DIGITS["0"] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(
 32 |     r"([0-1]?[0-9]|2[0-3])"
 33 |     r":([0-5][0-9])"
 34 |     r"(:([0-5][0-9]))?"
 35 | )
 36 | 
 37 | # 时间范围，如8:30-12:30
 38 | RE_TIME_RANGE = re.compile(
 39 |     r"([0-1]?[0-9]|2[0-3])"
 40 |     r":([0-5][0-9])"
 41 |     r"(:([0-5][0-9]))?"
 42 |     r"(~|-)"
 43 |     r"([0-1]?[0-9]|2[0-3])"
 44 |     r":([0-5][0-9])"
 45 |     r"(:([0-5][0-9]))?"
 46 | )
 47 | 
 48 | 
 49 | def replace_time(match) -> str:
 50 |     """
 51 |     Args:
 52 |         match (re.Match)
 53 |     Returns:
 54 |         str
 55 |     """
 56 | 
 57 |     is_range = len(match.groups()) > 5
 58 | 
 59 |     hour = match.group(1)
 60 |     minute = match.group(2)
 61 |     second = match.group(4)
 62 | 
 63 |     if is_range:
 64 |         hour_2 = match.group(6)
 65 |         minute_2 = match.group(7)
 66 |         second_2 = match.group(9)
 67 | 
 68 |     result = f"{num2str(hour)}点"
 69 |     if minute.lstrip("0"):
 70 |         if int(minute) == 30:
 71 |             result += "半"
 72 |         else:
 73 |             result += f"{_time_num2str(minute)}分"
 74 |     if second and second.lstrip("0"):
 75 |         result += f"{_time_num2str(second)}秒"
 76 | 
 77 |     if is_range:
 78 |         result += "至"
 79 |         result += f"{num2str(hour_2)}点"
 80 |         if minute_2.lstrip("0"):
 81 |             if int(minute) == 30:
 82 |                 result += "半"
 83 |             else:
 84 |                 result += f"{_time_num2str(minute_2)}分"
 85 |         if second_2 and second_2.lstrip("0"):
 86 |             result += f"{_time_num2str(second_2)}秒"
 87 | 
 88 |     return result
 89 | 
 90 | 
 91 | RE_DATE = re.compile(
 92 |     r"(\d{4}|\d{2})年"
 93 |     r"((0?[1-9]|1[0-2])月)?"
 94 |     r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
 95 | )
 96 | 
 97 | 
 98 | def replace_date(match) -> str:
 99 |     """
100 |     Args:
101 |         match (re.Match)
102 |     Returns:
103 |         str
104 |     """
105 |     year = match.group(1)
106 |     month = match.group(3)
107 |     day = match.group(5)
108 |     result = ""
109 |     if year:
110 |         result += f"{verbalize_digit(year)}年"
111 |     if month:
112 |         result += f"{verbalize_cardinal(month)}月"
113 |     if day:
114 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
115 |     return result
116 | 
117 | 
118 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
119 | RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
120 | 
121 | 
122 | def replace_date2(match) -> str:
123 |     """
124 |     Args:
125 |         match (re.Match)
126 |     Returns:
127 |         str
128 |     """
129 |     year = match.group(1)
130 |     month = match.group(3)
131 |     day = match.group(4)
132 |     result = ""
133 |     if year:
134 |         result += f"{verbalize_digit(year)}年"
135 |     if month:
136 |         result += f"{verbalize_cardinal(month)}月"
137 |     if day:
138 |         result += f"{verbalize_cardinal(day)}日"
139 |     return result
140 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
22 | 
23 | # 英文字符半角 -> 全角映射表
24 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
25 | 
26 | # 数字字符全角 -> 半角映射表 (num: 10)
27 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
28 | # 数字字符半角 -> 全角映射表
29 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
30 | 
31 | # 标点符号全角 -> 半角映射表 (num: 32)
32 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
33 | # 标点符号半角 -> 全角映射表
34 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
35 | 
36 | # 空格 (num: 1)
37 | F2H_SPACE = {"\u3000": " "}
38 | H2F_SPACE = {" ": "\u3000"}
39 | 
40 | # 非"有拼音的汉字"的字符串，可用于NSW提取
41 | if SUPPORT_UCS4:
42 |     RE_NSW = re.compile(
43 |         r"(?:[^"
44 |         r"\u3007"  # 〇
45 |         r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
46 |         r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
47 |         r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
48 |         r"\U00020000-\U0002A6DF"  # CJK扩展B:[20000-2A6DF]
49 |         r"\U0002A703-\U0002B73F"  # CJK扩展C:[2A700-2B73F]
50 |         r"\U0002B740-\U0002B81D"  # CJK扩展D:[2B740-2B81D]
51 |         r"\U0002F80A-\U0002FA1F"  # CJK兼容扩展:[2F800-2FA1F]
52 |         r"])+"
53 |     )
54 | else:
55 |     RE_NSW = re.compile(  # pragma: no cover
56 |         r"(?:[^"
57 |         r"\u3007"  # 〇
58 |         r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
59 |         r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
60 |         r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
61 |         r"])+"
62 |     )
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
25 | RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
26 | 
27 | # 全国统一的号码400开头
28 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
29 | 
30 | 
31 | def phone2str(phone_string: str, mobile=True) -> str:
32 |     if mobile:
33 |         sp_parts = phone_string.strip("+").split()
34 |         result = "，".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
35 |         return result
36 |     else:
37 |         sil_parts = phone_string.split("-")
38 |         result = "，".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
39 |         return result
40 | 
41 | 
42 | def replace_phone(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     return phone2str(match.group(0), mobile=False)
50 | 
51 | 
52 | def replace_mobile(match) -> str:
53 |     """
54 |     Args:
55 |         match (re.Match)
56 |     Returns:
57 |         str
58 |     """
59 |     return phone2str(match.group(0))
60 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒",
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 AlfreScarlet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 基于GPT-SoVITS的语音交互系统
  2 | ## 简介
  3 | 一个非常基础的语音交互系统，使用GPT-SoVITS作为TTS模块。集成ASR接口，使用funasr作为语音识别模块基础。支持openai规范的大模型接口。
  4 | Linux环境下首Token延迟基本能做到1.5s以内。Windows环境下延迟在2.1s左右
  5 | ### 测试平台
  6 | 服务端
  7 | - OS：Manjaro
  8 | - CPU：R9 5950X
  9 | - GPU：RTX 3080ti
 10 | 
 11 | 客户端
 12 | - 树莓派5
 13 | 
 14 | ### 测试结果
 15 | ![](screen/img.png)
 16 | ## 整合包使用说明
 17 | 整合包下载链接：http://ss.alfresama.moe:5244/MoeChat
 18 | ### 注意！重要的事情说三遍
 19 | ### 服务端只会对英文""符号包裹的文本进行语音合成，使用前请修改大模型的提示词！
 20 | ### 服务端只会对英文""符号包裹的文本进行语音合成，使用前请修改大模型的提示词！
 21 | ### 服务端只会对英文""符号包裹的文本进行语音合成，使用前请修改大模型的提示词！
 22 | 整合包不包含用于推理的GPT跟SoVITS模型，需要自行添加底模或者训练好的模型。
 23 | ### Windows
 24 | ```bash
 25 | runtime\python.exe chat_server.py
 26 | ```
 27 | ### Linux
 28 | ```bash
 29 | # 创建虚拟环境
 30 | python -m venv pp
 31 | 
 32 | # 进入虚拟环境
 33 | source pp/bin/activate
 34 | 
 35 | # 安装依赖
 36 | pip install -r extra-req.txt
 37 | pip install -r requirements.txt
 38 | pip install -r extra-req2.txt
 39 | 
 40 | # 运行
 41 | python chat_server.py
 42 | ```
 43 | ### 配置说明
 44 | 整合包配置文件为config.yaml
 45 | ```yaml
 46 | Core:
 47 |   sv:                    # 声纹配置  
 48 |     is_up: true          # 是否启用声纹识别
 49 |     master_audio:        # 音频路径
 50 |     thr:                 # 不知道有什么用暂时留空
 51 | LLM:
 52 |   api:                   # 大模型API
 53 |   key:                   # 大模型API_Key
 54 |   model:                 # 模型名称
 55 |   extra_config:          # 大模型API额外参数，如：temperature: 0.7，温度参数
 56 |     temperature: 0.7
 57 | GSV:
 58 |   text_lang: zh          # 合成文本的语种
 59 |   GPT_weight:            # GPT_weight模型路径
 60 |   SoVITS_weight:         # SoVITS_weight模型路径
 61 |   ref_audio_path:        # 主要参考音频路径
 62 |   prompt_text:           # 参考音频文本
 63 |   prompt_lang: zh        # 参考音频语种
 64 |   aux_ref_audio_paths:   # 多参考音频
 65 |     -                    # 多参考音频文件路径
 66 |   seed: -1               # 种子
 67 |   top_k: 15              # 情感表现程度，越高情感越丰富，也可能越奇怪
 68 |   batch_size: 1
 69 | extra_ref_audio:         # 使用情绪标签选择参考音频，例如 [普通]"你好呀。"
 70 |   # 实例
 71 |   普通: 
 72 |     - 参考音频路径
 73 |     - 参考音频文本
 74 | ```
 75 | 
 76 | ### 简易客户端使用方法
 77 | 
 78 | #### Windows
 79 | 测试使用python 3.10
 80 | 首先修改client.py文件asr_api、chat_api的ip地址。
 81 | ##### 带简单GUI的客户端
 82 | ```bash
 83 | # 运行
 84 | runtime\python.exe client-gui\src\client_gui.py
 85 | ```
 86 | 
 87 | #### Linux
 88 | ```bash
 89 | # 创建虚拟环境
 90 | python -m venv pp
 91 | 
 92 | # 进入虚拟环境
 93 | source pp/bin/activate
 94 | 
 95 | # 安装依赖
 96 | pip install -r client-requirements.txt
 97 | 
 98 | # 启动
 99 | python client-gui\src\client_gui.py
100 | ```
101 | 
102 | ### 在客户端上修改提示词的方法
103 | 此方法不适用于ollama，非必要情况下可以使用LMstudio
104 | ```bash
105 | # 打开client_cli.py文件，GUI简易客户端打开client-gui\src\utils.py文件，修改下面内容
106 | # 修改前
107 | # 用于存储上下文内容
108 | data = {
109 |     "msg": []
110 | }
111 | 
112 | #修改后
113 | # 用于存储上下文内容
114 | data = {
115 |     "msg": [
116 |         {"role":"system", "content": ```填入你的提示词```}
117 |     ]
118 | }
119 | ```
120 | 
121 | ## 接口说明
122 | 接口全部使用POST请求。
123 | 
124 | #### ASR语音识别接口
125 | ```python
126 | # url为/api/asr
127 | # 请求数据格式为json
128 | # 将音频数据编码成urlsafe的base64字符串，放进请求体data字段中
129 | {
130 |   "data": str # base64音频数据
131 | }
132 | # 服务端直接返回识别结果文本
133 | ```
134 | 
135 | #### 对话接口
136 | ```python
137 | # 对话接口为sse流式接口，服务端会将大模型的回答切片并生成对应的语音数据，一段一段返回客户端
138 | # 请求数据格式为json
139 | # 将大模型上下文数据放进msg字段，类型为字符串数组
140 | # 请求例子
141 | {
142 |   "msg": [
143 |     {"role": "user", "content": "你好呀！"},
144 |     {"role": "assistant", "content": "你好呀！有什么能帮到你的吗？"},
145 |     {"role": "user", "content": "1+1等于多少呢？"},
146 |   ]
147 | }
148 | 
149 | # 服务端响应例子
150 | {
151 |   "file": str     # urlsafe的base64字符串音频文件
152 |   "message": str  # 音频数据对应的文本
153 |   "done": False   # bool类型，用于判断是否为最后一个数据包
154 | }
155 | # 最后一个数据包服务端会将大模型完整的回答文本放进message字段返回客户端
156 | {
157 |   "file": str
158 |   "message": str  # 字符串类型，大模型完整回答文本，用于拼接上下文
159 |   "done": True    # bool类型，用于判断是否为最后一个数据包
160 | }
161 | ```
162 | 


--------------------------------------------------------------------------------
/client-gui/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | *.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Flet
163 | storage/


--------------------------------------------------------------------------------
/client-gui/README.md:
--------------------------------------------------------------------------------
 1 | # ClientGui app
 2 | 
 3 | ## Run the app
 4 | 
 5 | ### uv
 6 | 
 7 | Run as a desktop app:
 8 | 
 9 | ```
10 | uv run flet run
11 | ```
12 | 
13 | Run as a web app:
14 | 
15 | ```
16 | uv run flet run --web
17 | ```
18 | 
19 | ### Poetry
20 | 
21 | Install dependencies from `pyproject.toml`:
22 | 
23 | ```
24 | poetry install
25 | ```
26 | 
27 | Run as a desktop app:
28 | 
29 | ```
30 | poetry run flet run
31 | ```
32 | 
33 | Run as a web app:
34 | 
35 | ```
36 | poetry run flet run --web
37 | ```
38 | 
39 | For more details on running the app, refer to the [Getting Started Guide](https://flet.dev/docs/getting-started/).
40 | 
41 | ## Build the app
42 | 
43 | ### Android
44 | 
45 | ```
46 | flet build apk -v
47 | ```
48 | 
49 | For more details on building and signing `.apk` or `.aab`, refer to the [Android Packaging Guide](https://flet.dev/docs/publish/android/).
50 | 
51 | ### iOS
52 | 
53 | ```
54 | flet build ipa -v
55 | ```
56 | 
57 | For more details on building and signing `.ipa`, refer to the [iOS Packaging Guide](https://flet.dev/docs/publish/ios/).
58 | 
59 | ### macOS
60 | 
61 | ```
62 | flet build macos -v
63 | ```
64 | 
65 | For more details on building macOS package, refer to the [macOS Packaging Guide](https://flet.dev/docs/publish/macos/).
66 | 
67 | ### Linux
68 | 
69 | ```
70 | flet build linux -v
71 | ```
72 | 
73 | For more details on building Linux package, refer to the [Linux Packaging Guide](https://flet.dev/docs/publish/linux/).
74 | 
75 | ### Windows
76 | 
77 | ```
78 | flet build windows -v
79 | ```
80 | 
81 | For more details on building Windows package, refer to the [Windows Packaging Guide](https://flet.dev/docs/publish/windows/).


--------------------------------------------------------------------------------
/client-gui/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "client-gui"
 3 | version = "0.1.0"
 4 | description = ""
 5 | readme = "README.md"
 6 | requires-python = ">=3.9"
 7 | authors = [
 8 |     { name = "Flet developer", email = "you@example.com" }
 9 | ]
10 | dependencies = [
11 |   "flet==0.27.6"
12 | ]
13 | 
14 | [tool.flet]
15 | # org name in reverse domain name notation, e.g. "com.mycompany".
16 | # Combined with project.name to build bundle ID for iOS and Android apps
17 | org = "com.mycompany"
18 | 
19 | # project display name that is used as an app title on Android and iOS home screens,
20 | # shown in window titles and about app dialogs on desktop.
21 | product = "client-gui"
22 | 
23 | # company name to display in about app dialogs
24 | company = "Flet"
25 | 
26 | # copyright text to display in about app dialogs
27 | copyright = "Copyright (C) 2025 by Flet"
28 | 
29 | [tool.flet.app]
30 | path = "src"
31 | 
32 | [tool.uv]
33 | dev-dependencies = [
34 |     "flet[all]==0.27.6",
35 | ]
36 | 
37 | [tool.poetry]
38 | package-mode = false
39 | 
40 | [tool.poetry.group.dev.dependencies]
41 | flet = {extras = ["all"], version = "0.27.6"}


--------------------------------------------------------------------------------
/client-gui/src/assets/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/client-gui/src/assets/icon.png


--------------------------------------------------------------------------------
/client-gui/src/assets/splash_android.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/client-gui/src/assets/splash_android.png


--------------------------------------------------------------------------------
/client-gui/src/cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import click
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | from audiolab import Reader, Writer, info, load_audio
19 | 
20 | from pysilero import SileroVAD, VADIterator
21 | 
22 | 
23 | @click.command()
24 | @click.argument("wav_path", type=click.Path(exists=True, file_okay=True))
25 | @click.option("--version", default="v5", help="Silero VAD version")
26 | @click.option("--denoise/--no-denoise", default=False, help="Denoise before vad")
27 | @click.option("--streaming/--no-streaming", default=False, help="Streming mode")
28 | @click.option("--save-path", help="Save path for output audio")
29 | @click.option("--plot/--no-plot", default=False, help="Plot the vad probabilities")
30 | def main(wav_path, version, denoise, streaming, save_path, plot):
31 |     if not streaming:
32 |         model = SileroVAD(version, info(wav_path).rate, denoise=denoise)
33 |         speech_timestamps = model.get_speech_timestamps(wav_path, return_seconds=True, save_path=save_path)
34 |         print("None streaming result:", list(speech_timestamps))
35 | 
36 |         if plot:
37 |             audio, rate = load_audio(wav_path, dtype=np.float32)
38 |             x1 = np.arange(0, audio.shape[1]) / rate
39 |             outputs = list(model.get_speech_probs(wav_path))
40 |             x2 = [i * 32 / 1000 for i in range(0, len(outputs))]
41 |             plt.plot(x1, audio[0])
42 |             plt.plot(x2, outputs)
43 |             plt.show()
44 |     else:
45 |         print("Streaming result:", end=" ")
46 |         reader = Reader(wav_path, dtype=np.float32, frame_size_ms=10)
47 |         if save_path is not None:
48 |             writer = Writer(save_path, reader.rate, layout=reader.layout)
49 |         vad_iterator = VADIterator(version, reader.rate)
50 |         for idx, (frame, _) in enumerate(reader):
51 |             partial = idx == reader.num_frames - 1
52 |             for speech_dict, speech_samples in vad_iterator(frame.squeeze(), partial, return_seconds=True):
53 |                 if "start" in speech_dict or "end" in speech_dict:
54 |                     print(speech_dict, end=" ")
55 |                 if save_path is not None and speech_samples is not None:
56 |                     writer.write(speech_samples)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/client-gui/src/client_gui.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | now_dir = os.getcwd()
 5 | sys.path.append(now_dir)
 6 | sys.path.append("%s/client-gui" % (now_dir))
 7 | sys.path.append("%s/client-gui/src" % (now_dir))
 8 | 
 9 | import flet as ft
10 | import ui
11 | from threading import Thread
12 | import client_utils
13 | 
14 | client_utils_thread = Thread(target=client_utils.main, args=())
15 | client_utils_thread.daemon = True
16 | client_utils_thread.start()
17 | 
18 | def get_msg_box(msg: str):
19 |     return ft.Container(
20 |         content=ft.Text(msg, size=20, text_align=ft.TextAlign.CENTER),
21 |         padding=5,
22 |         border_radius=10,
23 |         bgcolor=ft.colors.BLUE_900,
24 |     )
25 | 
26 | def main(page: ft.Page):
27 |     page.horizontal_alignment = ft.CrossAxisAlignment.STRETCH
28 |     page.title = "Moe Chat GUI"
29 | 
30 |     def send_message_click(e):
31 |         if new_message.value != "" and client_utils.status:
32 |             mmsg = f"\"{new_message.value}\""
33 |             client_utils.add_msg_me(mmsg.replace("\"", ""))
34 |             client_utils.to_llm_and_tts(mmsg, "0.000")
35 |         new_message.value = ""
36 |         new_message.focus()
37 |         page.update()
38 | 
39 |     # A new message entry form
40 |     new_message = ft.TextField(
41 |         hint_text="输入信息",
42 |         autofocus=True,
43 |         shift_enter=True,
44 |         min_lines=1,
45 |         max_lines=5,
46 |         filled=True,
47 |         expand=True,
48 |         on_submit=send_message_click,
49 |     )
50 | 
51 |     # Add everything to the page
52 |     page.add(
53 |         ft.Container(
54 |             content=ui.chat_list,
55 |             border=ft.border.all(1, ft.Colors.OUTLINE),
56 |             border_radius=5,
57 |             padding=10,
58 |             expand=True,
59 |         ),
60 |         ft.Row(
61 |             [
62 |                 new_message,
63 |                 ft.IconButton(
64 |                     icon=ft.Icons.SEND_ROUNDED,
65 |                     tooltip="Send message",
66 |                     on_click=send_message_click,
67 |                 ),
68 |             ]
69 |         ),
70 |     )
71 | 
72 | ft.app(target=main)


--------------------------------------------------------------------------------
/client-gui/src/frame_queue.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import numpy as np
16 | import soxr
17 | 
18 | 
19 | class FrameQueue:
20 |     def __init__(self, frame_size, in_rate, speech_pad_samples=0, out_rate=None, padding=True):
21 |         self.frame_size = frame_size
22 |         # padding zeros for the last frame
23 |         self.padding = padding
24 |         self.speech_pad_samples = speech_pad_samples
25 |         # cache the original samples for padding and soxr's delay
26 |         # TODO: use the largest delay of soxr instead of 500ms cache
27 |         num_cached_samples = speech_pad_samples + 500 * in_rate // 1000
28 |         self.cached_samples = np.zeros(num_cached_samples, dtype=np.float32)
29 |         self.cache_start = -len(self.cached_samples)
30 | 
31 |         self.current_sample = 0
32 |         self.remained_samples = np.empty(0, dtype=np.float32)
33 | 
34 |         if out_rate is None or in_rate == out_rate:
35 |             self.step = 1.0
36 |             self.resampler = None
37 |         else:
38 |             self.step = in_rate / out_rate
39 |             self.resampler = soxr.ResampleStream(in_rate, out_rate, num_channels=1)
40 | 
41 |     def add_chunk(self, chunk, is_last=False):
42 |         # cache the original frame without resampling for `lookforward` of vad start
43 |         # cache start is the absolute sample index of the first sample in the cached_samples
44 |         if len(chunk) > 0:
45 |             self.cache_start += len(chunk)
46 |             self.cached_samples = np.roll(self.cached_samples, -len(chunk))
47 |             self.cached_samples[-len(chunk) :] = chunk[-len(self.cached_samples) :]
48 |             # resample
49 |             if self.resampler is not None:
50 |                 chunk = self.resampler.resample_chunk(chunk, is_last)
51 |             # enqueue chunk
52 |             self.remained_samples = np.concatenate((self.remained_samples, chunk))
53 | 
54 |         while len(self.remained_samples) >= self.frame_size:
55 |             frame = self.remained_samples[: self.frame_size]
56 |             self.remained_samples = self.remained_samples[self.frame_size :]
57 |             # frame_start and frame_end is the sample index before resampling
58 |             frame_start = self.current_sample
59 |             self.current_sample += int(len(frame) * self.step)
60 |             frame_end = self.current_sample
61 |             yield frame_start, frame_end, frame
62 | 
63 |         if is_last and len(self.remained_samples) > 0 and self.padding:
64 |             frame = self.remained_samples
65 |             frame_start = self.current_sample
66 |             self.current_sample += int(len(frame) * self.step)
67 |             frame = np.pad(frame, (0, self.frame_size - len(frame)))
68 |             frame_end = self.current_sample
69 |             yield frame_start, frame_end, frame
70 | 
71 |     def get_frame(self, speech_padding=False):
72 |         # dequeue one original frame without resampling
73 |         frame_start = self.current_sample - int(self.frame_size * self.step)
74 |         frame_end = self.current_sample
75 |         if speech_padding:
76 |             frame_start -= self.speech_pad_samples
77 |         # get the relative sample index of the speech
78 |         speech_start = frame_start - self.cache_start
79 |         speech_end = frame_end - self.cache_start
80 |         return self.cached_samples[speech_start:speech_end]
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     queue = FrameQueue(3, 1000)
85 |     frames = [[1, 2, 3], [4, 5], [6, 7, 8]]
86 |     for index, frame in enumerate(frames):
87 |         for frame_start, frame_end, frame in queue.add_chunk(frame, index == len(frames) - 1):
88 |             print(frame_start, frame_end, frame)
89 | 


--------------------------------------------------------------------------------
/client-gui/src/pickable_session.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from functools import partial
16 | 
17 | import onnxruntime as ort
18 | from modelscope import snapshot_download
19 | 
20 | 
21 | class PickableSession:
22 |     """
23 |     This is a wrapper to make the current InferenceSession class pickable.
24 |     """
25 | 
26 |     def __init__(self, version="v5"):
27 |         opts = ort.SessionOptions()
28 |         opts.inter_op_num_threads = 1
29 |         opts.intra_op_num_threads = 1
30 |         opts.log_severity_level = 3
31 | 
32 |         assert version in ["v4", "v5"]
33 |         model_id = "pengzhendong/silero-vad"
34 |         try:
35 |             repo_dir = snapshot_download(model_id)
36 |         except Exception:
37 |             from modelscope.utils.file_utils import get_default_modelscope_cache_dir
38 | 
39 |             repo_dir = f"{get_default_modelscope_cache_dir()}/models/{model_id}"
40 |         self.model_path = f"{repo_dir}/{version}/silero_vad.onnx"
41 |         self.init_session = partial(ort.InferenceSession, sess_options=opts, providers=["CPUExecutionProvider"])
42 |         self.sess = self.init_session(self.model_path)
43 | 
44 |     def run(self, *args):
45 |         return self.sess.run(None, *args)
46 | 
47 |     def __getstate__(self):
48 |         return {"model_path": self.model_path}
49 | 
50 |     def __setstate__(self, values):
51 |         self.model_path = values["model_path"]
52 |         self.sess = self.init_session(self.model_path)
53 | 
54 | 
55 | VERSIONS = ["v4", "v5"]
56 | silero_vad = {version: PickableSession(version) for version in VERSIONS}
57 | 


--------------------------------------------------------------------------------
/client-gui/src/ui.py:
--------------------------------------------------------------------------------
 1 | import flet as ft
 2 | 
 3 | # Chat messages
 4 | chat_list = ft.ListView(
 5 |     controls=[],
 6 |     expand=True,
 7 |     spacing=10,
 8 |     auto_scroll=True,
 9 | )
10 | 
11 | class ChatMessage:
12 |     def __init__(self, user_name: str, text: str, positon: str):
13 |         if positon == "left":
14 |             self.position1 = ft.MainAxisAlignment.START
15 |             self.position2 = ft.CrossAxisAlignment.START
16 |         else:
17 |             self.position1 = ft.MainAxisAlignment.END
18 |             self.position2 = ft.CrossAxisAlignment.END
19 |         self.user_name = user_name
20 |         self.text = text
21 |         self.tou = ft.CircleAvatar(
22 |             content=ft.Text(self.get_initials(user_name), size=25),
23 |             color=ft.Colors.WHITE,
24 |             bgcolor=self.get_avatar_color(user_name),
25 |             min_radius=30,
26 |         )
27 |         self.msg_list = ft.Column(
28 |             controls=[
29 |                 ft.Container(
30 |                     height=5,
31 |                     width=1,
32 |                 )
33 |             ],
34 |             alignment=ft.MainAxisAlignment.START,
35 |             horizontal_alignment=self.position2,
36 |             auto_scroll=True,
37 |         )
38 |         self.cont = ft.Row(
39 |             data=self.user_name,
40 |             # controls=[self.msg_list, self.tou],
41 |             alignment=self.position1,
42 |             vertical_alignment=ft.CrossAxisAlignment.START,
43 |             expand=True,
44 |         )
45 |         if positon == "left":
46 |             self.cont.controls = [self.tou, self.msg_list]
47 |             print("left")
48 |         else:
49 |             self.cont.controls = [self.msg_list, self.tou]
50 |             print("right")
51 |     def get_initials(self, user_name: str):
52 |         if user_name:
53 |             return user_name[:1].capitalize()
54 |         else:
55 |             return "Unknown"  # or any default value you prefer
56 |     def get_avatar_color(self, user_name: str):
57 |         colors_lookup = [
58 |             ft.Colors.AMBER,
59 |             ft.Colors.BLUE,
60 |             ft.Colors.BROWN,
61 |             ft.Colors.CYAN,
62 |             ft.Colors.GREEN,
63 |             ft.Colors.INDIGO,
64 |             ft.Colors.LIME,
65 |             ft.Colors.ORANGE,
66 |             ft.Colors.PINK,
67 |             ft.Colors.PURPLE,
68 |             ft.Colors.RED,
69 |             ft.Colors.TEAL,
70 |             ft.Colors.YELLOW,
71 |         ]
72 |         return colors_lookup[hash(user_name) % len(colors_lookup)]
73 | 
74 |         
75 | def get_msg_box(msg: str):
76 |     return ft.Container(
77 |         content=ft.Text(msg, size=20, text_align=ft.TextAlign.CENTER),
78 |         padding=5,
79 |         border_radius=10,
80 |         bgcolor=ft.colors.BLUE_900,
81 |     )


--------------------------------------------------------------------------------
/client-gui/src/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, Zhendong Peng (pzd17@tsinghua.org.cn)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import warnings
16 | 
17 | import librosa
18 | import numpy as np
19 | import parselmouth
20 | 
21 | warnings.filterwarnings("ignore")
22 | 
23 | 
24 | def get_energy(chunk, sr, from_harmonic=1, to_harmonic=5):
25 |     sound = parselmouth.Sound(chunk, sampling_frequency=sr)
26 |     # pitch
27 |     pitch = sound.to_pitch(pitch_floor=100, pitch_ceiling=350)
28 |     # pitch energy
29 |     # energy = np.mean(pitch.selected_array["strength"])
30 |     pitch = np.mean(pitch.selected_array["frequency"])
31 |     # frame log energy
32 |     # energy = np.mean(sound.to_mfcc().to_array(), axis=1)[0]
33 | 
34 |     # energy form x-th harmonic to y-th harmonic
35 |     freqs = librosa.fft_frequencies(sr=sr)
36 |     freq_band_idx = np.where((freqs >= from_harmonic * pitch) & (freqs <= to_harmonic * pitch))[0]
37 |     energy = np.sum(np.abs(librosa.stft(chunk)[freq_band_idx, :]))
38 | 
39 |     return energy
40 | 


--------------------------------------------------------------------------------
/client-requirements.txt:
--------------------------------------------------------------------------------
1 | sounddevice
2 | soundfile
3 | pysilero
4 | numpy
5 | scipy
6 | pygame
7 | requests
8 | flet[all]==0.27.6


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | Core:
 2 |   sv:
 3 |     is_up: true
 4 | 
 5 |     master_audio: test.wav    # 包含你声音的wav音频文件，建议3s-5s左右。
 6 |     thr:                      # 阈值，越小越敏感，建议0.5-0.8之间，实测好像不是很有用？
 7 |   # wakeword:                 # 唤醒词相关，暂时不支持
 8 |   #   is_up: false
 9 |   #   word: 爱丽丝             #唤醒词
10 |   #   sleep_time: 1           #休眠时间，休眠后需要用带有唤醒词的语句唤醒，单位分钟，0表示永远需要唤醒词
11 | LLM:
12 |   api: 
13 |   key: 
14 |   model: 
15 |   extra_config:               # 大模型API额外参数，如：temperature: 0.7，温度参数
16 |     temperature: 0.7
17 | GSV:
18 |   text_lang: zh
19 |   GPT_weight: 
20 |   SoVITS_weight: 
21 |   ref_audio_path: 
22 |   prompt_text: 
23 |   prompt_lang: zh
24 |   aux_ref_audio_paths:        # 多参考音频 v2模型有效
25 |     - 
26 |   seed: -1
27 |   top_k: 15
28 |   batch_size: 1
29 | extra_ref_audio:              # 使用情绪标签选择参考音频，例如 [普通]"你好呀。"
30 |   # 实例
31 |   普通: 
32 |     - 参考音频路径
33 |     - 参考音频文本


--------------------------------------------------------------------------------
/extra-req.txt:
--------------------------------------------------------------------------------
1 | faster-whisper
2 | 


--------------------------------------------------------------------------------
/extra-req2.txt:
--------------------------------------------------------------------------------
1 | addict
2 | datasets==2.18.0
3 | simplejson
4 | sortedcontainers
5 | modelscope==1.24.1
6 | funasr==1.2.6
7 | numpy==1.25
8 | pypinyin


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy<2.0
 2 | scipy
 3 | tensorboard
 4 | librosa==0.10.2
 5 | numba
 6 | pytorch-lightning>=2.4
 7 | gradio<5
 8 | ffmpeg-python
 9 | onnxruntime; sys_platform == 'darwin'
10 | onnxruntime-gpu; sys_platform != 'darwin'
11 | tqdm
12 | cn2an
13 | pypinyin
14 | pyopenjtalk>=0.4.1
15 | g2p_en
16 | torchaudio
17 | sentencepiece
18 | transformers>=4.43
19 | peft
20 | chardet
21 | PyYAML
22 | psutil
23 | jieba_fast
24 | jieba
25 | split-lang
26 | fast_langdetect>=0.3.1
27 | wordsegment
28 | rotary_embedding_torch
29 | ToJyutping 
30 | g2pk2
31 | ko_pron
32 | opencc; sys_platform != 'linux'
33 | opencc==1.1.1; sys_platform == 'linux'
34 | python_mecab_ko; sys_platform != 'win32'
35 | fastapi[standard]>=0.115.2
36 | x_transformers
37 | torchmetrics<=1.5
38 | pydantic<=2.10.6
39 | ctranslate2>=4.0,<5
40 | huggingface_hub>=0.13
41 | tokenizers>=0.13,<1
42 | av>=11
43 | tqdm
44 | 


--------------------------------------------------------------------------------
/screen/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/screen/img.png


--------------------------------------------------------------------------------
/tools/AP_BWE_main/24kto48k/readme.txt:
--------------------------------------------------------------------------------
 1 | For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model.
 2 | 对于v3模型的推理，如果你发现生成的音频比较闷，可以尝试这个音频超分模型。
 3 | 
 4 | put g_24kto48k.zip and config.json in this folder
 5 | 把g_24kto48k.zip and config.json下到这个文件夹
 6 | 
 7 | download link 下载链接:
 8 | https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link
 9 | 
10 | audio sr project page 音频超分项目主页:
11 | https://github.com/yxlu-0102/AP-BWE
12 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ye-Xin Lu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/datasets1/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/datasets1/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import torch
  4 | import torchaudio
  5 | import torch.utils.data
  6 | import torchaudio.functional as aF
  7 | 
  8 | 
  9 | def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True):
 10 |     hann_window = torch.hann_window(win_size).to(audio.device)
 11 |     stft_spec = torch.stft(
 12 |         audio,
 13 |         n_fft,
 14 |         hop_length=hop_size,
 15 |         win_length=win_size,
 16 |         window=hann_window,
 17 |         center=center,
 18 |         pad_mode="reflect",
 19 |         normalized=False,
 20 |         return_complex=True,
 21 |     )
 22 |     log_amp = torch.log(torch.abs(stft_spec) + 1e-4)
 23 |     pha = torch.angle(stft_spec)
 24 | 
 25 |     com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1)
 26 | 
 27 |     return log_amp, pha, com
 28 | 
 29 | 
 30 | def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
 31 |     amp = torch.exp(log_amp)
 32 |     com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha))
 33 |     hann_window = torch.hann_window(win_size).to(com.device)
 34 |     audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center)
 35 | 
 36 |     return audio
 37 | 
 38 | 
 39 | def get_dataset_filelist(a):
 40 |     with open(a.input_training_file, "r", encoding="utf-8") as fi:
 41 |         training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
 42 | 
 43 |     with open(a.input_validation_file, "r", encoding="utf-8") as fi:
 44 |         validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
 45 | 
 46 |     return training_indexes, validation_indexes
 47 | 
 48 | 
 49 | class Dataset(torch.utils.data.Dataset):
 50 |     def __init__(
 51 |         self,
 52 |         training_indexes,
 53 |         wavs_dir,
 54 |         segment_size,
 55 |         hr_sampling_rate,
 56 |         lr_sampling_rate,
 57 |         split=True,
 58 |         shuffle=True,
 59 |         n_cache_reuse=1,
 60 |         device=None,
 61 |     ):
 62 |         self.audio_indexes = training_indexes
 63 |         random.seed(1234)
 64 |         if shuffle:
 65 |             random.shuffle(self.audio_indexes)
 66 |         self.wavs_dir = wavs_dir
 67 |         self.segment_size = segment_size
 68 |         self.hr_sampling_rate = hr_sampling_rate
 69 |         self.lr_sampling_rate = lr_sampling_rate
 70 |         self.split = split
 71 |         self.cached_wav = None
 72 |         self.n_cache_reuse = n_cache_reuse
 73 |         self._cache_ref_count = 0
 74 |         self.device = device
 75 | 
 76 |     def __getitem__(self, index):
 77 |         filename = self.audio_indexes[index]
 78 |         if self._cache_ref_count == 0:
 79 |             audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav"))
 80 |             self.cached_wav = audio
 81 |             self._cache_ref_count = self.n_cache_reuse
 82 |         else:
 83 |             audio = self.cached_wav
 84 |             self._cache_ref_count -= 1
 85 | 
 86 |         if orig_sampling_rate == self.hr_sampling_rate:
 87 |             audio_hr = audio
 88 |         else:
 89 |             audio_hr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.hr_sampling_rate)
 90 | 
 91 |         audio_lr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.lr_sampling_rate)
 92 |         audio_lr = aF.resample(audio_lr, orig_freq=self.lr_sampling_rate, new_freq=self.hr_sampling_rate)
 93 |         audio_lr = audio_lr[:, : audio_hr.size(1)]
 94 | 
 95 |         if self.split:
 96 |             if audio_hr.size(1) >= self.segment_size:
 97 |                 max_audio_start = audio_hr.size(1) - self.segment_size
 98 |                 audio_start = random.randint(0, max_audio_start)
 99 |                 audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size]
100 |                 audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size]
101 |             else:
102 |                 audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant")
103 |                 audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant")
104 | 
105 |         return (audio_hr.squeeze(), audio_lr.squeeze())
106 | 
107 |     def __len__(self):
108 |         return len(self.audio_indexes)
109 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/tools/__init__.py


--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def check_fw_local_models():
 5 |     """
 6 |     启动时检查本地是否有 Faster Whisper 模型.
 7 |     """
 8 |     model_size_list = [
 9 |         "tiny",
10 |         "tiny.en",
11 |         "base",
12 |         "base.en",
13 |         "small",
14 |         "small.en",
15 |         "medium",
16 |         "medium.en",
17 |         "large",
18 |         "large-v1",
19 |         "large-v2",
20 |         "large-v3",
21 |     ]
22 |     for i, size in enumerate(model_size_list):
23 |         if os.path.exists(f"tools/asr/models/faster-whisper-{size}"):
24 |             model_size_list[i] = size + "-local"
25 |     return model_size_list
26 | 
27 | 
28 | asr_dict = {
29 |     "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
30 |     "Faster Whisper (多语种)": {
31 |         "lang": ["auto", "zh", "en", "ja", "ko", "yue"],
32 |         "size": check_fw_local_models(),
33 |         "path": "fasterwhisper_asr.py",
34 |         "precision": ["float32", "float16", "int8"],
35 |     },
36 | }
37 | 


--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/tools/audio_sr.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function, unicode_literals
 2 | import sys
 3 | import os
 4 | 
 5 | AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
 6 | sys.path.append(AP_BWE_main_dir_path)
 7 | import json
 8 | import torch
 9 | import torchaudio.functional as aF
10 | # from attrdict import AttrDict####will be bug in py3.10
11 | 
12 | from datasets1.dataset import amp_pha_stft, amp_pha_istft
13 | from models.model import APNet_BWE_Model
14 | 
15 | 
16 | class AP_BWE:
17 |     def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
18 |         if checkpoint_file == None:
19 |             checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
20 |             if os.path.exists(checkpoint_file) == False:
21 |                 raise FileNotFoundError
22 |         config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
23 |         with open(config_file) as f:
24 |             data = f.read()
25 |         json_config = json.loads(data)
26 |         # h = AttrDict(json_config)
27 |         h = DictToAttrRecursive(json_config)
28 |         model = APNet_BWE_Model(h).to(device)
29 |         state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False)
30 |         model.load_state_dict(state_dict["generator"])
31 |         model.eval()
32 |         self.device = device
33 |         self.model = model
34 |         self.h = h
35 | 
36 |     def to(self, *arg, **kwargs):
37 |         self.model.to(*arg, **kwargs)
38 |         self.device = self.model.conv_pre_mag.weight.device
39 |         return self
40 | 
41 |     def __call__(self, audio, orig_sampling_rate):
42 |         with torch.no_grad():
43 |             # audio, orig_sampling_rate = torchaudio.load(inp_path)
44 |             # audio = audio.to(self.device)
45 |             audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate)
46 |             amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size)
47 |             amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb)
48 |             audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size)
49 |             # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16')
50 |             return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate
51 | 


--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import traceback
 4 | 
 5 | from modelscope.pipelines import pipeline
 6 | from modelscope.utils.constant import Tasks
 7 | from tqdm import tqdm
 8 | 
 9 | path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
10 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
11 | ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
12 | 
13 | 
14 | def execute_denoise(input_folder, output_folder):
15 |     os.makedirs(output_folder, exist_ok=True)
16 |     # print(input_folder)
17 |     # print(list(os.listdir(input_folder).sort()))
18 |     for name in tqdm(os.listdir(input_folder)):
19 |         try:
20 |             ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
21 |         except:
22 |             traceback.print_exc()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument(
28 |         "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
29 |     )
30 |     parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
31 |     parser.add_argument(
32 |         "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
33 |     )  # 还没接入
34 |     cmd = parser.parse_args()
35 |     execute_denoise(
36 |         input_folder=cmd.input_folder,
37 |         output_folder=cmd.output_folder,
38 |     )
39 | 


--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
 6 | 
 7 | 
 8 | def load_language_list(language):
 9 |     with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
10 |         language_list = json.load(f)
11 |     return language_list
12 | 
13 | 
14 | def scan_language_list():
15 |     language_list = []
16 |     for name in os.listdir(I18N_JSON_DIR):
17 |         if name.endswith(".json"):
18 |             language_list.append(name.split(".")[0])
19 |     return language_list
20 | 
21 | 
22 | class I18nAuto:
23 |     def __init__(self, language=None):
24 |         if language in ["Auto", None]:
25 |             language = locale.getdefaultlocale()[0]
26 |             # getlocale can't identify the system's language ((None, None))
27 |         if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
28 |             language = "en_US"
29 |         self.language = language
30 |         self.language_map = load_language_list(language)
31 | 
32 |     def __call__(self, key):
33 |         return self.language_map.get(key, key)
34 | 
35 |     def __repr__(self):
36 |         return "Use Language: " + self.language
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     i18n = I18nAuto(language="en_US")
41 |     print(i18n)
42 | 


--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | import traceback
 5 | from scipy.io import wavfile
 6 | 
 7 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
 8 | # sys.path.append(parent_directory)
 9 | from tools.my_utils import load_audio
10 | from slicer2 import Slicer
11 | 
12 | 
13 | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
14 |     os.makedirs(opt_root, exist_ok=True)
15 |     if os.path.isfile(inp):
16 |         input = [inp]
17 |     elif os.path.isdir(inp):
18 |         input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
19 |     else:
20 |         return "输入路径存在但既不是文件也不是文件夹"
21 |     slicer = Slicer(
22 |         sr=32000,  # 长音频采样率
23 |         threshold=int(threshold),  # 音量小于这个值视作静音的备选切割点
24 |         min_length=int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
25 |         min_interval=int(min_interval),  # 最短切割间隔
26 |         hop_size=int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
27 |         max_sil_kept=int(max_sil_kept),  # 切完后静音最多留多长
28 |     )
29 |     _max = float(_max)
30 |     alpha = float(alpha)
31 |     for inp_path in input[int(i_part) :: int(all_part)]:
32 |         # print(inp_path)
33 |         try:
34 |             name = os.path.basename(inp_path)
35 |             audio = load_audio(inp_path, 32000)
36 |             # print(audio.shape)
37 |             for chunk, start, end in slicer.slice(audio):  # start和end是帧数
38 |                 tmp_max = np.abs(chunk).max()
39 |                 if tmp_max > 1:
40 |                     chunk /= tmp_max
41 |                 chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
42 |                 wavfile.write(
43 |                     "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
44 |                     32000,
45 |                     # chunk.astype(np.float32),
46 |                     (chunk * 32767).astype(np.int16),
47 |                 )
48 |         except:
49 |             print(inp_path, "->fail->", traceback.format_exc())
50 |     return "执行完毕，请检查输出文件"
51 | 
52 | 
53 | print(slice(*sys.argv[1:]))
54 | 


--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/tools/uvr5/bs_roformer/__init__.py


--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/attend.py:
--------------------------------------------------------------------------------
 1 | from packaging import version
 2 | import torch
 3 | from torch import nn, einsum
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def exists(val):
 8 |     return val is not None
 9 | 
10 | 
11 | def default(v, d):
12 |     return v if exists(v) else d
13 | 
14 | 
15 | class Attend(nn.Module):
16 |     def __init__(self, dropout=0.0, flash=False, scale=None):
17 |         super().__init__()
18 |         self.scale = scale
19 |         self.dropout = dropout
20 |         self.attn_dropout = nn.Dropout(dropout)
21 | 
22 |         self.flash = flash
23 |         assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), (
24 |             "in order to use flash attention, you must be using pytorch 2.0 or above"
25 |         )
26 | 
27 |     def flash_attn(self, q, k, v):
28 |         # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
29 | 
30 |         if exists(self.scale):
31 |             default_scale = q.shape[-1] ** -0.5
32 |             q = q * (self.scale / default_scale)
33 | 
34 |         # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
35 |         # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
36 |         return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0)
37 | 
38 |     def forward(self, q, k, v):
39 |         """
40 |         einstein notation
41 |         b - batch
42 |         h - heads
43 |         n, i, j - sequence length (base sequence length, source, target)
44 |         d - feature dimension
45 |         """
46 | 
47 |         # q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
48 | 
49 |         scale = default(self.scale, q.shape[-1] ** -0.5)
50 | 
51 |         if self.flash:
52 |             return self.flash_attn(q, k, v)
53 | 
54 |         # similarity
55 | 
56 |         sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
57 | 
58 |         # attention
59 | 
60 |         attn = sim.softmax(dim=-1)
61 |         attn = self.attn_dropout(attn)
62 | 
63 |         # aggregate values
64 | 
65 |         out = einsum("b h i j, b h j d -> b h i d", attn, v)
66 | 
67 |         return out
68 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data : dic configs
 28 |     """
 29 | 
 30 |     def _execute(
 31 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
 32 |     ):
 33 |         model.eval()
 34 |         with torch.no_grad():
 35 |             preds = []
 36 | 
 37 |             iterations = [n_window]
 38 | 
 39 |             total_iterations = sum(iterations)
 40 |             for i in tqdm(range(n_window)):
 41 |                 start = i * roi_size
 42 |                 X_mag_window = X_mag_pad[
 43 |                     None, :, :, start : start + data["window_size"]
 44 |                 ]
 45 |                 X_mag_window = torch.from_numpy(X_mag_window)
 46 |                 if is_half:
 47 |                     X_mag_window = X_mag_window.half()
 48 |                 X_mag_window = X_mag_window.to(device)
 49 | 
 50 |                 pred = model.predict(X_mag_window, aggressiveness)
 51 | 
 52 |                 pred = pred.detach().cpu().numpy()
 53 |                 preds.append(pred[0])
 54 | 
 55 |             pred = np.concatenate(preds, axis=2)
 56 |         return pred
 57 | 
 58 |     def preprocess(X_spec):
 59 |         X_mag = np.abs(X_spec)
 60 |         X_phase = np.angle(X_spec)
 61 | 
 62 |         return X_mag, X_phase
 63 | 
 64 |     X_mag, X_phase = preprocess(X_spec)
 65 | 
 66 |     coef = X_mag.max()
 67 |     X_mag_pre = X_mag / coef
 68 | 
 69 |     n_frame = X_mag_pre.shape[2]
 70 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 71 |     n_window = int(np.ceil(n_frame / roi_size))
 72 | 
 73 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 74 | 
 75 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 76 |         is_half = True
 77 |     else:
 78 |         is_half = False
 79 |     pred = _execute(
 80 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 81 |     )
 82 |     pred = pred[:, :, :n_frame]
 83 | 
 84 |     if data["tta"]:
 85 |         pad_l += roi_size // 2
 86 |         pad_r += roi_size // 2
 87 |         n_window += 1
 88 | 
 89 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 90 | 
 91 |         pred_tta = _execute(
 92 |             X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 93 |         )
 94 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 95 |         pred_tta = pred_tta[:, :, :n_frame]
 96 | 
 97 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 98 |     else:
 99 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 | 
101 | 
102 | def _get_name_params(model_path, model_hash):
103 |     data = load_data()
104 |     flag = False
105 |     ModelName = model_path
106 |     for type in list(data):
107 |         for model in list(data[type][0]):
108 |             for i in range(len(data[type][0][model])):
109 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 |                     flag = True
111 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 |                     flag = True
113 | 
114 |                 if flag:
115 |                     model_params_auto = data[type][0][model][i]["model_params"]
116 |                     param_name_auto = data[type][0][model][i]["param_name"]
117 |                     if type == "equivalent":
118 |                         return param_name_auto, model_params_auto
119 |                     else:
120 |                         flag = False
121 |     return param_name_auto, model_params_auto
122 | 


--------------------------------------------------------------------------------
/tools/uvr5/uvr5_weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/utilss/__init__.py:
--------------------------------------------------------------------------------
1 | from . import sv


--------------------------------------------------------------------------------
/utilss/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/utilss/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/utilss/__pycache__/sv.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlfreScarlet/MoeChat/3deed6ffa08d6268cac7d2b5aadf43d675d2fee2/utilss/__pycache__/sv.cpython-39.pyc


--------------------------------------------------------------------------------
/utilss/sv.py:
--------------------------------------------------------------------------------
 1 | from modelscope.pipelines import pipeline
 2 | import soundfile as sf
 3 | import numpy as np
 4 | from scipy.signal import resample
 5 | import io
 6 | 
 7 | class SV:
 8 |     def __init__(self, config: dict):
 9 |         self.thr = ""
10 |         with open(config["master_audio"], "rb") as f:
11 |             audio_bytes = f.read()
12 |         self.master_audio = self.resample_wav_bytes(audio_bytes)
13 |         if "thr" in config:
14 |             if config["thr"]:
15 |                 self.thr = str(config["thr"])
16 |         self.sv_pipeline = pipeline(
17 |             task='speaker-verification',
18 |             model='iic/speech_res2net_sv_zh-cn_3dspeaker_16k',
19 |             model_revision='master'
20 |         )
21 |     def resample_wav_bytes(self, wav_bytes, target_sr=16000):
22 |         # 使用BytesIO将字节转为文件类对象
23 |         with io.BytesIO(wav_bytes) as wav_file:
24 |             # 读取音频数据
25 |             data, original_sr = sf.read(wav_file, dtype='float32')
26 |             
27 |             # 立体声转单声道（取均值）
28 |             if len(data.shape) > 1:
29 |                 data = np.mean(data, axis=1)
30 |             
31 |             # 计算重采样比例
32 |             resample_ratio = target_sr / original_sr
33 |             
34 |             # 使用scipy的signal.resample进行重采样
35 |             target_samples = int(len(data) * resample_ratio)
36 |             resampled_data = resample(data, target_samples)
37 |             
38 |             # 归一化并转为16bit PCM格式
39 |             resampled_data = np.clip(resampled_data, -1.0, 1.0)
40 |             resampled_data = (resampled_data * 32767).astype(np.int16)
41 |             
42 |             return resampled_data
43 |     def check_speaker(self, speaker_audio: bytes) -> bool:
44 |         # with open("ttmp.wav", "wb") as f:
45 |         #     f.write(speaker_audio)
46 |         with io.BytesIO(speaker_audio) as f:
47 |             speaker_audio_1, _ = sf.read(f)
48 |         res = {}
49 |         if self.thr:
50 |             res = self.sv_pipeline([speaker_audio_1, self.master_audio], self.thr)
51 |         else:
52 |             res = self.sv_pipeline([speaker_audio_1, self.master_audio])
53 |         print(f"[声纹识别结果]结果相似度{res['score']}, 目标相似度{self.thr}")
54 |         if res["text"] == "yes":
55 |             return True
56 |         else:
57 |             return False


--------------------------------------------------------------------------------