├── oldVersion ├── V111 │ ├── text │ │ ├── fix │ │ │ ├── __init__.py │ │ │ └── japanese_bert.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ ├── symbols.py │ │ ├── english.py │ │ ├── opencpop-strict.txt │ │ └── chinese.py │ └── __init__.py ├── __init__.py ├── V101 │ ├── text │ │ ├── english_bert_mock.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ ├── symbols.py │ │ ├── japanese.py │ │ ├── english.py │ │ └── opencpop-strict.txt │ └── __init__.py └── V110 │ ├── text │ ├── english_bert_mock.py │ ├── cleaner.py │ ├── __init__.py │ ├── japanese_bert.py │ ├── chinese_bert.py │ ├── symbols.py │ ├── english.py │ ├── opencpop-strict.txt │ └── chinese.py │ └── __init__.py ├── tools ├── __init__.py ├── log.py ├── translate.py ├── classify_language.py └── sentence.py ├── emotional └── wav2vec2-large-robust-12-ft-emotion-msp-dim │ ├── vocab.json │ ├── preprocessor_config.json │ ├── config.json │ └── README.md ├── Web ├── img │ ├── Hiyori.ico │ ├── helps1.png │ └── helps2.png ├── index.html └── assets │ └── index-49e71a58.css ├── text ├── cmudict_cache.pickle ├── bert_utils.py ├── cleaner.py ├── english_bert_mock.py ├── __init__.py ├── japanese_bert.py ├── chinese_bert.py ├── symbols.py ├── opencpop-strict.txt └── chinese.py ├── run_Mgpus.sh ├── Data └── keqing │ ├── models │ └── eval │ │ └── events.out.tfevents.1700630428.ly.20380.1 │ ├── config.yml │ └── config.json ├── bert ├── bert-base-japanese-v3 │ ├── tokenizer_config.json │ ├── config.json │ └── README.md ├── bert-large-japanese-v2 │ ├── tokenizer_config.json │ ├── config.json │ └── README.md └── bert_models.json ├── .gitignore ├── requirements.txt ├── monotonic_align ├── __init__.py └── core.py ├── audio_slicer.py ├── losses.py ├── config.yml ├── resample.py ├── re_matching.py ├── bert_gen.py ├── update_status.py ├── transcribe_genshin.py ├── README.md ├── configs └── default_config.yml ├── mel_processing.py ├── emo_gen.py ├── preprocess_text.py ├── short_audio_transcribe.py ├── commons.py └── server.py /oldVersion/V111/text/fix/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 工具包 3 | """ 4 | -------------------------------------------------------------------------------- /oldVersion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 老版本模型推理兼容 3 | """ 4 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /Web/img/Hiyori.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/Hiyori.ico -------------------------------------------------------------------------------- /Web/img/helps1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps1.png -------------------------------------------------------------------------------- /Web/img/helps2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps2.png -------------------------------------------------------------------------------- /text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V101/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V110/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V111/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /run_Mgpus.sh: -------------------------------------------------------------------------------- 1 | torchrun \ 2 | --nnodes=1:3\ 3 | --nproc_per_node=2\ 4 | --rdzv_id=1\ 5 | --rdzv_backend=c10d\ 6 | --rdzv_endpoint="ib1:8880"\ 7 | train_ms.py 8 | -------------------------------------------------------------------------------- /Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1 -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0.0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tools/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | logger封装 3 | """ 4 | from loguru import logger 5 | import sys 6 | 7 | 8 | # 移除所有默认的处理器 9 | logger.remove() 10 | 11 | # 自定义格式并添加到标准输出 12 | log_format = ( 13 | "{time:MM-DD HH:mm:ss} {level:<9}| {file}:{line} | {message}" 14 | ) 15 | 16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True) 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | output/ 6 | ckpt/ 7 | pretrained_models/ 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | -------------------------------------------------------------------------------- /bert/bert_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "deberta-v2-large-japanese": { 3 | "repo_id": "ku-nlp/deberta-v2-large-japanese", 4 | "files": ["pytorch_model.bin"] 5 | }, 6 | "chinese-roberta-wwm-ext-large": { 7 | "repo_id": "hfl/chinese-roberta-wwm-ext-large", 8 | "files": ["pytorch_model.bin"] 9 | }, 10 | "deberta-v3-large": { 11 | "repo_id": "microsoft/deberta-v3-large", 12 | "files": ["spm.model", "pytorch_model.bin"] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Hiyori UI 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa==0.9.1 2 | matplotlib 3 | numpy 4 | numba 5 | phonemizer 6 | scipy 7 | tensorboard 8 | Unidecode 9 | amfm_decompy 10 | jieba 11 | transformers 12 | pypinyin 13 | cn2an 14 | gradio 15 | av 16 | mecab-python3 17 | loguru 18 | unidic-lite 19 | cmudict 20 | fugashi 21 | num2words 22 | PyYAML 23 | requests 24 | pyopenjtalk; sys_platform == 'linux' 25 | openjtalk; sys_platform != 'linux' 26 | jaconv 27 | psutil 28 | GPUtil 29 | vector_quantize_pytorch 30 | g2p_en 31 | sentencepiece 32 | pykakasi 33 | langid 34 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 1024, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 4096, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 16, 15 | "num_hidden_layers": 24, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | 7 | def maximum_path(neg_cent, mask): 8 | device = neg_cent.device 9 | dtype = neg_cent.dtype 10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 11 | path = zeros(neg_cent.shape, dtype=int32) 12 | 13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 16 | return from_numpy(path).to(device=device, dtype=dtype) 17 | -------------------------------------------------------------------------------- /text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V101/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V110/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V101/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph) 28 | return bert 29 | -------------------------------------------------------------------------------- /oldVersion/V110/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language, device): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | from .japanese_bert import get_bert_feature as jp_bert 26 | 27 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 28 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 29 | return bert 30 | -------------------------------------------------------------------------------- /oldVersion/V110/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to( 18 | device 19 | ) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | assert inputs["input_ids"].shape[-1] == len(word2ph) 27 | word2phone = word2ph 28 | phone_level_feature = [] 29 | for i in range(len(word2phone)): 30 | repeat_feature = res[i].repeat(word2phone[i], 1) 31 | phone_level_feature.append(repeat_feature) 32 | 33 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 34 | 35 | return phone_level_feature.T 36 | -------------------------------------------------------------------------------- /oldVersion/V111/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/bert-base-japanese-v3" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | assert inputs["input_ids"].shape[-1] == len(word2ph) 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | -------------------------------------------------------------------------------- /oldVersion/V111/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | from .fix import japanese as japanese_fix 3 | 4 | 5 | language_module_map = {"ZH": chinese, "JP": japanese} 6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix} 7 | 8 | 9 | def clean_text(text, language): 10 | language_module = language_module_map[language] 11 | norm_text = language_module.text_normalize(text) 12 | phones, tones, word2ph = language_module.g2p(norm_text) 13 | return norm_text, phones, tones, word2ph 14 | 15 | 16 | def clean_text_fix(text, language): 17 | """使用dev分支修复""" 18 | language_module = language_module_map_fix[language] 19 | norm_text = language_module.text_normalize(text) 20 | phones, tones, word2ph = language_module.g2p(norm_text) 21 | return norm_text, phones, tones, word2ph 22 | 23 | 24 | def clean_text_bert(text, language): 25 | language_module = language_module_map[language] 26 | norm_text = language_module.text_normalize(text) 27 | phones, tones, word2ph = language_module.g2p(norm_text) 28 | bert = language_module.get_bert_feature(norm_text, word2ph) 29 | return phones, tones, bert 30 | 31 | 32 | def text_to_sequence(text, language): 33 | norm_text, phones, tones, word2ph = clean_text(text, language) 34 | return cleaned_text_to_sequence(phones, tones, language) 35 | 36 | 37 | if __name__ == "__main__": 38 | pass 39 | -------------------------------------------------------------------------------- /text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | if ( 18 | sys.platform == "darwin" 19 | and torch.backends.mps.is_available() 20 | and device == "cpu" 21 | ): 22 | device = "mps" 23 | if not device: 24 | device = "cuda" 25 | if device not in models.keys(): 26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 27 | with torch.no_grad(): 28 | inputs = tokenizer(text, return_tensors="pt") 29 | for i in inputs: 30 | inputs[i] = inputs[i].to(device) 31 | res = models[device](**inputs, output_hidden_states=True) 32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 33 | # assert len(word2ph) == len(text)+2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | -------------------------------------------------------------------------------- /monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit( 5 | numba.void( 6 | numba.int32[:, :, ::1], 7 | numba.float32[:, :, ::1], 8 | numba.int32[::1], 9 | numba.int32[::1], 10 | ), 11 | nopython=True, 12 | nogil=True, 13 | ) 14 | def maximum_path_jit(paths, values, t_ys, t_xs): 15 | b = paths.shape[0] 16 | max_neg_val = -1e9 17 | for i in range(int(b)): 18 | path = paths[i] 19 | value = values[i] 20 | t_y = t_ys[i] 21 | t_x = t_xs[i] 22 | 23 | v_prev = v_cur = 0.0 24 | index = t_x - 1 25 | 26 | for y in range(t_y): 27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 28 | if x == y: 29 | v_cur = max_neg_val 30 | else: 31 | v_cur = value[y - 1, x] 32 | if x == 0: 33 | if y == 0: 34 | v_prev = 0.0 35 | else: 36 | v_prev = max_neg_val 37 | else: 38 | v_prev = value[y - 1, x - 1] 39 | value[y, x] += max(v_prev, v_cur) 40 | 41 | for y in range(t_y - 1, -1, -1): 42 | path[y, index] = 1 43 | if index != 0 and ( 44 | index == y or value[y - 1, index] < value[y - 1, index - 1] 45 | ): 46 | index = index - 1 47 | -------------------------------------------------------------------------------- /audio_slicer.py: -------------------------------------------------------------------------------- 1 | import librosa # Optional. Use any library you like to read audio files. 2 | import soundfile # Optional. Use any library you like to write audio files. 3 | 4 | import shutil 5 | import gradio as gr 6 | import os 7 | import webbrowser 8 | import subprocess 9 | import datetime 10 | import json 11 | import requests 12 | import soundfile as sf 13 | import numpy as np 14 | import yaml 15 | from config import config 16 | import os 17 | 18 | with open('config.yml', mode="r", encoding="utf-8") as f: 19 | configyml=yaml.load(f,Loader=yaml.FullLoader) 20 | 21 | model_name = configyml["dataset_path"].replace("Data\\","") 22 | 23 | from slicer2 import Slicer 24 | 25 | audio, sr = librosa.load(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav', sr=None, mono=False) # Load an audio file with librosa. 26 | slicer = Slicer( 27 | sr=sr, 28 | threshold=-40, 29 | min_length=2000, 30 | min_interval=300, 31 | hop_size=10, 32 | max_sil_kept=500 33 | ) 34 | chunks = slicer.slice(audio) 35 | for i, chunk in enumerate(chunks): 36 | if len(chunk.shape) > 1: 37 | chunk = chunk.T # Swap axes if the audio is stereo. 38 | soundfile.write(f'./Data/{model_name}/raw/{model_name}/{model_name}_{i}.wav', chunk, sr) # Save sliced audio files with soundfile. 39 | 40 | if os.path.exists(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav'): # 如果文件存在 41 | os.remove(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav') -------------------------------------------------------------------------------- /oldVersion/V111/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def get_bert_fix(norm_text, word2ph, language, device): 32 | from .chinese_bert import get_bert_feature as zh_bert 33 | from .english_bert_mock import get_bert_feature as en_bert 34 | from .fix.japanese_bert import get_bert_feature as jp_bert 35 | 36 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 37 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 38 | return bert 39 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def check_bert_models(): 32 | import json 33 | from pathlib import Path 34 | 35 | from config import config 36 | from .bert_utils import _check_bert 37 | 38 | if config.mirror.lower() == "openi": 39 | import openi 40 | 41 | kwargs = {"token": config.openi_token} if config.openi_token else {} 42 | openi.login(**kwargs) 43 | 44 | with open("./bert/bert_models.json", "r") as fp: 45 | models = json.load(fp) 46 | for k, v in models.items(): 47 | local_path = Path("./bert").joinpath(k) 48 | _check_bert(v["repo_id"], v["files"], local_path) 49 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | bert_gen: 2 | config_path: config.json 3 | device: cuda 4 | num_processes: 2 5 | use_multi_device: false 6 | dataset_path: Data\keqing 7 | mirror: '' 8 | openi_token: '' 9 | preprocess_text: 10 | clean: true 11 | cleaned_path: filelists/cleaned.list 12 | config_path: config.json 13 | max_val_total: 8 14 | train_path: filelists/train.list 15 | transcription_path: filelists/short_character_anno.list 16 | val_path: filelists/val.list 17 | val_per_spk: 5 18 | resample: 19 | in_dir: raw 20 | out_dir: raw 21 | sampling_rate: 44100 22 | server: 23 | device: cuda 24 | models: 25 | - config: ./Data/keqing/config.json 26 | device: cuda 27 | language: ZH 28 | model: ./Data/keqing/models/G_0.pth 29 | speakers: 30 | - length_scale: 1 31 | noise_scale: 0.6 32 | noise_scale_w: 0.8 33 | sdp_ratio: 0.2 34 | speaker: "\u79D1\u6BD4" 35 | - length_scale: 0.5 36 | noise_scale: 0.7 37 | noise_scale_w: 0.8 38 | sdp_ratio: 0.3 39 | speaker: "\u4E94\u6761\u609F" 40 | - length_scale: 1.2 41 | noise_scale: 0.6 42 | noise_scale_w: 0.8 43 | sdp_ratio: 0.2 44 | speaker: "\u5B89\u500D\u664B\u4E09" 45 | - config: ./Data/keqing/config.json 46 | device: cuda 47 | language: JP 48 | model: ./Data/keqing/models/G_0.pth 49 | speakers: [] 50 | port: 7860 51 | train_ms: 52 | base: 53 | model_image: "Bert-VITS2中日英底模-fix" 54 | repo_id: Stardust_minus/Bert-VITS2 55 | use_base_model: false 56 | config_path: config.json 57 | env: 58 | MASTER_ADDR: localhost 59 | MASTER_PORT: 10086 60 | RANK: 0 61 | THE_ENV_VAR_YOU_NEED_TO_USE: '1234567' 62 | WORLD_SIZE: 1 63 | model: models 64 | translate: 65 | app_key: '' 66 | secret_key: '' 67 | webui: 68 | config_path: Data/keqing/config.json 69 | debug: false 70 | device: cuda 71 | language_identification_library: langid 72 | model: models/G_0.pth 73 | port: 7860 74 | share: false 75 | -------------------------------------------------------------------------------- /Data/keqing/config.yml: -------------------------------------------------------------------------------- 1 | bert_gen: 2 | config_path: config.json 3 | device: cuda 4 | num_processes: 2 5 | use_multi_device: false 6 | dataset_path: Data\keqing 7 | mirror: '' 8 | openi_token: '' 9 | preprocess_text: 10 | clean: true 11 | cleaned_path: filelists/cleaned.list 12 | config_path: config.json 13 | max_val_total: 8 14 | train_path: filelists/train.list 15 | transcription_path: filelists/short_character_anno.list 16 | val_path: filelists/val.list 17 | val_per_spk: 5 18 | resample: 19 | in_dir: raw 20 | out_dir: raw 21 | sampling_rate: 44100 22 | server: 23 | device: cuda 24 | models: 25 | - config: ./Data/TEST/config.json 26 | device: cuda 27 | language: ZH 28 | model: ./Data/TEST/models/G_100.pth 29 | speakers: 30 | - length_scale: 1 31 | noise_scale: 0.6 32 | noise_scale_w: 0.8 33 | sdp_ratio: 0.2 34 | speaker: "\u79D1\u6BD4" 35 | - length_scale: 0.5 36 | noise_scale: 0.7 37 | noise_scale_w: 0.8 38 | sdp_ratio: 0.3 39 | speaker: "\u4E94\u6761\u609F" 40 | - length_scale: 1.2 41 | noise_scale: 0.6 42 | noise_scale_w: 0.8 43 | sdp_ratio: 0.2 44 | speaker: "\u5B89\u500D\u664B\u4E09" 45 | - config: ./Data/test/config.json 46 | device: cuda 47 | language: JP 48 | model: ./Data/test/models/G_100.pth 49 | speakers: [] 50 | port: 7860 51 | train_ms: 52 | base: 53 | model_image: "Bert-VITS2中日英底模-fix" 54 | repo_id: Stardust_minus/Bert-VITS2 55 | use_base_model: false 56 | config_path: config.json 57 | env: 58 | MASTER_ADDR: localhost 59 | MASTER_PORT: 10086 60 | RANK: 0 61 | THE_ENV_VAR_YOU_NEED_TO_USE: '1234567' 62 | WORLD_SIZE: 1 63 | model: models 64 | translate: 65 | app_key: '' 66 | secret_key: '' 67 | webui: 68 | config_path: Data/TEST/config.json 69 | debug: false 70 | device: cuda 71 | language_identification_library: langid 72 | model: models/G_100.pth 73 | port: 7860 74 | share: false 75 | -------------------------------------------------------------------------------- /Data/keqing/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 50, 4 | "eval_interval": 50, 5 | "seed": 42, 6 | "epochs": 200, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 8, 14 | "fp16_run": false, 15 | "lr_decay": 0.99995, 16 | "segment_size": 16384, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "skip_optimizer": false 22 | }, 23 | "data": { 24 | "training_files": "Data/keqing/filelists/train.list", 25 | "validation_files": "Data/keqing/filelists/val.list", 26 | "max_wav_value": 32768.0, 27 | "sampling_rate": 44100, 28 | "filter_length": 2048, 29 | "hop_length": 512, 30 | "win_length": 2048, 31 | "n_mel_channels": 128, 32 | "mel_fmin": 0.0, 33 | "mel_fmax": null, 34 | "add_blank": true, 35 | "n_speakers": 1, 36 | "cleaned_text": true, 37 | "spk2id": { 38 | "keqing": 0 39 | } 40 | }, 41 | "model": { 42 | "use_spk_conditioned_encoder": true, 43 | "use_noise_scaled_mas": true, 44 | "use_mel_posterior_encoder": false, 45 | "use_duration_discriminator": true, 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [ 55 | 3, 56 | 7, 57 | 11 58 | ], 59 | "resblock_dilation_sizes": [ 60 | [ 61 | 1, 62 | 3, 63 | 5 64 | ], 65 | [ 66 | 1, 67 | 3, 68 | 5 69 | ], 70 | [ 71 | 1, 72 | 3, 73 | 5 74 | ] 75 | ], 76 | "upsample_rates": [ 77 | 8, 78 | 8, 79 | 2, 80 | 2, 81 | 2 82 | ], 83 | "upsample_initial_channel": 512, 84 | "upsample_kernel_sizes": [ 85 | 16, 86 | 16, 87 | 8, 88 | 2, 89 | 2 90 | ], 91 | "n_layers_q": 3, 92 | "use_spectral_norm": false, 93 | "gin_channels": 256 94 | }, 95 | "version": "2.0" 96 | } -------------------------------------------------------------------------------- /tools/translate.py: -------------------------------------------------------------------------------- 1 | """ 2 | 翻译api 3 | """ 4 | from config import config 5 | 6 | import random 7 | import hashlib 8 | import requests 9 | 10 | 11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""): 12 | """ 13 | :param Sentence: 待翻译语句 14 | :param from_Language: 待翻译语句语言 15 | :param to_Language: 目标语言 16 | :return: 翻译后语句 出错时返回None 17 | 18 | 常见语言代码:中文 zh 英语 en 日语 jp 19 | """ 20 | appid = config.translate_config.app_key 21 | key = config.translate_config.secret_key 22 | if appid == "" or key == "": 23 | return "请开发者在config.yml中配置app_key与secret_key" 24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate" 25 | texts = Sentence.splitlines() 26 | outTexts = [] 27 | for t in texts: 28 | if t != "": 29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113 30 | salt = str(random.randint(1, 100000)) 31 | signString = appid + t + salt + key 32 | hs = hashlib.md5() 33 | hs.update(signString.encode("utf-8")) 34 | signString = hs.hexdigest() 35 | if from_Language == "": 36 | from_Language = "auto" 37 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 38 | payload = { 39 | "q": t, 40 | "from": from_Language, 41 | "to": to_Language, 42 | "appid": appid, 43 | "salt": salt, 44 | "sign": signString, 45 | } 46 | # 发送请求 47 | try: 48 | response = requests.post( 49 | url=url, data=payload, headers=headers, timeout=3 50 | ) 51 | response = response.json() 52 | if "trans_result" in response.keys(): 53 | result = response["trans_result"][0] 54 | if "dst" in result.keys(): 55 | dst = result["dst"] 56 | outTexts.append(dst) 57 | except Exception: 58 | return Sentence 59 | else: 60 | outTexts.append(t) 61 | return "\n".join(outTexts) 62 | -------------------------------------------------------------------------------- /text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from text.japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | sep_text, _, _ = text2sep_kata(text) 18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 21 | return get_bert_feature_with_token(sep_ids, word2ph, device) 22 | 23 | 24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 25 | if ( 26 | sys.platform == "darwin" 27 | and torch.backends.mps.is_available() 28 | and device == "cpu" 29 | ): 30 | device = "mps" 31 | if not device: 32 | device = "cuda" 33 | if device not in models.keys(): 34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 35 | with torch.no_grad(): 36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 37 | token_type_ids = torch.zeros_like(inputs).to(device) 38 | attention_mask = torch.ones_like(inputs).to(device) 39 | inputs = { 40 | "input_ids": inputs, 41 | "token_type_ids": token_type_ids, 42 | "attention_mask": attention_mask, 43 | } 44 | 45 | # for i in inputs: 46 | # inputs[i] = inputs[i].to(device) 47 | res = models[device](**inputs, output_hidden_states=True) 48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 49 | assert inputs["input_ids"].shape[-1] == len(word2ph) 50 | word2phone = word2ph 51 | phone_level_feature = [] 52 | for i in range(len(word2phone)): 53 | repeat_feature = res[i].repeat(word2phone[i], 1) 54 | phone_level_feature.append(repeat_feature) 55 | 56 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 57 | 58 | return phone_level_feature.T 59 | -------------------------------------------------------------------------------- /oldVersion/V111/text/fix/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | from .japanese import text2sep_kata 5 | from config import config 6 | 7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 8 | 9 | models = dict() 10 | 11 | 12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 13 | sep_text, _ = text2sep_kata(text) 14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 17 | return get_bert_feature_with_token(sep_ids, word2ph, device) 18 | 19 | 20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 21 | if ( 22 | sys.platform == "darwin" 23 | and torch.backends.mps.is_available() 24 | and device == "cpu" 25 | ): 26 | device = "mps" 27 | if not device: 28 | device = "cuda" 29 | if device not in models.keys(): 30 | models[device] = AutoModelForMaskedLM.from_pretrained( 31 | "./bert/bert-base-japanese-v3" 32 | ).to(device) 33 | with torch.no_grad(): 34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 35 | token_type_ids = torch.zeros_like(inputs).to(device) 36 | attention_mask = torch.ones_like(inputs).to(device) 37 | inputs = { 38 | "input_ids": inputs, 39 | "token_type_ids": token_type_ids, 40 | "attention_mask": attention_mask, 41 | } 42 | 43 | # for i in inputs: 44 | # inputs[i] = inputs[i].to(device) 45 | res = models[device](**inputs, output_hidden_states=True) 46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 47 | assert inputs["input_ids"].shape[-1] == len(word2ph) 48 | word2phone = word2ph 49 | phone_level_feature = [] 50 | for i in range(len(word2phone)): 51 | repeat_feature = res[i].repeat(word2phone[i], 1) 52 | phone_level_feature.append(repeat_feature) 53 | 54 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 55 | 56 | return phone_level_feature.T 57 | -------------------------------------------------------------------------------- /resample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | from multiprocessing import Pool, cpu_count 5 | 6 | import soundfile 7 | from tqdm import tqdm 8 | 9 | from config import config 10 | 11 | 12 | def process(item): 13 | spkdir, wav_name, args = item 14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name) 15 | if os.path.exists(wav_path) and ".wav" in wav_path: 16 | wav, sr = librosa.load(wav_path, sr=args.sr) 17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "--sr", 24 | type=int, 25 | default=config.resample_config.sampling_rate, 26 | help="sampling rate", 27 | ) 28 | parser.add_argument( 29 | "--in_dir", 30 | type=str, 31 | default=config.resample_config.in_dir, 32 | help="path to source dir", 33 | ) 34 | parser.add_argument( 35 | "--out_dir", 36 | type=str, 37 | default=config.resample_config.out_dir, 38 | help="path to target dir", 39 | ) 40 | parser.add_argument( 41 | "--processes", 42 | type=int, 43 | default=0, 44 | help="cpu_processes", 45 | ) 46 | args, _ = parser.parse_known_args() 47 | # autodl 无卡模式会识别出46个cpu 48 | if args.processes == 0: 49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1 50 | else: 51 | processes = args.processes 52 | pool = Pool(processes=processes) 53 | 54 | tasks = [] 55 | 56 | for dirpath, _, filenames in os.walk(args.in_dir): 57 | # 子级目录 58 | spk_dir = os.path.relpath(dirpath, args.in_dir) 59 | spk_dir_out = os.path.join(args.out_dir, spk_dir) 60 | if not os.path.isdir(spk_dir_out): 61 | os.makedirs(spk_dir_out, exist_ok=True) 62 | for filename in filenames: 63 | if filename.endswith(".wav"): 64 | twople = (spk_dir, filename, args) 65 | tasks.append(twople) 66 | 67 | for _ in tqdm( 68 | pool.imap_unordered(process, tasks), 69 | ): 70 | pass 71 | 72 | pool.close() 73 | pool.join() 74 | 75 | print("音频重采样完毕!") 76 | -------------------------------------------------------------------------------- /oldVersion/V101/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.0.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | 26 | assert bert.shape[-1] == len(phone) 27 | 28 | phone = torch.LongTensor(phone) 29 | tone = torch.LongTensor(tone) 30 | language = torch.LongTensor(language) 31 | 32 | return bert, phone, tone, language 33 | 34 | 35 | def infer( 36 | text, 37 | sdp_ratio, 38 | noise_scale, 39 | noise_scale_w, 40 | length_scale, 41 | sid, 42 | hps, 43 | net_g, 44 | device, 45 | ): 46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device) 47 | with torch.no_grad(): 48 | x_tst = phones.to(device).unsqueeze(0) 49 | tones = tones.to(device).unsqueeze(0) 50 | lang_ids = lang_ids.to(device).unsqueeze(0) 51 | bert = bert.to(device).unsqueeze(0) 52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 53 | del phones 54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 55 | audio = ( 56 | net_g.infer( 57 | x_tst, 58 | x_tst_lengths, 59 | speakers, 60 | tones, 61 | lang_ids, 62 | bert, 63 | sdp_ratio=sdp_ratio, 64 | noise_scale=noise_scale, 65 | noise_scale_w=noise_scale_w, 66 | length_scale=length_scale, 67 | )[0][0, 0] 68 | .data.cpu() 69 | .float() 70 | .numpy() 71 | ) 72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers 73 | if torch.cuda.is_available(): 74 | torch.cuda.empty_cache() 75 | return audio 76 | -------------------------------------------------------------------------------- /oldVersion/V110/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained( 18 | "./bert/chinese-roberta-wwm-ext-large" 19 | ).to(device) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | 27 | assert len(word2ph) == len(text) + 2 28 | word2phone = word2ph 29 | phone_level_feature = [] 30 | for i in range(len(word2phone)): 31 | repeat_feature = res[i].repeat(word2phone[i], 1) 32 | phone_level_feature.append(repeat_feature) 33 | 34 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 35 | 36 | return phone_level_feature.T 37 | 38 | 39 | if __name__ == "__main__": 40 | import torch 41 | 42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 43 | word2phone = [ 44 | 1, 45 | 2, 46 | 1, 47 | 2, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 2, 59 | 2, 60 | 2, 61 | 1, 62 | 1, 63 | 2, 64 | 2, 65 | 1, 66 | 2, 67 | 2, 68 | 2, 69 | 2, 70 | 1, 71 | 2, 72 | 2, 73 | 2, 74 | 2, 75 | 2, 76 | 1, 77 | 2, 78 | 2, 79 | 2, 80 | 2, 81 | 1, 82 | ] 83 | 84 | # 计算总帧数 85 | total_frames = sum(word2phone) 86 | print(word_level_feature.shape) 87 | print(word2phone) 88 | phone_level_feature = [] 89 | for i in range(len(word2phone)): 90 | print(word_level_feature[i].shape) 91 | 92 | # 对每个词重复word2phone[i]次 93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 94 | phone_level_feature.append(repeat_feature) 95 | 96 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 97 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 98 | -------------------------------------------------------------------------------- /re_matching.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def extract_language_and_text_updated(speaker, dialogue): 5 | # 使用正则表达式匹配<语言>标签和其后的文本 6 | pattern_language_text = r"<(\S+?)>([^<]+)" 7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL) 8 | speaker = speaker[1:-1] 9 | # 清理文本:去除两边的空白字符 10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] 11 | matches_cleaned.append(speaker) 12 | return matches_cleaned 13 | 14 | 15 | def validate_text(input_text): 16 | # 验证说话人的正则表达式 17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" 18 | 19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符 20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL) 21 | 22 | # 对每个匹配到的说话人内容进行进一步验证 23 | for _, dialogue in matches: 24 | language_text_matches = extract_language_and_text_updated(_, dialogue) 25 | if not language_text_matches: 26 | return ( 27 | False, 28 | "Error: Invalid format detected in dialogue content. Please check your input.", 29 | ) 30 | 31 | # 如果输入的文本中没有找到任何匹配项 32 | if not matches: 33 | return ( 34 | False, 35 | "Error: No valid speaker format detected. Please check your input.", 36 | ) 37 | 38 | return True, "Input is valid." 39 | 40 | 41 | def text_matching(text: str) -> list: 42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" 43 | matches = re.findall(speaker_pattern, text, re.DOTALL) 44 | result = [] 45 | for speaker, dialogue in matches: 46 | result.append(extract_language_and_text_updated(speaker, dialogue)) 47 | print(result) 48 | return result 49 | 50 | 51 | def cut_para(text): 52 | splitted_para = re.split("[\n]", text) # 按段分 53 | splitted_para = [ 54 | sentence.strip() for sentence in splitted_para if sentence.strip() 55 | ] # 删除空字符串 56 | return splitted_para 57 | 58 | 59 | def cut_sent(para): 60 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符 61 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号 62 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号 63 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) 64 | para = para.rstrip() # 段尾如果有多余的\n就去掉它 65 | return para.split("\n") 66 | 67 | 68 | if __name__ == "__main__": 69 | text = """ 70 | [说话人1] 71 | [说话人2]你好吗?元気ですか?こんにちは,世界。你好吗? 72 | [说话人3]谢谢。どういたしまして。 73 | """ 74 | text_matching(text) 75 | # 测试函数 76 | test_text = """ 77 | [说话人1]你好,こんにちは!こんにちは,世界。 78 | [说话人2]你好吗? 79 | """ 80 | text_matching(test_text) 81 | res = validate_text(test_text) 82 | print(res) 83 | -------------------------------------------------------------------------------- /oldVersion/V101/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | device = torch.device( 6 | "cuda" 7 | if torch.cuda.is_available() 8 | else ( 9 | "mps" 10 | if sys.platform == "darwin" and torch.backends.mps.is_available() 11 | else "cpu" 12 | ) 13 | ) 14 | 15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to( 17 | device 18 | ) 19 | 20 | 21 | def get_bert_feature(text, word2ph): 22 | with torch.no_grad(): 23 | inputs = tokenizer(text, return_tensors="pt") 24 | for i in inputs: 25 | inputs[i] = inputs[i].to(device) 26 | res = model(**inputs, output_hidden_states=True) 27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 28 | 29 | assert len(word2ph) == len(text) + 2 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | 40 | 41 | if __name__ == "__main__": 42 | # feature = get_bert_feature('你好,我是说的道理。') 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /bert_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from multiprocessing import Pool 3 | import commons 4 | import utils 5 | from tqdm import tqdm 6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert 7 | import argparse 8 | import torch.multiprocessing as mp 9 | from config import config 10 | 11 | 12 | def process_line(line): 13 | device = config.bert_gen_config.device 14 | if config.bert_gen_config.use_multi_device: 15 | rank = mp.current_process()._identity 16 | rank = rank[0] if len(rank) > 0 else 0 17 | if torch.cuda.is_available(): 18 | gpu_id = rank % torch.cuda.device_count() 19 | device = torch.device(f"cuda:{gpu_id}") 20 | else: 21 | device = torch.device("cpu") 22 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") 23 | phone = phones.split(" ") 24 | tone = [int(i) for i in tone.split(" ")] 25 | word2ph = [int(i) for i in word2ph.split(" ")] 26 | word2ph = [i for i in word2ph] 27 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 28 | 29 | phone = commons.intersperse(phone, 0) 30 | tone = commons.intersperse(tone, 0) 31 | language = commons.intersperse(language, 0) 32 | for i in range(len(word2ph)): 33 | word2ph[i] = word2ph[i] * 2 34 | word2ph[0] += 1 35 | 36 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") 37 | 38 | try: 39 | bert = torch.load(bert_path) 40 | assert bert.shape[-1] == len(phone) 41 | except Exception: 42 | bert = get_bert(text, word2ph, language_str, device) 43 | assert bert.shape[-1] == len(phone) 44 | torch.save(bert, bert_path) 45 | 46 | 47 | preprocess_text_config = config.preprocess_text_config 48 | 49 | if __name__ == "__main__": 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument( 52 | "-c", "--config", type=str, default=config.bert_gen_config.config_path 53 | ) 54 | parser.add_argument( 55 | "--num_processes", type=int, default=config.bert_gen_config.num_processes 56 | ) 57 | args, _ = parser.parse_known_args() 58 | config_path = args.config 59 | hps = utils.get_hparams_from_file(config_path) 60 | check_bert_models() 61 | lines = [] 62 | with open(hps.data.training_files, encoding="utf-8") as f: 63 | lines.extend(f.readlines()) 64 | 65 | with open(hps.data.validation_files, encoding="utf-8") as f: 66 | lines.extend(f.readlines()) 67 | if len(lines) != 0: 68 | num_processes = args.num_processes 69 | with Pool(processes=num_processes) as pool: 70 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)): 71 | pass 72 | 73 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!") 74 | -------------------------------------------------------------------------------- /oldVersion/V111/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/chinese-roberta-wwm-ext-large" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert len(word2ph) == len(text) + 2 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | 41 | 42 | if __name__ == "__main__": 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 16 | if ( 17 | sys.platform == "darwin" 18 | and torch.backends.mps.is_available() 19 | and device == "cpu" 20 | ): 21 | device = "mps" 22 | if not device: 23 | device = "cuda" 24 | if device not in models.keys(): 25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 26 | with torch.no_grad(): 27 | inputs = tokenizer(text, return_tensors="pt") 28 | for i in inputs: 29 | inputs[i] = inputs[i].to(device) 30 | res = models[device](**inputs, output_hidden_states=True) 31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 32 | 33 | assert len(word2ph) == len(text) + 2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | 44 | 45 | if __name__ == "__main__": 46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 47 | word2phone = [ 48 | 1, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 1, 54 | 2, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 2, 65 | 1, 66 | 1, 67 | 2, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 1, 86 | ] 87 | 88 | # 计算总帧数 89 | total_frames = sum(word2phone) 90 | print(word_level_feature.shape) 91 | print(word2phone) 92 | phone_level_feature = [] 93 | for i in range(len(word2phone)): 94 | print(word_level_feature[i].shape) 95 | 96 | # 对每个词重复word2phone[i]次 97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 98 | phone_level_feature.append(repeat_feature) 99 | 100 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 101 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 102 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "torch", 3 | "activation_dropout": 0.1, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "Wav2Vec2ForSpeechClassification" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 768, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": true, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_dropout": 0.0, 51 | "feat_extract_norm": "layer", 52 | "feat_proj_dropout": 0.1, 53 | "feat_quantizer_dropout": 0.0, 54 | "final_dropout": 0.1, 55 | "finetuning_task": "wav2vec2_reg", 56 | "gradient_checkpointing": false, 57 | "hidden_act": "gelu", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "id2label": { 62 | "0": "arousal", 63 | "1": "dominance", 64 | "2": "valence" 65 | }, 66 | "initializer_range": 0.02, 67 | "intermediate_size": 4096, 68 | "label2id": { 69 | "arousal": 0, 70 | "dominance": 1, 71 | "valence": 2 72 | }, 73 | "layer_norm_eps": 1e-05, 74 | "layerdrop": 0.1, 75 | "mask_feature_length": 10, 76 | "mask_feature_min_masks": 0, 77 | "mask_feature_prob": 0.0, 78 | "mask_time_length": 10, 79 | "mask_time_min_masks": 2, 80 | "mask_time_prob": 0.05, 81 | "model_type": "wav2vec2", 82 | "num_adapter_layers": 3, 83 | "num_attention_heads": 16, 84 | "num_codevector_groups": 2, 85 | "num_codevectors_per_group": 320, 86 | "num_conv_pos_embedding_groups": 16, 87 | "num_conv_pos_embeddings": 128, 88 | "num_feat_extract_layers": 7, 89 | "num_hidden_layers": 12, 90 | "num_negatives": 100, 91 | "output_hidden_size": 1024, 92 | "pad_token_id": 0, 93 | "pooling_mode": "mean", 94 | "problem_type": "regression", 95 | "proj_codevector_dim": 768, 96 | "tdnn_dilation": [ 97 | 1, 98 | 2, 99 | 3, 100 | 1, 101 | 1 102 | ], 103 | "tdnn_dim": [ 104 | 512, 105 | 512, 106 | 512, 107 | 512, 108 | 1500 109 | ], 110 | "tdnn_kernel": [ 111 | 5, 112 | 3, 113 | 3, 114 | 1, 115 | 1 116 | ], 117 | "torch_dtype": "float32", 118 | "transformers_version": "4.17.0.dev0", 119 | "use_weighted_layer_sum": false, 120 | "vocab_size": null, 121 | "xvector_output_dim": 512 122 | } 123 | -------------------------------------------------------------------------------- /oldVersion/V110/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | assert bert.shape[-1] == len(phone), phone 26 | 27 | if language_str == "ZH": 28 | bert = bert 29 | ja_bert = torch.zeros(768, len(phone)) 30 | elif language_str == "JP": 31 | ja_bert = bert 32 | bert = torch.zeros(1024, len(phone)) 33 | else: 34 | bert = torch.zeros(1024, len(phone)) 35 | ja_bert = torch.zeros(768, len(phone)) 36 | 37 | assert bert.shape[-1] == len( 38 | phone 39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 40 | 41 | phone = torch.LongTensor(phone) 42 | tone = torch.LongTensor(tone) 43 | language = torch.LongTensor(language) 44 | return bert, ja_bert, phone, tone, language 45 | 46 | 47 | def infer( 48 | text, 49 | sdp_ratio, 50 | noise_scale, 51 | noise_scale_w, 52 | length_scale, 53 | sid, 54 | language, 55 | hps, 56 | net_g, 57 | device, 58 | ): 59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device) 60 | with torch.no_grad(): 61 | x_tst = phones.to(device).unsqueeze(0) 62 | tones = tones.to(device).unsqueeze(0) 63 | lang_ids = lang_ids.to(device).unsqueeze(0) 64 | bert = bert.to(device).unsqueeze(0) 65 | ja_bert = ja_bert.to(device).unsqueeze(0) 66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 67 | del phones 68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 69 | audio = ( 70 | net_g.infer( 71 | x_tst, 72 | x_tst_lengths, 73 | speakers, 74 | tones, 75 | lang_ids, 76 | bert, 77 | ja_bert, 78 | sdp_ratio=sdp_ratio, 79 | noise_scale=noise_scale, 80 | noise_scale_w=noise_scale_w, 81 | length_scale=length_scale, 82 | )[0][0, 0] 83 | .data.cpu() 84 | .float() 85 | .numpy() 86 | ) 87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert 88 | if torch.cuda.is_available(): 89 | torch.cuda.empty_cache() 90 | return audio 91 | -------------------------------------------------------------------------------- /update_status.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gradio as gr 3 | 4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"} 5 | 6 | 7 | def raw_dir_convert_to_path(target_dir: str, lang): 8 | res = target_dir.rstrip("/").rstrip("\\") 9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")): 10 | res = os.path.join("./raw", res) 11 | if ( 12 | (not res.endswith("_zh")) 13 | and (not res.endswith("_jp")) 14 | and (not res.endswith("_en")) 15 | ): 16 | res += lang_dict[lang] 17 | return res 18 | 19 | 20 | def update_g_files(): 21 | g_files = [] 22 | cnt = 0 23 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 24 | for file in files: 25 | if file.startswith("G_") and file.endswith(".pth"): 26 | g_files.append(os.path.join(root, file)) 27 | cnt += 1 28 | print(g_files) 29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files) 30 | 31 | 32 | def update_c_files(): 33 | c_files = [] 34 | cnt = 0 35 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 36 | for file in files: 37 | if file.startswith("config.json"): 38 | c_files.append(os.path.join(root, file)) 39 | cnt += 1 40 | print(c_files) 41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files) 42 | 43 | 44 | def update_model_folders(): 45 | subdirs = [] 46 | cnt = 0 47 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 48 | for dir_name in dirs: 49 | if os.path.basename(dir_name) != "eval": 50 | subdirs.append(os.path.join(root, dir_name)) 51 | cnt += 1 52 | print(subdirs) 53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs) 54 | 55 | 56 | def update_wav_lab_pairs(): 57 | wav_count = tot_count = 0 58 | for root, _, files in os.walk("./raw"): 59 | for file in files: 60 | # print(file) 61 | file_path = os.path.join(root, file) 62 | if file.lower().endswith(".wav"): 63 | lab_file = os.path.splitext(file_path)[0] + ".lab" 64 | if os.path.exists(lab_file): 65 | wav_count += 1 66 | tot_count += 1 67 | return f"{wav_count} / {tot_count}" 68 | 69 | 70 | def update_raw_folders(): 71 | subdirs = [] 72 | cnt = 0 73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径 74 | raw_path = os.path.join(script_path, "raw") 75 | print(raw_path) 76 | os.makedirs(raw_path, exist_ok=True) 77 | for root, dirs, files in os.walk(raw_path): 78 | for dir_name in dirs: 79 | relative_path = os.path.relpath( 80 | os.path.join(root, dir_name), script_path 81 | ) # 获取相对路径 82 | subdirs.append(relative_path) 83 | cnt += 1 84 | print(subdirs) 85 | return ( 86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹", 87 | gr.Dropdown.update(choices=subdirs), 88 | gr.Textbox.update(value=update_wav_lab_pairs()), 89 | ) 90 | -------------------------------------------------------------------------------- /oldVersion/V101/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "I", 78 | "N", 79 | "U", 80 | "a", 81 | "b", 82 | "by", 83 | "ch", 84 | "cl", 85 | "d", 86 | "dy", 87 | "e", 88 | "f", 89 | "g", 90 | "gy", 91 | "h", 92 | "hy", 93 | "i", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "p", 103 | "py", 104 | "r", 105 | "ry", 106 | "s", 107 | "sh", 108 | "t", 109 | "ts", 110 | "u", 111 | "V", 112 | "w", 113 | "y", 114 | "z", 115 | ] 116 | num_ja_tones = 1 117 | 118 | # English 119 | en_symbols = [ 120 | "aa", 121 | "ae", 122 | "ah", 123 | "ao", 124 | "aw", 125 | "ay", 126 | "b", 127 | "ch", 128 | "d", 129 | "dh", 130 | "eh", 131 | "er", 132 | "ey", 133 | "f", 134 | "g", 135 | "hh", 136 | "ih", 137 | "iy", 138 | "jh", 139 | "k", 140 | "l", 141 | "m", 142 | "n", 143 | "ng", 144 | "ow", 145 | "oy", 146 | "p", 147 | "r", 148 | "s", 149 | "sh", 150 | "t", 151 | "th", 152 | "uh", 153 | "uw", 154 | "V", 155 | "w", 156 | "y", 157 | "z", 158 | "zh", 159 | ] 160 | num_en_tones = 4 161 | 162 | # combine all symbols 163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 164 | symbols = [pad] + normal_symbols + pu_symbols 165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 166 | 167 | # combine all tones 168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 169 | 170 | # language maps 171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2} 172 | num_languages = len(language_id_map.keys()) 173 | 174 | language_tone_start_map = { 175 | "ZH": 0, 176 | "JA": num_zh_tones, 177 | "EN": num_zh_tones + num_ja_tones, 178 | } 179 | 180 | if __name__ == "__main__": 181 | a = set(zh_symbols) 182 | b = set(en_symbols) 183 | print(sorted(a & b)) 184 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V110/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V111/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V101/text/japanese.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py 2 | import re 3 | import sys 4 | 5 | import pyopenjtalk 6 | 7 | from . import symbols 8 | 9 | # Regular expression matching Japanese without punctuation marks: 10 | _japanese_characters = re.compile( 11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 12 | ) 13 | 14 | # Regular expression matching non-Japanese characters or punctuation marks: 15 | _japanese_marks = re.compile( 16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 17 | ) 18 | 19 | # List of (symbol, Japanese) pairs for marks: 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] 21 | 22 | 23 | # List of (consonant, sokuon) pairs: 24 | _real_sokuon = [ 25 | (re.compile("%s" % x[0]), x[1]) 26 | for x in [ 27 | (r"Q([↑↓]*[kg])", r"k#\1"), 28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"), 29 | (r"Q([↑↓]*[sʃ])", r"s\1"), 30 | (r"Q([↑↓]*[pb])", r"p#\1"), 31 | ] 32 | ] 33 | 34 | # List of (consonant, hatsuon) pairs: 35 | _real_hatsuon = [ 36 | (re.compile("%s" % x[0]), x[1]) 37 | for x in [ 38 | (r"N([↑↓]*[pbm])", r"m\1"), 39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"), 40 | (r"N([↑↓]*[tdn])", r"n\1"), 41 | (r"N([↑↓]*[kg])", r"ŋ\1"), 42 | ] 43 | ] 44 | 45 | 46 | def post_replace_ph(ph): 47 | rep_map = { 48 | ":": ",", 49 | ";": ",", 50 | ",": ",", 51 | "。": ".", 52 | "!": "!", 53 | "?": "?", 54 | "\n": ".", 55 | "·": ",", 56 | "、": ",", 57 | "...": "…", 58 | "v": "V", 59 | } 60 | if ph in rep_map.keys(): 61 | ph = rep_map[ph] 62 | if ph in symbols: 63 | return ph 64 | if ph not in symbols: 65 | ph = "UNK" 66 | return ph 67 | 68 | 69 | def symbols_to_japanese(text): 70 | for regex, replacement in _symbols_to_japanese: 71 | text = re.sub(regex, replacement, text) 72 | return text 73 | 74 | 75 | def preprocess_jap(text): 76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" 77 | text = symbols_to_japanese(text) 78 | sentences = re.split(_japanese_marks, text) 79 | marks = re.findall(_japanese_marks, text) 80 | text = [] 81 | for i, sentence in enumerate(sentences): 82 | if re.match(_japanese_characters, sentence): 83 | p = pyopenjtalk.g2p(sentence) 84 | text += p.split(" ") 85 | 86 | if i < len(marks): 87 | text += [marks[i].replace(" ", "")] 88 | return text 89 | 90 | 91 | def text_normalize(text): 92 | # todo: jap text normalize 93 | return text 94 | 95 | 96 | def g2p(norm_text): 97 | phones = preprocess_jap(norm_text) 98 | phones = [post_replace_ph(i) for i in phones] 99 | # todo: implement tones and word2ph 100 | tones = [0 for i in phones] 101 | word2ph = [1 for i in phones] 102 | return phones, tones, word2ph 103 | 104 | 105 | if __name__ == "__main__": 106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines(): 107 | text = line.split(":")[1] 108 | phones, tones, word2ph = g2p(text) 109 | for p in phones: 110 | if p == "z": 111 | print(text, phones) 112 | sys.exit(0) 113 | -------------------------------------------------------------------------------- /tools/classify_language.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from config import config 4 | 5 | LANGUAGE_IDENTIFICATION_LIBRARY = config.webui_config.language_identification_library 6 | 7 | module = LANGUAGE_IDENTIFICATION_LIBRARY.lower() 8 | 9 | langid_languages = [ 10 | "af", 11 | "am", 12 | "an", 13 | "ar", 14 | "as", 15 | "az", 16 | "be", 17 | "bg", 18 | "bn", 19 | "br", 20 | "bs", 21 | "ca", 22 | "cs", 23 | "cy", 24 | "da", 25 | "de", 26 | "dz", 27 | "el", 28 | "en", 29 | "eo", 30 | "es", 31 | "et", 32 | "eu", 33 | "fa", 34 | "fi", 35 | "fo", 36 | "fr", 37 | "ga", 38 | "gl", 39 | "gu", 40 | "he", 41 | "hi", 42 | "hr", 43 | "ht", 44 | "hu", 45 | "hy", 46 | "id", 47 | "is", 48 | "it", 49 | "ja", 50 | "jv", 51 | "ka", 52 | "kk", 53 | "km", 54 | "kn", 55 | "ko", 56 | "ku", 57 | "ky", 58 | "la", 59 | "lb", 60 | "lo", 61 | "lt", 62 | "lv", 63 | "mg", 64 | "mk", 65 | "ml", 66 | "mn", 67 | "mr", 68 | "ms", 69 | "mt", 70 | "nb", 71 | "ne", 72 | "nl", 73 | "nn", 74 | "no", 75 | "oc", 76 | "or", 77 | "pa", 78 | "pl", 79 | "ps", 80 | "pt", 81 | "qu", 82 | "ro", 83 | "ru", 84 | "rw", 85 | "se", 86 | "si", 87 | "sk", 88 | "sl", 89 | "sq", 90 | "sr", 91 | "sv", 92 | "sw", 93 | "ta", 94 | "te", 95 | "th", 96 | "tl", 97 | "tr", 98 | "ug", 99 | "uk", 100 | "ur", 101 | "vi", 102 | "vo", 103 | "wa", 104 | "xh", 105 | "zh", 106 | "zu", 107 | ] 108 | 109 | 110 | def classify_language(text: str, target_languages: list = None) -> str: 111 | if module == "fastlid" or module == "fasttext": 112 | from fastlid import fastlid, supported_langs 113 | 114 | classifier = fastlid 115 | if target_languages != None: 116 | target_languages = [ 117 | lang for lang in target_languages if lang in supported_langs 118 | ] 119 | fastlid.set_languages = target_languages 120 | elif module == "langid": 121 | import langid 122 | 123 | classifier = langid.classify 124 | if target_languages != None: 125 | target_languages = [ 126 | lang for lang in target_languages if lang in langid_languages 127 | ] 128 | langid.set_languages(target_languages) 129 | else: 130 | raise ValueError(f"Wrong module {module}") 131 | 132 | lang = classifier(text)[0] 133 | 134 | return lang 135 | 136 | 137 | def classify_zh_ja(text: str) -> str: 138 | for idx, char in enumerate(text): 139 | unicode_val = ord(char) 140 | 141 | # 检测日语字符 142 | if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF: 143 | return "ja" 144 | 145 | # 检测汉字字符 146 | if 0x4E00 <= unicode_val <= 0x9FFF: 147 | # 检查周围的字符 148 | next_char = text[idx + 1] if idx + 1 < len(text) else None 149 | 150 | if next_char and ( 151 | 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF 152 | ): 153 | return "ja" 154 | 155 | return "zh" 156 | 157 | 158 | def split_alpha_nonalpha(text): 159 | return re.split( 160 | r"(?:(?<=[\u4e00-\u9fff])|(?<=[\u3040-\u30FF]))(?=[a-zA-Z])|(?<=[a-zA-Z])(?:(?=[\u4e00-\u9fff])|(?=[\u3040-\u30FF]))", 161 | text, 162 | ) 163 | 164 | 165 | if __name__ == "__main__": 166 | text = "这是一个测试文本" 167 | print(classify_language(text)) 168 | print(classify_zh_ja(text)) # "zh" 169 | 170 | text = "これはテストテキストです" 171 | print(classify_language(text)) 172 | print(classify_zh_ja(text)) # "ja" 173 | -------------------------------------------------------------------------------- /transcribe_genshin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | import numpy as np 5 | from multiprocessing import Pool, cpu_count 6 | 7 | import soundfile 8 | from scipy.io import wavfile 9 | from tqdm import tqdm 10 | from config import config 11 | 12 | global speaker_annos 13 | speaker_annos = [] 14 | 15 | def process(item): 16 | spkdir, wav_name, args = item 17 | speaker = spkdir.replace("\\", "/").split("/")[-1] 18 | wav_path = os.path.join(args.in_dir, speaker, wav_name) 19 | if os.path.exists(wav_path) and '.wav' in wav_path: 20 | os.makedirs(os.path.join(args.out_dir, speaker), exist_ok=True) 21 | wav, sr = librosa.load(wav_path, sr=args.sr) 22 | soundfile.write( 23 | os.path.join(args.out_dir, speaker, wav_name), 24 | wav, 25 | sr 26 | ) 27 | 28 | def process_text(item): 29 | spkdir, wav_name, args,lang = item 30 | speaker = spkdir.replace("\\", "/").split("/")[-1] 31 | wav_path = os.path.join(args.in_dir, speaker, wav_name) 32 | global speaker_annos 33 | tr_name = wav_name.replace('.wav', '') 34 | with open(args.out_dir+'/'+speaker+'/'+tr_name+'.lab', "r", encoding="utf-8") as file: 35 | text = file.read() 36 | text = text.replace("{NICKNAME}",'旅行者') 37 | text = text.replace("{M#他}{F#她}",'他') 38 | text = text.replace("{M#她}{F#他}",'他') 39 | substring = "{M#妹妹}{F#哥哥}" 40 | if substring in text: 41 | if tr_name.endswith("a"): 42 | text = text.replace("{M#妹妹}{F#哥哥}",'妹妹') 43 | if tr_name.endswith("b"): 44 | text = text.replace("{M#妹妹}{F#哥哥}",'哥哥') 45 | text = text.replace("#",'') 46 | text = f'{lang}|{text}\n' # 47 | speaker_annos.append(args.out_dir+'/'+speaker+'/'+wav_name+ "|" + speaker + "|" + text) 48 | 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--sr", type=int, default=44100, help="sampling rate") 54 | parser.add_argument("--in_dir", type=str, default=config.resample_config.in_dir, help="path to source dir") 55 | parser.add_argument("--out_dir", type=str, default=config.resample_config.out_dir, help="path to target dir") 56 | parent_dir=config.resample_config.in_dir 57 | print(config.resample_config.out_dir) 58 | speaker_names = list(os.walk(parent_dir))[0][1] 59 | args = parser.parse_args() 60 | 61 | entered = False 62 | while not entered: 63 | print("Enter a letter to choose language.\n") 64 | print("C = Chinese ; J = Japanese ;E = English;\n e.g: C \n") 65 | languages=input("Enter language: ") 66 | if (languages == "C"or languages == "c"): 67 | lang='ZH' 68 | entered = True 69 | elif (languages == "J"or languages == "j"): 70 | lang='JP' 71 | entered = True 72 | elif (languages == "E"or languages == "e"): 73 | lang='EN' 74 | entered = True 75 | else: 76 | print("Illegal Arguments! Please try again.\n") 77 | # processs = 8 78 | processs = cpu_count()-2 if cpu_count() >4 else 1 79 | pool = Pool(processes=processs) 80 | 81 | for speaker in os.listdir(args.in_dir): 82 | spk_dir = os.path.join(args.in_dir, speaker) 83 | if os.path.isdir(spk_dir): 84 | print(spk_dir) 85 | for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])): 86 | pass 87 | for i in os.listdir(spk_dir): 88 | if i.endswith("wav"): 89 | pro=(spk_dir, i, args, lang) 90 | process_text(pro) 91 | if len(speaker_annos) == 0: 92 | print("transcribe error. len(speaker_annos) == 0") 93 | else: 94 | with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f: 95 | for line in speaker_annos: 96 | f.write(line) 97 | print("finished.") 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bert-VITS2_train 2 | 3 | ## 本项目fork自https://github.com/YYuX-1145/Bert-VITS2-Integration-package/tree/2.0.2 4 | 5 | ## 安装依赖 6 | 7 | ``` 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## 下载bert模型 放入bert目录 12 | 13 | ``` 14 | 链接:https://pan.baidu.com/s/11vLNEVDeP_8YhYIJUjcUeg?pwd=v3uc 15 | ``` 16 | 17 | ``` 18 | E:\work\Bert-VITS2-v202\bert>tree /f 19 | Folder PATH listing for volume myssd 20 | Volume serial number is 7CE3-15AE 21 | E:. 22 | │ bert_models.json 23 | │ 24 | ├───bert-base-japanese-v3 25 | │ config.json 26 | │ README.md 27 | │ tokenizer_config.json 28 | │ vocab.txt 29 | │ 30 | ├───bert-large-japanese-v2 31 | │ config.json 32 | │ README.md 33 | │ tokenizer_config.json 34 | │ vocab.txt 35 | │ 36 | ├───chinese-roberta-wwm-ext-large 37 | │ added_tokens.json 38 | │ config.json 39 | │ pytorch_model.bin 40 | │ README.md 41 | │ special_tokens_map.json 42 | │ tokenizer.json 43 | │ tokenizer_config.json 44 | │ vocab.txt 45 | │ 46 | ├───deberta-v2-large-japanese 47 | │ config.json 48 | │ pytorch_model.bin 49 | │ README.md 50 | │ special_tokens_map.json 51 | │ tokenizer.json 52 | │ tokenizer_config.json 53 | │ 54 | └───deberta-v3-large 55 | config.json 56 | generator_config.json 57 | pytorch_model.bin 58 | README.md 59 | spm.model 60 | tokenizer_config.json 61 | ``` 62 | 63 | ## 下载预训练模型,放入pretrained_models目录 64 | 65 | ``` 66 | https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/model_readme_tmpl?name=Bert-VITS2%E4%B8%AD%E6%97%A5%E8%8B%B1%E5%BA%95%E6%A8%A1-fix 67 | ``` 68 | 69 | ``` 70 | E:\work\Bert-VITS2-v202\pretrained_models>tree /f 71 | Folder PATH listing for volume myssd 72 | Volume serial number is 7CE3-15AE 73 | E:. 74 | DUR_0.pth 75 | D_0.pth 76 | G_0.pth 77 | 78 | No subfolders exist 79 | ``` 80 | 81 | ## 下载数据集 82 | 83 | ``` 84 | https://pan.ai-hobbyist.org/Genshin%20Datasets/%E4%B8%AD%E6%96%87%20-%20Chinese/%E5%88%86%E8%A7%92%E8%89%B2%20-%20Single/%E8%A7%92%E8%89%B2%E8%AF%AD%E9%9F%B3%20-%20Character 85 | ``` 86 | 87 | ## 以刻晴为例 解压缩后,放入项目的Data/keqing/raw/keqing目录 88 | 89 | ``` 90 | E:\work\Bert-VITS2-v202\Data\keqing\raw\keqing>tree /f 91 | Folder PATH listing for volume myssd 92 | Volume serial number is 7CE3-15AE 93 | E:. 94 | vo_card_keqing_endOfGame_fail_01.lab 95 | vo_card_keqing_endOfGame_fail_01.wav 96 | ``` 97 | 98 | ## 转写标注文件 99 | 100 | ``` 101 | 102 | python3 transcribe_genshin.py 103 | 104 | ``` 105 | 106 | 107 | ## 如果是自主构建数据集,把音频素材以当前模型命名为*.wav文件,如meimei.wav,放入raw目录,随后运行脚本进行切分 108 | 109 | ``` 110 | python3 audio_slicer.py 111 | ``` 112 | 113 | ``` 114 | E:\work\Bert-VITS2-v202_demo\Data\meimei\raw\meimei>tree /f 115 | Folder PATH listing for volume myssd 116 | Volume serial number is 7CE3-15AE 117 | E:. 118 | meimei_0.wav 119 | meimei_1.wav 120 | meimei_2.wav 121 | meimei_3.wav 122 | meimei_4.wav 123 | meimei_5.wav 124 | meimei_6.wav 125 | meimei_7.wav 126 | meimei_8.wav 127 | ``` 128 | 129 | ## 文本预处理和生成bert模型可读文件: 130 | 131 | ``` 132 | python3 preprocess_text.py 133 | 134 | python3 bert_gen.py 135 | 136 | ``` 137 | 138 | ## 开始训练 139 | 140 | ``` 141 | python3 train_ms.py 142 | ``` 143 | 144 | ## 训练好的模型目录 145 | 146 | ``` 147 | 148 | E:\work\Bert-VITS2-v202\Data\keqing\models>tree /f 149 | Folder PATH listing for volume myssd 150 | Volume serial number is 7CE3-15AE 151 | E:. 152 | │ DUR_0.pth 153 | │ DUR_550.pth 154 | │ DUR_600.pth 155 | │ DUR_650.pth 156 | │ D_0.pth 157 | │ D_600.pth 158 | │ D_650.pth 159 | │ events.out.tfevents.1700625154.ly.24008.0 160 | │ events.out.tfevents.1700630428.ly.20380.0 161 | │ G_0.pth 162 | │ G_450.pth 163 | │ G_500.pth 164 | │ G_550.pth 165 | │ G_600.pth 166 | │ G_650.pth 167 | │ train.log 168 | │ 169 | └───eval 170 | events.out.tfevents.1700625154.ly.24008.1 171 | events.out.tfevents.1700630428.ly.20380.1 172 | 173 | ``` 174 | 175 | ## 模型推理验证 176 | 177 | ``` 178 | python3 server_fastapi.py 179 | ``` 180 | -------------------------------------------------------------------------------- /Web/assets/index-49e71a58.css: -------------------------------------------------------------------------------- 1 | html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}@-ms-viewport{width:device-width}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}/*! 2 | Theme: StackOverflow Light 3 | Description: Light theme as used on stackoverflow.com 4 | Author: stackoverflow.com 5 | Maintainer: @Hirse 6 | Website: https://github.com/StackExchange/Stacks 7 | License: MIT 8 | Updated: 2021-05-15 9 | 10 | Updated for @stackoverflow/stacks v0.64.0 11 | Code Blocks: /blob/v0.64.0/lib/css/components/_stacks-code-blocks.less 12 | Colors: /blob/v0.64.0/lib/css/exports/_stacks-constants-colors.less 13 | */.hljs{color:#2f3337;background:#f6f6f6}.hljs-subst{color:#2f3337}.hljs-comment{color:#656e77}.hljs-keyword,.hljs-selector-tag,.hljs-meta .hljs-keyword,.hljs-doctag,.hljs-section,.hljs-attr{color:#015692}.hljs-attribute{color:#803378}.hljs-name,.hljs-type,.hljs-number,.hljs-selector-id,.hljs-quote,.hljs-template-tag{color:#b75501}.hljs-selector-class{color:#015692}.hljs-string,.hljs-regexp,.hljs-symbol,.hljs-variable,.hljs-template-variable,.hljs-link,.hljs-selector-attr{color:#54790d}.hljs-meta,.hljs-selector-pseudo{color:#015692}.hljs-built_in,.hljs-title,.hljs-literal{color:#b75501}.hljs-bullet,.hljs-code{color:#535a60}.hljs-meta .hljs-string{color:#54790d}.hljs-deletion{color:#c02d2e}.hljs-addition{color:#2f6f44}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700} 14 | -------------------------------------------------------------------------------- /configs/default_config.yml: -------------------------------------------------------------------------------- 1 | # 全局配置 2 | # 对于希望在同一时间使用多个配置文件的情况,例如两个GPU同时跑两个训练集:通过环境变量指定配置文件,不指定则默认为./config.yml 3 | 4 | # 拟提供通用路径配置,统一存放数据,避免数据放得很乱 5 | # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径 6 | # 不填或者填空则路径为相对于项目根目录的路径 7 | dataset_path: "Data/TEST" 8 | 9 | # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token 10 | mirror: "" 11 | openi_token: "" # openi token 12 | 13 | # resample 音频重采样配置 14 | # 注意, “:” 后需要加空格 15 | resample: 16 | # 目标重采样率 17 | sampling_rate: 44100 18 | # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样 19 | # 请填入相对于datasetPath的相对路径 20 | in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir 21 | # 音频文件重采样后输出路径 22 | out_dir: "audios/wavs" 23 | 24 | 25 | # preprocess_text 数据集预处理相关配置 26 | # 注意, “:” 后需要加空格 27 | preprocess_text: 28 | # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。 29 | transcription_path: "filelists/short_character_anno.list" 30 | # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成 31 | cleaned_path: "filelists/cleaned.list" 32 | # 训练集路径 33 | train_path: "filelists/train.list" 34 | # 验证集路径 35 | val_path: "filelists/val.list" 36 | # 配置文件路径 37 | config_path: "config.json" 38 | # 每个speaker的验证集条数 39 | val_per_spk: 5 40 | # 验证集最大条数,多于的会被截断并放到训练集中 41 | max_val_total: 8 42 | # 是否进行数据清洗 43 | clean: true 44 | 45 | 46 | # bert_gen 相关配置 47 | # 注意, “:” 后需要加空格 48 | bert_gen: 49 | # 训练数据集配置文件路径 50 | config_path: "config.json" 51 | # 并行数 52 | num_processes: 2 53 | # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理 54 | # 该选项同时决定了get_bert_feature的默认设备 55 | device: "cuda" 56 | # 使用多卡推理 57 | use_multi_device: false 58 | 59 | 60 | # train 训练配置 61 | # 注意, “:” 后需要加空格 62 | train_ms: 63 | # 需要加载的环境变量,多显卡训练时RANK请手动在环境变量填写 64 | # 环境变量对应名称环境变量不存在时加载,也就是说手动添加的环境变量优先级更高,会覆盖本配置文件 65 | env: 66 | MASTER_ADDR: "localhost" 67 | MASTER_PORT: 10086 68 | WORLD_SIZE: 1 69 | RANK: 0 70 | # 可以填写任意名的环境变量 71 | THE_ENV_VAR_YOU_NEED_TO_USE: "1234567" 72 | # 底模设置 73 | base: 74 | use_base_model: false 75 | repo_id: "Stardust_minus/Bert-VITS2" 76 | model_image: "Bert-VITS2中日底模" # openi网页的模型名 77 | # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下 78 | model: "models" 79 | # 配置文件路径 80 | config_path: "config.json" 81 | 82 | 83 | # webui webui配置 84 | # 注意, “:” 后需要加空格 85 | webui: 86 | # 推理设备 87 | device: "cuda" 88 | # 模型路径 89 | model: "models/G_100.pth" 90 | # 配置文件路径 91 | config_path: "Data/TEST/config.json" 92 | # 端口号 93 | port: 7860 94 | # 是否公开部署,对外网开放 95 | share: false 96 | # 是否开启debug模式 97 | debug: false 98 | # 语种识别库,可选langid, fastlid 99 | language_identification_library: "langid" 100 | 101 | 102 | # server api配置 103 | # 注意, “:” 后需要加空格 104 | # 注意,本配置下的所有配置均为相对于根目录的路径 105 | server: 106 | # 端口号 107 | port: 7860 108 | # 模型默认使用设备:但是当前并没有实现这个配置。 109 | device: "cuda" 110 | # 需要加载的所有模型的配置 111 | # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。 112 | models: 113 | - # 模型的路径 114 | model: "./Data/TEST/models/G_100.pth" 115 | # 模型config.json的路径 116 | config: "./Data/TEST/config.json" 117 | # 模型使用设备,若填写则会覆盖默认配置 118 | device: "cuda" 119 | # 模型默认使用的语言 120 | language: "ZH" 121 | # 模型人物默认参数 122 | # 不必填写所有人物,不填的使用默认值 123 | # 暂时不用填写,当前尚未实现按人区分配置 124 | speakers: 125 | - speaker: "科比" 126 | sdp_ratio: 0.2 127 | noise_scale: 0.6 128 | noise_scale_w: 0.8 129 | length_scale: 1 130 | - speaker: "五条悟" 131 | sdp_ratio: 0.3 132 | noise_scale: 0.7 133 | noise_scale_w: 0.8 134 | length_scale: 0.5 135 | - speaker: "安倍晋三" 136 | sdp_ratio: 0.2 137 | noise_scale: 0.6 138 | noise_scale_w: 0.8 139 | length_scale: 1.2 140 | - # 模型的路径 141 | model: "./Data/test/models/G_100.pth" 142 | # 模型config.json的路径 143 | config: "./Data/test/config.json" 144 | # 模型使用设备,若填写则会覆盖默认配置 145 | device: "cuda" 146 | # 模型默认使用的语言 147 | language: "JP" 148 | # 模型人物默认参数 149 | # 不必填写所有人物,不填的使用默认值 150 | speakers: [ ] # 也可以不填 151 | 152 | 153 | # 百度翻译开放平台 api配置 154 | # api接入文档 https://api.fanyi.baidu.com/doc/21 155 | # 请不要在github等网站公开分享你的app id 与 key 156 | translate: 157 | # 你的APPID 158 | "app_key": "" 159 | # 你的密钥 160 | "secret_key": "" 161 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | datasets: 4 | - msp-podcast 5 | inference: true 6 | tags: 7 | - speech 8 | - audio 9 | - wav2vec2 10 | - audio-classification 11 | - emotion-recognition 12 | license: cc-by-nc-sa-4.0 13 | pipeline_tag: audio-classification 14 | --- 15 | 16 | # Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0 17 | 18 | The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [ 19 | Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to). 20 | 21 | # Usage 22 | 23 | ```python 24 | import numpy as np 25 | import torch 26 | import torch.nn as nn 27 | from transformers import Wav2Vec2Processor 28 | from transformers.models.wav2vec2.modeling_wav2vec2 import ( 29 | Wav2Vec2Model, 30 | Wav2Vec2PreTrainedModel, 31 | ) 32 | 33 | 34 | class RegressionHead(nn.Module): 35 | r"""Classification head.""" 36 | 37 | def __init__(self, config): 38 | 39 | super().__init__() 40 | 41 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 42 | self.dropout = nn.Dropout(config.final_dropout) 43 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 44 | 45 | def forward(self, features, **kwargs): 46 | 47 | x = features 48 | x = self.dropout(x) 49 | x = self.dense(x) 50 | x = torch.tanh(x) 51 | x = self.dropout(x) 52 | x = self.out_proj(x) 53 | 54 | return x 55 | 56 | 57 | class EmotionModel(Wav2Vec2PreTrainedModel): 58 | r"""Speech emotion classifier.""" 59 | 60 | def __init__(self, config): 61 | 62 | super().__init__(config) 63 | 64 | self.config = config 65 | self.wav2vec2 = Wav2Vec2Model(config) 66 | self.classifier = RegressionHead(config) 67 | self.init_weights() 68 | 69 | def forward( 70 | self, 71 | input_values, 72 | ): 73 | 74 | outputs = self.wav2vec2(input_values) 75 | hidden_states = outputs[0] 76 | hidden_states = torch.mean(hidden_states, dim=1) 77 | logits = self.classifier(hidden_states) 78 | 79 | return hidden_states, logits 80 | 81 | 82 | 83 | # load model from hub 84 | device = 'cpu' 85 | model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim' 86 | processor = Wav2Vec2Processor.from_pretrained(model_name) 87 | model = EmotionModel.from_pretrained(model_name) 88 | 89 | # dummy signal 90 | sampling_rate = 16000 91 | signal = np.zeros((1, sampling_rate), dtype=np.float32) 92 | 93 | 94 | def process_func( 95 | x: np.ndarray, 96 | sampling_rate: int, 97 | embeddings: bool = False, 98 | ) -> np.ndarray: 99 | r"""Predict emotions or extract embeddings from raw audio signal.""" 100 | 101 | # run through processor to normalize signal 102 | # always returns a batch, so we just get the first entry 103 | # then we put it on the device 104 | y = processor(x, sampling_rate=sampling_rate) 105 | y = y['input_values'][0] 106 | y = y.reshape(1, -1) 107 | y = torch.from_numpy(y).to(device) 108 | 109 | # run through model 110 | with torch.no_grad(): 111 | y = model(y)[0 if embeddings else 1] 112 | 113 | # convert to numpy 114 | y = y.detach().cpu().numpy() 115 | 116 | return y 117 | 118 | 119 | print(process_func(signal, sampling_rate)) 120 | # Arousal dominance valence 121 | # [[0.5460754 0.6062266 0.40431657]] 122 | 123 | print(process_func(signal, sampling_rate, embeddings=True)) 124 | # Pooled hidden states of last transformer layer 125 | # [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748 126 | # 0.00599211]] 127 | ``` 128 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | import warnings 5 | 6 | # warnings.simplefilter(action='ignore', category=FutureWarning) 7 | warnings.filterwarnings(action="ignore") 8 | MAX_WAV_VALUE = 32768.0 9 | 10 | 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 12 | """ 13 | PARAMS 14 | ------ 15 | C: compression factor 16 | """ 17 | return torch.log(torch.clamp(x, min=clip_val) * C) 18 | 19 | 20 | def dynamic_range_decompression_torch(x, C=1): 21 | """ 22 | PARAMS 23 | ------ 24 | C: compression factor used to compress 25 | """ 26 | return torch.exp(x) / C 27 | 28 | 29 | def spectral_normalize_torch(magnitudes): 30 | output = dynamic_range_compression_torch(magnitudes) 31 | return output 32 | 33 | 34 | def spectral_de_normalize_torch(magnitudes): 35 | output = dynamic_range_decompression_torch(magnitudes) 36 | return output 37 | 38 | 39 | mel_basis = {} 40 | hann_window = {} 41 | 42 | 43 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 44 | if torch.min(y) < -1.0: 45 | print("min value is ", torch.min(y)) 46 | if torch.max(y) > 1.0: 47 | print("max value is ", torch.max(y)) 48 | 49 | global hann_window 50 | dtype_device = str(y.dtype) + "_" + str(y.device) 51 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 52 | if wnsize_dtype_device not in hann_window: 53 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 54 | dtype=y.dtype, device=y.device 55 | ) 56 | 57 | y = torch.nn.functional.pad( 58 | y.unsqueeze(1), 59 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 60 | mode="reflect", 61 | ) 62 | y = y.squeeze(1) 63 | 64 | spec = torch.stft( 65 | y, 66 | n_fft, 67 | hop_length=hop_size, 68 | win_length=win_size, 69 | window=hann_window[wnsize_dtype_device], 70 | center=center, 71 | pad_mode="reflect", 72 | normalized=False, 73 | onesided=True, 74 | return_complex=False, 75 | ) 76 | 77 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 78 | return spec 79 | 80 | 81 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 82 | global mel_basis 83 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 84 | fmax_dtype_device = str(fmax) + "_" + dtype_device 85 | if fmax_dtype_device not in mel_basis: 86 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 87 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 88 | dtype=spec.dtype, device=spec.device 89 | ) 90 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 91 | spec = spectral_normalize_torch(spec) 92 | return spec 93 | 94 | 95 | def mel_spectrogram_torch( 96 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 97 | ): 98 | if torch.min(y) < -1.0: 99 | print("min value is ", torch.min(y)) 100 | if torch.max(y) > 1.0: 101 | print("max value is ", torch.max(y)) 102 | 103 | global mel_basis, hann_window 104 | dtype_device = str(y.dtype) + "_" + str(y.device) 105 | fmax_dtype_device = str(fmax) + "_" + dtype_device 106 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 107 | if fmax_dtype_device not in mel_basis: 108 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 109 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 110 | dtype=y.dtype, device=y.device 111 | ) 112 | if wnsize_dtype_device not in hann_window: 113 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 114 | dtype=y.dtype, device=y.device 115 | ) 116 | 117 | y = torch.nn.functional.pad( 118 | y.unsqueeze(1), 119 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 120 | mode="reflect", 121 | ) 122 | y = y.squeeze(1) 123 | 124 | spec = torch.stft( 125 | y, 126 | n_fft, 127 | hop_length=hop_size, 128 | win_length=win_size, 129 | window=hann_window[wnsize_dtype_device], 130 | center=center, 131 | pad_mode="reflect", 132 | normalized=False, 133 | onesided=True, 134 | return_complex=False, 135 | ) 136 | 137 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 138 | 139 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 140 | spec = spectral_normalize_torch(spec) 141 | 142 | return spec 143 | -------------------------------------------------------------------------------- /oldVersion/V101/text/english.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | from g2p_en import G2p 5 | 6 | from text import symbols 7 | 8 | current_file_path = os.path.dirname(__file__) 9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle") 11 | _g2p = G2p() 12 | 13 | arpa = { 14 | "AH0", 15 | "S", 16 | "AH1", 17 | "EY2", 18 | "AE2", 19 | "EH0", 20 | "OW2", 21 | "UH0", 22 | "NG", 23 | "B", 24 | "G", 25 | "AY0", 26 | "M", 27 | "AA0", 28 | "F", 29 | "AO0", 30 | "ER2", 31 | "UH1", 32 | "IY1", 33 | "AH2", 34 | "DH", 35 | "IY0", 36 | "EY1", 37 | "IH0", 38 | "K", 39 | "N", 40 | "W", 41 | "IY2", 42 | "T", 43 | "AA1", 44 | "ER1", 45 | "EH2", 46 | "OY0", 47 | "UH2", 48 | "UW1", 49 | "Z", 50 | "AW2", 51 | "AW1", 52 | "V", 53 | "UW2", 54 | "AA2", 55 | "ER", 56 | "AW0", 57 | "UW0", 58 | "R", 59 | "OW1", 60 | "EH1", 61 | "ZH", 62 | "AE0", 63 | "IH2", 64 | "IH", 65 | "Y", 66 | "JH", 67 | "P", 68 | "AY1", 69 | "EY0", 70 | "OY2", 71 | "TH", 72 | "HH", 73 | "D", 74 | "ER0", 75 | "CH", 76 | "AO1", 77 | "AE1", 78 | "AO2", 79 | "OY1", 80 | "AY2", 81 | "IH1", 82 | "OW0", 83 | "L", 84 | "SH", 85 | } 86 | 87 | 88 | def post_replace_ph(ph): 89 | rep_map = { 90 | ":": ",", 91 | ";": ",", 92 | ",": ",", 93 | "。": ".", 94 | "!": "!", 95 | "?": "?", 96 | "\n": ".", 97 | "·": ",", 98 | "、": ",", 99 | "...": "…", 100 | "v": "V", 101 | } 102 | if ph in rep_map.keys(): 103 | ph = rep_map[ph] 104 | if ph in symbols: 105 | return ph 106 | if ph not in symbols: 107 | ph = "UNK" 108 | return ph 109 | 110 | 111 | def read_dict(): 112 | g2p_dict = {} 113 | start_line = 49 114 | with open(CMU_DICT_PATH) as f: 115 | line = f.readline() 116 | line_index = 1 117 | while line: 118 | if line_index >= start_line: 119 | line = line.strip() 120 | word_split = line.split(" ") 121 | word = word_split[0] 122 | 123 | syllable_split = word_split[1].split(" - ") 124 | g2p_dict[word] = [] 125 | for syllable in syllable_split: 126 | phone_split = syllable.split(" ") 127 | g2p_dict[word].append(phone_split) 128 | 129 | line_index = line_index + 1 130 | line = f.readline() 131 | 132 | return g2p_dict 133 | 134 | 135 | def cache_dict(g2p_dict, file_path): 136 | with open(file_path, "wb") as pickle_file: 137 | pickle.dump(g2p_dict, pickle_file) 138 | 139 | 140 | def get_dict(): 141 | if os.path.exists(CACHE_PATH): 142 | with open(CACHE_PATH, "rb") as pickle_file: 143 | g2p_dict = pickle.load(pickle_file) 144 | else: 145 | g2p_dict = read_dict() 146 | cache_dict(g2p_dict, CACHE_PATH) 147 | 148 | return g2p_dict 149 | 150 | 151 | eng_dict = get_dict() 152 | 153 | 154 | def refine_ph(phn): 155 | tone = 0 156 | if re.search(r"\d$", phn): 157 | tone = int(phn[-1]) + 1 158 | phn = phn[:-1] 159 | return phn.lower(), tone 160 | 161 | 162 | def refine_syllables(syllables): 163 | tones = [] 164 | phonemes = [] 165 | for phn_list in syllables: 166 | for i in range(len(phn_list)): 167 | phn = phn_list[i] 168 | phn, tone = refine_ph(phn) 169 | phonemes.append(phn) 170 | tones.append(tone) 171 | return phonemes, tones 172 | 173 | 174 | def text_normalize(text): 175 | # todo: eng text normalize 176 | return text 177 | 178 | 179 | def g2p(text): 180 | phones = [] 181 | tones = [] 182 | words = re.split(r"([,;.\-\?\!\s+])", text) 183 | for w in words: 184 | if w.upper() in eng_dict: 185 | phns, tns = refine_syllables(eng_dict[w.upper()]) 186 | phones += phns 187 | tones += tns 188 | else: 189 | phone_list = list(filter(lambda p: p != " ", _g2p(w))) 190 | for ph in phone_list: 191 | if ph in arpa: 192 | ph, tn = refine_ph(ph) 193 | phones.append(ph) 194 | tones.append(tn) 195 | else: 196 | phones.append(ph) 197 | tones.append(0) 198 | # todo: implement word2ph 199 | word2ph = [1 for i in phones] 200 | 201 | phones = [post_replace_ph(i) for i in phones] 202 | return phones, tones, word2ph 203 | 204 | 205 | if __name__ == "__main__": 206 | # print(get_dict()) 207 | # print(eng_word_to_phoneme("hello")) 208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) 209 | # all_phones = set() 210 | # for k, syllables in eng_dict.items(): 211 | # for group in syllables: 212 | # for ph in group: 213 | # all_phones.add(ph) 214 | # print(all_phones) 215 | -------------------------------------------------------------------------------- /oldVersion/V110/text/english.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | from g2p_en import G2p 5 | 6 | from . import symbols 7 | 8 | current_file_path = os.path.dirname(__file__) 9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle") 11 | _g2p = G2p() 12 | 13 | arpa = { 14 | "AH0", 15 | "S", 16 | "AH1", 17 | "EY2", 18 | "AE2", 19 | "EH0", 20 | "OW2", 21 | "UH0", 22 | "NG", 23 | "B", 24 | "G", 25 | "AY0", 26 | "M", 27 | "AA0", 28 | "F", 29 | "AO0", 30 | "ER2", 31 | "UH1", 32 | "IY1", 33 | "AH2", 34 | "DH", 35 | "IY0", 36 | "EY1", 37 | "IH0", 38 | "K", 39 | "N", 40 | "W", 41 | "IY2", 42 | "T", 43 | "AA1", 44 | "ER1", 45 | "EH2", 46 | "OY0", 47 | "UH2", 48 | "UW1", 49 | "Z", 50 | "AW2", 51 | "AW1", 52 | "V", 53 | "UW2", 54 | "AA2", 55 | "ER", 56 | "AW0", 57 | "UW0", 58 | "R", 59 | "OW1", 60 | "EH1", 61 | "ZH", 62 | "AE0", 63 | "IH2", 64 | "IH", 65 | "Y", 66 | "JH", 67 | "P", 68 | "AY1", 69 | "EY0", 70 | "OY2", 71 | "TH", 72 | "HH", 73 | "D", 74 | "ER0", 75 | "CH", 76 | "AO1", 77 | "AE1", 78 | "AO2", 79 | "OY1", 80 | "AY2", 81 | "IH1", 82 | "OW0", 83 | "L", 84 | "SH", 85 | } 86 | 87 | 88 | def post_replace_ph(ph): 89 | rep_map = { 90 | ":": ",", 91 | ";": ",", 92 | ",": ",", 93 | "。": ".", 94 | "!": "!", 95 | "?": "?", 96 | "\n": ".", 97 | "·": ",", 98 | "、": ",", 99 | "...": "…", 100 | "v": "V", 101 | } 102 | if ph in rep_map.keys(): 103 | ph = rep_map[ph] 104 | if ph in symbols: 105 | return ph 106 | if ph not in symbols: 107 | ph = "UNK" 108 | return ph 109 | 110 | 111 | def read_dict(): 112 | g2p_dict = {} 113 | start_line = 49 114 | with open(CMU_DICT_PATH) as f: 115 | line = f.readline() 116 | line_index = 1 117 | while line: 118 | if line_index >= start_line: 119 | line = line.strip() 120 | word_split = line.split(" ") 121 | word = word_split[0] 122 | 123 | syllable_split = word_split[1].split(" - ") 124 | g2p_dict[word] = [] 125 | for syllable in syllable_split: 126 | phone_split = syllable.split(" ") 127 | g2p_dict[word].append(phone_split) 128 | 129 | line_index = line_index + 1 130 | line = f.readline() 131 | 132 | return g2p_dict 133 | 134 | 135 | def cache_dict(g2p_dict, file_path): 136 | with open(file_path, "wb") as pickle_file: 137 | pickle.dump(g2p_dict, pickle_file) 138 | 139 | 140 | def get_dict(): 141 | if os.path.exists(CACHE_PATH): 142 | with open(CACHE_PATH, "rb") as pickle_file: 143 | g2p_dict = pickle.load(pickle_file) 144 | else: 145 | g2p_dict = read_dict() 146 | cache_dict(g2p_dict, CACHE_PATH) 147 | 148 | return g2p_dict 149 | 150 | 151 | eng_dict = get_dict() 152 | 153 | 154 | def refine_ph(phn): 155 | tone = 0 156 | if re.search(r"\d$", phn): 157 | tone = int(phn[-1]) + 1 158 | phn = phn[:-1] 159 | return phn.lower(), tone 160 | 161 | 162 | def refine_syllables(syllables): 163 | tones = [] 164 | phonemes = [] 165 | for phn_list in syllables: 166 | for i in range(len(phn_list)): 167 | phn = phn_list[i] 168 | phn, tone = refine_ph(phn) 169 | phonemes.append(phn) 170 | tones.append(tone) 171 | return phonemes, tones 172 | 173 | 174 | def text_normalize(text): 175 | # todo: eng text normalize 176 | return text 177 | 178 | 179 | def g2p(text): 180 | phones = [] 181 | tones = [] 182 | words = re.split(r"([,;.\-\?\!\s+])", text) 183 | for w in words: 184 | if w.upper() in eng_dict: 185 | phns, tns = refine_syllables(eng_dict[w.upper()]) 186 | phones += phns 187 | tones += tns 188 | else: 189 | phone_list = list(filter(lambda p: p != " ", _g2p(w))) 190 | for ph in phone_list: 191 | if ph in arpa: 192 | ph, tn = refine_ph(ph) 193 | phones.append(ph) 194 | tones.append(tn) 195 | else: 196 | phones.append(ph) 197 | tones.append(0) 198 | # todo: implement word2ph 199 | word2ph = [1 for i in phones] 200 | 201 | phones = [post_replace_ph(i) for i in phones] 202 | return phones, tones, word2ph 203 | 204 | 205 | if __name__ == "__main__": 206 | # print(get_dict()) 207 | # print(eng_word_to_phoneme("hello")) 208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) 209 | # all_phones = set() 210 | # for k, syllables in eng_dict.items(): 211 | # for group in syllables: 212 | # for ph in group: 213 | # all_phones.add(ph) 214 | # print(all_phones) 215 | -------------------------------------------------------------------------------- /oldVersion/V111/text/english.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import re 4 | from g2p_en import G2p 5 | 6 | from . import symbols 7 | 8 | current_file_path = os.path.dirname(__file__) 9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle") 11 | _g2p = G2p() 12 | 13 | arpa = { 14 | "AH0", 15 | "S", 16 | "AH1", 17 | "EY2", 18 | "AE2", 19 | "EH0", 20 | "OW2", 21 | "UH0", 22 | "NG", 23 | "B", 24 | "G", 25 | "AY0", 26 | "M", 27 | "AA0", 28 | "F", 29 | "AO0", 30 | "ER2", 31 | "UH1", 32 | "IY1", 33 | "AH2", 34 | "DH", 35 | "IY0", 36 | "EY1", 37 | "IH0", 38 | "K", 39 | "N", 40 | "W", 41 | "IY2", 42 | "T", 43 | "AA1", 44 | "ER1", 45 | "EH2", 46 | "OY0", 47 | "UH2", 48 | "UW1", 49 | "Z", 50 | "AW2", 51 | "AW1", 52 | "V", 53 | "UW2", 54 | "AA2", 55 | "ER", 56 | "AW0", 57 | "UW0", 58 | "R", 59 | "OW1", 60 | "EH1", 61 | "ZH", 62 | "AE0", 63 | "IH2", 64 | "IH", 65 | "Y", 66 | "JH", 67 | "P", 68 | "AY1", 69 | "EY0", 70 | "OY2", 71 | "TH", 72 | "HH", 73 | "D", 74 | "ER0", 75 | "CH", 76 | "AO1", 77 | "AE1", 78 | "AO2", 79 | "OY1", 80 | "AY2", 81 | "IH1", 82 | "OW0", 83 | "L", 84 | "SH", 85 | } 86 | 87 | 88 | def post_replace_ph(ph): 89 | rep_map = { 90 | ":": ",", 91 | ";": ",", 92 | ",": ",", 93 | "。": ".", 94 | "!": "!", 95 | "?": "?", 96 | "\n": ".", 97 | "·": ",", 98 | "、": ",", 99 | "...": "…", 100 | "v": "V", 101 | } 102 | if ph in rep_map.keys(): 103 | ph = rep_map[ph] 104 | if ph in symbols: 105 | return ph 106 | if ph not in symbols: 107 | ph = "UNK" 108 | return ph 109 | 110 | 111 | def read_dict(): 112 | g2p_dict = {} 113 | start_line = 49 114 | with open(CMU_DICT_PATH) as f: 115 | line = f.readline() 116 | line_index = 1 117 | while line: 118 | if line_index >= start_line: 119 | line = line.strip() 120 | word_split = line.split(" ") 121 | word = word_split[0] 122 | 123 | syllable_split = word_split[1].split(" - ") 124 | g2p_dict[word] = [] 125 | for syllable in syllable_split: 126 | phone_split = syllable.split(" ") 127 | g2p_dict[word].append(phone_split) 128 | 129 | line_index = line_index + 1 130 | line = f.readline() 131 | 132 | return g2p_dict 133 | 134 | 135 | def cache_dict(g2p_dict, file_path): 136 | with open(file_path, "wb") as pickle_file: 137 | pickle.dump(g2p_dict, pickle_file) 138 | 139 | 140 | def get_dict(): 141 | if os.path.exists(CACHE_PATH): 142 | with open(CACHE_PATH, "rb") as pickle_file: 143 | g2p_dict = pickle.load(pickle_file) 144 | else: 145 | g2p_dict = read_dict() 146 | cache_dict(g2p_dict, CACHE_PATH) 147 | 148 | return g2p_dict 149 | 150 | 151 | eng_dict = get_dict() 152 | 153 | 154 | def refine_ph(phn): 155 | tone = 0 156 | if re.search(r"\d$", phn): 157 | tone = int(phn[-1]) + 1 158 | phn = phn[:-1] 159 | return phn.lower(), tone 160 | 161 | 162 | def refine_syllables(syllables): 163 | tones = [] 164 | phonemes = [] 165 | for phn_list in syllables: 166 | for i in range(len(phn_list)): 167 | phn = phn_list[i] 168 | phn, tone = refine_ph(phn) 169 | phonemes.append(phn) 170 | tones.append(tone) 171 | return phonemes, tones 172 | 173 | 174 | def text_normalize(text): 175 | # todo: eng text normalize 176 | return text 177 | 178 | 179 | def g2p(text): 180 | phones = [] 181 | tones = [] 182 | words = re.split(r"([,;.\-\?\!\s+])", text) 183 | for w in words: 184 | if w.upper() in eng_dict: 185 | phns, tns = refine_syllables(eng_dict[w.upper()]) 186 | phones += phns 187 | tones += tns 188 | else: 189 | phone_list = list(filter(lambda p: p != " ", _g2p(w))) 190 | for ph in phone_list: 191 | if ph in arpa: 192 | ph, tn = refine_ph(ph) 193 | phones.append(ph) 194 | tones.append(tn) 195 | else: 196 | phones.append(ph) 197 | tones.append(0) 198 | # todo: implement word2ph 199 | word2ph = [1 for i in phones] 200 | 201 | phones = [post_replace_ph(i) for i in phones] 202 | return phones, tones, word2ph 203 | 204 | 205 | if __name__ == "__main__": 206 | # print(get_dict()) 207 | # print(eng_word_to_phoneme("hello")) 208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) 209 | # all_phones = set() 210 | # for k, syllables in eng_dict.items(): 211 | # for group in syllables: 212 | # for ph in group: 213 | # all_phones.add(ph) 214 | # print(all_phones) 215 | -------------------------------------------------------------------------------- /emo_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.utils.data import Dataset 4 | from torch.utils.data import DataLoader 5 | from transformers import Wav2Vec2Processor 6 | from transformers.models.wav2vec2.modeling_wav2vec2 import ( 7 | Wav2Vec2Model, 8 | Wav2Vec2PreTrainedModel, 9 | ) 10 | import librosa 11 | import numpy as np 12 | import argparse 13 | from config import config 14 | import utils 15 | import os 16 | from tqdm import tqdm 17 | 18 | 19 | class RegressionHead(nn.Module): 20 | r"""Classification head.""" 21 | 22 | def __init__(self, config): 23 | super().__init__() 24 | 25 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 26 | self.dropout = nn.Dropout(config.final_dropout) 27 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 28 | 29 | def forward(self, features, **kwargs): 30 | x = features 31 | x = self.dropout(x) 32 | x = self.dense(x) 33 | x = torch.tanh(x) 34 | x = self.dropout(x) 35 | x = self.out_proj(x) 36 | 37 | return x 38 | 39 | 40 | class EmotionModel(Wav2Vec2PreTrainedModel): 41 | r"""Speech emotion classifier.""" 42 | 43 | def __init__(self, config): 44 | super().__init__(config) 45 | 46 | self.config = config 47 | self.wav2vec2 = Wav2Vec2Model(config) 48 | self.classifier = RegressionHead(config) 49 | self.init_weights() 50 | 51 | def forward( 52 | self, 53 | input_values, 54 | ): 55 | outputs = self.wav2vec2(input_values) 56 | hidden_states = outputs[0] 57 | hidden_states = torch.mean(hidden_states, dim=1) 58 | logits = self.classifier(hidden_states) 59 | 60 | return hidden_states, logits 61 | 62 | 63 | class AudioDataset(Dataset): 64 | def __init__(self, list_of_wav_files, sr, processor): 65 | self.list_of_wav_files = list_of_wav_files 66 | self.processor = processor 67 | self.sr = sr 68 | 69 | def __len__(self): 70 | return len(self.list_of_wav_files) 71 | 72 | def __getitem__(self, idx): 73 | wav_file = self.list_of_wav_files[idx] 74 | audio_data, _ = librosa.load(wav_file, sr=self.sr) 75 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[ 76 | "input_values" 77 | ][0] 78 | return torch.from_numpy(processed_data) 79 | 80 | 81 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim" 82 | processor = Wav2Vec2Processor.from_pretrained(model_name) 83 | model = EmotionModel.from_pretrained(model_name) 84 | 85 | 86 | def process_func( 87 | x: np.ndarray, 88 | sampling_rate: int, 89 | model: EmotionModel, 90 | processor: Wav2Vec2Processor, 91 | device: str, 92 | embeddings: bool = False, 93 | ) -> np.ndarray: 94 | r"""Predict emotions or extract embeddings from raw audio signal.""" 95 | model = model.to(device) 96 | y = processor(x, sampling_rate=sampling_rate) 97 | y = y["input_values"][0] 98 | y = torch.from_numpy(y).unsqueeze(0).to(device) 99 | 100 | # run through model 101 | with torch.no_grad(): 102 | y = model(y)[0 if embeddings else 1] 103 | 104 | # convert to numpy 105 | y = y.detach().cpu().numpy() 106 | 107 | return y 108 | 109 | 110 | def get_emo(path): 111 | wav, sr = librosa.load(path, 16000) 112 | device = config.bert_gen_config.device 113 | return process_func( 114 | np.expand_dims(wav, 0).astype(np.float), 115 | sr, 116 | model, 117 | processor, 118 | device, 119 | embeddings=True, 120 | ).squeeze(0) 121 | 122 | 123 | if __name__ == "__main__": 124 | parser = argparse.ArgumentParser() 125 | parser.add_argument( 126 | "-c", "--config", type=str, default=config.bert_gen_config.config_path 127 | ) 128 | parser.add_argument( 129 | "--num_processes", type=int, default=config.bert_gen_config.num_processes 130 | ) 131 | args, _ = parser.parse_known_args() 132 | config_path = args.config 133 | hps = utils.get_hparams_from_file(config_path) 134 | 135 | device = config.bert_gen_config.device 136 | 137 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim" 138 | processor = ( 139 | Wav2Vec2Processor.from_pretrained(model_name) 140 | if processor is None 141 | else processor 142 | ) 143 | model = ( 144 | EmotionModel.from_pretrained(model_name).to(device) 145 | if model is None 146 | else model.to(device) 147 | ) 148 | 149 | lines = [] 150 | with open(hps.data.training_files, encoding="utf-8") as f: 151 | lines.extend(f.readlines()) 152 | 153 | with open(hps.data.validation_files, encoding="utf-8") as f: 154 | lines.extend(f.readlines()) 155 | 156 | wavnames = [line.split("|")[0] for line in lines] 157 | dataset = AudioDataset(wavnames, 16000, processor) 158 | data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=16) 159 | 160 | with torch.no_grad(): 161 | for i, data in tqdm(enumerate(data_loader), total=len(data_loader)): 162 | wavname = wavnames[i] 163 | emo_path = wavname.replace(".wav", ".emo.npy") 164 | if os.path.exists(emo_path): 165 | continue 166 | emb = model(data.to(device))[0].detach().cpu().numpy() 167 | np.save(emo_path, emb) 168 | 169 | print("Emo vec 生成完毕!") 170 | -------------------------------------------------------------------------------- /preprocess_text.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import defaultdict 3 | from random import shuffle 4 | from typing import Optional 5 | import os 6 | 7 | from tqdm import tqdm 8 | import click 9 | from text.cleaner import clean_text 10 | from config import config 11 | from infer import latest_version 12 | 13 | preprocess_text_config = config.preprocess_text_config 14 | 15 | 16 | @click.command() 17 | @click.option( 18 | "--transcription-path", 19 | default=preprocess_text_config.transcription_path, 20 | type=click.Path(exists=True, file_okay=True, dir_okay=False), 21 | ) 22 | @click.option("--cleaned-path", default=preprocess_text_config.cleaned_path) 23 | @click.option("--train-path", default=preprocess_text_config.train_path) 24 | @click.option("--val-path", default=preprocess_text_config.val_path) 25 | @click.option( 26 | "--config-path", 27 | default=preprocess_text_config.config_path, 28 | type=click.Path(exists=True, file_okay=True, dir_okay=False), 29 | ) 30 | @click.option("--val-per-spk", default=preprocess_text_config.val_per_spk) 31 | @click.option("--max-val-total", default=preprocess_text_config.max_val_total) 32 | @click.option("--clean/--no-clean", default=preprocess_text_config.clean) 33 | @click.option("-y", "--yml_config") 34 | def preprocess( 35 | transcription_path: str, 36 | cleaned_path: Optional[str], 37 | train_path: str, 38 | val_path: str, 39 | config_path: str, 40 | val_per_spk: int, 41 | max_val_total: int, 42 | clean: bool, 43 | yml_config: str, # 这个不要删 44 | ): 45 | if cleaned_path == "" or cleaned_path is None: 46 | cleaned_path = transcription_path + ".cleaned" 47 | 48 | if clean: 49 | with open(cleaned_path, "w", encoding="utf-8") as out_file: 50 | with open(transcription_path, "r", encoding="utf-8") as trans_file: 51 | lines = trans_file.readlines() 52 | # print(lines, ' ', len(lines)) 53 | if len(lines) != 0: 54 | for line in tqdm(lines): 55 | try: 56 | utt, spk, language, text = line.strip().split("|") 57 | norm_text, phones, tones, word2ph = clean_text( 58 | text, language 59 | ) 60 | out_file.write( 61 | "{}|{}|{}|{}|{}|{}|{}\n".format( 62 | utt, 63 | spk, 64 | language, 65 | norm_text, 66 | " ".join(phones), 67 | " ".join([str(i) for i in tones]), 68 | " ".join([str(i) for i in word2ph]), 69 | ) 70 | ) 71 | except Exception as e: 72 | print(line) 73 | print(f"生成训练集和验证集时发生错误!, 详细信息:\n{e}") 74 | 75 | transcription_path = cleaned_path 76 | spk_utt_map = defaultdict(list) 77 | spk_id_map = {} 78 | current_sid = 0 79 | 80 | with open(transcription_path, "r", encoding="utf-8") as f: 81 | audioPaths = set() 82 | countSame = 0 83 | countNotFound = 0 84 | for line in f.readlines(): 85 | utt, spk, language, text, phones, tones, word2ph = line.strip().split("|") 86 | if utt in audioPaths: 87 | # 过滤数据集错误:相同的音频匹配多个文本,导致后续bert出问题 88 | print(f"重复音频文本:{line}") 89 | countSame += 1 90 | continue 91 | if not os.path.isfile(utt): 92 | # 过滤数据集错误:不存在对应音频 93 | print(f"没有找到对应的音频:{utt}") 94 | countNotFound += 1 95 | continue 96 | audioPaths.add(utt) 97 | spk_utt_map[spk].append(line) 98 | 99 | if spk not in spk_id_map.keys(): 100 | spk_id_map[spk] = current_sid 101 | current_sid += 1 102 | print(f"总重复音频数:{countSame},总未找到的音频数:{countNotFound}") 103 | 104 | train_list = [] 105 | val_list = [] 106 | 107 | for spk, utts in spk_utt_map.items(): 108 | shuffle(utts) 109 | val_list += utts[:val_per_spk] 110 | train_list += utts[val_per_spk:] 111 | 112 | if len(val_list) > max_val_total: 113 | train_list += val_list[max_val_total:] 114 | val_list = val_list[:max_val_total] 115 | 116 | with open(train_path, "w", encoding="utf-8") as f: 117 | for line in train_list: 118 | f.write(line) 119 | 120 | with open(val_path, "w", encoding="utf-8") as f: 121 | for line in val_list: 122 | f.write(line) 123 | 124 | json_config = json.load(open(config_path, encoding="utf-8")) 125 | json_config["data"]["spk2id"] = spk_id_map 126 | json_config['data']["n_speakers"] = current_sid# 127 | # 新增写入:写入训练版本、数据集路径 128 | json_config["version"] = latest_version 129 | json_config["data"]["training_files"] = os.path.normpath(train_path).replace( 130 | "\\", "/" 131 | ) 132 | json_config["data"]["validation_files"] = os.path.normpath(val_path).replace( 133 | "\\", "/" 134 | ) 135 | with open(config_path, "w", encoding="utf-8") as f: 136 | json.dump(json_config, f, indent=2, ensure_ascii=False) 137 | print("训练集和验证集生成完成!") 138 | 139 | 140 | if __name__ == "__main__": 141 | preprocess() 142 | -------------------------------------------------------------------------------- /short_audio_transcribe.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | import os 3 | import json 4 | import torchaudio 5 | import argparse 6 | import torch 7 | from config import config 8 | lang2token = { 9 | 'zh': "ZH|", 10 | 'ja': "JP|", 11 | "en": "EN|", 12 | } 13 | def transcribe_one(audio_path): 14 | # load audio and pad/trim it to fit 30 seconds 15 | audio = whisper.load_audio(audio_path) 16 | audio = whisper.pad_or_trim(audio) 17 | 18 | # make log-Mel spectrogram and move to the same device as the model 19 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 20 | 21 | # detect the spoken language 22 | _, probs = model.detect_language(mel) 23 | print(f"Detected language: {max(probs, key=probs.get)}") 24 | lang = max(probs, key=probs.get) 25 | # decode the audio 26 | options = whisper.DecodingOptions(beam_size=5) 27 | result = whisper.decode(model, mel, options) 28 | 29 | # print the recognized text 30 | print(result.text) 31 | return lang, result.text 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--languages", default="CJ") 35 | parser.add_argument("--whisper_size", default="medium") 36 | args = parser.parse_args() 37 | if args.languages == "CJE": 38 | lang2token = { 39 | 'zh': "ZH|", 40 | 'ja': "JP|", 41 | "en": "EN|", 42 | } 43 | elif args.languages == "CJ": 44 | lang2token = { 45 | 'zh': "ZH|", 46 | 'ja': "JP|", 47 | } 48 | elif args.languages == "C": 49 | lang2token = { 50 | 'zh': "ZH|", 51 | } 52 | assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!" 53 | model = whisper.load_model(args.whisper_size) 54 | #parent_dir = "./custom_character_voice/" 55 | parent_dir=config.resample_config.in_dir 56 | print(parent_dir) 57 | speaker_names = list(os.walk(parent_dir))[0][1] 58 | speaker_annos = [] 59 | total_files = sum([len(files) for r, d, files in os.walk(parent_dir)]) 60 | # resample audios 61 | # 2023/4/21: Get the target sampling rate 62 | with open(config.train_ms_config.config_path,'r', encoding='utf-8') as f: 63 | hps = json.load(f) 64 | target_sr = hps['data']['sampling_rate'] 65 | processed_files = 0 66 | for speaker in speaker_names: 67 | for i, wavfile in enumerate(list(os.walk(os.path.join(parent_dir,speaker)))[0][2]): 68 | # try to load file as audio 69 | if wavfile.startswith("processed_"): 70 | continue 71 | try: 72 | wav, sr = torchaudio.load(parent_dir + "/" + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True, 73 | channels_first=True) 74 | wav = wav.mean(dim=0).unsqueeze(0) 75 | if sr != target_sr: 76 | wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav) 77 | if wav.shape[1] / sr > 20: 78 | print(f"{wavfile} too long, ignoring\n") 79 | save_path = parent_dir+"/"+ speaker + "/" + f"processed_{i}.wav" 80 | torchaudio.save(save_path, wav, target_sr, channels_first=True) 81 | # transcribe text 82 | lang, text = transcribe_one(save_path) 83 | if lang not in list(lang2token.keys()): 84 | print(f"{lang} not supported, ignoring\n") 85 | continue 86 | #text = "ZH|" + text + "\n" 87 | text = lang2token[lang] + text + "\n" 88 | speaker_annos.append(save_path + "|" + speaker + "|" + text) 89 | 90 | processed_files += 1 91 | print(f"Processed: {processed_files}/{total_files}") 92 | except Exception as e: 93 | print(e) 94 | continue 95 | 96 | # # clean annotation 97 | # import argparse 98 | # import text 99 | # from utils import load_filepaths_and_text 100 | # for i, line in enumerate(speaker_annos): 101 | # path, sid, txt = line.split("|") 102 | # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"]) 103 | # cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" 104 | # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text 105 | # write into annotation 106 | if len(speaker_annos) == 0: 107 | print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.") 108 | print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.") 109 | with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f: 110 | for line in speaker_annos: 111 | f.write(line) 112 | 113 | # import json 114 | # # generate new config 115 | # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: 116 | # hps = json.load(f) 117 | # # modify n_speakers 118 | # hps['data']["n_speakers"] = 1000 + len(speaker2id) 119 | # # add speaker names 120 | # for speaker in speaker_names: 121 | # hps['speakers'][speaker] = speaker2id[speaker] 122 | # # save modified config 123 | # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f: 124 | # json.dump(hps, f, indent=2) 125 | # print("finished") 126 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | 6 | def init_weights(m, mean=0.0, std=0.01): 7 | classname = m.__class__.__name__ 8 | if classname.find("Conv") != -1: 9 | m.weight.data.normal_(mean, std) 10 | 11 | 12 | def get_padding(kernel_size, dilation=1): 13 | return int((kernel_size * dilation - dilation) / 2) 14 | 15 | 16 | def convert_pad_shape(pad_shape): 17 | layer = pad_shape[::-1] 18 | pad_shape = [item for sublist in layer for item in sublist] 19 | return pad_shape 20 | 21 | 22 | def intersperse(lst, item): 23 | result = [item] * (len(lst) * 2 + 1) 24 | result[1::2] = lst 25 | return result 26 | 27 | 28 | def kl_divergence(m_p, logs_p, m_q, logs_q): 29 | """KL(P||Q)""" 30 | kl = (logs_q - logs_p) - 0.5 31 | kl += ( 32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 33 | ) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | if idx_str < 0: 54 | i1 = x.size(2) + idx_str 55 | r1 = x[i, :, i1:] 56 | r2 = x[i, :, :idx_end] 57 | ret[i] = torch.cat([r1, r2], dim=1) 58 | else: 59 | ret[i] = x[i, :, idx_str:idx_end] 60 | return ret 61 | 62 | 63 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 64 | b, d, t = x.size() 65 | if x_lengths is None: 66 | x_lengths = t 67 | ids_str_max = x_lengths - segment_size + 1 68 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 69 | ret = slice_segments(x, ids_str, segment_size) 70 | return ret, ids_str 71 | 72 | 73 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 74 | position = torch.arange(length, dtype=torch.float) 75 | num_timescales = channels // 2 76 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 77 | num_timescales - 1 78 | ) 79 | inv_timescales = min_timescale * torch.exp( 80 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 81 | ) 82 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 83 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 84 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 85 | signal = signal.view(1, channels, length) 86 | return signal 87 | 88 | 89 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return x + signal.to(dtype=x.dtype, device=x.device) 93 | 94 | 95 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 96 | b, channels, length = x.size() 97 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 98 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 99 | 100 | 101 | def subsequent_mask(length): 102 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 103 | return mask 104 | 105 | 106 | @torch.jit.script 107 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 108 | n_channels_int = n_channels[0] 109 | in_act = input_a + input_b 110 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 111 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 112 | acts = t_act * s_act 113 | return acts 114 | 115 | 116 | def convert_pad_shape(pad_shape): 117 | layer = pad_shape[::-1] 118 | pad_shape = [item for sublist in layer for item in sublist] 119 | return pad_shape 120 | 121 | 122 | def shift_1d(x): 123 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 124 | return x 125 | 126 | 127 | def sequence_mask(length, max_length=None): 128 | if max_length is None: 129 | max_length = length.max() 130 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 131 | return x.unsqueeze(0) < length.unsqueeze(1) 132 | 133 | 134 | def generate_path(duration, mask): 135 | """ 136 | duration: [b, 1, t_x] 137 | mask: [b, 1, t_y, t_x] 138 | """ 139 | 140 | b, _, t_y, t_x = mask.shape 141 | cum_duration = torch.cumsum(duration, -1) 142 | 143 | cum_duration_flat = cum_duration.view(b * t_x) 144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 145 | path = path.view(b, t_x, t_y) 146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 147 | path = path.unsqueeze(1).transpose(2, 3) * mask 148 | return path 149 | 150 | 151 | def clip_grad_value_(parameters, clip_value, norm_type=2): 152 | if isinstance(parameters, torch.Tensor): 153 | parameters = [parameters] 154 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 155 | norm_type = float(norm_type) 156 | if clip_value is not None: 157 | clip_value = float(clip_value) 158 | 159 | total_norm = 0 160 | for p in parameters: 161 | param_norm = p.grad.data.norm(norm_type) 162 | total_norm += param_norm.item() ** norm_type 163 | if clip_value is not None: 164 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 165 | total_norm = total_norm ** (1.0 / norm_type) 166 | return total_norm 167 | -------------------------------------------------------------------------------- /tools/sentence.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import regex as re 4 | 5 | from tools.classify_language import classify_language, split_alpha_nonalpha 6 | 7 | 8 | def check_is_none(item) -> bool: 9 | """none -> True, not none -> False""" 10 | return ( 11 | item is None 12 | or (isinstance(item, str) and str(item).isspace()) 13 | or str(item) == "" 14 | ) 15 | 16 | 17 | def markup_language(text: str, target_languages: list = None) -> str: 18 | pattern = ( 19 | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" 20 | r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" 21 | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" 22 | ) 23 | sentences = re.split(pattern, text) 24 | 25 | pre_lang = "" 26 | p = 0 27 | 28 | sorted_target_languages = sorted(target_languages) 29 | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: 30 | new_sentences = [] 31 | for sentence in sentences: 32 | new_sentences.extend(split_alpha_nonalpha(sentence)) 33 | sentences = new_sentences 34 | 35 | for sentence in sentences: 36 | if check_is_none(sentence): 37 | continue 38 | 39 | lang = classify_language(sentence, target_languages) 40 | 41 | if pre_lang == "": 42 | text = text[:p] + text[p:].replace( 43 | sentence, f"[{lang.upper()}]{sentence}", 1 44 | ) 45 | p += len(f"[{lang.upper()}]") 46 | elif pre_lang != lang: 47 | text = text[:p] + text[p:].replace( 48 | sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1 49 | ) 50 | p += len(f"[{pre_lang.upper()}][{lang.upper()}]") 51 | pre_lang = lang 52 | p += text[p:].index(sentence) + len(sentence) 53 | text += f"[{pre_lang.upper()}]" 54 | 55 | return text 56 | 57 | 58 | def split_by_language(text: str, target_languages: list = None) -> list: 59 | pattern = ( 60 | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" 61 | r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" 62 | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" 63 | ) 64 | sentences = re.split(pattern, text) 65 | 66 | pre_lang = "" 67 | start = 0 68 | end = 0 69 | sentences_list = [] 70 | 71 | sorted_target_languages = sorted(target_languages) 72 | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: 73 | new_sentences = [] 74 | for sentence in sentences: 75 | new_sentences.extend(split_alpha_nonalpha(sentence)) 76 | sentences = new_sentences 77 | 78 | for sentence in sentences: 79 | if check_is_none(sentence): 80 | continue 81 | 82 | lang = classify_language(sentence, target_languages) 83 | 84 | end += text[end:].index(sentence) 85 | if pre_lang != "" and pre_lang != lang: 86 | sentences_list.append((text[start:end], pre_lang)) 87 | start = end 88 | end += len(sentence) 89 | pre_lang = lang 90 | sentences_list.append((text[start:], pre_lang)) 91 | 92 | return sentences_list 93 | 94 | 95 | def sentence_split(text: str, max: int) -> list: 96 | pattern = r"[!(),—+\-.:;??。,、;:]+" 97 | sentences = re.split(pattern, text) 98 | discarded_chars = re.findall(pattern, text) 99 | 100 | sentences_list, count, p = [], 0, 0 101 | 102 | # 按被分割的符号遍历 103 | for i, discarded_chars in enumerate(discarded_chars): 104 | count += len(sentences[i]) + len(discarded_chars) 105 | if count >= max: 106 | sentences_list.append(text[p : p + count].strip()) 107 | p += count 108 | count = 0 109 | 110 | # 加入最后剩余的文本 111 | if p < len(text): 112 | sentences_list.append(text[p:]) 113 | 114 | return sentences_list 115 | 116 | 117 | def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None): 118 | # 如果该speaker只支持一种语言 119 | if speaker_lang is not None and len(speaker_lang) == 1: 120 | if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]: 121 | logging.debug( 122 | f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}' 123 | ) 124 | lang = speaker_lang[0] 125 | 126 | sentences_list = [] 127 | if lang.upper() != "MIX": 128 | if max <= 0: 129 | sentences_list.append( 130 | markup_language(text, speaker_lang) 131 | if lang.upper() == "AUTO" 132 | else f"[{lang.upper()}]{text}[{lang.upper()}]" 133 | ) 134 | else: 135 | for i in sentence_split(text, max): 136 | if check_is_none(i): 137 | continue 138 | sentences_list.append( 139 | markup_language(i, speaker_lang) 140 | if lang.upper() == "AUTO" 141 | else f"[{lang.upper()}]{i}[{lang.upper()}]" 142 | ) 143 | else: 144 | sentences_list.append(text) 145 | 146 | for i in sentences_list: 147 | logging.debug(i) 148 | 149 | return sentences_list 150 | 151 | 152 | if __name__ == "__main__": 153 | text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。" 154 | print(markup_language(text, target_languages=None)) 155 | print(sentence_split(text, max=50)) 156 | print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None)) 157 | text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" 158 | print(split_by_language(text, ["zh", "ja", "en"])) 159 | -------------------------------------------------------------------------------- /oldVersion/V111/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.1.1版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text, clean_text_fix 8 | from .text import cleaned_text_to_sequence 9 | from .text import get_bert, get_bert_fix 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | assert bert.shape[-1] == len(phone), phone 26 | 27 | if language_str == "ZH": 28 | bert = bert 29 | ja_bert = torch.zeros(768, len(phone)) 30 | elif language_str == "JP": 31 | ja_bert = bert 32 | bert = torch.zeros(1024, len(phone)) 33 | else: 34 | bert = torch.zeros(1024, len(phone)) 35 | ja_bert = torch.zeros(768, len(phone)) 36 | 37 | assert bert.shape[-1] == len( 38 | phone 39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 40 | 41 | phone = torch.LongTensor(phone) 42 | tone = torch.LongTensor(tone) 43 | language = torch.LongTensor(language) 44 | return bert, ja_bert, phone, tone, language 45 | 46 | 47 | def get_text_fix(text, language_str, hps, device): 48 | norm_text, phone, tone, word2ph = clean_text_fix(text, language_str) 49 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 50 | 51 | if hps.data.add_blank: 52 | phone = commons.intersperse(phone, 0) 53 | tone = commons.intersperse(tone, 0) 54 | language = commons.intersperse(language, 0) 55 | for i in range(len(word2ph)): 56 | word2ph[i] = word2ph[i] * 2 57 | word2ph[0] += 1 58 | bert = get_bert_fix(norm_text, word2ph, language_str, device) 59 | del word2ph 60 | assert bert.shape[-1] == len(phone), phone 61 | 62 | if language_str == "ZH": 63 | bert = bert 64 | ja_bert = torch.zeros(768, len(phone)) 65 | elif language_str == "JP": 66 | ja_bert = bert 67 | bert = torch.zeros(1024, len(phone)) 68 | else: 69 | bert = torch.zeros(1024, len(phone)) 70 | ja_bert = torch.zeros(768, len(phone)) 71 | 72 | assert bert.shape[-1] == len( 73 | phone 74 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 75 | 76 | phone = torch.LongTensor(phone) 77 | tone = torch.LongTensor(tone) 78 | language = torch.LongTensor(language) 79 | return bert, ja_bert, phone, tone, language 80 | 81 | 82 | def infer( 83 | text, 84 | sdp_ratio, 85 | noise_scale, 86 | noise_scale_w, 87 | length_scale, 88 | sid, 89 | language, 90 | hps, 91 | net_g, 92 | device, 93 | ): 94 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device) 95 | with torch.no_grad(): 96 | x_tst = phones.to(device).unsqueeze(0) 97 | tones = tones.to(device).unsqueeze(0) 98 | lang_ids = lang_ids.to(device).unsqueeze(0) 99 | bert = bert.to(device).unsqueeze(0) 100 | ja_bert = ja_bert.to(device).unsqueeze(0) 101 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 102 | del phones 103 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 104 | audio = ( 105 | net_g.infer( 106 | x_tst, 107 | x_tst_lengths, 108 | speakers, 109 | tones, 110 | lang_ids, 111 | bert, 112 | ja_bert, 113 | sdp_ratio=sdp_ratio, 114 | noise_scale=noise_scale, 115 | noise_scale_w=noise_scale_w, 116 | length_scale=length_scale, 117 | )[0][0, 0] 118 | .data.cpu() 119 | .float() 120 | .numpy() 121 | ) 122 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert 123 | if torch.cuda.is_available(): 124 | torch.cuda.empty_cache() 125 | return audio 126 | 127 | 128 | def infer_fix( 129 | text, 130 | sdp_ratio, 131 | noise_scale, 132 | noise_scale_w, 133 | length_scale, 134 | sid, 135 | language, 136 | hps, 137 | net_g, 138 | device, 139 | ): 140 | bert, ja_bert, phones, tones, lang_ids = get_text_fix(text, language, hps, device) 141 | with torch.no_grad(): 142 | x_tst = phones.to(device).unsqueeze(0) 143 | tones = tones.to(device).unsqueeze(0) 144 | lang_ids = lang_ids.to(device).unsqueeze(0) 145 | bert = bert.to(device).unsqueeze(0) 146 | ja_bert = ja_bert.to(device).unsqueeze(0) 147 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 148 | del phones 149 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 150 | audio = ( 151 | net_g.infer( 152 | x_tst, 153 | x_tst_lengths, 154 | speakers, 155 | tones, 156 | lang_ids, 157 | bert, 158 | ja_bert, 159 | sdp_ratio=sdp_ratio, 160 | noise_scale=noise_scale, 161 | noise_scale_w=noise_scale_w, 162 | length_scale=length_scale, 163 | )[0][0, 0] 164 | .data.cpu() 165 | .float() 166 | .numpy() 167 | ) 168 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert 169 | if torch.cuda.is_available(): 170 | torch.cuda.empty_cache() 171 | return audio 172 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, Response 2 | from io import BytesIO 3 | import torch 4 | from av import open as avopen 5 | from typing import Dict, List 6 | import re_matching 7 | import utils 8 | from infer import infer, get_net_g, latest_version 9 | from scipy.io import wavfile 10 | import gradio as gr 11 | from config import config 12 | 13 | # Flask Init 14 | app = Flask(__name__) 15 | app.config["JSON_AS_ASCII"] = False 16 | 17 | 18 | def replace_punctuation(text, i=2): 19 | punctuation = ",。?!" 20 | for char in punctuation: 21 | text = text.replace(char, char * i) 22 | return text 23 | 24 | 25 | def wav2(i, o, format): 26 | inp = avopen(i, "rb") 27 | out = avopen(o, "wb", format=format) 28 | if format == "ogg": 29 | format = "libvorbis" 30 | 31 | ostream = out.add_stream(format) 32 | 33 | for frame in inp.decode(audio=0): 34 | for p in ostream.encode(frame): 35 | out.mux(p) 36 | 37 | for p in ostream.encode(None): 38 | out.mux(p) 39 | 40 | out.close() 41 | inp.close() 42 | 43 | 44 | net_g_List = [] 45 | hps_List = [] 46 | # 模型角色字典 47 | # 使用方法 chr_name = chrsMap[model_id][chr_id] 48 | chrsMap: List[Dict[int, str]] = list() 49 | 50 | # 加载模型 51 | models = config.server_config.models 52 | for model in models: 53 | hps_List.append(utils.get_hparams_from_file(model["config"])) 54 | # 添加角色字典 55 | chrsMap.append(dict()) 56 | for name, cid in hps_List[-1].data.spk2id.items(): 57 | chrsMap[-1][cid] = name 58 | version = ( 59 | hps_List[-1].version if hasattr(hps_List[-1], "version") else latest_version 60 | ) 61 | net_g_List.append( 62 | get_net_g( 63 | model_path=model["model"], 64 | version=version, 65 | device=model["device"], 66 | hps=hps_List[-1], 67 | ) 68 | ) 69 | 70 | 71 | def generate_audio( 72 | slices, 73 | sdp_ratio, 74 | noise_scale, 75 | noise_scale_w, 76 | length_scale, 77 | speaker, 78 | language, 79 | ): 80 | audio_list = [] 81 | silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) 82 | with torch.no_grad(): 83 | for piece in slices: 84 | audio = infer( 85 | piece, 86 | sdp_ratio=sdp_ratio, 87 | noise_scale=noise_scale, 88 | noise_scale_w=noise_scale_w, 89 | length_scale=length_scale, 90 | sid=speaker, 91 | language=language, 92 | hps=hps, 93 | net_g=net_g, 94 | device=device, 95 | ) 96 | audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) 97 | audio_list.append(audio16bit) 98 | audio_list.append(silence) # 将静音添加到列表中 99 | return audio_list 100 | 101 | 102 | @app.route("/") 103 | def main(): 104 | try: 105 | model = int(request.args.get("model")) 106 | speaker = request.args.get("speaker", "") # 指定人物名 107 | speaker_id = request.args.get("speaker_id", None) # 直接指定id 108 | text = request.args.get("text").replace("/n", "") 109 | sdp_ratio = float(request.args.get("sdp_ratio", 0.2)) 110 | noise = float(request.args.get("noise", 0.5)) 111 | noisew = float(request.args.get("noisew", 0.6)) 112 | length = float(request.args.get("length", 1.2)) 113 | language = request.args.get("language") 114 | if length >= 2: 115 | return "Too big length" 116 | if len(text) >= 250: 117 | return "Too long text" 118 | fmt = request.args.get("format", "wav") 119 | if None in (speaker, text): 120 | return "Missing Parameter" 121 | if fmt not in ("mp3", "wav", "ogg"): 122 | return "Invalid Format" 123 | if language not in ("JP", "ZH", "EN", "mix"): 124 | return "Invalid language" 125 | except: 126 | return "Invalid Parameter" 127 | 128 | if speaker_id is not None: 129 | if speaker_id.isdigit(): 130 | speaker = chrsMap[model][int(speaker_id)] 131 | audio_list = [] 132 | if language == "mix": 133 | bool_valid, str_valid = re_matching.validate_text(text) 134 | if not bool_valid: 135 | return str_valid, ( 136 | hps.data.sampling_rate, 137 | np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), 138 | ) 139 | result = re_matching.text_matching(text) 140 | for one in result: 141 | _speaker = one.pop() 142 | for lang, content in one: 143 | audio_list.extend( 144 | generate_audio( 145 | content.split("|"), 146 | sdp_ratio, 147 | noise_scale, 148 | noise_scale_w, 149 | length_scale, 150 | _speaker, 151 | lang, 152 | ) 153 | ) 154 | else: 155 | audio_list.extend( 156 | generate_audio( 157 | text.split("|"), 158 | sdp_ratio, 159 | noise_scale, 160 | noise_scale_w, 161 | length_scale, 162 | speaker, 163 | language, 164 | ) 165 | ) 166 | 167 | audio_concat = np.concatenate(audio_list) 168 | with BytesIO() as wav: 169 | wavfile.write(wav, hps_List[model].data.sampling_rate, audio_concat) 170 | torch.cuda.empty_cache() 171 | if fmt == "wav": 172 | return Response(wav.getvalue(), mimetype="audio/wav") 173 | wav.seek(0, 0) 174 | with BytesIO() as ofp: 175 | wav2(wav, ofp, fmt) 176 | return Response( 177 | ofp.getvalue(), mimetype="audio/mpeg" if fmt == "mp3" else "audio/ogg" 178 | ) 179 | 180 | 181 | if __name__ == "__main__": 182 | app.run(port=config.server_config.port, server_name="0.0.0.0") 183 | -------------------------------------------------------------------------------- /text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /oldVersion/V101/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /oldVersion/V110/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /oldVersion/V111/text/opencpop-strict.txt: -------------------------------------------------------------------------------- 1 | a AA a 2 | ai AA ai 3 | an AA an 4 | ang AA ang 5 | ao AA ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | bei b ei 12 | ben b en 13 | beng b eng 14 | bi b i 15 | bian b ian 16 | biao b iao 17 | bie b ie 18 | bin b in 19 | bing b ing 20 | bo b o 21 | bu b u 22 | ca c a 23 | cai c ai 24 | can c an 25 | cang c ang 26 | cao c ao 27 | ce c e 28 | cei c ei 29 | cen c en 30 | ceng c eng 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch ang 35 | chao ch ao 36 | che ch e 37 | chen ch en 38 | cheng ch eng 39 | chi ch ir 40 | chong ch ong 41 | chou ch ou 42 | chu ch u 43 | chua ch ua 44 | chuai ch uai 45 | chuan ch uan 46 | chuang ch uang 47 | chui ch ui 48 | chun ch un 49 | chuo ch uo 50 | ci c i0 51 | cong c ong 52 | cou c ou 53 | cu c u 54 | cuan c uan 55 | cui c ui 56 | cun c un 57 | cuo c uo 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d ang 62 | dao d ao 63 | de d e 64 | dei d ei 65 | den d en 66 | deng d eng 67 | di d i 68 | dia d ia 69 | dian d ian 70 | diao d iao 71 | die d ie 72 | ding d ing 73 | diu d iu 74 | dong d ong 75 | dou d ou 76 | du d u 77 | duan d uan 78 | dui d ui 79 | dun d un 80 | duo d uo 81 | e EE e 82 | ei EE ei 83 | en EE en 84 | eng EE eng 85 | er EE er 86 | fa f a 87 | fan f an 88 | fang f ang 89 | fei f ei 90 | fen f en 91 | feng f eng 92 | fo f o 93 | fou f ou 94 | fu f u 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g ang 99 | gao g ao 100 | ge g e 101 | gei g ei 102 | gen g en 103 | geng g eng 104 | gong g ong 105 | gou g ou 106 | gu g u 107 | gua g ua 108 | guai g uai 109 | guan g uan 110 | guang g uang 111 | gui g ui 112 | gun g un 113 | guo g uo 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h ang 118 | hao h ao 119 | he h e 120 | hei h ei 121 | hen h en 122 | heng h eng 123 | hong h ong 124 | hou h ou 125 | hu h u 126 | hua h ua 127 | huai h uai 128 | huan h uan 129 | huang h uang 130 | hui h ui 131 | hun h un 132 | huo h uo 133 | ji j i 134 | jia j ia 135 | jian j ian 136 | jiang j iang 137 | jiao j iao 138 | jie j ie 139 | jin j in 140 | jing j ing 141 | jiong j iong 142 | jiu j iu 143 | ju j v 144 | jv j v 145 | juan j van 146 | jvan j van 147 | jue j ve 148 | jve j ve 149 | jun j vn 150 | jvn j vn 151 | ka k a 152 | kai k ai 153 | kan k an 154 | kang k ang 155 | kao k ao 156 | ke k e 157 | kei k ei 158 | ken k en 159 | keng k eng 160 | kong k ong 161 | kou k ou 162 | ku k u 163 | kua k ua 164 | kuai k uai 165 | kuan k uan 166 | kuang k uang 167 | kui k ui 168 | kun k un 169 | kuo k uo 170 | la l a 171 | lai l ai 172 | lan l an 173 | lang l ang 174 | lao l ao 175 | le l e 176 | lei l ei 177 | leng l eng 178 | li l i 179 | lia l ia 180 | lian l ian 181 | liang l iang 182 | liao l iao 183 | lie l ie 184 | lin l in 185 | ling l ing 186 | liu l iu 187 | lo l o 188 | long l ong 189 | lou l ou 190 | lu l u 191 | luan l uan 192 | lun l un 193 | luo l uo 194 | lv l v 195 | lve l ve 196 | ma m a 197 | mai m ai 198 | man m an 199 | mang m ang 200 | mao m ao 201 | me m e 202 | mei m ei 203 | men m en 204 | meng m eng 205 | mi m i 206 | mian m ian 207 | miao m iao 208 | mie m ie 209 | min m in 210 | ming m ing 211 | miu m iu 212 | mo m o 213 | mou m ou 214 | mu m u 215 | na n a 216 | nai n ai 217 | nan n an 218 | nang n ang 219 | nao n ao 220 | ne n e 221 | nei n ei 222 | nen n en 223 | neng n eng 224 | ni n i 225 | nian n ian 226 | niang n iang 227 | niao n iao 228 | nie n ie 229 | nin n in 230 | ning n ing 231 | niu n iu 232 | nong n ong 233 | nou n ou 234 | nu n u 235 | nuan n uan 236 | nun n un 237 | nuo n uo 238 | nv n v 239 | nve n ve 240 | o OO o 241 | ou OO ou 242 | pa p a 243 | pai p ai 244 | pan p an 245 | pang p ang 246 | pao p ao 247 | pei p ei 248 | pen p en 249 | peng p eng 250 | pi p i 251 | pian p ian 252 | piao p iao 253 | pie p ie 254 | pin p in 255 | ping p ing 256 | po p o 257 | pou p ou 258 | pu p u 259 | qi q i 260 | qia q ia 261 | qian q ian 262 | qiang q iang 263 | qiao q iao 264 | qie q ie 265 | qin q in 266 | qing q ing 267 | qiong q iong 268 | qiu q iu 269 | qu q v 270 | qv q v 271 | quan q van 272 | qvan q van 273 | que q ve 274 | qve q ve 275 | qun q vn 276 | qvn q vn 277 | ran r an 278 | rang r ang 279 | rao r ao 280 | re r e 281 | ren r en 282 | reng r eng 283 | ri r ir 284 | rong r ong 285 | rou r ou 286 | ru r u 287 | rua r ua 288 | ruan r uan 289 | rui r ui 290 | run r un 291 | ruo r uo 292 | sa s a 293 | sai s ai 294 | san s an 295 | sang s ang 296 | sao s ao 297 | se s e 298 | sen s en 299 | seng s eng 300 | sha sh a 301 | shai sh ai 302 | shan sh an 303 | shang sh ang 304 | shao sh ao 305 | she sh e 306 | shei sh ei 307 | shen sh en 308 | sheng sh eng 309 | shi sh ir 310 | shou sh ou 311 | shu sh u 312 | shua sh ua 313 | shuai sh uai 314 | shuan sh uan 315 | shuang sh uang 316 | shui sh ui 317 | shun sh un 318 | shuo sh uo 319 | si s i0 320 | song s ong 321 | sou s ou 322 | su s u 323 | suan s uan 324 | sui s ui 325 | sun s un 326 | suo s uo 327 | ta t a 328 | tai t ai 329 | tan t an 330 | tang t ang 331 | tao t ao 332 | te t e 333 | tei t ei 334 | teng t eng 335 | ti t i 336 | tian t ian 337 | tiao t iao 338 | tie t ie 339 | ting t ing 340 | tong t ong 341 | tou t ou 342 | tu t u 343 | tuan t uan 344 | tui t ui 345 | tun t un 346 | tuo t uo 347 | wa w a 348 | wai w ai 349 | wan w an 350 | wang w ang 351 | wei w ei 352 | wen w en 353 | weng w eng 354 | wo w o 355 | wu w u 356 | xi x i 357 | xia x ia 358 | xian x ian 359 | xiang x iang 360 | xiao x iao 361 | xie x ie 362 | xin x in 363 | xing x ing 364 | xiong x iong 365 | xiu x iu 366 | xu x v 367 | xv x v 368 | xuan x van 369 | xvan x van 370 | xue x ve 371 | xve x ve 372 | xun x vn 373 | xvn x vn 374 | ya y a 375 | yan y En 376 | yang y ang 377 | yao y ao 378 | ye y E 379 | yi y i 380 | yin y in 381 | ying y ing 382 | yo y o 383 | yong y ong 384 | you y ou 385 | yu y v 386 | yv y v 387 | yuan y van 388 | yvan y van 389 | yue y ve 390 | yve y ve 391 | yun y vn 392 | yvn y vn 393 | za z a 394 | zai z ai 395 | zan z an 396 | zang z ang 397 | zao z ao 398 | ze z e 399 | zei z ei 400 | zen z en 401 | zeng z eng 402 | zha zh a 403 | zhai zh ai 404 | zhan zh an 405 | zhang zh ang 406 | zhao zh ao 407 | zhe zh e 408 | zhei zh ei 409 | zhen zh en 410 | zheng zh eng 411 | zhi zh ir 412 | zhong zh ong 413 | zhou zh ou 414 | zhu zh u 415 | zhua zh ua 416 | zhuai zh uai 417 | zhuan zh uan 418 | zhuang zh uang 419 | zhui zh ui 420 | zhun zh un 421 | zhuo zh uo 422 | zi z i0 423 | zong z ong 424 | zou z ou 425 | zu z u 426 | zuan z uan 427 | zui z ui 428 | zun z un 429 | zuo z uo 430 | -------------------------------------------------------------------------------- /text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import cn2an 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | from text.symbols import punctuation 8 | from text.tone_sandhi import ToneSandhi 9 | 10 | current_file_path = os.path.dirname(__file__) 11 | pinyin_to_symbol_map = { 12 | line.split("\t")[0]: line.strip().split("\t")[1] 13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 14 | } 15 | 16 | import jieba.posseg as psg 17 | 18 | 19 | rep_map = { 20 | ":": ",", 21 | ";": ",", 22 | ",": ",", 23 | "。": ".", 24 | "!": "!", 25 | "?": "?", 26 | "\n": ".", 27 | "·": ",", 28 | "、": ",", 29 | "...": "…", 30 | "$": ".", 31 | "“": "'", 32 | "”": "'", 33 | "‘": "'", 34 | "’": "'", 35 | "(": "'", 36 | ")": "'", 37 | "(": "'", 38 | ")": "'", 39 | "《": "'", 40 | "》": "'", 41 | "【": "'", 42 | "】": "'", 43 | "[": "'", 44 | "]": "'", 45 | "—": "-", 46 | "~": "-", 47 | "~": "-", 48 | "「": "'", 49 | "」": "'", 50 | } 51 | 52 | tone_modifier = ToneSandhi() 53 | 54 | 55 | def replace_punctuation(text): 56 | text = text.replace("嗯", "恩").replace("呣", "母") 57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 58 | 59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 60 | 61 | replaced_text = re.sub( 62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 63 | ) 64 | 65 | return replaced_text 66 | 67 | 68 | def g2p(text): 69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 71 | phones, tones, word2ph = _g2p(sentences) 72 | assert sum(word2ph) == len(phones) 73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch. 74 | phones = ["_"] + phones + ["_"] 75 | tones = [0] + tones + [0] 76 | word2ph = [1] + word2ph + [1] 77 | return phones, tones, word2ph 78 | 79 | 80 | def _get_initials_finals(word): 81 | initials = [] 82 | finals = [] 83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 84 | orig_finals = lazy_pinyin( 85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 86 | ) 87 | for c, v in zip(orig_initials, orig_finals): 88 | initials.append(c) 89 | finals.append(v) 90 | return initials, finals 91 | 92 | 93 | def _g2p(segments): 94 | phones_list = [] 95 | tones_list = [] 96 | word2ph = [] 97 | for seg in segments: 98 | # Replace all English words in the sentence 99 | seg = re.sub("[a-zA-Z]+", "", seg) 100 | seg_cut = psg.lcut(seg) 101 | initials = [] 102 | finals = [] 103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 104 | for word, pos in seg_cut: 105 | if pos == "eng": 106 | continue 107 | sub_initials, sub_finals = _get_initials_finals(word) 108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 109 | initials.append(sub_initials) 110 | finals.append(sub_finals) 111 | 112 | # assert len(sub_initials) == len(sub_finals) == len(word) 113 | initials = sum(initials, []) 114 | finals = sum(finals, []) 115 | # 116 | for c, v in zip(initials, finals): 117 | raw_pinyin = c + v 118 | # NOTE: post process for pypinyin outputs 119 | # we discriminate i, ii and iii 120 | if c == v: 121 | assert c in punctuation 122 | phone = [c] 123 | tone = "0" 124 | word2ph.append(1) 125 | else: 126 | v_without_tone = v[:-1] 127 | tone = v[-1] 128 | 129 | pinyin = c + v_without_tone 130 | assert tone in "12345" 131 | 132 | if c: 133 | # 多音节 134 | v_rep_map = { 135 | "uei": "ui", 136 | "iou": "iu", 137 | "uen": "un", 138 | } 139 | if v_without_tone in v_rep_map.keys(): 140 | pinyin = c + v_rep_map[v_without_tone] 141 | else: 142 | # 单音节 143 | pinyin_rep_map = { 144 | "ing": "ying", 145 | "i": "yi", 146 | "in": "yin", 147 | "u": "wu", 148 | } 149 | if pinyin in pinyin_rep_map.keys(): 150 | pinyin = pinyin_rep_map[pinyin] 151 | else: 152 | single_rep_map = { 153 | "v": "yu", 154 | "e": "e", 155 | "i": "y", 156 | "u": "w", 157 | } 158 | if pinyin[0] in single_rep_map.keys(): 159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 160 | 161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 162 | phone = pinyin_to_symbol_map[pinyin].split(" ") 163 | word2ph.append(len(phone)) 164 | 165 | phones_list += phone 166 | tones_list += [int(tone)] * len(phone) 167 | return phones_list, tones_list, word2ph 168 | 169 | 170 | def text_normalize(text): 171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text) 172 | for number in numbers: 173 | text = text.replace(number, cn2an.an2cn(number), 1) 174 | text = replace_punctuation(text) 175 | return text 176 | 177 | 178 | def get_bert_feature(text, word2ph): 179 | from text import chinese_bert 180 | 181 | return chinese_bert.get_bert_feature(text, word2ph) 182 | 183 | 184 | if __name__ == "__main__": 185 | from text.chinese_bert import get_bert_feature 186 | 187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" 188 | text = text_normalize(text) 189 | print(text) 190 | phones, tones, word2ph = g2p(text) 191 | bert = get_bert_feature(text, word2ph) 192 | 193 | print(phones, tones, word2ph, bert.shape) 194 | 195 | 196 | # # 示例用法 197 | # text = "这是一个示例文本:,你好!这是一个测试...." 198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 199 | -------------------------------------------------------------------------------- /oldVersion/V110/text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import cn2an 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | from .symbols import punctuation 8 | from .tone_sandhi import ToneSandhi 9 | 10 | current_file_path = os.path.dirname(__file__) 11 | pinyin_to_symbol_map = { 12 | line.split("\t")[0]: line.strip().split("\t")[1] 13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 14 | } 15 | 16 | import jieba.posseg as psg 17 | 18 | 19 | rep_map = { 20 | ":": ",", 21 | ";": ",", 22 | ",": ",", 23 | "。": ".", 24 | "!": "!", 25 | "?": "?", 26 | "\n": ".", 27 | "·": ",", 28 | "、": ",", 29 | "...": "…", 30 | "$": ".", 31 | "“": "'", 32 | "”": "'", 33 | "‘": "'", 34 | "’": "'", 35 | "(": "'", 36 | ")": "'", 37 | "(": "'", 38 | ")": "'", 39 | "《": "'", 40 | "》": "'", 41 | "【": "'", 42 | "】": "'", 43 | "[": "'", 44 | "]": "'", 45 | "—": "-", 46 | "~": "-", 47 | "~": "-", 48 | "「": "'", 49 | "」": "'", 50 | } 51 | 52 | tone_modifier = ToneSandhi() 53 | 54 | 55 | def replace_punctuation(text): 56 | text = text.replace("嗯", "恩").replace("呣", "母") 57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 58 | 59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 60 | 61 | replaced_text = re.sub( 62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 63 | ) 64 | 65 | return replaced_text 66 | 67 | 68 | def g2p(text): 69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 71 | phones, tones, word2ph = _g2p(sentences) 72 | assert sum(word2ph) == len(phones) 73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch. 74 | phones = ["_"] + phones + ["_"] 75 | tones = [0] + tones + [0] 76 | word2ph = [1] + word2ph + [1] 77 | return phones, tones, word2ph 78 | 79 | 80 | def _get_initials_finals(word): 81 | initials = [] 82 | finals = [] 83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 84 | orig_finals = lazy_pinyin( 85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 86 | ) 87 | for c, v in zip(orig_initials, orig_finals): 88 | initials.append(c) 89 | finals.append(v) 90 | return initials, finals 91 | 92 | 93 | def _g2p(segments): 94 | phones_list = [] 95 | tones_list = [] 96 | word2ph = [] 97 | for seg in segments: 98 | # Replace all English words in the sentence 99 | seg = re.sub("[a-zA-Z]+", "", seg) 100 | seg_cut = psg.lcut(seg) 101 | initials = [] 102 | finals = [] 103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 104 | for word, pos in seg_cut: 105 | if pos == "eng": 106 | continue 107 | sub_initials, sub_finals = _get_initials_finals(word) 108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 109 | initials.append(sub_initials) 110 | finals.append(sub_finals) 111 | 112 | # assert len(sub_initials) == len(sub_finals) == len(word) 113 | initials = sum(initials, []) 114 | finals = sum(finals, []) 115 | # 116 | for c, v in zip(initials, finals): 117 | raw_pinyin = c + v 118 | # NOTE: post process for pypinyin outputs 119 | # we discriminate i, ii and iii 120 | if c == v: 121 | assert c in punctuation 122 | phone = [c] 123 | tone = "0" 124 | word2ph.append(1) 125 | else: 126 | v_without_tone = v[:-1] 127 | tone = v[-1] 128 | 129 | pinyin = c + v_without_tone 130 | assert tone in "12345" 131 | 132 | if c: 133 | # 多音节 134 | v_rep_map = { 135 | "uei": "ui", 136 | "iou": "iu", 137 | "uen": "un", 138 | } 139 | if v_without_tone in v_rep_map.keys(): 140 | pinyin = c + v_rep_map[v_without_tone] 141 | else: 142 | # 单音节 143 | pinyin_rep_map = { 144 | "ing": "ying", 145 | "i": "yi", 146 | "in": "yin", 147 | "u": "wu", 148 | } 149 | if pinyin in pinyin_rep_map.keys(): 150 | pinyin = pinyin_rep_map[pinyin] 151 | else: 152 | single_rep_map = { 153 | "v": "yu", 154 | "e": "e", 155 | "i": "y", 156 | "u": "w", 157 | } 158 | if pinyin[0] in single_rep_map.keys(): 159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 160 | 161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 162 | phone = pinyin_to_symbol_map[pinyin].split(" ") 163 | word2ph.append(len(phone)) 164 | 165 | phones_list += phone 166 | tones_list += [int(tone)] * len(phone) 167 | return phones_list, tones_list, word2ph 168 | 169 | 170 | def text_normalize(text): 171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text) 172 | for number in numbers: 173 | text = text.replace(number, cn2an.an2cn(number), 1) 174 | text = replace_punctuation(text) 175 | return text 176 | 177 | 178 | def get_bert_feature(text, word2ph): 179 | from text import chinese_bert 180 | 181 | return chinese_bert.get_bert_feature(text, word2ph) 182 | 183 | 184 | if __name__ == "__main__": 185 | from text.chinese_bert import get_bert_feature 186 | 187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" 188 | text = text_normalize(text) 189 | print(text) 190 | phones, tones, word2ph = g2p(text) 191 | bert = get_bert_feature(text, word2ph) 192 | 193 | print(phones, tones, word2ph, bert.shape) 194 | 195 | 196 | # # 示例用法 197 | # text = "这是一个示例文本:,你好!这是一个测试...." 198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 199 | -------------------------------------------------------------------------------- /oldVersion/V111/text/chinese.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import cn2an 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | from .symbols import punctuation 8 | from .tone_sandhi import ToneSandhi 9 | 10 | current_file_path = os.path.dirname(__file__) 11 | pinyin_to_symbol_map = { 12 | line.split("\t")[0]: line.strip().split("\t")[1] 13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() 14 | } 15 | 16 | import jieba.posseg as psg 17 | 18 | 19 | rep_map = { 20 | ":": ",", 21 | ";": ",", 22 | ",": ",", 23 | "。": ".", 24 | "!": "!", 25 | "?": "?", 26 | "\n": ".", 27 | "·": ",", 28 | "、": ",", 29 | "...": "…", 30 | "$": ".", 31 | "“": "'", 32 | "”": "'", 33 | "‘": "'", 34 | "’": "'", 35 | "(": "'", 36 | ")": "'", 37 | "(": "'", 38 | ")": "'", 39 | "《": "'", 40 | "》": "'", 41 | "【": "'", 42 | "】": "'", 43 | "[": "'", 44 | "]": "'", 45 | "—": "-", 46 | "~": "-", 47 | "~": "-", 48 | "「": "'", 49 | "」": "'", 50 | } 51 | 52 | tone_modifier = ToneSandhi() 53 | 54 | 55 | def replace_punctuation(text): 56 | text = text.replace("嗯", "恩").replace("呣", "母") 57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) 58 | 59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) 60 | 61 | replaced_text = re.sub( 62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text 63 | ) 64 | 65 | return replaced_text 66 | 67 | 68 | def g2p(text): 69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) 70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] 71 | phones, tones, word2ph = _g2p(sentences) 72 | assert sum(word2ph) == len(phones) 73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch. 74 | phones = ["_"] + phones + ["_"] 75 | tones = [0] + tones + [0] 76 | word2ph = [1] + word2ph + [1] 77 | return phones, tones, word2ph 78 | 79 | 80 | def _get_initials_finals(word): 81 | initials = [] 82 | finals = [] 83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) 84 | orig_finals = lazy_pinyin( 85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 86 | ) 87 | for c, v in zip(orig_initials, orig_finals): 88 | initials.append(c) 89 | finals.append(v) 90 | return initials, finals 91 | 92 | 93 | def _g2p(segments): 94 | phones_list = [] 95 | tones_list = [] 96 | word2ph = [] 97 | for seg in segments: 98 | # Replace all English words in the sentence 99 | seg = re.sub("[a-zA-Z]+", "", seg) 100 | seg_cut = psg.lcut(seg) 101 | initials = [] 102 | finals = [] 103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) 104 | for word, pos in seg_cut: 105 | if pos == "eng": 106 | continue 107 | sub_initials, sub_finals = _get_initials_finals(word) 108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) 109 | initials.append(sub_initials) 110 | finals.append(sub_finals) 111 | 112 | # assert len(sub_initials) == len(sub_finals) == len(word) 113 | initials = sum(initials, []) 114 | finals = sum(finals, []) 115 | # 116 | for c, v in zip(initials, finals): 117 | raw_pinyin = c + v 118 | # NOTE: post process for pypinyin outputs 119 | # we discriminate i, ii and iii 120 | if c == v: 121 | assert c in punctuation 122 | phone = [c] 123 | tone = "0" 124 | word2ph.append(1) 125 | else: 126 | v_without_tone = v[:-1] 127 | tone = v[-1] 128 | 129 | pinyin = c + v_without_tone 130 | assert tone in "12345" 131 | 132 | if c: 133 | # 多音节 134 | v_rep_map = { 135 | "uei": "ui", 136 | "iou": "iu", 137 | "uen": "un", 138 | } 139 | if v_without_tone in v_rep_map.keys(): 140 | pinyin = c + v_rep_map[v_without_tone] 141 | else: 142 | # 单音节 143 | pinyin_rep_map = { 144 | "ing": "ying", 145 | "i": "yi", 146 | "in": "yin", 147 | "u": "wu", 148 | } 149 | if pinyin in pinyin_rep_map.keys(): 150 | pinyin = pinyin_rep_map[pinyin] 151 | else: 152 | single_rep_map = { 153 | "v": "yu", 154 | "e": "e", 155 | "i": "y", 156 | "u": "w", 157 | } 158 | if pinyin[0] in single_rep_map.keys(): 159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] 160 | 161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) 162 | phone = pinyin_to_symbol_map[pinyin].split(" ") 163 | word2ph.append(len(phone)) 164 | 165 | phones_list += phone 166 | tones_list += [int(tone)] * len(phone) 167 | return phones_list, tones_list, word2ph 168 | 169 | 170 | def text_normalize(text): 171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text) 172 | for number in numbers: 173 | text = text.replace(number, cn2an.an2cn(number), 1) 174 | text = replace_punctuation(text) 175 | return text 176 | 177 | 178 | def get_bert_feature(text, word2ph): 179 | from text import chinese_bert 180 | 181 | return chinese_bert.get_bert_feature(text, word2ph) 182 | 183 | 184 | if __name__ == "__main__": 185 | from text.chinese_bert import get_bert_feature 186 | 187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" 188 | text = text_normalize(text) 189 | print(text) 190 | phones, tones, word2ph = g2p(text) 191 | bert = get_bert_feature(text, word2ph) 192 | 193 | print(phones, tones, word2ph, bert.shape) 194 | 195 | 196 | # # 示例用法 197 | # text = "这是一个示例文本:,你好!这是一个测试...." 198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 199 | --------------------------------------------------------------------------------