├── oldVersion
├── V111
│ ├── text
│ │ ├── fix
│ │ │ ├── __init__.py
│ │ │ └── japanese_bert.py
│ │ ├── english_bert_mock.py
│ │ ├── japanese_bert.py
│ │ ├── cleaner.py
│ │ ├── __init__.py
│ │ ├── chinese_bert.py
│ │ ├── symbols.py
│ │ ├── english.py
│ │ ├── opencpop-strict.txt
│ │ └── chinese.py
│ └── __init__.py
├── __init__.py
├── V101
│ ├── text
│ │ ├── english_bert_mock.py
│ │ ├── cleaner.py
│ │ ├── __init__.py
│ │ ├── chinese_bert.py
│ │ ├── symbols.py
│ │ ├── japanese.py
│ │ ├── english.py
│ │ └── opencpop-strict.txt
│ └── __init__.py
└── V110
│ ├── text
│ ├── english_bert_mock.py
│ ├── cleaner.py
│ ├── __init__.py
│ ├── japanese_bert.py
│ ├── chinese_bert.py
│ ├── symbols.py
│ ├── english.py
│ ├── opencpop-strict.txt
│ └── chinese.py
│ └── __init__.py
├── tools
├── __init__.py
├── log.py
├── translate.py
├── classify_language.py
└── sentence.py
├── emotional
└── wav2vec2-large-robust-12-ft-emotion-msp-dim
│ ├── vocab.json
│ ├── preprocessor_config.json
│ ├── config.json
│ └── README.md
├── Web
├── img
│ ├── Hiyori.ico
│ ├── helps1.png
│ └── helps2.png
├── index.html
└── assets
│ └── index-49e71a58.css
├── text
├── cmudict_cache.pickle
├── bert_utils.py
├── cleaner.py
├── english_bert_mock.py
├── __init__.py
├── japanese_bert.py
├── chinese_bert.py
├── symbols.py
├── opencpop-strict.txt
└── chinese.py
├── run_Mgpus.sh
├── Data
└── keqing
│ ├── models
│ └── eval
│ │ └── events.out.tfevents.1700630428.ly.20380.1
│ ├── config.yml
│ └── config.json
├── bert
├── bert-base-japanese-v3
│ ├── tokenizer_config.json
│ ├── config.json
│ └── README.md
├── bert-large-japanese-v2
│ ├── tokenizer_config.json
│ ├── config.json
│ └── README.md
└── bert_models.json
├── .gitignore
├── requirements.txt
├── monotonic_align
├── __init__.py
└── core.py
├── audio_slicer.py
├── losses.py
├── config.yml
├── resample.py
├── re_matching.py
├── bert_gen.py
├── update_status.py
├── transcribe_genshin.py
├── README.md
├── configs
└── default_config.yml
├── mel_processing.py
├── emo_gen.py
├── preprocess_text.py
├── short_audio_transcribe.py
├── commons.py
└── server.py
/oldVersion/V111/text/fix/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具包
3 | """
4 |
--------------------------------------------------------------------------------
/oldVersion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 老版本模型推理兼容
3 | """
4 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/Web/img/Hiyori.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/Hiyori.ico
--------------------------------------------------------------------------------
/Web/img/helps1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps1.png
--------------------------------------------------------------------------------
/Web/img/helps2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps2.png
--------------------------------------------------------------------------------
/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/text/cmudict_cache.pickle
--------------------------------------------------------------------------------
/oldVersion/V101/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/run_Mgpus.sh:
--------------------------------------------------------------------------------
1 | torchrun \
2 | --nnodes=1:3\
3 | --nproc_per_node=2\
4 | --rdzv_id=1\
5 | --rdzv_backend=c10d\
6 | --rdzv_endpoint="ib1:8880"\
7 | train_ms.py
8 |
--------------------------------------------------------------------------------
/Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "do_normalize": true,
3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4 | "feature_size": 1,
5 | "padding_side": "right",
6 | "padding_value": 0.0,
7 | "return_attention_mask": true,
8 | "sampling_rate": 16000
9 | }
10 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "tokenizer_class": "BertJapaneseTokenizer",
3 | "model_max_length": 512,
4 | "do_lower_case": false,
5 | "word_tokenizer_type": "mecab",
6 | "subword_tokenizer_type": "wordpiece",
7 | "mecab_kwargs": {
8 | "mecab_dic": "unidic_lite"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "tokenizer_class": "BertJapaneseTokenizer",
3 | "model_max_length": 512,
4 | "do_lower_case": false,
5 | "word_tokenizer_type": "mecab",
6 | "subword_tokenizer_type": "wordpiece",
7 | "mecab_kwargs": {
8 | "mecab_dic": "unidic_lite"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/tools/log.py:
--------------------------------------------------------------------------------
1 | """
2 | logger封装
3 | """
4 | from loguru import logger
5 | import sys
6 |
7 |
8 | # 移除所有默认的处理器
9 | logger.remove()
10 |
11 | # 自定义格式并添加到标准输出
12 | log_format = (
13 | "{time:MM-DD HH:mm:ss} {level:<9}| {file}:{line} | {message}"
14 | )
15 |
16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | output/
6 | ckpt/
7 | pretrained_models/
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 |
--------------------------------------------------------------------------------
/bert/bert_models.json:
--------------------------------------------------------------------------------
1 | {
2 | "deberta-v2-large-japanese": {
3 | "repo_id": "ku-nlp/deberta-v2-large-japanese",
4 | "files": ["pytorch_model.bin"]
5 | },
6 | "chinese-roberta-wwm-ext-large": {
7 | "repo_id": "hfl/chinese-roberta-wwm-ext-large",
8 | "files": ["pytorch_model.bin"]
9 | },
10 | "deberta-v3-large": {
11 | "repo_id": "microsoft/deberta-v3-large",
12 | "files": ["spm.model", "pytorch_model.bin"]
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/Web/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Hiyori UI
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa==0.9.1
2 | matplotlib
3 | numpy
4 | numba
5 | phonemizer
6 | scipy
7 | tensorboard
8 | Unidecode
9 | amfm_decompy
10 | jieba
11 | transformers
12 | pypinyin
13 | cn2an
14 | gradio
15 | av
16 | mecab-python3
17 | loguru
18 | unidic-lite
19 | cmudict
20 | fugashi
21 | num2words
22 | PyYAML
23 | requests
24 | pyopenjtalk; sys_platform == 'linux'
25 | openjtalk; sys_platform != 'linux'
26 | jaconv
27 | psutil
28 | GPUtil
29 | vector_quantize_pytorch
30 | g2p_en
31 | sentencepiece
32 | pykakasi
33 | langid
34 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForPreTraining"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "type_vocab_size": 2,
18 | "vocab_size": 32768
19 | }
20 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForPreTraining"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 1024,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 4096,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 16,
15 | "num_hidden_layers": 24,
16 | "pad_token_id": 0,
17 | "type_vocab_size": 2,
18 | "vocab_size": 32768
19 | }
20 |
--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 | from numpy import zeros, int32, float32
2 | from torch import from_numpy
3 |
4 | from .core import maximum_path_jit
5 |
6 |
7 | def maximum_path(neg_cent, mask):
8 | device = neg_cent.device
9 | dtype = neg_cent.dtype
10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 | path = zeros(neg_cent.shape, dtype=int32)
12 |
13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 | return from_numpy(path).to(device=device, dtype=dtype)
17 |
--------------------------------------------------------------------------------
/text/bert_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from huggingface_hub import hf_hub_download
4 |
5 | from config import config
6 |
7 |
8 | MIRROR: str = config.mirror
9 |
10 |
11 | def _check_bert(repo_id, files, local_path):
12 | for file in files:
13 | if not Path(local_path).joinpath(file).exists():
14 | if MIRROR.lower() == "openi":
15 | import openi
16 |
17 | openi.model.download_model(
18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 | )
20 | else:
21 | hf_hub_download(
22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 | )
24 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from text import chinese, japanese, english, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 |
7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9 | Args:
10 | text: string to convert to a sequence
11 | Returns:
12 | List of integers corresponding to the symbols in the text
13 | """
14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 | tone_start = language_tone_start_map[language]
16 | tones = [i + tone_start for i in tones]
17 | lang_id = language_id_map[language]
18 | lang_ids = [lang_id for i in phones]
19 | return phones, tones, lang_ids
20 |
21 |
22 | def get_bert(norm_text, word2ph, language):
23 | from .chinese_bert import get_bert_feature as zh_bert
24 | from .english_bert_mock import get_bert_feature as en_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph)
28 | return bert
29 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 |
7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9 | Args:
10 | text: string to convert to a sequence
11 | Returns:
12 | List of integers corresponding to the symbols in the text
13 | """
14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 | tone_start = language_tone_start_map[language]
16 | tones = [i + tone_start for i in tones]
17 | lang_id = language_id_map[language]
18 | lang_ids = [lang_id for i in phones]
19 | return phones, tones, lang_ids
20 |
21 |
22 | def get_bert(norm_text, word2ph, language, device):
23 | from .chinese_bert import get_bert_feature as zh_bert
24 | from .english_bert_mock import get_bert_feature as en_bert
25 | from .japanese_bert import get_bert_feature as jp_bert
26 |
27 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
28 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
29 | return bert
30 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
6 |
7 |
8 | def get_bert_feature(text, word2ph, device=None):
9 | if (
10 | sys.platform == "darwin"
11 | and torch.backends.mps.is_available()
12 | and device == "cpu"
13 | ):
14 | device = "mps"
15 | if not device:
16 | device = "cuda"
17 | model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to(
18 | device
19 | )
20 | with torch.no_grad():
21 | inputs = tokenizer(text, return_tensors="pt")
22 | for i in inputs:
23 | inputs[i] = inputs[i].to(device)
24 | res = model(**inputs, output_hidden_states=True)
25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 | assert inputs["input_ids"].shape[-1] == len(word2ph)
27 | word2phone = word2ph
28 | phone_level_feature = []
29 | for i in range(len(word2phone)):
30 | repeat_feature = res[i].repeat(word2phone[i], 1)
31 | phone_level_feature.append(repeat_feature)
32 |
33 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
34 |
35 | return phone_level_feature.T
36 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
6 |
7 | models = dict()
8 |
9 |
10 | def get_bert_feature(text, word2ph, device=None):
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if device not in models.keys():
20 | models[device] = AutoModelForMaskedLM.from_pretrained(
21 | "./bert/bert-base-japanese-v3"
22 | ).to(device)
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = models[device](**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 | assert inputs["input_ids"].shape[-1] == len(word2ph)
30 | word2phone = word2ph
31 | phone_level_feature = []
32 | for i in range(len(word2phone)):
33 | repeat_feature = res[i].repeat(word2phone[i], 1)
34 | phone_level_feature.append(repeat_feature)
35 |
36 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 |
38 | return phone_level_feature.T
39 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, cleaned_text_to_sequence
2 | from .fix import japanese as japanese_fix
3 |
4 |
5 | language_module_map = {"ZH": chinese, "JP": japanese}
6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix}
7 |
8 |
9 | def clean_text(text, language):
10 | language_module = language_module_map[language]
11 | norm_text = language_module.text_normalize(text)
12 | phones, tones, word2ph = language_module.g2p(norm_text)
13 | return norm_text, phones, tones, word2ph
14 |
15 |
16 | def clean_text_fix(text, language):
17 | """使用dev分支修复"""
18 | language_module = language_module_map_fix[language]
19 | norm_text = language_module.text_normalize(text)
20 | phones, tones, word2ph = language_module.g2p(norm_text)
21 | return norm_text, phones, tones, word2ph
22 |
23 |
24 | def clean_text_bert(text, language):
25 | language_module = language_module_map[language]
26 | norm_text = language_module.text_normalize(text)
27 | phones, tones, word2ph = language_module.g2p(norm_text)
28 | bert = language_module.get_bert_feature(norm_text, word2ph)
29 | return phones, tones, bert
30 |
31 |
32 | def text_to_sequence(text, language):
33 | norm_text, phones, tones, word2ph = clean_text(text, language)
34 | return cleaned_text_to_sequence(phones, tones, language)
35 |
36 |
37 | if __name__ == "__main__":
38 | pass
39 |
--------------------------------------------------------------------------------
/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | if (
18 | sys.platform == "darwin"
19 | and torch.backends.mps.is_available()
20 | and device == "cpu"
21 | ):
22 | device = "mps"
23 | if not device:
24 | device = "cuda"
25 | if device not in models.keys():
26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 | with torch.no_grad():
28 | inputs = tokenizer(text, return_tensors="pt")
29 | for i in inputs:
30 | inputs[i] = inputs[i].to(device)
31 | res = models[device](**inputs, output_hidden_states=True)
32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 | # assert len(word2ph) == len(text)+2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
1 | import numba
2 |
3 |
4 | @numba.jit(
5 | numba.void(
6 | numba.int32[:, :, ::1],
7 | numba.float32[:, :, ::1],
8 | numba.int32[::1],
9 | numba.int32[::1],
10 | ),
11 | nopython=True,
12 | nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 | b = paths.shape[0]
16 | max_neg_val = -1e9
17 | for i in range(int(b)):
18 | path = paths[i]
19 | value = values[i]
20 | t_y = t_ys[i]
21 | t_x = t_xs[i]
22 |
23 | v_prev = v_cur = 0.0
24 | index = t_x - 1
25 |
26 | for y in range(t_y):
27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 | if x == y:
29 | v_cur = max_neg_val
30 | else:
31 | v_cur = value[y - 1, x]
32 | if x == 0:
33 | if y == 0:
34 | v_prev = 0.0
35 | else:
36 | v_prev = max_neg_val
37 | else:
38 | v_prev = value[y - 1, x - 1]
39 | value[y, x] += max(v_prev, v_cur)
40 |
41 | for y in range(t_y - 1, -1, -1):
42 | path[y, index] = 1
43 | if index != 0 and (
44 | index == y or value[y - 1, index] < value[y - 1, index - 1]
45 | ):
46 | index = index - 1
47 |
--------------------------------------------------------------------------------
/audio_slicer.py:
--------------------------------------------------------------------------------
1 | import librosa # Optional. Use any library you like to read audio files.
2 | import soundfile # Optional. Use any library you like to write audio files.
3 |
4 | import shutil
5 | import gradio as gr
6 | import os
7 | import webbrowser
8 | import subprocess
9 | import datetime
10 | import json
11 | import requests
12 | import soundfile as sf
13 | import numpy as np
14 | import yaml
15 | from config import config
16 | import os
17 |
18 | with open('config.yml', mode="r", encoding="utf-8") as f:
19 | configyml=yaml.load(f,Loader=yaml.FullLoader)
20 |
21 | model_name = configyml["dataset_path"].replace("Data\\","")
22 |
23 | from slicer2 import Slicer
24 |
25 | audio, sr = librosa.load(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav', sr=None, mono=False) # Load an audio file with librosa.
26 | slicer = Slicer(
27 | sr=sr,
28 | threshold=-40,
29 | min_length=2000,
30 | min_interval=300,
31 | hop_size=10,
32 | max_sil_kept=500
33 | )
34 | chunks = slicer.slice(audio)
35 | for i, chunk in enumerate(chunks):
36 | if len(chunk.shape) > 1:
37 | chunk = chunk.T # Swap axes if the audio is stereo.
38 | soundfile.write(f'./Data/{model_name}/raw/{model_name}/{model_name}_{i}.wav', chunk, sr) # Save sliced audio files with soundfile.
39 |
40 | if os.path.exists(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav'): # 如果文件存在
41 | os.remove(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav')
--------------------------------------------------------------------------------
/oldVersion/V111/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 | return bert
29 |
30 |
31 | def get_bert_fix(norm_text, word2ph, language, device):
32 | from .chinese_bert import get_bert_feature as zh_bert
33 | from .english_bert_mock import get_bert_feature as en_bert
34 | from .fix.japanese_bert import get_bert_feature as jp_bert
35 |
36 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
37 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
38 | return bert
39 |
--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def feature_loss(fmap_r, fmap_g):
5 | loss = 0
6 | for dr, dg in zip(fmap_r, fmap_g):
7 | for rl, gl in zip(dr, dg):
8 | rl = rl.float().detach()
9 | gl = gl.float()
10 | loss += torch.mean(torch.abs(rl - gl))
11 |
12 | return loss * 2
13 |
14 |
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 | loss = 0
17 | r_losses = []
18 | g_losses = []
19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 | dr = dr.float()
21 | dg = dg.float()
22 | r_loss = torch.mean((1 - dr) ** 2)
23 | g_loss = torch.mean(dg**2)
24 | loss += r_loss + g_loss
25 | r_losses.append(r_loss.item())
26 | g_losses.append(g_loss.item())
27 |
28 | return loss, r_losses, g_losses
29 |
30 |
31 | def generator_loss(disc_outputs):
32 | loss = 0
33 | gen_losses = []
34 | for dg in disc_outputs:
35 | dg = dg.float()
36 | l = torch.mean((1 - dg) ** 2)
37 | gen_losses.append(l)
38 | loss += l
39 |
40 | return loss, gen_losses
41 |
42 |
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 | """
45 | z_p, logs_q: [b, h, t_t]
46 | m_p, logs_p: [b, h, t_t]
47 | """
48 | z_p = z_p.float()
49 | logs_q = logs_q.float()
50 | m_p = m_p.float()
51 | logs_p = logs_p.float()
52 | z_mask = z_mask.float()
53 |
54 | kl = logs_p - logs_q - 0.5
55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 | kl = torch.sum(kl * z_mask)
57 | l = kl / torch.sum(z_mask)
58 | return l
59 |
--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
1 | from text.symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 | return bert
29 |
30 |
31 | def check_bert_models():
32 | import json
33 | from pathlib import Path
34 |
35 | from config import config
36 | from .bert_utils import _check_bert
37 |
38 | if config.mirror.lower() == "openi":
39 | import openi
40 |
41 | kwargs = {"token": config.openi_token} if config.openi_token else {}
42 | openi.login(**kwargs)
43 |
44 | with open("./bert/bert_models.json", "r") as fp:
45 | models = json.load(fp)
46 | for k, v in models.items():
47 | local_path = Path("./bert").joinpath(k)
48 | _check_bert(v["repo_id"], v["files"], local_path)
49 |
--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
1 | bert_gen:
2 | config_path: config.json
3 | device: cuda
4 | num_processes: 2
5 | use_multi_device: false
6 | dataset_path: Data\keqing
7 | mirror: ''
8 | openi_token: ''
9 | preprocess_text:
10 | clean: true
11 | cleaned_path: filelists/cleaned.list
12 | config_path: config.json
13 | max_val_total: 8
14 | train_path: filelists/train.list
15 | transcription_path: filelists/short_character_anno.list
16 | val_path: filelists/val.list
17 | val_per_spk: 5
18 | resample:
19 | in_dir: raw
20 | out_dir: raw
21 | sampling_rate: 44100
22 | server:
23 | device: cuda
24 | models:
25 | - config: ./Data/keqing/config.json
26 | device: cuda
27 | language: ZH
28 | model: ./Data/keqing/models/G_0.pth
29 | speakers:
30 | - length_scale: 1
31 | noise_scale: 0.6
32 | noise_scale_w: 0.8
33 | sdp_ratio: 0.2
34 | speaker: "\u79D1\u6BD4"
35 | - length_scale: 0.5
36 | noise_scale: 0.7
37 | noise_scale_w: 0.8
38 | sdp_ratio: 0.3
39 | speaker: "\u4E94\u6761\u609F"
40 | - length_scale: 1.2
41 | noise_scale: 0.6
42 | noise_scale_w: 0.8
43 | sdp_ratio: 0.2
44 | speaker: "\u5B89\u500D\u664B\u4E09"
45 | - config: ./Data/keqing/config.json
46 | device: cuda
47 | language: JP
48 | model: ./Data/keqing/models/G_0.pth
49 | speakers: []
50 | port: 7860
51 | train_ms:
52 | base:
53 | model_image: "Bert-VITS2中日英底模-fix"
54 | repo_id: Stardust_minus/Bert-VITS2
55 | use_base_model: false
56 | config_path: config.json
57 | env:
58 | MASTER_ADDR: localhost
59 | MASTER_PORT: 10086
60 | RANK: 0
61 | THE_ENV_VAR_YOU_NEED_TO_USE: '1234567'
62 | WORLD_SIZE: 1
63 | model: models
64 | translate:
65 | app_key: ''
66 | secret_key: ''
67 | webui:
68 | config_path: Data/keqing/config.json
69 | debug: false
70 | device: cuda
71 | language_identification_library: langid
72 | model: models/G_0.pth
73 | port: 7860
74 | share: false
75 |
--------------------------------------------------------------------------------
/Data/keqing/config.yml:
--------------------------------------------------------------------------------
1 | bert_gen:
2 | config_path: config.json
3 | device: cuda
4 | num_processes: 2
5 | use_multi_device: false
6 | dataset_path: Data\keqing
7 | mirror: ''
8 | openi_token: ''
9 | preprocess_text:
10 | clean: true
11 | cleaned_path: filelists/cleaned.list
12 | config_path: config.json
13 | max_val_total: 8
14 | train_path: filelists/train.list
15 | transcription_path: filelists/short_character_anno.list
16 | val_path: filelists/val.list
17 | val_per_spk: 5
18 | resample:
19 | in_dir: raw
20 | out_dir: raw
21 | sampling_rate: 44100
22 | server:
23 | device: cuda
24 | models:
25 | - config: ./Data/TEST/config.json
26 | device: cuda
27 | language: ZH
28 | model: ./Data/TEST/models/G_100.pth
29 | speakers:
30 | - length_scale: 1
31 | noise_scale: 0.6
32 | noise_scale_w: 0.8
33 | sdp_ratio: 0.2
34 | speaker: "\u79D1\u6BD4"
35 | - length_scale: 0.5
36 | noise_scale: 0.7
37 | noise_scale_w: 0.8
38 | sdp_ratio: 0.3
39 | speaker: "\u4E94\u6761\u609F"
40 | - length_scale: 1.2
41 | noise_scale: 0.6
42 | noise_scale_w: 0.8
43 | sdp_ratio: 0.2
44 | speaker: "\u5B89\u500D\u664B\u4E09"
45 | - config: ./Data/test/config.json
46 | device: cuda
47 | language: JP
48 | model: ./Data/test/models/G_100.pth
49 | speakers: []
50 | port: 7860
51 | train_ms:
52 | base:
53 | model_image: "Bert-VITS2中日英底模-fix"
54 | repo_id: Stardust_minus/Bert-VITS2
55 | use_base_model: false
56 | config_path: config.json
57 | env:
58 | MASTER_ADDR: localhost
59 | MASTER_PORT: 10086
60 | RANK: 0
61 | THE_ENV_VAR_YOU_NEED_TO_USE: '1234567'
62 | WORLD_SIZE: 1
63 | model: models
64 | translate:
65 | app_key: ''
66 | secret_key: ''
67 | webui:
68 | config_path: Data/TEST/config.json
69 | debug: false
70 | device: cuda
71 | language_identification_library: langid
72 | model: models/G_100.pth
73 | port: 7860
74 | share: false
75 |
--------------------------------------------------------------------------------
/Data/keqing/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 50,
4 | "eval_interval": 50,
5 | "seed": 42,
6 | "epochs": 200,
7 | "learning_rate": 0.0001,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 8,
14 | "fp16_run": false,
15 | "lr_decay": 0.99995,
16 | "segment_size": 16384,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "skip_optimizer": false
22 | },
23 | "data": {
24 | "training_files": "Data/keqing/filelists/train.list",
25 | "validation_files": "Data/keqing/filelists/val.list",
26 | "max_wav_value": 32768.0,
27 | "sampling_rate": 44100,
28 | "filter_length": 2048,
29 | "hop_length": 512,
30 | "win_length": 2048,
31 | "n_mel_channels": 128,
32 | "mel_fmin": 0.0,
33 | "mel_fmax": null,
34 | "add_blank": true,
35 | "n_speakers": 1,
36 | "cleaned_text": true,
37 | "spk2id": {
38 | "keqing": 0
39 | }
40 | },
41 | "model": {
42 | "use_spk_conditioned_encoder": true,
43 | "use_noise_scaled_mas": true,
44 | "use_mel_posterior_encoder": false,
45 | "use_duration_discriminator": true,
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [
55 | 3,
56 | 7,
57 | 11
58 | ],
59 | "resblock_dilation_sizes": [
60 | [
61 | 1,
62 | 3,
63 | 5
64 | ],
65 | [
66 | 1,
67 | 3,
68 | 5
69 | ],
70 | [
71 | 1,
72 | 3,
73 | 5
74 | ]
75 | ],
76 | "upsample_rates": [
77 | 8,
78 | 8,
79 | 2,
80 | 2,
81 | 2
82 | ],
83 | "upsample_initial_channel": 512,
84 | "upsample_kernel_sizes": [
85 | 16,
86 | 16,
87 | 8,
88 | 2,
89 | 2
90 | ],
91 | "n_layers_q": 3,
92 | "use_spectral_norm": false,
93 | "gin_channels": 256
94 | },
95 | "version": "2.0"
96 | }
--------------------------------------------------------------------------------
/tools/translate.py:
--------------------------------------------------------------------------------
1 | """
2 | 翻译api
3 | """
4 | from config import config
5 |
6 | import random
7 | import hashlib
8 | import requests
9 |
10 |
11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12 | """
13 | :param Sentence: 待翻译语句
14 | :param from_Language: 待翻译语句语言
15 | :param to_Language: 目标语言
16 | :return: 翻译后语句 出错时返回None
17 |
18 | 常见语言代码:中文 zh 英语 en 日语 jp
19 | """
20 | appid = config.translate_config.app_key
21 | key = config.translate_config.secret_key
22 | if appid == "" or key == "":
23 | return "请开发者在config.yml中配置app_key与secret_key"
24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25 | texts = Sentence.splitlines()
26 | outTexts = []
27 | for t in texts:
28 | if t != "":
29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30 | salt = str(random.randint(1, 100000))
31 | signString = appid + t + salt + key
32 | hs = hashlib.md5()
33 | hs.update(signString.encode("utf-8"))
34 | signString = hs.hexdigest()
35 | if from_Language == "":
36 | from_Language = "auto"
37 | headers = {"Content-Type": "application/x-www-form-urlencoded"}
38 | payload = {
39 | "q": t,
40 | "from": from_Language,
41 | "to": to_Language,
42 | "appid": appid,
43 | "salt": salt,
44 | "sign": signString,
45 | }
46 | # 发送请求
47 | try:
48 | response = requests.post(
49 | url=url, data=payload, headers=headers, timeout=3
50 | )
51 | response = response.json()
52 | if "trans_result" in response.keys():
53 | result = response["trans_result"][0]
54 | if "dst" in result.keys():
55 | dst = result["dst"]
56 | outTexts.append(dst)
57 | except Exception:
58 | return Sentence
59 | else:
60 | outTexts.append(t)
61 | return "\n".join(outTexts)
62 |
--------------------------------------------------------------------------------
/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from text.japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | sep_text, _, _ = text2sep_kata(text)
18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 | return get_bert_feature_with_token(sep_ids, word2ph, device)
22 |
23 |
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 | if (
26 | sys.platform == "darwin"
27 | and torch.backends.mps.is_available()
28 | and device == "cpu"
29 | ):
30 | device = "mps"
31 | if not device:
32 | device = "cuda"
33 | if device not in models.keys():
34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 | with torch.no_grad():
36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 | token_type_ids = torch.zeros_like(inputs).to(device)
38 | attention_mask = torch.ones_like(inputs).to(device)
39 | inputs = {
40 | "input_ids": inputs,
41 | "token_type_ids": token_type_ids,
42 | "attention_mask": attention_mask,
43 | }
44 |
45 | # for i in inputs:
46 | # inputs[i] = inputs[i].to(device)
47 | res = models[device](**inputs, output_hidden_states=True)
48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 | assert inputs["input_ids"].shape[-1] == len(word2ph)
50 | word2phone = word2ph
51 | phone_level_feature = []
52 | for i in range(len(word2phone)):
53 | repeat_feature = res[i].repeat(word2phone[i], 1)
54 | phone_level_feature.append(repeat_feature)
55 |
56 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 |
58 | return phone_level_feature.T
59 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 | from .japanese import text2sep_kata
5 | from config import config
6 |
7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
8 |
9 | models = dict()
10 |
11 |
12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
13 | sep_text, _ = text2sep_kata(text)
14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
17 | return get_bert_feature_with_token(sep_ids, word2ph, device)
18 |
19 |
20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
21 | if (
22 | sys.platform == "darwin"
23 | and torch.backends.mps.is_available()
24 | and device == "cpu"
25 | ):
26 | device = "mps"
27 | if not device:
28 | device = "cuda"
29 | if device not in models.keys():
30 | models[device] = AutoModelForMaskedLM.from_pretrained(
31 | "./bert/bert-base-japanese-v3"
32 | ).to(device)
33 | with torch.no_grad():
34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
35 | token_type_ids = torch.zeros_like(inputs).to(device)
36 | attention_mask = torch.ones_like(inputs).to(device)
37 | inputs = {
38 | "input_ids": inputs,
39 | "token_type_ids": token_type_ids,
40 | "attention_mask": attention_mask,
41 | }
42 |
43 | # for i in inputs:
44 | # inputs[i] = inputs[i].to(device)
45 | res = models[device](**inputs, output_hidden_states=True)
46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
47 | assert inputs["input_ids"].shape[-1] == len(word2ph)
48 | word2phone = word2ph
49 | phone_level_feature = []
50 | for i in range(len(word2phone)):
51 | repeat_feature = res[i].repeat(word2phone[i], 1)
52 | phone_level_feature.append(repeat_feature)
53 |
54 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
55 |
56 | return phone_level_feature.T
57 |
--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import librosa
4 | from multiprocessing import Pool, cpu_count
5 |
6 | import soundfile
7 | from tqdm import tqdm
8 |
9 | from config import config
10 |
11 |
12 | def process(item):
13 | spkdir, wav_name, args = item
14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name)
15 | if os.path.exists(wav_path) and ".wav" in wav_path:
16 | wav, sr = librosa.load(wav_path, sr=args.sr)
17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
18 |
19 |
20 | if __name__ == "__main__":
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument(
23 | "--sr",
24 | type=int,
25 | default=config.resample_config.sampling_rate,
26 | help="sampling rate",
27 | )
28 | parser.add_argument(
29 | "--in_dir",
30 | type=str,
31 | default=config.resample_config.in_dir,
32 | help="path to source dir",
33 | )
34 | parser.add_argument(
35 | "--out_dir",
36 | type=str,
37 | default=config.resample_config.out_dir,
38 | help="path to target dir",
39 | )
40 | parser.add_argument(
41 | "--processes",
42 | type=int,
43 | default=0,
44 | help="cpu_processes",
45 | )
46 | args, _ = parser.parse_known_args()
47 | # autodl 无卡模式会识别出46个cpu
48 | if args.processes == 0:
49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 | else:
51 | processes = args.processes
52 | pool = Pool(processes=processes)
53 |
54 | tasks = []
55 |
56 | for dirpath, _, filenames in os.walk(args.in_dir):
57 | # 子级目录
58 | spk_dir = os.path.relpath(dirpath, args.in_dir)
59 | spk_dir_out = os.path.join(args.out_dir, spk_dir)
60 | if not os.path.isdir(spk_dir_out):
61 | os.makedirs(spk_dir_out, exist_ok=True)
62 | for filename in filenames:
63 | if filename.endswith(".wav"):
64 | twople = (spk_dir, filename, args)
65 | tasks.append(twople)
66 |
67 | for _ in tqdm(
68 | pool.imap_unordered(process, tasks),
69 | ):
70 | pass
71 |
72 | pool.close()
73 | pool.join()
74 |
75 | print("音频重采样完毕!")
76 |
--------------------------------------------------------------------------------
/oldVersion/V101/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.0.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 |
26 | assert bert.shape[-1] == len(phone)
27 |
28 | phone = torch.LongTensor(phone)
29 | tone = torch.LongTensor(tone)
30 | language = torch.LongTensor(language)
31 |
32 | return bert, phone, tone, language
33 |
34 |
35 | def infer(
36 | text,
37 | sdp_ratio,
38 | noise_scale,
39 | noise_scale_w,
40 | length_scale,
41 | sid,
42 | hps,
43 | net_g,
44 | device,
45 | ):
46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device)
47 | with torch.no_grad():
48 | x_tst = phones.to(device).unsqueeze(0)
49 | tones = tones.to(device).unsqueeze(0)
50 | lang_ids = lang_ids.to(device).unsqueeze(0)
51 | bert = bert.to(device).unsqueeze(0)
52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
53 | del phones
54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
55 | audio = (
56 | net_g.infer(
57 | x_tst,
58 | x_tst_lengths,
59 | speakers,
60 | tones,
61 | lang_ids,
62 | bert,
63 | sdp_ratio=sdp_ratio,
64 | noise_scale=noise_scale,
65 | noise_scale_w=noise_scale_w,
66 | length_scale=length_scale,
67 | )[0][0, 0]
68 | .data.cpu()
69 | .float()
70 | .numpy()
71 | )
72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
73 | if torch.cuda.is_available():
74 | torch.cuda.empty_cache()
75 | return audio
76 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 |
8 | def get_bert_feature(text, word2ph, device=None):
9 | if (
10 | sys.platform == "darwin"
11 | and torch.backends.mps.is_available()
12 | and device == "cpu"
13 | ):
14 | device = "mps"
15 | if not device:
16 | device = "cuda"
17 | model = AutoModelForMaskedLM.from_pretrained(
18 | "./bert/chinese-roberta-wwm-ext-large"
19 | ).to(device)
20 | with torch.no_grad():
21 | inputs = tokenizer(text, return_tensors="pt")
22 | for i in inputs:
23 | inputs[i] = inputs[i].to(device)
24 | res = model(**inputs, output_hidden_states=True)
25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 |
27 | assert len(word2ph) == len(text) + 2
28 | word2phone = word2ph
29 | phone_level_feature = []
30 | for i in range(len(word2phone)):
31 | repeat_feature = res[i].repeat(word2phone[i], 1)
32 | phone_level_feature.append(repeat_feature)
33 |
34 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
35 |
36 | return phone_level_feature.T
37 |
38 |
39 | if __name__ == "__main__":
40 | import torch
41 |
42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
43 | word2phone = [
44 | 1,
45 | 2,
46 | 1,
47 | 2,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 2,
59 | 2,
60 | 2,
61 | 1,
62 | 1,
63 | 2,
64 | 2,
65 | 1,
66 | 2,
67 | 2,
68 | 2,
69 | 2,
70 | 1,
71 | 2,
72 | 2,
73 | 2,
74 | 2,
75 | 2,
76 | 1,
77 | 2,
78 | 2,
79 | 2,
80 | 2,
81 | 1,
82 | ]
83 |
84 | # 计算总帧数
85 | total_frames = sum(word2phone)
86 | print(word_level_feature.shape)
87 | print(word2phone)
88 | phone_level_feature = []
89 | for i in range(len(word2phone)):
90 | print(word_level_feature[i].shape)
91 |
92 | # 对每个词重复word2phone[i]次
93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
94 | phone_level_feature.append(repeat_feature)
95 |
96 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
97 | print(phone_level_feature.shape) # torch.Size([36, 1024])
98 |
--------------------------------------------------------------------------------
/re_matching.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def extract_language_and_text_updated(speaker, dialogue):
5 | # 使用正则表达式匹配<语言>标签和其后的文本
6 | pattern_language_text = r"<(\S+?)>([^<]+)"
7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
8 | speaker = speaker[1:-1]
9 | # 清理文本:去除两边的空白字符
10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11 | matches_cleaned.append(speaker)
12 | return matches_cleaned
13 |
14 |
15 | def validate_text(input_text):
16 | # 验证说话人的正则表达式
17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18 |
19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21 |
22 | # 对每个匹配到的说话人内容进行进一步验证
23 | for _, dialogue in matches:
24 | language_text_matches = extract_language_and_text_updated(_, dialogue)
25 | if not language_text_matches:
26 | return (
27 | False,
28 | "Error: Invalid format detected in dialogue content. Please check your input.",
29 | )
30 |
31 | # 如果输入的文本中没有找到任何匹配项
32 | if not matches:
33 | return (
34 | False,
35 | "Error: No valid speaker format detected. Please check your input.",
36 | )
37 |
38 | return True, "Input is valid."
39 |
40 |
41 | def text_matching(text: str) -> list:
42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43 | matches = re.findall(speaker_pattern, text, re.DOTALL)
44 | result = []
45 | for speaker, dialogue in matches:
46 | result.append(extract_language_and_text_updated(speaker, dialogue))
47 | print(result)
48 | return result
49 |
50 |
51 | def cut_para(text):
52 | splitted_para = re.split("[\n]", text) # 按段分
53 | splitted_para = [
54 | sentence.strip() for sentence in splitted_para if sentence.strip()
55 | ] # 删除空字符串
56 | return splitted_para
57 |
58 |
59 | def cut_sent(para):
60 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符
61 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号
62 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号
63 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
64 | para = para.rstrip() # 段尾如果有多余的\n就去掉它
65 | return para.split("\n")
66 |
67 |
68 | if __name__ == "__main__":
69 | text = """
70 | [说话人1]
71 | [说话人2]你好吗?元気ですか?こんにちは,世界。你好吗?
72 | [说话人3]谢谢。どういたしまして。
73 | """
74 | text_matching(text)
75 | # 测试函数
76 | test_text = """
77 | [说话人1]你好,こんにちは!こんにちは,世界。
78 | [说话人2]你好吗?
79 | """
80 | text_matching(test_text)
81 | res = validate_text(test_text)
82 | print(res)
83 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | device = torch.device(
6 | "cuda"
7 | if torch.cuda.is_available()
8 | else (
9 | "mps"
10 | if sys.platform == "darwin" and torch.backends.mps.is_available()
11 | else "cpu"
12 | )
13 | )
14 |
15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(
17 | device
18 | )
19 |
20 |
21 | def get_bert_feature(text, word2ph):
22 | with torch.no_grad():
23 | inputs = tokenizer(text, return_tensors="pt")
24 | for i in inputs:
25 | inputs[i] = inputs[i].to(device)
26 | res = model(**inputs, output_hidden_states=True)
27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
28 |
29 | assert len(word2ph) == len(text) + 2
30 | word2phone = word2ph
31 | phone_level_feature = []
32 | for i in range(len(word2phone)):
33 | repeat_feature = res[i].repeat(word2phone[i], 1)
34 | phone_level_feature.append(repeat_feature)
35 |
36 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 |
38 | return phone_level_feature.T
39 |
40 |
41 | if __name__ == "__main__":
42 | # feature = get_bert_feature('你好,我是说的道理。')
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/bert_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from multiprocessing import Pool
3 | import commons
4 | import utils
5 | from tqdm import tqdm
6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert
7 | import argparse
8 | import torch.multiprocessing as mp
9 | from config import config
10 |
11 |
12 | def process_line(line):
13 | device = config.bert_gen_config.device
14 | if config.bert_gen_config.use_multi_device:
15 | rank = mp.current_process()._identity
16 | rank = rank[0] if len(rank) > 0 else 0
17 | if torch.cuda.is_available():
18 | gpu_id = rank % torch.cuda.device_count()
19 | device = torch.device(f"cuda:{gpu_id}")
20 | else:
21 | device = torch.device("cpu")
22 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
23 | phone = phones.split(" ")
24 | tone = [int(i) for i in tone.split(" ")]
25 | word2ph = [int(i) for i in word2ph.split(" ")]
26 | word2ph = [i for i in word2ph]
27 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
28 |
29 | phone = commons.intersperse(phone, 0)
30 | tone = commons.intersperse(tone, 0)
31 | language = commons.intersperse(language, 0)
32 | for i in range(len(word2ph)):
33 | word2ph[i] = word2ph[i] * 2
34 | word2ph[0] += 1
35 |
36 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
37 |
38 | try:
39 | bert = torch.load(bert_path)
40 | assert bert.shape[-1] == len(phone)
41 | except Exception:
42 | bert = get_bert(text, word2ph, language_str, device)
43 | assert bert.shape[-1] == len(phone)
44 | torch.save(bert, bert_path)
45 |
46 |
47 | preprocess_text_config = config.preprocess_text_config
48 |
49 | if __name__ == "__main__":
50 | parser = argparse.ArgumentParser()
51 | parser.add_argument(
52 | "-c", "--config", type=str, default=config.bert_gen_config.config_path
53 | )
54 | parser.add_argument(
55 | "--num_processes", type=int, default=config.bert_gen_config.num_processes
56 | )
57 | args, _ = parser.parse_known_args()
58 | config_path = args.config
59 | hps = utils.get_hparams_from_file(config_path)
60 | check_bert_models()
61 | lines = []
62 | with open(hps.data.training_files, encoding="utf-8") as f:
63 | lines.extend(f.readlines())
64 |
65 | with open(hps.data.validation_files, encoding="utf-8") as f:
66 | lines.extend(f.readlines())
67 | if len(lines) != 0:
68 | num_processes = args.num_processes
69 | with Pool(processes=num_processes) as pool:
70 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
71 | pass
72 |
73 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
74 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 | models = dict()
8 |
9 |
10 | def get_bert_feature(text, word2ph, device=None):
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if device not in models.keys():
20 | models[device] = AutoModelForMaskedLM.from_pretrained(
21 | "./bert/chinese-roberta-wwm-ext-large"
22 | ).to(device)
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = models[device](**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert len(word2ph) == len(text) + 2
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
41 |
42 | if __name__ == "__main__":
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16 | if (
17 | sys.platform == "darwin"
18 | and torch.backends.mps.is_available()
19 | and device == "cpu"
20 | ):
21 | device = "mps"
22 | if not device:
23 | device = "cuda"
24 | if device not in models.keys():
25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26 | with torch.no_grad():
27 | inputs = tokenizer(text, return_tensors="pt")
28 | for i in inputs:
29 | inputs[i] = inputs[i].to(device)
30 | res = models[device](**inputs, output_hidden_states=True)
31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32 |
33 | assert len(word2ph) == len(text) + 2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
44 |
45 | if __name__ == "__main__":
46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47 | word2phone = [
48 | 1,
49 | 2,
50 | 1,
51 | 2,
52 | 2,
53 | 1,
54 | 2,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 2,
65 | 1,
66 | 1,
67 | 2,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 1,
86 | ]
87 |
88 | # 计算总帧数
89 | total_frames = sum(word2phone)
90 | print(word_level_feature.shape)
91 | print(word2phone)
92 | phone_level_feature = []
93 | for i in range(len(word2phone)):
94 | print(word_level_feature[i].shape)
95 |
96 | # 对每个词重复word2phone[i]次
97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98 | phone_level_feature.append(repeat_feature)
99 |
100 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 | print(phone_level_feature.shape) # torch.Size([36, 1024])
102 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "torch",
3 | "activation_dropout": 0.1,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "Wav2Vec2ForSpeechClassification"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 768,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": true,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": true,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_dropout": 0.0,
51 | "feat_extract_norm": "layer",
52 | "feat_proj_dropout": 0.1,
53 | "feat_quantizer_dropout": 0.0,
54 | "final_dropout": 0.1,
55 | "finetuning_task": "wav2vec2_reg",
56 | "gradient_checkpointing": false,
57 | "hidden_act": "gelu",
58 | "hidden_dropout": 0.1,
59 | "hidden_dropout_prob": 0.1,
60 | "hidden_size": 1024,
61 | "id2label": {
62 | "0": "arousal",
63 | "1": "dominance",
64 | "2": "valence"
65 | },
66 | "initializer_range": 0.02,
67 | "intermediate_size": 4096,
68 | "label2id": {
69 | "arousal": 0,
70 | "dominance": 1,
71 | "valence": 2
72 | },
73 | "layer_norm_eps": 1e-05,
74 | "layerdrop": 0.1,
75 | "mask_feature_length": 10,
76 | "mask_feature_min_masks": 0,
77 | "mask_feature_prob": 0.0,
78 | "mask_time_length": 10,
79 | "mask_time_min_masks": 2,
80 | "mask_time_prob": 0.05,
81 | "model_type": "wav2vec2",
82 | "num_adapter_layers": 3,
83 | "num_attention_heads": 16,
84 | "num_codevector_groups": 2,
85 | "num_codevectors_per_group": 320,
86 | "num_conv_pos_embedding_groups": 16,
87 | "num_conv_pos_embeddings": 128,
88 | "num_feat_extract_layers": 7,
89 | "num_hidden_layers": 12,
90 | "num_negatives": 100,
91 | "output_hidden_size": 1024,
92 | "pad_token_id": 0,
93 | "pooling_mode": "mean",
94 | "problem_type": "regression",
95 | "proj_codevector_dim": 768,
96 | "tdnn_dilation": [
97 | 1,
98 | 2,
99 | 3,
100 | 1,
101 | 1
102 | ],
103 | "tdnn_dim": [
104 | 512,
105 | 512,
106 | 512,
107 | 512,
108 | 1500
109 | ],
110 | "tdnn_kernel": [
111 | 5,
112 | 3,
113 | 3,
114 | 1,
115 | 1
116 | ],
117 | "torch_dtype": "float32",
118 | "transformers_version": "4.17.0.dev0",
119 | "use_weighted_layer_sum": false,
120 | "vocab_size": null,
121 | "xvector_output_dim": 512
122 | }
123 |
--------------------------------------------------------------------------------
/oldVersion/V110/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 | assert bert.shape[-1] == len(phone), phone
26 |
27 | if language_str == "ZH":
28 | bert = bert
29 | ja_bert = torch.zeros(768, len(phone))
30 | elif language_str == "JP":
31 | ja_bert = bert
32 | bert = torch.zeros(1024, len(phone))
33 | else:
34 | bert = torch.zeros(1024, len(phone))
35 | ja_bert = torch.zeros(768, len(phone))
36 |
37 | assert bert.shape[-1] == len(
38 | phone
39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 |
41 | phone = torch.LongTensor(phone)
42 | tone = torch.LongTensor(tone)
43 | language = torch.LongTensor(language)
44 | return bert, ja_bert, phone, tone, language
45 |
46 |
47 | def infer(
48 | text,
49 | sdp_ratio,
50 | noise_scale,
51 | noise_scale_w,
52 | length_scale,
53 | sid,
54 | language,
55 | hps,
56 | net_g,
57 | device,
58 | ):
59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
60 | with torch.no_grad():
61 | x_tst = phones.to(device).unsqueeze(0)
62 | tones = tones.to(device).unsqueeze(0)
63 | lang_ids = lang_ids.to(device).unsqueeze(0)
64 | bert = bert.to(device).unsqueeze(0)
65 | ja_bert = ja_bert.to(device).unsqueeze(0)
66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
67 | del phones
68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
69 | audio = (
70 | net_g.infer(
71 | x_tst,
72 | x_tst_lengths,
73 | speakers,
74 | tones,
75 | lang_ids,
76 | bert,
77 | ja_bert,
78 | sdp_ratio=sdp_ratio,
79 | noise_scale=noise_scale,
80 | noise_scale_w=noise_scale_w,
81 | length_scale=length_scale,
82 | )[0][0, 0]
83 | .data.cpu()
84 | .float()
85 | .numpy()
86 | )
87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
88 | if torch.cuda.is_available():
89 | torch.cuda.empty_cache()
90 | return audio
91 |
--------------------------------------------------------------------------------
/update_status.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gradio as gr
3 |
4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"}
5 |
6 |
7 | def raw_dir_convert_to_path(target_dir: str, lang):
8 | res = target_dir.rstrip("/").rstrip("\\")
9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")):
10 | res = os.path.join("./raw", res)
11 | if (
12 | (not res.endswith("_zh"))
13 | and (not res.endswith("_jp"))
14 | and (not res.endswith("_en"))
15 | ):
16 | res += lang_dict[lang]
17 | return res
18 |
19 |
20 | def update_g_files():
21 | g_files = []
22 | cnt = 0
23 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
24 | for file in files:
25 | if file.startswith("G_") and file.endswith(".pth"):
26 | g_files.append(os.path.join(root, file))
27 | cnt += 1
28 | print(g_files)
29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files)
30 |
31 |
32 | def update_c_files():
33 | c_files = []
34 | cnt = 0
35 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
36 | for file in files:
37 | if file.startswith("config.json"):
38 | c_files.append(os.path.join(root, file))
39 | cnt += 1
40 | print(c_files)
41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files)
42 |
43 |
44 | def update_model_folders():
45 | subdirs = []
46 | cnt = 0
47 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
48 | for dir_name in dirs:
49 | if os.path.basename(dir_name) != "eval":
50 | subdirs.append(os.path.join(root, dir_name))
51 | cnt += 1
52 | print(subdirs)
53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs)
54 |
55 |
56 | def update_wav_lab_pairs():
57 | wav_count = tot_count = 0
58 | for root, _, files in os.walk("./raw"):
59 | for file in files:
60 | # print(file)
61 | file_path = os.path.join(root, file)
62 | if file.lower().endswith(".wav"):
63 | lab_file = os.path.splitext(file_path)[0] + ".lab"
64 | if os.path.exists(lab_file):
65 | wav_count += 1
66 | tot_count += 1
67 | return f"{wav_count} / {tot_count}"
68 |
69 |
70 | def update_raw_folders():
71 | subdirs = []
72 | cnt = 0
73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径
74 | raw_path = os.path.join(script_path, "raw")
75 | print(raw_path)
76 | os.makedirs(raw_path, exist_ok=True)
77 | for root, dirs, files in os.walk(raw_path):
78 | for dir_name in dirs:
79 | relative_path = os.path.relpath(
80 | os.path.join(root, dir_name), script_path
81 | ) # 获取相对路径
82 | subdirs.append(relative_path)
83 | cnt += 1
84 | print(subdirs)
85 | return (
86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹",
87 | gr.Dropdown.update(choices=subdirs),
88 | gr.Textbox.update(value=update_wav_lab_pairs()),
89 | )
90 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "I",
78 | "N",
79 | "U",
80 | "a",
81 | "b",
82 | "by",
83 | "ch",
84 | "cl",
85 | "d",
86 | "dy",
87 | "e",
88 | "f",
89 | "g",
90 | "gy",
91 | "h",
92 | "hy",
93 | "i",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "p",
103 | "py",
104 | "r",
105 | "ry",
106 | "s",
107 | "sh",
108 | "t",
109 | "ts",
110 | "u",
111 | "V",
112 | "w",
113 | "y",
114 | "z",
115 | ]
116 | num_ja_tones = 1
117 |
118 | # English
119 | en_symbols = [
120 | "aa",
121 | "ae",
122 | "ah",
123 | "ao",
124 | "aw",
125 | "ay",
126 | "b",
127 | "ch",
128 | "d",
129 | "dh",
130 | "eh",
131 | "er",
132 | "ey",
133 | "f",
134 | "g",
135 | "hh",
136 | "ih",
137 | "iy",
138 | "jh",
139 | "k",
140 | "l",
141 | "m",
142 | "n",
143 | "ng",
144 | "ow",
145 | "oy",
146 | "p",
147 | "r",
148 | "s",
149 | "sh",
150 | "t",
151 | "th",
152 | "uh",
153 | "uw",
154 | "V",
155 | "w",
156 | "y",
157 | "z",
158 | "zh",
159 | ]
160 | num_en_tones = 4
161 |
162 | # combine all symbols
163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
164 | symbols = [pad] + normal_symbols + pu_symbols
165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
166 |
167 | # combine all tones
168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
169 |
170 | # language maps
171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2}
172 | num_languages = len(language_id_map.keys())
173 |
174 | language_tone_start_map = {
175 | "ZH": 0,
176 | "JA": num_zh_tones,
177 | "EN": num_zh_tones + num_ja_tones,
178 | }
179 |
180 | if __name__ == "__main__":
181 | a = set(zh_symbols)
182 | b = set(en_symbols)
183 | print(sorted(a & b))
184 |
--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/japanese.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2 | import re
3 | import sys
4 |
5 | import pyopenjtalk
6 |
7 | from . import symbols
8 |
9 | # Regular expression matching Japanese without punctuation marks:
10 | _japanese_characters = re.compile(
11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
12 | )
13 |
14 | # Regular expression matching non-Japanese characters or punctuation marks:
15 | _japanese_marks = re.compile(
16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
17 | )
18 |
19 | # List of (symbol, Japanese) pairs for marks:
20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
21 |
22 |
23 | # List of (consonant, sokuon) pairs:
24 | _real_sokuon = [
25 | (re.compile("%s" % x[0]), x[1])
26 | for x in [
27 | (r"Q([↑↓]*[kg])", r"k#\1"),
28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
29 | (r"Q([↑↓]*[sʃ])", r"s\1"),
30 | (r"Q([↑↓]*[pb])", r"p#\1"),
31 | ]
32 | ]
33 |
34 | # List of (consonant, hatsuon) pairs:
35 | _real_hatsuon = [
36 | (re.compile("%s" % x[0]), x[1])
37 | for x in [
38 | (r"N([↑↓]*[pbm])", r"m\1"),
39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"),
40 | (r"N([↑↓]*[tdn])", r"n\1"),
41 | (r"N([↑↓]*[kg])", r"ŋ\1"),
42 | ]
43 | ]
44 |
45 |
46 | def post_replace_ph(ph):
47 | rep_map = {
48 | ":": ",",
49 | ";": ",",
50 | ",": ",",
51 | "。": ".",
52 | "!": "!",
53 | "?": "?",
54 | "\n": ".",
55 | "·": ",",
56 | "、": ",",
57 | "...": "…",
58 | "v": "V",
59 | }
60 | if ph in rep_map.keys():
61 | ph = rep_map[ph]
62 | if ph in symbols:
63 | return ph
64 | if ph not in symbols:
65 | ph = "UNK"
66 | return ph
67 |
68 |
69 | def symbols_to_japanese(text):
70 | for regex, replacement in _symbols_to_japanese:
71 | text = re.sub(regex, replacement, text)
72 | return text
73 |
74 |
75 | def preprocess_jap(text):
76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
77 | text = symbols_to_japanese(text)
78 | sentences = re.split(_japanese_marks, text)
79 | marks = re.findall(_japanese_marks, text)
80 | text = []
81 | for i, sentence in enumerate(sentences):
82 | if re.match(_japanese_characters, sentence):
83 | p = pyopenjtalk.g2p(sentence)
84 | text += p.split(" ")
85 |
86 | if i < len(marks):
87 | text += [marks[i].replace(" ", "")]
88 | return text
89 |
90 |
91 | def text_normalize(text):
92 | # todo: jap text normalize
93 | return text
94 |
95 |
96 | def g2p(norm_text):
97 | phones = preprocess_jap(norm_text)
98 | phones = [post_replace_ph(i) for i in phones]
99 | # todo: implement tones and word2ph
100 | tones = [0 for i in phones]
101 | word2ph = [1 for i in phones]
102 | return phones, tones, word2ph
103 |
104 |
105 | if __name__ == "__main__":
106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines():
107 | text = line.split(":")[1]
108 | phones, tones, word2ph = g2p(text)
109 | for p in phones:
110 | if p == "z":
111 | print(text, phones)
112 | sys.exit(0)
113 |
--------------------------------------------------------------------------------
/tools/classify_language.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from config import config
4 |
5 | LANGUAGE_IDENTIFICATION_LIBRARY = config.webui_config.language_identification_library
6 |
7 | module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
8 |
9 | langid_languages = [
10 | "af",
11 | "am",
12 | "an",
13 | "ar",
14 | "as",
15 | "az",
16 | "be",
17 | "bg",
18 | "bn",
19 | "br",
20 | "bs",
21 | "ca",
22 | "cs",
23 | "cy",
24 | "da",
25 | "de",
26 | "dz",
27 | "el",
28 | "en",
29 | "eo",
30 | "es",
31 | "et",
32 | "eu",
33 | "fa",
34 | "fi",
35 | "fo",
36 | "fr",
37 | "ga",
38 | "gl",
39 | "gu",
40 | "he",
41 | "hi",
42 | "hr",
43 | "ht",
44 | "hu",
45 | "hy",
46 | "id",
47 | "is",
48 | "it",
49 | "ja",
50 | "jv",
51 | "ka",
52 | "kk",
53 | "km",
54 | "kn",
55 | "ko",
56 | "ku",
57 | "ky",
58 | "la",
59 | "lb",
60 | "lo",
61 | "lt",
62 | "lv",
63 | "mg",
64 | "mk",
65 | "ml",
66 | "mn",
67 | "mr",
68 | "ms",
69 | "mt",
70 | "nb",
71 | "ne",
72 | "nl",
73 | "nn",
74 | "no",
75 | "oc",
76 | "or",
77 | "pa",
78 | "pl",
79 | "ps",
80 | "pt",
81 | "qu",
82 | "ro",
83 | "ru",
84 | "rw",
85 | "se",
86 | "si",
87 | "sk",
88 | "sl",
89 | "sq",
90 | "sr",
91 | "sv",
92 | "sw",
93 | "ta",
94 | "te",
95 | "th",
96 | "tl",
97 | "tr",
98 | "ug",
99 | "uk",
100 | "ur",
101 | "vi",
102 | "vo",
103 | "wa",
104 | "xh",
105 | "zh",
106 | "zu",
107 | ]
108 |
109 |
110 | def classify_language(text: str, target_languages: list = None) -> str:
111 | if module == "fastlid" or module == "fasttext":
112 | from fastlid import fastlid, supported_langs
113 |
114 | classifier = fastlid
115 | if target_languages != None:
116 | target_languages = [
117 | lang for lang in target_languages if lang in supported_langs
118 | ]
119 | fastlid.set_languages = target_languages
120 | elif module == "langid":
121 | import langid
122 |
123 | classifier = langid.classify
124 | if target_languages != None:
125 | target_languages = [
126 | lang for lang in target_languages if lang in langid_languages
127 | ]
128 | langid.set_languages(target_languages)
129 | else:
130 | raise ValueError(f"Wrong module {module}")
131 |
132 | lang = classifier(text)[0]
133 |
134 | return lang
135 |
136 |
137 | def classify_zh_ja(text: str) -> str:
138 | for idx, char in enumerate(text):
139 | unicode_val = ord(char)
140 |
141 | # 检测日语字符
142 | if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
143 | return "ja"
144 |
145 | # 检测汉字字符
146 | if 0x4E00 <= unicode_val <= 0x9FFF:
147 | # 检查周围的字符
148 | next_char = text[idx + 1] if idx + 1 < len(text) else None
149 |
150 | if next_char and (
151 | 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
152 | ):
153 | return "ja"
154 |
155 | return "zh"
156 |
157 |
158 | def split_alpha_nonalpha(text):
159 | return re.split(
160 | r"(?:(?<=[\u4e00-\u9fff])|(?<=[\u3040-\u30FF]))(?=[a-zA-Z])|(?<=[a-zA-Z])(?:(?=[\u4e00-\u9fff])|(?=[\u3040-\u30FF]))",
161 | text,
162 | )
163 |
164 |
165 | if __name__ == "__main__":
166 | text = "这是一个测试文本"
167 | print(classify_language(text))
168 | print(classify_zh_ja(text)) # "zh"
169 |
170 | text = "これはテストテキストです"
171 | print(classify_language(text))
172 | print(classify_zh_ja(text)) # "ja"
173 |
--------------------------------------------------------------------------------
/transcribe_genshin.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import librosa
4 | import numpy as np
5 | from multiprocessing import Pool, cpu_count
6 |
7 | import soundfile
8 | from scipy.io import wavfile
9 | from tqdm import tqdm
10 | from config import config
11 |
12 | global speaker_annos
13 | speaker_annos = []
14 |
15 | def process(item):
16 | spkdir, wav_name, args = item
17 | speaker = spkdir.replace("\\", "/").split("/")[-1]
18 | wav_path = os.path.join(args.in_dir, speaker, wav_name)
19 | if os.path.exists(wav_path) and '.wav' in wav_path:
20 | os.makedirs(os.path.join(args.out_dir, speaker), exist_ok=True)
21 | wav, sr = librosa.load(wav_path, sr=args.sr)
22 | soundfile.write(
23 | os.path.join(args.out_dir, speaker, wav_name),
24 | wav,
25 | sr
26 | )
27 |
28 | def process_text(item):
29 | spkdir, wav_name, args,lang = item
30 | speaker = spkdir.replace("\\", "/").split("/")[-1]
31 | wav_path = os.path.join(args.in_dir, speaker, wav_name)
32 | global speaker_annos
33 | tr_name = wav_name.replace('.wav', '')
34 | with open(args.out_dir+'/'+speaker+'/'+tr_name+'.lab', "r", encoding="utf-8") as file:
35 | text = file.read()
36 | text = text.replace("{NICKNAME}",'旅行者')
37 | text = text.replace("{M#他}{F#她}",'他')
38 | text = text.replace("{M#她}{F#他}",'他')
39 | substring = "{M#妹妹}{F#哥哥}"
40 | if substring in text:
41 | if tr_name.endswith("a"):
42 | text = text.replace("{M#妹妹}{F#哥哥}",'妹妹')
43 | if tr_name.endswith("b"):
44 | text = text.replace("{M#妹妹}{F#哥哥}",'哥哥')
45 | text = text.replace("#",'')
46 | text = f'{lang}|{text}\n' #
47 | speaker_annos.append(args.out_dir+'/'+speaker+'/'+wav_name+ "|" + speaker + "|" + text)
48 |
49 |
50 |
51 | if __name__ == "__main__":
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument("--sr", type=int, default=44100, help="sampling rate")
54 | parser.add_argument("--in_dir", type=str, default=config.resample_config.in_dir, help="path to source dir")
55 | parser.add_argument("--out_dir", type=str, default=config.resample_config.out_dir, help="path to target dir")
56 | parent_dir=config.resample_config.in_dir
57 | print(config.resample_config.out_dir)
58 | speaker_names = list(os.walk(parent_dir))[0][1]
59 | args = parser.parse_args()
60 |
61 | entered = False
62 | while not entered:
63 | print("Enter a letter to choose language.\n")
64 | print("C = Chinese ; J = Japanese ;E = English;\n e.g: C \n")
65 | languages=input("Enter language: ")
66 | if (languages == "C"or languages == "c"):
67 | lang='ZH'
68 | entered = True
69 | elif (languages == "J"or languages == "j"):
70 | lang='JP'
71 | entered = True
72 | elif (languages == "E"or languages == "e"):
73 | lang='EN'
74 | entered = True
75 | else:
76 | print("Illegal Arguments! Please try again.\n")
77 | # processs = 8
78 | processs = cpu_count()-2 if cpu_count() >4 else 1
79 | pool = Pool(processes=processs)
80 |
81 | for speaker in os.listdir(args.in_dir):
82 | spk_dir = os.path.join(args.in_dir, speaker)
83 | if os.path.isdir(spk_dir):
84 | print(spk_dir)
85 | for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
86 | pass
87 | for i in os.listdir(spk_dir):
88 | if i.endswith("wav"):
89 | pro=(spk_dir, i, args, lang)
90 | process_text(pro)
91 | if len(speaker_annos) == 0:
92 | print("transcribe error. len(speaker_annos) == 0")
93 | else:
94 | with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f:
95 | for line in speaker_annos:
96 | f.write(line)
97 | print("finished.")
98 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bert-VITS2_train
2 |
3 | ## 本项目fork自https://github.com/YYuX-1145/Bert-VITS2-Integration-package/tree/2.0.2
4 |
5 | ## 安装依赖
6 |
7 | ```
8 | pip install -r requirements.txt
9 | ```
10 |
11 | ## 下载bert模型 放入bert目录
12 |
13 | ```
14 | 链接:https://pan.baidu.com/s/11vLNEVDeP_8YhYIJUjcUeg?pwd=v3uc
15 | ```
16 |
17 | ```
18 | E:\work\Bert-VITS2-v202\bert>tree /f
19 | Folder PATH listing for volume myssd
20 | Volume serial number is 7CE3-15AE
21 | E:.
22 | │ bert_models.json
23 | │
24 | ├───bert-base-japanese-v3
25 | │ config.json
26 | │ README.md
27 | │ tokenizer_config.json
28 | │ vocab.txt
29 | │
30 | ├───bert-large-japanese-v2
31 | │ config.json
32 | │ README.md
33 | │ tokenizer_config.json
34 | │ vocab.txt
35 | │
36 | ├───chinese-roberta-wwm-ext-large
37 | │ added_tokens.json
38 | │ config.json
39 | │ pytorch_model.bin
40 | │ README.md
41 | │ special_tokens_map.json
42 | │ tokenizer.json
43 | │ tokenizer_config.json
44 | │ vocab.txt
45 | │
46 | ├───deberta-v2-large-japanese
47 | │ config.json
48 | │ pytorch_model.bin
49 | │ README.md
50 | │ special_tokens_map.json
51 | │ tokenizer.json
52 | │ tokenizer_config.json
53 | │
54 | └───deberta-v3-large
55 | config.json
56 | generator_config.json
57 | pytorch_model.bin
58 | README.md
59 | spm.model
60 | tokenizer_config.json
61 | ```
62 |
63 | ## 下载预训练模型,放入pretrained_models目录
64 |
65 | ```
66 | https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/model_readme_tmpl?name=Bert-VITS2%E4%B8%AD%E6%97%A5%E8%8B%B1%E5%BA%95%E6%A8%A1-fix
67 | ```
68 |
69 | ```
70 | E:\work\Bert-VITS2-v202\pretrained_models>tree /f
71 | Folder PATH listing for volume myssd
72 | Volume serial number is 7CE3-15AE
73 | E:.
74 | DUR_0.pth
75 | D_0.pth
76 | G_0.pth
77 |
78 | No subfolders exist
79 | ```
80 |
81 | ## 下载数据集
82 |
83 | ```
84 | https://pan.ai-hobbyist.org/Genshin%20Datasets/%E4%B8%AD%E6%96%87%20-%20Chinese/%E5%88%86%E8%A7%92%E8%89%B2%20-%20Single/%E8%A7%92%E8%89%B2%E8%AF%AD%E9%9F%B3%20-%20Character
85 | ```
86 |
87 | ## 以刻晴为例 解压缩后,放入项目的Data/keqing/raw/keqing目录
88 |
89 | ```
90 | E:\work\Bert-VITS2-v202\Data\keqing\raw\keqing>tree /f
91 | Folder PATH listing for volume myssd
92 | Volume serial number is 7CE3-15AE
93 | E:.
94 | vo_card_keqing_endOfGame_fail_01.lab
95 | vo_card_keqing_endOfGame_fail_01.wav
96 | ```
97 |
98 | ## 转写标注文件
99 |
100 | ```
101 |
102 | python3 transcribe_genshin.py
103 |
104 | ```
105 |
106 |
107 | ## 如果是自主构建数据集,把音频素材以当前模型命名为*.wav文件,如meimei.wav,放入raw目录,随后运行脚本进行切分
108 |
109 | ```
110 | python3 audio_slicer.py
111 | ```
112 |
113 | ```
114 | E:\work\Bert-VITS2-v202_demo\Data\meimei\raw\meimei>tree /f
115 | Folder PATH listing for volume myssd
116 | Volume serial number is 7CE3-15AE
117 | E:.
118 | meimei_0.wav
119 | meimei_1.wav
120 | meimei_2.wav
121 | meimei_3.wav
122 | meimei_4.wav
123 | meimei_5.wav
124 | meimei_6.wav
125 | meimei_7.wav
126 | meimei_8.wav
127 | ```
128 |
129 | ## 文本预处理和生成bert模型可读文件:
130 |
131 | ```
132 | python3 preprocess_text.py
133 |
134 | python3 bert_gen.py
135 |
136 | ```
137 |
138 | ## 开始训练
139 |
140 | ```
141 | python3 train_ms.py
142 | ```
143 |
144 | ## 训练好的模型目录
145 |
146 | ```
147 |
148 | E:\work\Bert-VITS2-v202\Data\keqing\models>tree /f
149 | Folder PATH listing for volume myssd
150 | Volume serial number is 7CE3-15AE
151 | E:.
152 | │ DUR_0.pth
153 | │ DUR_550.pth
154 | │ DUR_600.pth
155 | │ DUR_650.pth
156 | │ D_0.pth
157 | │ D_600.pth
158 | │ D_650.pth
159 | │ events.out.tfevents.1700625154.ly.24008.0
160 | │ events.out.tfevents.1700630428.ly.20380.0
161 | │ G_0.pth
162 | │ G_450.pth
163 | │ G_500.pth
164 | │ G_550.pth
165 | │ G_600.pth
166 | │ G_650.pth
167 | │ train.log
168 | │
169 | └───eval
170 | events.out.tfevents.1700625154.ly.24008.1
171 | events.out.tfevents.1700630428.ly.20380.1
172 |
173 | ```
174 |
175 | ## 模型推理验证
176 |
177 | ```
178 | python3 server_fastapi.py
179 | ```
180 |
--------------------------------------------------------------------------------
/Web/assets/index-49e71a58.css:
--------------------------------------------------------------------------------
1 | html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}@-ms-viewport{width:device-width}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}/*!
2 | Theme: StackOverflow Light
3 | Description: Light theme as used on stackoverflow.com
4 | Author: stackoverflow.com
5 | Maintainer: @Hirse
6 | Website: https://github.com/StackExchange/Stacks
7 | License: MIT
8 | Updated: 2021-05-15
9 |
10 | Updated for @stackoverflow/stacks v0.64.0
11 | Code Blocks: /blob/v0.64.0/lib/css/components/_stacks-code-blocks.less
12 | Colors: /blob/v0.64.0/lib/css/exports/_stacks-constants-colors.less
13 | */.hljs{color:#2f3337;background:#f6f6f6}.hljs-subst{color:#2f3337}.hljs-comment{color:#656e77}.hljs-keyword,.hljs-selector-tag,.hljs-meta .hljs-keyword,.hljs-doctag,.hljs-section,.hljs-attr{color:#015692}.hljs-attribute{color:#803378}.hljs-name,.hljs-type,.hljs-number,.hljs-selector-id,.hljs-quote,.hljs-template-tag{color:#b75501}.hljs-selector-class{color:#015692}.hljs-string,.hljs-regexp,.hljs-symbol,.hljs-variable,.hljs-template-variable,.hljs-link,.hljs-selector-attr{color:#54790d}.hljs-meta,.hljs-selector-pseudo{color:#015692}.hljs-built_in,.hljs-title,.hljs-literal{color:#b75501}.hljs-bullet,.hljs-code{color:#535a60}.hljs-meta .hljs-string{color:#54790d}.hljs-deletion{color:#c02d2e}.hljs-addition{color:#2f6f44}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}
14 |
--------------------------------------------------------------------------------
/configs/default_config.yml:
--------------------------------------------------------------------------------
1 | # 全局配置
2 | # 对于希望在同一时间使用多个配置文件的情况,例如两个GPU同时跑两个训练集:通过环境变量指定配置文件,不指定则默认为./config.yml
3 |
4 | # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5 | # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6 | # 不填或者填空则路径为相对于项目根目录的路径
7 | dataset_path: "Data/TEST"
8 |
9 | # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10 | mirror: ""
11 | openi_token: "" # openi token
12 |
13 | # resample 音频重采样配置
14 | # 注意, “:” 后需要加空格
15 | resample:
16 | # 目标重采样率
17 | sampling_rate: 44100
18 | # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19 | # 请填入相对于datasetPath的相对路径
20 | in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
21 | # 音频文件重采样后输出路径
22 | out_dir: "audios/wavs"
23 |
24 |
25 | # preprocess_text 数据集预处理相关配置
26 | # 注意, “:” 后需要加空格
27 | preprocess_text:
28 | # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29 | transcription_path: "filelists/short_character_anno.list"
30 | # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31 | cleaned_path: "filelists/cleaned.list"
32 | # 训练集路径
33 | train_path: "filelists/train.list"
34 | # 验证集路径
35 | val_path: "filelists/val.list"
36 | # 配置文件路径
37 | config_path: "config.json"
38 | # 每个speaker的验证集条数
39 | val_per_spk: 5
40 | # 验证集最大条数,多于的会被截断并放到训练集中
41 | max_val_total: 8
42 | # 是否进行数据清洗
43 | clean: true
44 |
45 |
46 | # bert_gen 相关配置
47 | # 注意, “:” 后需要加空格
48 | bert_gen:
49 | # 训练数据集配置文件路径
50 | config_path: "config.json"
51 | # 并行数
52 | num_processes: 2
53 | # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54 | # 该选项同时决定了get_bert_feature的默认设备
55 | device: "cuda"
56 | # 使用多卡推理
57 | use_multi_device: false
58 |
59 |
60 | # train 训练配置
61 | # 注意, “:” 后需要加空格
62 | train_ms:
63 | # 需要加载的环境变量,多显卡训练时RANK请手动在环境变量填写
64 | # 环境变量对应名称环境变量不存在时加载,也就是说手动添加的环境变量优先级更高,会覆盖本配置文件
65 | env:
66 | MASTER_ADDR: "localhost"
67 | MASTER_PORT: 10086
68 | WORLD_SIZE: 1
69 | RANK: 0
70 | # 可以填写任意名的环境变量
71 | THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
72 | # 底模设置
73 | base:
74 | use_base_model: false
75 | repo_id: "Stardust_minus/Bert-VITS2"
76 | model_image: "Bert-VITS2中日底模" # openi网页的模型名
77 | # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
78 | model: "models"
79 | # 配置文件路径
80 | config_path: "config.json"
81 |
82 |
83 | # webui webui配置
84 | # 注意, “:” 后需要加空格
85 | webui:
86 | # 推理设备
87 | device: "cuda"
88 | # 模型路径
89 | model: "models/G_100.pth"
90 | # 配置文件路径
91 | config_path: "Data/TEST/config.json"
92 | # 端口号
93 | port: 7860
94 | # 是否公开部署,对外网开放
95 | share: false
96 | # 是否开启debug模式
97 | debug: false
98 | # 语种识别库,可选langid, fastlid
99 | language_identification_library: "langid"
100 |
101 |
102 | # server api配置
103 | # 注意, “:” 后需要加空格
104 | # 注意,本配置下的所有配置均为相对于根目录的路径
105 | server:
106 | # 端口号
107 | port: 7860
108 | # 模型默认使用设备:但是当前并没有实现这个配置。
109 | device: "cuda"
110 | # 需要加载的所有模型的配置
111 | # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
112 | models:
113 | - # 模型的路径
114 | model: "./Data/TEST/models/G_100.pth"
115 | # 模型config.json的路径
116 | config: "./Data/TEST/config.json"
117 | # 模型使用设备,若填写则会覆盖默认配置
118 | device: "cuda"
119 | # 模型默认使用的语言
120 | language: "ZH"
121 | # 模型人物默认参数
122 | # 不必填写所有人物,不填的使用默认值
123 | # 暂时不用填写,当前尚未实现按人区分配置
124 | speakers:
125 | - speaker: "科比"
126 | sdp_ratio: 0.2
127 | noise_scale: 0.6
128 | noise_scale_w: 0.8
129 | length_scale: 1
130 | - speaker: "五条悟"
131 | sdp_ratio: 0.3
132 | noise_scale: 0.7
133 | noise_scale_w: 0.8
134 | length_scale: 0.5
135 | - speaker: "安倍晋三"
136 | sdp_ratio: 0.2
137 | noise_scale: 0.6
138 | noise_scale_w: 0.8
139 | length_scale: 1.2
140 | - # 模型的路径
141 | model: "./Data/test/models/G_100.pth"
142 | # 模型config.json的路径
143 | config: "./Data/test/config.json"
144 | # 模型使用设备,若填写则会覆盖默认配置
145 | device: "cuda"
146 | # 模型默认使用的语言
147 | language: "JP"
148 | # 模型人物默认参数
149 | # 不必填写所有人物,不填的使用默认值
150 | speakers: [ ] # 也可以不填
151 |
152 |
153 | # 百度翻译开放平台 api配置
154 | # api接入文档 https://api.fanyi.baidu.com/doc/21
155 | # 请不要在github等网站公开分享你的app id 与 key
156 | translate:
157 | # 你的APPID
158 | "app_key": ""
159 | # 你的密钥
160 | "secret_key": ""
161 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language: en
3 | datasets:
4 | - msp-podcast
5 | inference: true
6 | tags:
7 | - speech
8 | - audio
9 | - wav2vec2
10 | - audio-classification
11 | - emotion-recognition
12 | license: cc-by-nc-sa-4.0
13 | pipeline_tag: audio-classification
14 | ---
15 |
16 | # Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0
17 |
18 | The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [
19 | Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).
20 |
21 | # Usage
22 |
23 | ```python
24 | import numpy as np
25 | import torch
26 | import torch.nn as nn
27 | from transformers import Wav2Vec2Processor
28 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
29 | Wav2Vec2Model,
30 | Wav2Vec2PreTrainedModel,
31 | )
32 |
33 |
34 | class RegressionHead(nn.Module):
35 | r"""Classification head."""
36 |
37 | def __init__(self, config):
38 |
39 | super().__init__()
40 |
41 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
42 | self.dropout = nn.Dropout(config.final_dropout)
43 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
44 |
45 | def forward(self, features, **kwargs):
46 |
47 | x = features
48 | x = self.dropout(x)
49 | x = self.dense(x)
50 | x = torch.tanh(x)
51 | x = self.dropout(x)
52 | x = self.out_proj(x)
53 |
54 | return x
55 |
56 |
57 | class EmotionModel(Wav2Vec2PreTrainedModel):
58 | r"""Speech emotion classifier."""
59 |
60 | def __init__(self, config):
61 |
62 | super().__init__(config)
63 |
64 | self.config = config
65 | self.wav2vec2 = Wav2Vec2Model(config)
66 | self.classifier = RegressionHead(config)
67 | self.init_weights()
68 |
69 | def forward(
70 | self,
71 | input_values,
72 | ):
73 |
74 | outputs = self.wav2vec2(input_values)
75 | hidden_states = outputs[0]
76 | hidden_states = torch.mean(hidden_states, dim=1)
77 | logits = self.classifier(hidden_states)
78 |
79 | return hidden_states, logits
80 |
81 |
82 |
83 | # load model from hub
84 | device = 'cpu'
85 | model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
86 | processor = Wav2Vec2Processor.from_pretrained(model_name)
87 | model = EmotionModel.from_pretrained(model_name)
88 |
89 | # dummy signal
90 | sampling_rate = 16000
91 | signal = np.zeros((1, sampling_rate), dtype=np.float32)
92 |
93 |
94 | def process_func(
95 | x: np.ndarray,
96 | sampling_rate: int,
97 | embeddings: bool = False,
98 | ) -> np.ndarray:
99 | r"""Predict emotions or extract embeddings from raw audio signal."""
100 |
101 | # run through processor to normalize signal
102 | # always returns a batch, so we just get the first entry
103 | # then we put it on the device
104 | y = processor(x, sampling_rate=sampling_rate)
105 | y = y['input_values'][0]
106 | y = y.reshape(1, -1)
107 | y = torch.from_numpy(y).to(device)
108 |
109 | # run through model
110 | with torch.no_grad():
111 | y = model(y)[0 if embeddings else 1]
112 |
113 | # convert to numpy
114 | y = y.detach().cpu().numpy()
115 |
116 | return y
117 |
118 |
119 | print(process_func(signal, sampling_rate))
120 | # Arousal dominance valence
121 | # [[0.5460754 0.6062266 0.40431657]]
122 |
123 | print(process_func(signal, sampling_rate, embeddings=True))
124 | # Pooled hidden states of last transformer layer
125 | # [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748
126 | # 0.00599211]]
127 | ```
128 |
--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data
3 | from librosa.filters import mel as librosa_mel_fn
4 | import warnings
5 |
6 | # warnings.simplefilter(action='ignore', category=FutureWarning)
7 | warnings.filterwarnings(action="ignore")
8 | MAX_WAV_VALUE = 32768.0
9 |
10 |
11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
12 | """
13 | PARAMS
14 | ------
15 | C: compression factor
16 | """
17 | return torch.log(torch.clamp(x, min=clip_val) * C)
18 |
19 |
20 | def dynamic_range_decompression_torch(x, C=1):
21 | """
22 | PARAMS
23 | ------
24 | C: compression factor used to compress
25 | """
26 | return torch.exp(x) / C
27 |
28 |
29 | def spectral_normalize_torch(magnitudes):
30 | output = dynamic_range_compression_torch(magnitudes)
31 | return output
32 |
33 |
34 | def spectral_de_normalize_torch(magnitudes):
35 | output = dynamic_range_decompression_torch(magnitudes)
36 | return output
37 |
38 |
39 | mel_basis = {}
40 | hann_window = {}
41 |
42 |
43 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
44 | if torch.min(y) < -1.0:
45 | print("min value is ", torch.min(y))
46 | if torch.max(y) > 1.0:
47 | print("max value is ", torch.max(y))
48 |
49 | global hann_window
50 | dtype_device = str(y.dtype) + "_" + str(y.device)
51 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
52 | if wnsize_dtype_device not in hann_window:
53 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
54 | dtype=y.dtype, device=y.device
55 | )
56 |
57 | y = torch.nn.functional.pad(
58 | y.unsqueeze(1),
59 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
60 | mode="reflect",
61 | )
62 | y = y.squeeze(1)
63 |
64 | spec = torch.stft(
65 | y,
66 | n_fft,
67 | hop_length=hop_size,
68 | win_length=win_size,
69 | window=hann_window[wnsize_dtype_device],
70 | center=center,
71 | pad_mode="reflect",
72 | normalized=False,
73 | onesided=True,
74 | return_complex=False,
75 | )
76 |
77 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
78 | return spec
79 |
80 |
81 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
82 | global mel_basis
83 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
84 | fmax_dtype_device = str(fmax) + "_" + dtype_device
85 | if fmax_dtype_device not in mel_basis:
86 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
87 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
88 | dtype=spec.dtype, device=spec.device
89 | )
90 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
91 | spec = spectral_normalize_torch(spec)
92 | return spec
93 |
94 |
95 | def mel_spectrogram_torch(
96 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
97 | ):
98 | if torch.min(y) < -1.0:
99 | print("min value is ", torch.min(y))
100 | if torch.max(y) > 1.0:
101 | print("max value is ", torch.max(y))
102 |
103 | global mel_basis, hann_window
104 | dtype_device = str(y.dtype) + "_" + str(y.device)
105 | fmax_dtype_device = str(fmax) + "_" + dtype_device
106 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
107 | if fmax_dtype_device not in mel_basis:
108 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
109 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
110 | dtype=y.dtype, device=y.device
111 | )
112 | if wnsize_dtype_device not in hann_window:
113 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
114 | dtype=y.dtype, device=y.device
115 | )
116 |
117 | y = torch.nn.functional.pad(
118 | y.unsqueeze(1),
119 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
120 | mode="reflect",
121 | )
122 | y = y.squeeze(1)
123 |
124 | spec = torch.stft(
125 | y,
126 | n_fft,
127 | hop_length=hop_size,
128 | win_length=win_size,
129 | window=hann_window[wnsize_dtype_device],
130 | center=center,
131 | pad_mode="reflect",
132 | normalized=False,
133 | onesided=True,
134 | return_complex=False,
135 | )
136 |
137 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
138 |
139 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
140 | spec = spectral_normalize_torch(spec)
141 |
142 | return spec
143 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/english.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 | from g2p_en import G2p
5 |
6 | from text import symbols
7 |
8 | current_file_path = os.path.dirname(__file__)
9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11 | _g2p = G2p()
12 |
13 | arpa = {
14 | "AH0",
15 | "S",
16 | "AH1",
17 | "EY2",
18 | "AE2",
19 | "EH0",
20 | "OW2",
21 | "UH0",
22 | "NG",
23 | "B",
24 | "G",
25 | "AY0",
26 | "M",
27 | "AA0",
28 | "F",
29 | "AO0",
30 | "ER2",
31 | "UH1",
32 | "IY1",
33 | "AH2",
34 | "DH",
35 | "IY0",
36 | "EY1",
37 | "IH0",
38 | "K",
39 | "N",
40 | "W",
41 | "IY2",
42 | "T",
43 | "AA1",
44 | "ER1",
45 | "EH2",
46 | "OY0",
47 | "UH2",
48 | "UW1",
49 | "Z",
50 | "AW2",
51 | "AW1",
52 | "V",
53 | "UW2",
54 | "AA2",
55 | "ER",
56 | "AW0",
57 | "UW0",
58 | "R",
59 | "OW1",
60 | "EH1",
61 | "ZH",
62 | "AE0",
63 | "IH2",
64 | "IH",
65 | "Y",
66 | "JH",
67 | "P",
68 | "AY1",
69 | "EY0",
70 | "OY2",
71 | "TH",
72 | "HH",
73 | "D",
74 | "ER0",
75 | "CH",
76 | "AO1",
77 | "AE1",
78 | "AO2",
79 | "OY1",
80 | "AY2",
81 | "IH1",
82 | "OW0",
83 | "L",
84 | "SH",
85 | }
86 |
87 |
88 | def post_replace_ph(ph):
89 | rep_map = {
90 | ":": ",",
91 | ";": ",",
92 | ",": ",",
93 | "。": ".",
94 | "!": "!",
95 | "?": "?",
96 | "\n": ".",
97 | "·": ",",
98 | "、": ",",
99 | "...": "…",
100 | "v": "V",
101 | }
102 | if ph in rep_map.keys():
103 | ph = rep_map[ph]
104 | if ph in symbols:
105 | return ph
106 | if ph not in symbols:
107 | ph = "UNK"
108 | return ph
109 |
110 |
111 | def read_dict():
112 | g2p_dict = {}
113 | start_line = 49
114 | with open(CMU_DICT_PATH) as f:
115 | line = f.readline()
116 | line_index = 1
117 | while line:
118 | if line_index >= start_line:
119 | line = line.strip()
120 | word_split = line.split(" ")
121 | word = word_split[0]
122 |
123 | syllable_split = word_split[1].split(" - ")
124 | g2p_dict[word] = []
125 | for syllable in syllable_split:
126 | phone_split = syllable.split(" ")
127 | g2p_dict[word].append(phone_split)
128 |
129 | line_index = line_index + 1
130 | line = f.readline()
131 |
132 | return g2p_dict
133 |
134 |
135 | def cache_dict(g2p_dict, file_path):
136 | with open(file_path, "wb") as pickle_file:
137 | pickle.dump(g2p_dict, pickle_file)
138 |
139 |
140 | def get_dict():
141 | if os.path.exists(CACHE_PATH):
142 | with open(CACHE_PATH, "rb") as pickle_file:
143 | g2p_dict = pickle.load(pickle_file)
144 | else:
145 | g2p_dict = read_dict()
146 | cache_dict(g2p_dict, CACHE_PATH)
147 |
148 | return g2p_dict
149 |
150 |
151 | eng_dict = get_dict()
152 |
153 |
154 | def refine_ph(phn):
155 | tone = 0
156 | if re.search(r"\d$", phn):
157 | tone = int(phn[-1]) + 1
158 | phn = phn[:-1]
159 | return phn.lower(), tone
160 |
161 |
162 | def refine_syllables(syllables):
163 | tones = []
164 | phonemes = []
165 | for phn_list in syllables:
166 | for i in range(len(phn_list)):
167 | phn = phn_list[i]
168 | phn, tone = refine_ph(phn)
169 | phonemes.append(phn)
170 | tones.append(tone)
171 | return phonemes, tones
172 |
173 |
174 | def text_normalize(text):
175 | # todo: eng text normalize
176 | return text
177 |
178 |
179 | def g2p(text):
180 | phones = []
181 | tones = []
182 | words = re.split(r"([,;.\-\?\!\s+])", text)
183 | for w in words:
184 | if w.upper() in eng_dict:
185 | phns, tns = refine_syllables(eng_dict[w.upper()])
186 | phones += phns
187 | tones += tns
188 | else:
189 | phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 | for ph in phone_list:
191 | if ph in arpa:
192 | ph, tn = refine_ph(ph)
193 | phones.append(ph)
194 | tones.append(tn)
195 | else:
196 | phones.append(ph)
197 | tones.append(0)
198 | # todo: implement word2ph
199 | word2ph = [1 for i in phones]
200 |
201 | phones = [post_replace_ph(i) for i in phones]
202 | return phones, tones, word2ph
203 |
204 |
205 | if __name__ == "__main__":
206 | # print(get_dict())
207 | # print(eng_word_to_phoneme("hello"))
208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 | # all_phones = set()
210 | # for k, syllables in eng_dict.items():
211 | # for group in syllables:
212 | # for ph in group:
213 | # all_phones.add(ph)
214 | # print(all_phones)
215 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/english.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 | from g2p_en import G2p
5 |
6 | from . import symbols
7 |
8 | current_file_path = os.path.dirname(__file__)
9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11 | _g2p = G2p()
12 |
13 | arpa = {
14 | "AH0",
15 | "S",
16 | "AH1",
17 | "EY2",
18 | "AE2",
19 | "EH0",
20 | "OW2",
21 | "UH0",
22 | "NG",
23 | "B",
24 | "G",
25 | "AY0",
26 | "M",
27 | "AA0",
28 | "F",
29 | "AO0",
30 | "ER2",
31 | "UH1",
32 | "IY1",
33 | "AH2",
34 | "DH",
35 | "IY0",
36 | "EY1",
37 | "IH0",
38 | "K",
39 | "N",
40 | "W",
41 | "IY2",
42 | "T",
43 | "AA1",
44 | "ER1",
45 | "EH2",
46 | "OY0",
47 | "UH2",
48 | "UW1",
49 | "Z",
50 | "AW2",
51 | "AW1",
52 | "V",
53 | "UW2",
54 | "AA2",
55 | "ER",
56 | "AW0",
57 | "UW0",
58 | "R",
59 | "OW1",
60 | "EH1",
61 | "ZH",
62 | "AE0",
63 | "IH2",
64 | "IH",
65 | "Y",
66 | "JH",
67 | "P",
68 | "AY1",
69 | "EY0",
70 | "OY2",
71 | "TH",
72 | "HH",
73 | "D",
74 | "ER0",
75 | "CH",
76 | "AO1",
77 | "AE1",
78 | "AO2",
79 | "OY1",
80 | "AY2",
81 | "IH1",
82 | "OW0",
83 | "L",
84 | "SH",
85 | }
86 |
87 |
88 | def post_replace_ph(ph):
89 | rep_map = {
90 | ":": ",",
91 | ";": ",",
92 | ",": ",",
93 | "。": ".",
94 | "!": "!",
95 | "?": "?",
96 | "\n": ".",
97 | "·": ",",
98 | "、": ",",
99 | "...": "…",
100 | "v": "V",
101 | }
102 | if ph in rep_map.keys():
103 | ph = rep_map[ph]
104 | if ph in symbols:
105 | return ph
106 | if ph not in symbols:
107 | ph = "UNK"
108 | return ph
109 |
110 |
111 | def read_dict():
112 | g2p_dict = {}
113 | start_line = 49
114 | with open(CMU_DICT_PATH) as f:
115 | line = f.readline()
116 | line_index = 1
117 | while line:
118 | if line_index >= start_line:
119 | line = line.strip()
120 | word_split = line.split(" ")
121 | word = word_split[0]
122 |
123 | syllable_split = word_split[1].split(" - ")
124 | g2p_dict[word] = []
125 | for syllable in syllable_split:
126 | phone_split = syllable.split(" ")
127 | g2p_dict[word].append(phone_split)
128 |
129 | line_index = line_index + 1
130 | line = f.readline()
131 |
132 | return g2p_dict
133 |
134 |
135 | def cache_dict(g2p_dict, file_path):
136 | with open(file_path, "wb") as pickle_file:
137 | pickle.dump(g2p_dict, pickle_file)
138 |
139 |
140 | def get_dict():
141 | if os.path.exists(CACHE_PATH):
142 | with open(CACHE_PATH, "rb") as pickle_file:
143 | g2p_dict = pickle.load(pickle_file)
144 | else:
145 | g2p_dict = read_dict()
146 | cache_dict(g2p_dict, CACHE_PATH)
147 |
148 | return g2p_dict
149 |
150 |
151 | eng_dict = get_dict()
152 |
153 |
154 | def refine_ph(phn):
155 | tone = 0
156 | if re.search(r"\d$", phn):
157 | tone = int(phn[-1]) + 1
158 | phn = phn[:-1]
159 | return phn.lower(), tone
160 |
161 |
162 | def refine_syllables(syllables):
163 | tones = []
164 | phonemes = []
165 | for phn_list in syllables:
166 | for i in range(len(phn_list)):
167 | phn = phn_list[i]
168 | phn, tone = refine_ph(phn)
169 | phonemes.append(phn)
170 | tones.append(tone)
171 | return phonemes, tones
172 |
173 |
174 | def text_normalize(text):
175 | # todo: eng text normalize
176 | return text
177 |
178 |
179 | def g2p(text):
180 | phones = []
181 | tones = []
182 | words = re.split(r"([,;.\-\?\!\s+])", text)
183 | for w in words:
184 | if w.upper() in eng_dict:
185 | phns, tns = refine_syllables(eng_dict[w.upper()])
186 | phones += phns
187 | tones += tns
188 | else:
189 | phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 | for ph in phone_list:
191 | if ph in arpa:
192 | ph, tn = refine_ph(ph)
193 | phones.append(ph)
194 | tones.append(tn)
195 | else:
196 | phones.append(ph)
197 | tones.append(0)
198 | # todo: implement word2ph
199 | word2ph = [1 for i in phones]
200 |
201 | phones = [post_replace_ph(i) for i in phones]
202 | return phones, tones, word2ph
203 |
204 |
205 | if __name__ == "__main__":
206 | # print(get_dict())
207 | # print(eng_word_to_phoneme("hello"))
208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 | # all_phones = set()
210 | # for k, syllables in eng_dict.items():
211 | # for group in syllables:
212 | # for ph in group:
213 | # all_phones.add(ph)
214 | # print(all_phones)
215 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/english.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import re
4 | from g2p_en import G2p
5 |
6 | from . import symbols
7 |
8 | current_file_path = os.path.dirname(__file__)
9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11 | _g2p = G2p()
12 |
13 | arpa = {
14 | "AH0",
15 | "S",
16 | "AH1",
17 | "EY2",
18 | "AE2",
19 | "EH0",
20 | "OW2",
21 | "UH0",
22 | "NG",
23 | "B",
24 | "G",
25 | "AY0",
26 | "M",
27 | "AA0",
28 | "F",
29 | "AO0",
30 | "ER2",
31 | "UH1",
32 | "IY1",
33 | "AH2",
34 | "DH",
35 | "IY0",
36 | "EY1",
37 | "IH0",
38 | "K",
39 | "N",
40 | "W",
41 | "IY2",
42 | "T",
43 | "AA1",
44 | "ER1",
45 | "EH2",
46 | "OY0",
47 | "UH2",
48 | "UW1",
49 | "Z",
50 | "AW2",
51 | "AW1",
52 | "V",
53 | "UW2",
54 | "AA2",
55 | "ER",
56 | "AW0",
57 | "UW0",
58 | "R",
59 | "OW1",
60 | "EH1",
61 | "ZH",
62 | "AE0",
63 | "IH2",
64 | "IH",
65 | "Y",
66 | "JH",
67 | "P",
68 | "AY1",
69 | "EY0",
70 | "OY2",
71 | "TH",
72 | "HH",
73 | "D",
74 | "ER0",
75 | "CH",
76 | "AO1",
77 | "AE1",
78 | "AO2",
79 | "OY1",
80 | "AY2",
81 | "IH1",
82 | "OW0",
83 | "L",
84 | "SH",
85 | }
86 |
87 |
88 | def post_replace_ph(ph):
89 | rep_map = {
90 | ":": ",",
91 | ";": ",",
92 | ",": ",",
93 | "。": ".",
94 | "!": "!",
95 | "?": "?",
96 | "\n": ".",
97 | "·": ",",
98 | "、": ",",
99 | "...": "…",
100 | "v": "V",
101 | }
102 | if ph in rep_map.keys():
103 | ph = rep_map[ph]
104 | if ph in symbols:
105 | return ph
106 | if ph not in symbols:
107 | ph = "UNK"
108 | return ph
109 |
110 |
111 | def read_dict():
112 | g2p_dict = {}
113 | start_line = 49
114 | with open(CMU_DICT_PATH) as f:
115 | line = f.readline()
116 | line_index = 1
117 | while line:
118 | if line_index >= start_line:
119 | line = line.strip()
120 | word_split = line.split(" ")
121 | word = word_split[0]
122 |
123 | syllable_split = word_split[1].split(" - ")
124 | g2p_dict[word] = []
125 | for syllable in syllable_split:
126 | phone_split = syllable.split(" ")
127 | g2p_dict[word].append(phone_split)
128 |
129 | line_index = line_index + 1
130 | line = f.readline()
131 |
132 | return g2p_dict
133 |
134 |
135 | def cache_dict(g2p_dict, file_path):
136 | with open(file_path, "wb") as pickle_file:
137 | pickle.dump(g2p_dict, pickle_file)
138 |
139 |
140 | def get_dict():
141 | if os.path.exists(CACHE_PATH):
142 | with open(CACHE_PATH, "rb") as pickle_file:
143 | g2p_dict = pickle.load(pickle_file)
144 | else:
145 | g2p_dict = read_dict()
146 | cache_dict(g2p_dict, CACHE_PATH)
147 |
148 | return g2p_dict
149 |
150 |
151 | eng_dict = get_dict()
152 |
153 |
154 | def refine_ph(phn):
155 | tone = 0
156 | if re.search(r"\d$", phn):
157 | tone = int(phn[-1]) + 1
158 | phn = phn[:-1]
159 | return phn.lower(), tone
160 |
161 |
162 | def refine_syllables(syllables):
163 | tones = []
164 | phonemes = []
165 | for phn_list in syllables:
166 | for i in range(len(phn_list)):
167 | phn = phn_list[i]
168 | phn, tone = refine_ph(phn)
169 | phonemes.append(phn)
170 | tones.append(tone)
171 | return phonemes, tones
172 |
173 |
174 | def text_normalize(text):
175 | # todo: eng text normalize
176 | return text
177 |
178 |
179 | def g2p(text):
180 | phones = []
181 | tones = []
182 | words = re.split(r"([,;.\-\?\!\s+])", text)
183 | for w in words:
184 | if w.upper() in eng_dict:
185 | phns, tns = refine_syllables(eng_dict[w.upper()])
186 | phones += phns
187 | tones += tns
188 | else:
189 | phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 | for ph in phone_list:
191 | if ph in arpa:
192 | ph, tn = refine_ph(ph)
193 | phones.append(ph)
194 | tones.append(tn)
195 | else:
196 | phones.append(ph)
197 | tones.append(0)
198 | # todo: implement word2ph
199 | word2ph = [1 for i in phones]
200 |
201 | phones = [post_replace_ph(i) for i in phones]
202 | return phones, tones, word2ph
203 |
204 |
205 | if __name__ == "__main__":
206 | # print(get_dict())
207 | # print(eng_word_to_phoneme("hello"))
208 | print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 | # all_phones = set()
210 | # for k, syllables in eng_dict.items():
211 | # for group in syllables:
212 | # for ph in group:
213 | # all_phones.add(ph)
214 | # print(all_phones)
215 |
--------------------------------------------------------------------------------
/emo_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.utils.data import Dataset
4 | from torch.utils.data import DataLoader
5 | from transformers import Wav2Vec2Processor
6 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
7 | Wav2Vec2Model,
8 | Wav2Vec2PreTrainedModel,
9 | )
10 | import librosa
11 | import numpy as np
12 | import argparse
13 | from config import config
14 | import utils
15 | import os
16 | from tqdm import tqdm
17 |
18 |
19 | class RegressionHead(nn.Module):
20 | r"""Classification head."""
21 |
22 | def __init__(self, config):
23 | super().__init__()
24 |
25 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
26 | self.dropout = nn.Dropout(config.final_dropout)
27 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
28 |
29 | def forward(self, features, **kwargs):
30 | x = features
31 | x = self.dropout(x)
32 | x = self.dense(x)
33 | x = torch.tanh(x)
34 | x = self.dropout(x)
35 | x = self.out_proj(x)
36 |
37 | return x
38 |
39 |
40 | class EmotionModel(Wav2Vec2PreTrainedModel):
41 | r"""Speech emotion classifier."""
42 |
43 | def __init__(self, config):
44 | super().__init__(config)
45 |
46 | self.config = config
47 | self.wav2vec2 = Wav2Vec2Model(config)
48 | self.classifier = RegressionHead(config)
49 | self.init_weights()
50 |
51 | def forward(
52 | self,
53 | input_values,
54 | ):
55 | outputs = self.wav2vec2(input_values)
56 | hidden_states = outputs[0]
57 | hidden_states = torch.mean(hidden_states, dim=1)
58 | logits = self.classifier(hidden_states)
59 |
60 | return hidden_states, logits
61 |
62 |
63 | class AudioDataset(Dataset):
64 | def __init__(self, list_of_wav_files, sr, processor):
65 | self.list_of_wav_files = list_of_wav_files
66 | self.processor = processor
67 | self.sr = sr
68 |
69 | def __len__(self):
70 | return len(self.list_of_wav_files)
71 |
72 | def __getitem__(self, idx):
73 | wav_file = self.list_of_wav_files[idx]
74 | audio_data, _ = librosa.load(wav_file, sr=self.sr)
75 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[
76 | "input_values"
77 | ][0]
78 | return torch.from_numpy(processed_data)
79 |
80 |
81 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
82 | processor = Wav2Vec2Processor.from_pretrained(model_name)
83 | model = EmotionModel.from_pretrained(model_name)
84 |
85 |
86 | def process_func(
87 | x: np.ndarray,
88 | sampling_rate: int,
89 | model: EmotionModel,
90 | processor: Wav2Vec2Processor,
91 | device: str,
92 | embeddings: bool = False,
93 | ) -> np.ndarray:
94 | r"""Predict emotions or extract embeddings from raw audio signal."""
95 | model = model.to(device)
96 | y = processor(x, sampling_rate=sampling_rate)
97 | y = y["input_values"][0]
98 | y = torch.from_numpy(y).unsqueeze(0).to(device)
99 |
100 | # run through model
101 | with torch.no_grad():
102 | y = model(y)[0 if embeddings else 1]
103 |
104 | # convert to numpy
105 | y = y.detach().cpu().numpy()
106 |
107 | return y
108 |
109 |
110 | def get_emo(path):
111 | wav, sr = librosa.load(path, 16000)
112 | device = config.bert_gen_config.device
113 | return process_func(
114 | np.expand_dims(wav, 0).astype(np.float),
115 | sr,
116 | model,
117 | processor,
118 | device,
119 | embeddings=True,
120 | ).squeeze(0)
121 |
122 |
123 | if __name__ == "__main__":
124 | parser = argparse.ArgumentParser()
125 | parser.add_argument(
126 | "-c", "--config", type=str, default=config.bert_gen_config.config_path
127 | )
128 | parser.add_argument(
129 | "--num_processes", type=int, default=config.bert_gen_config.num_processes
130 | )
131 | args, _ = parser.parse_known_args()
132 | config_path = args.config
133 | hps = utils.get_hparams_from_file(config_path)
134 |
135 | device = config.bert_gen_config.device
136 |
137 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
138 | processor = (
139 | Wav2Vec2Processor.from_pretrained(model_name)
140 | if processor is None
141 | else processor
142 | )
143 | model = (
144 | EmotionModel.from_pretrained(model_name).to(device)
145 | if model is None
146 | else model.to(device)
147 | )
148 |
149 | lines = []
150 | with open(hps.data.training_files, encoding="utf-8") as f:
151 | lines.extend(f.readlines())
152 |
153 | with open(hps.data.validation_files, encoding="utf-8") as f:
154 | lines.extend(f.readlines())
155 |
156 | wavnames = [line.split("|")[0] for line in lines]
157 | dataset = AudioDataset(wavnames, 16000, processor)
158 | data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=16)
159 |
160 | with torch.no_grad():
161 | for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
162 | wavname = wavnames[i]
163 | emo_path = wavname.replace(".wav", ".emo.npy")
164 | if os.path.exists(emo_path):
165 | continue
166 | emb = model(data.to(device))[0].detach().cpu().numpy()
167 | np.save(emo_path, emb)
168 |
169 | print("Emo vec 生成完毕!")
170 |
--------------------------------------------------------------------------------
/preprocess_text.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collections import defaultdict
3 | from random import shuffle
4 | from typing import Optional
5 | import os
6 |
7 | from tqdm import tqdm
8 | import click
9 | from text.cleaner import clean_text
10 | from config import config
11 | from infer import latest_version
12 |
13 | preprocess_text_config = config.preprocess_text_config
14 |
15 |
16 | @click.command()
17 | @click.option(
18 | "--transcription-path",
19 | default=preprocess_text_config.transcription_path,
20 | type=click.Path(exists=True, file_okay=True, dir_okay=False),
21 | )
22 | @click.option("--cleaned-path", default=preprocess_text_config.cleaned_path)
23 | @click.option("--train-path", default=preprocess_text_config.train_path)
24 | @click.option("--val-path", default=preprocess_text_config.val_path)
25 | @click.option(
26 | "--config-path",
27 | default=preprocess_text_config.config_path,
28 | type=click.Path(exists=True, file_okay=True, dir_okay=False),
29 | )
30 | @click.option("--val-per-spk", default=preprocess_text_config.val_per_spk)
31 | @click.option("--max-val-total", default=preprocess_text_config.max_val_total)
32 | @click.option("--clean/--no-clean", default=preprocess_text_config.clean)
33 | @click.option("-y", "--yml_config")
34 | def preprocess(
35 | transcription_path: str,
36 | cleaned_path: Optional[str],
37 | train_path: str,
38 | val_path: str,
39 | config_path: str,
40 | val_per_spk: int,
41 | max_val_total: int,
42 | clean: bool,
43 | yml_config: str, # 这个不要删
44 | ):
45 | if cleaned_path == "" or cleaned_path is None:
46 | cleaned_path = transcription_path + ".cleaned"
47 |
48 | if clean:
49 | with open(cleaned_path, "w", encoding="utf-8") as out_file:
50 | with open(transcription_path, "r", encoding="utf-8") as trans_file:
51 | lines = trans_file.readlines()
52 | # print(lines, ' ', len(lines))
53 | if len(lines) != 0:
54 | for line in tqdm(lines):
55 | try:
56 | utt, spk, language, text = line.strip().split("|")
57 | norm_text, phones, tones, word2ph = clean_text(
58 | text, language
59 | )
60 | out_file.write(
61 | "{}|{}|{}|{}|{}|{}|{}\n".format(
62 | utt,
63 | spk,
64 | language,
65 | norm_text,
66 | " ".join(phones),
67 | " ".join([str(i) for i in tones]),
68 | " ".join([str(i) for i in word2ph]),
69 | )
70 | )
71 | except Exception as e:
72 | print(line)
73 | print(f"生成训练集和验证集时发生错误!, 详细信息:\n{e}")
74 |
75 | transcription_path = cleaned_path
76 | spk_utt_map = defaultdict(list)
77 | spk_id_map = {}
78 | current_sid = 0
79 |
80 | with open(transcription_path, "r", encoding="utf-8") as f:
81 | audioPaths = set()
82 | countSame = 0
83 | countNotFound = 0
84 | for line in f.readlines():
85 | utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
86 | if utt in audioPaths:
87 | # 过滤数据集错误:相同的音频匹配多个文本,导致后续bert出问题
88 | print(f"重复音频文本:{line}")
89 | countSame += 1
90 | continue
91 | if not os.path.isfile(utt):
92 | # 过滤数据集错误:不存在对应音频
93 | print(f"没有找到对应的音频:{utt}")
94 | countNotFound += 1
95 | continue
96 | audioPaths.add(utt)
97 | spk_utt_map[spk].append(line)
98 |
99 | if spk not in spk_id_map.keys():
100 | spk_id_map[spk] = current_sid
101 | current_sid += 1
102 | print(f"总重复音频数:{countSame},总未找到的音频数:{countNotFound}")
103 |
104 | train_list = []
105 | val_list = []
106 |
107 | for spk, utts in spk_utt_map.items():
108 | shuffle(utts)
109 | val_list += utts[:val_per_spk]
110 | train_list += utts[val_per_spk:]
111 |
112 | if len(val_list) > max_val_total:
113 | train_list += val_list[max_val_total:]
114 | val_list = val_list[:max_val_total]
115 |
116 | with open(train_path, "w", encoding="utf-8") as f:
117 | for line in train_list:
118 | f.write(line)
119 |
120 | with open(val_path, "w", encoding="utf-8") as f:
121 | for line in val_list:
122 | f.write(line)
123 |
124 | json_config = json.load(open(config_path, encoding="utf-8"))
125 | json_config["data"]["spk2id"] = spk_id_map
126 | json_config['data']["n_speakers"] = current_sid#
127 | # 新增写入:写入训练版本、数据集路径
128 | json_config["version"] = latest_version
129 | json_config["data"]["training_files"] = os.path.normpath(train_path).replace(
130 | "\\", "/"
131 | )
132 | json_config["data"]["validation_files"] = os.path.normpath(val_path).replace(
133 | "\\", "/"
134 | )
135 | with open(config_path, "w", encoding="utf-8") as f:
136 | json.dump(json_config, f, indent=2, ensure_ascii=False)
137 | print("训练集和验证集生成完成!")
138 |
139 |
140 | if __name__ == "__main__":
141 | preprocess()
142 |
--------------------------------------------------------------------------------
/short_audio_transcribe.py:
--------------------------------------------------------------------------------
1 | import whisper
2 | import os
3 | import json
4 | import torchaudio
5 | import argparse
6 | import torch
7 | from config import config
8 | lang2token = {
9 | 'zh': "ZH|",
10 | 'ja': "JP|",
11 | "en": "EN|",
12 | }
13 | def transcribe_one(audio_path):
14 | # load audio and pad/trim it to fit 30 seconds
15 | audio = whisper.load_audio(audio_path)
16 | audio = whisper.pad_or_trim(audio)
17 |
18 | # make log-Mel spectrogram and move to the same device as the model
19 | mel = whisper.log_mel_spectrogram(audio).to(model.device)
20 |
21 | # detect the spoken language
22 | _, probs = model.detect_language(mel)
23 | print(f"Detected language: {max(probs, key=probs.get)}")
24 | lang = max(probs, key=probs.get)
25 | # decode the audio
26 | options = whisper.DecodingOptions(beam_size=5)
27 | result = whisper.decode(model, mel, options)
28 |
29 | # print the recognized text
30 | print(result.text)
31 | return lang, result.text
32 | if __name__ == "__main__":
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument("--languages", default="CJ")
35 | parser.add_argument("--whisper_size", default="medium")
36 | args = parser.parse_args()
37 | if args.languages == "CJE":
38 | lang2token = {
39 | 'zh': "ZH|",
40 | 'ja': "JP|",
41 | "en": "EN|",
42 | }
43 | elif args.languages == "CJ":
44 | lang2token = {
45 | 'zh': "ZH|",
46 | 'ja': "JP|",
47 | }
48 | elif args.languages == "C":
49 | lang2token = {
50 | 'zh': "ZH|",
51 | }
52 | assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
53 | model = whisper.load_model(args.whisper_size)
54 | #parent_dir = "./custom_character_voice/"
55 | parent_dir=config.resample_config.in_dir
56 | print(parent_dir)
57 | speaker_names = list(os.walk(parent_dir))[0][1]
58 | speaker_annos = []
59 | total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
60 | # resample audios
61 | # 2023/4/21: Get the target sampling rate
62 | with open(config.train_ms_config.config_path,'r', encoding='utf-8') as f:
63 | hps = json.load(f)
64 | target_sr = hps['data']['sampling_rate']
65 | processed_files = 0
66 | for speaker in speaker_names:
67 | for i, wavfile in enumerate(list(os.walk(os.path.join(parent_dir,speaker)))[0][2]):
68 | # try to load file as audio
69 | if wavfile.startswith("processed_"):
70 | continue
71 | try:
72 | wav, sr = torchaudio.load(parent_dir + "/" + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
73 | channels_first=True)
74 | wav = wav.mean(dim=0).unsqueeze(0)
75 | if sr != target_sr:
76 | wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
77 | if wav.shape[1] / sr > 20:
78 | print(f"{wavfile} too long, ignoring\n")
79 | save_path = parent_dir+"/"+ speaker + "/" + f"processed_{i}.wav"
80 | torchaudio.save(save_path, wav, target_sr, channels_first=True)
81 | # transcribe text
82 | lang, text = transcribe_one(save_path)
83 | if lang not in list(lang2token.keys()):
84 | print(f"{lang} not supported, ignoring\n")
85 | continue
86 | #text = "ZH|" + text + "\n"
87 | text = lang2token[lang] + text + "\n"
88 | speaker_annos.append(save_path + "|" + speaker + "|" + text)
89 |
90 | processed_files += 1
91 | print(f"Processed: {processed_files}/{total_files}")
92 | except Exception as e:
93 | print(e)
94 | continue
95 |
96 | # # clean annotation
97 | # import argparse
98 | # import text
99 | # from utils import load_filepaths_and_text
100 | # for i, line in enumerate(speaker_annos):
101 | # path, sid, txt = line.split("|")
102 | # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
103 | # cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
104 | # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
105 | # write into annotation
106 | if len(speaker_annos) == 0:
107 | print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
108 | print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
109 | with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f:
110 | for line in speaker_annos:
111 | f.write(line)
112 |
113 | # import json
114 | # # generate new config
115 | # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
116 | # hps = json.load(f)
117 | # # modify n_speakers
118 | # hps['data']["n_speakers"] = 1000 + len(speaker2id)
119 | # # add speaker names
120 | # for speaker in speaker_names:
121 | # hps['speakers'][speaker] = speaker2id[speaker]
122 | # # save modified config
123 | # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
124 | # json.dump(hps, f, indent=2)
125 | # print("finished")
126 |
--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.nn import functional as F
4 |
5 |
6 | def init_weights(m, mean=0.0, std=0.01):
7 | classname = m.__class__.__name__
8 | if classname.find("Conv") != -1:
9 | m.weight.data.normal_(mean, std)
10 |
11 |
12 | def get_padding(kernel_size, dilation=1):
13 | return int((kernel_size * dilation - dilation) / 2)
14 |
15 |
16 | def convert_pad_shape(pad_shape):
17 | layer = pad_shape[::-1]
18 | pad_shape = [item for sublist in layer for item in sublist]
19 | return pad_shape
20 |
21 |
22 | def intersperse(lst, item):
23 | result = [item] * (len(lst) * 2 + 1)
24 | result[1::2] = lst
25 | return result
26 |
27 |
28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
29 | """KL(P||Q)"""
30 | kl = (logs_q - logs_p) - 0.5
31 | kl += (
32 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33 | )
34 | return kl
35 |
36 |
37 | def rand_gumbel(shape):
38 | """Sample from the Gumbel distribution, protect from overflows."""
39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40 | return -torch.log(-torch.log(uniform_samples))
41 |
42 |
43 | def rand_gumbel_like(x):
44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45 | return g
46 |
47 |
48 | def slice_segments(x, ids_str, segment_size=4):
49 | ret = torch.zeros_like(x[:, :, :segment_size])
50 | for i in range(x.size(0)):
51 | idx_str = ids_str[i]
52 | idx_end = idx_str + segment_size
53 | if idx_str < 0:
54 | i1 = x.size(2) + idx_str
55 | r1 = x[i, :, i1:]
56 | r2 = x[i, :, :idx_end]
57 | ret[i] = torch.cat([r1, r2], dim=1)
58 | else:
59 | ret[i] = x[i, :, idx_str:idx_end]
60 | return ret
61 |
62 |
63 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
64 | b, d, t = x.size()
65 | if x_lengths is None:
66 | x_lengths = t
67 | ids_str_max = x_lengths - segment_size + 1
68 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
69 | ret = slice_segments(x, ids_str, segment_size)
70 | return ret, ids_str
71 |
72 |
73 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
74 | position = torch.arange(length, dtype=torch.float)
75 | num_timescales = channels // 2
76 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
77 | num_timescales - 1
78 | )
79 | inv_timescales = min_timescale * torch.exp(
80 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
81 | )
82 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
83 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
84 | signal = F.pad(signal, [0, 0, 0, channels % 2])
85 | signal = signal.view(1, channels, length)
86 | return signal
87 |
88 |
89 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
90 | b, channels, length = x.size()
91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92 | return x + signal.to(dtype=x.dtype, device=x.device)
93 |
94 |
95 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
96 | b, channels, length = x.size()
97 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
98 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
99 |
100 |
101 | def subsequent_mask(length):
102 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
103 | return mask
104 |
105 |
106 | @torch.jit.script
107 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
108 | n_channels_int = n_channels[0]
109 | in_act = input_a + input_b
110 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
111 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
112 | acts = t_act * s_act
113 | return acts
114 |
115 |
116 | def convert_pad_shape(pad_shape):
117 | layer = pad_shape[::-1]
118 | pad_shape = [item for sublist in layer for item in sublist]
119 | return pad_shape
120 |
121 |
122 | def shift_1d(x):
123 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
124 | return x
125 |
126 |
127 | def sequence_mask(length, max_length=None):
128 | if max_length is None:
129 | max_length = length.max()
130 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
131 | return x.unsqueeze(0) < length.unsqueeze(1)
132 |
133 |
134 | def generate_path(duration, mask):
135 | """
136 | duration: [b, 1, t_x]
137 | mask: [b, 1, t_y, t_x]
138 | """
139 |
140 | b, _, t_y, t_x = mask.shape
141 | cum_duration = torch.cumsum(duration, -1)
142 |
143 | cum_duration_flat = cum_duration.view(b * t_x)
144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145 | path = path.view(b, t_x, t_y)
146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147 | path = path.unsqueeze(1).transpose(2, 3) * mask
148 | return path
149 |
150 |
151 | def clip_grad_value_(parameters, clip_value, norm_type=2):
152 | if isinstance(parameters, torch.Tensor):
153 | parameters = [parameters]
154 | parameters = list(filter(lambda p: p.grad is not None, parameters))
155 | norm_type = float(norm_type)
156 | if clip_value is not None:
157 | clip_value = float(clip_value)
158 |
159 | total_norm = 0
160 | for p in parameters:
161 | param_norm = p.grad.data.norm(norm_type)
162 | total_norm += param_norm.item() ** norm_type
163 | if clip_value is not None:
164 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
165 | total_norm = total_norm ** (1.0 / norm_type)
166 | return total_norm
167 |
--------------------------------------------------------------------------------
/tools/sentence.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import regex as re
4 |
5 | from tools.classify_language import classify_language, split_alpha_nonalpha
6 |
7 |
8 | def check_is_none(item) -> bool:
9 | """none -> True, not none -> False"""
10 | return (
11 | item is None
12 | or (isinstance(item, str) and str(item).isspace())
13 | or str(item) == ""
14 | )
15 |
16 |
17 | def markup_language(text: str, target_languages: list = None) -> str:
18 | pattern = (
19 | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
20 | r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
21 | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
22 | )
23 | sentences = re.split(pattern, text)
24 |
25 | pre_lang = ""
26 | p = 0
27 |
28 | sorted_target_languages = sorted(target_languages)
29 | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
30 | new_sentences = []
31 | for sentence in sentences:
32 | new_sentences.extend(split_alpha_nonalpha(sentence))
33 | sentences = new_sentences
34 |
35 | for sentence in sentences:
36 | if check_is_none(sentence):
37 | continue
38 |
39 | lang = classify_language(sentence, target_languages)
40 |
41 | if pre_lang == "":
42 | text = text[:p] + text[p:].replace(
43 | sentence, f"[{lang.upper()}]{sentence}", 1
44 | )
45 | p += len(f"[{lang.upper()}]")
46 | elif pre_lang != lang:
47 | text = text[:p] + text[p:].replace(
48 | sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
49 | )
50 | p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
51 | pre_lang = lang
52 | p += text[p:].index(sentence) + len(sentence)
53 | text += f"[{pre_lang.upper()}]"
54 |
55 | return text
56 |
57 |
58 | def split_by_language(text: str, target_languages: list = None) -> list:
59 | pattern = (
60 | r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
61 | r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
62 | r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
63 | )
64 | sentences = re.split(pattern, text)
65 |
66 | pre_lang = ""
67 | start = 0
68 | end = 0
69 | sentences_list = []
70 |
71 | sorted_target_languages = sorted(target_languages)
72 | if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
73 | new_sentences = []
74 | for sentence in sentences:
75 | new_sentences.extend(split_alpha_nonalpha(sentence))
76 | sentences = new_sentences
77 |
78 | for sentence in sentences:
79 | if check_is_none(sentence):
80 | continue
81 |
82 | lang = classify_language(sentence, target_languages)
83 |
84 | end += text[end:].index(sentence)
85 | if pre_lang != "" and pre_lang != lang:
86 | sentences_list.append((text[start:end], pre_lang))
87 | start = end
88 | end += len(sentence)
89 | pre_lang = lang
90 | sentences_list.append((text[start:], pre_lang))
91 |
92 | return sentences_list
93 |
94 |
95 | def sentence_split(text: str, max: int) -> list:
96 | pattern = r"[!(),—+\-.:;??。,、;:]+"
97 | sentences = re.split(pattern, text)
98 | discarded_chars = re.findall(pattern, text)
99 |
100 | sentences_list, count, p = [], 0, 0
101 |
102 | # 按被分割的符号遍历
103 | for i, discarded_chars in enumerate(discarded_chars):
104 | count += len(sentences[i]) + len(discarded_chars)
105 | if count >= max:
106 | sentences_list.append(text[p : p + count].strip())
107 | p += count
108 | count = 0
109 |
110 | # 加入最后剩余的文本
111 | if p < len(text):
112 | sentences_list.append(text[p:])
113 |
114 | return sentences_list
115 |
116 |
117 | def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
118 | # 如果该speaker只支持一种语言
119 | if speaker_lang is not None and len(speaker_lang) == 1:
120 | if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
121 | logging.debug(
122 | f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
123 | )
124 | lang = speaker_lang[0]
125 |
126 | sentences_list = []
127 | if lang.upper() != "MIX":
128 | if max <= 0:
129 | sentences_list.append(
130 | markup_language(text, speaker_lang)
131 | if lang.upper() == "AUTO"
132 | else f"[{lang.upper()}]{text}[{lang.upper()}]"
133 | )
134 | else:
135 | for i in sentence_split(text, max):
136 | if check_is_none(i):
137 | continue
138 | sentences_list.append(
139 | markup_language(i, speaker_lang)
140 | if lang.upper() == "AUTO"
141 | else f"[{lang.upper()}]{i}[{lang.upper()}]"
142 | )
143 | else:
144 | sentences_list.append(text)
145 |
146 | for i in sentences_list:
147 | logging.debug(i)
148 |
149 | return sentences_list
150 |
151 |
152 | if __name__ == "__main__":
153 | text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
154 | print(markup_language(text, target_languages=None))
155 | print(sentence_split(text, max=50))
156 | print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
157 | text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
158 | print(split_by_language(text, ["zh", "ja", "en"]))
159 |
--------------------------------------------------------------------------------
/oldVersion/V111/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.1.1版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text, clean_text_fix
8 | from .text import cleaned_text_to_sequence
9 | from .text import get_bert, get_bert_fix
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 | assert bert.shape[-1] == len(phone), phone
26 |
27 | if language_str == "ZH":
28 | bert = bert
29 | ja_bert = torch.zeros(768, len(phone))
30 | elif language_str == "JP":
31 | ja_bert = bert
32 | bert = torch.zeros(1024, len(phone))
33 | else:
34 | bert = torch.zeros(1024, len(phone))
35 | ja_bert = torch.zeros(768, len(phone))
36 |
37 | assert bert.shape[-1] == len(
38 | phone
39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 |
41 | phone = torch.LongTensor(phone)
42 | tone = torch.LongTensor(tone)
43 | language = torch.LongTensor(language)
44 | return bert, ja_bert, phone, tone, language
45 |
46 |
47 | def get_text_fix(text, language_str, hps, device):
48 | norm_text, phone, tone, word2ph = clean_text_fix(text, language_str)
49 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
50 |
51 | if hps.data.add_blank:
52 | phone = commons.intersperse(phone, 0)
53 | tone = commons.intersperse(tone, 0)
54 | language = commons.intersperse(language, 0)
55 | for i in range(len(word2ph)):
56 | word2ph[i] = word2ph[i] * 2
57 | word2ph[0] += 1
58 | bert = get_bert_fix(norm_text, word2ph, language_str, device)
59 | del word2ph
60 | assert bert.shape[-1] == len(phone), phone
61 |
62 | if language_str == "ZH":
63 | bert = bert
64 | ja_bert = torch.zeros(768, len(phone))
65 | elif language_str == "JP":
66 | ja_bert = bert
67 | bert = torch.zeros(1024, len(phone))
68 | else:
69 | bert = torch.zeros(1024, len(phone))
70 | ja_bert = torch.zeros(768, len(phone))
71 |
72 | assert bert.shape[-1] == len(
73 | phone
74 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
75 |
76 | phone = torch.LongTensor(phone)
77 | tone = torch.LongTensor(tone)
78 | language = torch.LongTensor(language)
79 | return bert, ja_bert, phone, tone, language
80 |
81 |
82 | def infer(
83 | text,
84 | sdp_ratio,
85 | noise_scale,
86 | noise_scale_w,
87 | length_scale,
88 | sid,
89 | language,
90 | hps,
91 | net_g,
92 | device,
93 | ):
94 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
95 | with torch.no_grad():
96 | x_tst = phones.to(device).unsqueeze(0)
97 | tones = tones.to(device).unsqueeze(0)
98 | lang_ids = lang_ids.to(device).unsqueeze(0)
99 | bert = bert.to(device).unsqueeze(0)
100 | ja_bert = ja_bert.to(device).unsqueeze(0)
101 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
102 | del phones
103 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
104 | audio = (
105 | net_g.infer(
106 | x_tst,
107 | x_tst_lengths,
108 | speakers,
109 | tones,
110 | lang_ids,
111 | bert,
112 | ja_bert,
113 | sdp_ratio=sdp_ratio,
114 | noise_scale=noise_scale,
115 | noise_scale_w=noise_scale_w,
116 | length_scale=length_scale,
117 | )[0][0, 0]
118 | .data.cpu()
119 | .float()
120 | .numpy()
121 | )
122 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
123 | if torch.cuda.is_available():
124 | torch.cuda.empty_cache()
125 | return audio
126 |
127 |
128 | def infer_fix(
129 | text,
130 | sdp_ratio,
131 | noise_scale,
132 | noise_scale_w,
133 | length_scale,
134 | sid,
135 | language,
136 | hps,
137 | net_g,
138 | device,
139 | ):
140 | bert, ja_bert, phones, tones, lang_ids = get_text_fix(text, language, hps, device)
141 | with torch.no_grad():
142 | x_tst = phones.to(device).unsqueeze(0)
143 | tones = tones.to(device).unsqueeze(0)
144 | lang_ids = lang_ids.to(device).unsqueeze(0)
145 | bert = bert.to(device).unsqueeze(0)
146 | ja_bert = ja_bert.to(device).unsqueeze(0)
147 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
148 | del phones
149 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
150 | audio = (
151 | net_g.infer(
152 | x_tst,
153 | x_tst_lengths,
154 | speakers,
155 | tones,
156 | lang_ids,
157 | bert,
158 | ja_bert,
159 | sdp_ratio=sdp_ratio,
160 | noise_scale=noise_scale,
161 | noise_scale_w=noise_scale_w,
162 | length_scale=length_scale,
163 | )[0][0, 0]
164 | .data.cpu()
165 | .float()
166 | .numpy()
167 | )
168 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
169 | if torch.cuda.is_available():
170 | torch.cuda.empty_cache()
171 | return audio
172 |
--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, Response
2 | from io import BytesIO
3 | import torch
4 | from av import open as avopen
5 | from typing import Dict, List
6 | import re_matching
7 | import utils
8 | from infer import infer, get_net_g, latest_version
9 | from scipy.io import wavfile
10 | import gradio as gr
11 | from config import config
12 |
13 | # Flask Init
14 | app = Flask(__name__)
15 | app.config["JSON_AS_ASCII"] = False
16 |
17 |
18 | def replace_punctuation(text, i=2):
19 | punctuation = ",。?!"
20 | for char in punctuation:
21 | text = text.replace(char, char * i)
22 | return text
23 |
24 |
25 | def wav2(i, o, format):
26 | inp = avopen(i, "rb")
27 | out = avopen(o, "wb", format=format)
28 | if format == "ogg":
29 | format = "libvorbis"
30 |
31 | ostream = out.add_stream(format)
32 |
33 | for frame in inp.decode(audio=0):
34 | for p in ostream.encode(frame):
35 | out.mux(p)
36 |
37 | for p in ostream.encode(None):
38 | out.mux(p)
39 |
40 | out.close()
41 | inp.close()
42 |
43 |
44 | net_g_List = []
45 | hps_List = []
46 | # 模型角色字典
47 | # 使用方法 chr_name = chrsMap[model_id][chr_id]
48 | chrsMap: List[Dict[int, str]] = list()
49 |
50 | # 加载模型
51 | models = config.server_config.models
52 | for model in models:
53 | hps_List.append(utils.get_hparams_from_file(model["config"]))
54 | # 添加角色字典
55 | chrsMap.append(dict())
56 | for name, cid in hps_List[-1].data.spk2id.items():
57 | chrsMap[-1][cid] = name
58 | version = (
59 | hps_List[-1].version if hasattr(hps_List[-1], "version") else latest_version
60 | )
61 | net_g_List.append(
62 | get_net_g(
63 | model_path=model["model"],
64 | version=version,
65 | device=model["device"],
66 | hps=hps_List[-1],
67 | )
68 | )
69 |
70 |
71 | def generate_audio(
72 | slices,
73 | sdp_ratio,
74 | noise_scale,
75 | noise_scale_w,
76 | length_scale,
77 | speaker,
78 | language,
79 | ):
80 | audio_list = []
81 | silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
82 | with torch.no_grad():
83 | for piece in slices:
84 | audio = infer(
85 | piece,
86 | sdp_ratio=sdp_ratio,
87 | noise_scale=noise_scale,
88 | noise_scale_w=noise_scale_w,
89 | length_scale=length_scale,
90 | sid=speaker,
91 | language=language,
92 | hps=hps,
93 | net_g=net_g,
94 | device=device,
95 | )
96 | audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
97 | audio_list.append(audio16bit)
98 | audio_list.append(silence) # 将静音添加到列表中
99 | return audio_list
100 |
101 |
102 | @app.route("/")
103 | def main():
104 | try:
105 | model = int(request.args.get("model"))
106 | speaker = request.args.get("speaker", "") # 指定人物名
107 | speaker_id = request.args.get("speaker_id", None) # 直接指定id
108 | text = request.args.get("text").replace("/n", "")
109 | sdp_ratio = float(request.args.get("sdp_ratio", 0.2))
110 | noise = float(request.args.get("noise", 0.5))
111 | noisew = float(request.args.get("noisew", 0.6))
112 | length = float(request.args.get("length", 1.2))
113 | language = request.args.get("language")
114 | if length >= 2:
115 | return "Too big length"
116 | if len(text) >= 250:
117 | return "Too long text"
118 | fmt = request.args.get("format", "wav")
119 | if None in (speaker, text):
120 | return "Missing Parameter"
121 | if fmt not in ("mp3", "wav", "ogg"):
122 | return "Invalid Format"
123 | if language not in ("JP", "ZH", "EN", "mix"):
124 | return "Invalid language"
125 | except:
126 | return "Invalid Parameter"
127 |
128 | if speaker_id is not None:
129 | if speaker_id.isdigit():
130 | speaker = chrsMap[model][int(speaker_id)]
131 | audio_list = []
132 | if language == "mix":
133 | bool_valid, str_valid = re_matching.validate_text(text)
134 | if not bool_valid:
135 | return str_valid, (
136 | hps.data.sampling_rate,
137 | np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
138 | )
139 | result = re_matching.text_matching(text)
140 | for one in result:
141 | _speaker = one.pop()
142 | for lang, content in one:
143 | audio_list.extend(
144 | generate_audio(
145 | content.split("|"),
146 | sdp_ratio,
147 | noise_scale,
148 | noise_scale_w,
149 | length_scale,
150 | _speaker,
151 | lang,
152 | )
153 | )
154 | else:
155 | audio_list.extend(
156 | generate_audio(
157 | text.split("|"),
158 | sdp_ratio,
159 | noise_scale,
160 | noise_scale_w,
161 | length_scale,
162 | speaker,
163 | language,
164 | )
165 | )
166 |
167 | audio_concat = np.concatenate(audio_list)
168 | with BytesIO() as wav:
169 | wavfile.write(wav, hps_List[model].data.sampling_rate, audio_concat)
170 | torch.cuda.empty_cache()
171 | if fmt == "wav":
172 | return Response(wav.getvalue(), mimetype="audio/wav")
173 | wav.seek(0, 0)
174 | with BytesIO() as ofp:
175 | wav2(wav, ofp, fmt)
176 | return Response(
177 | ofp.getvalue(), mimetype="audio/mpeg" if fmt == "mp3" else "audio/ogg"
178 | )
179 |
180 |
181 | if __name__ == "__main__":
182 | app.run(port=config.server_config.port, server_name="0.0.0.0")
183 |
--------------------------------------------------------------------------------
/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
1 | a AA a
2 | ai AA ai
3 | an AA an
4 | ang AA ang
5 | ao AA ao
6 | ba b a
7 | bai b ai
8 | ban b an
9 | bang b ang
10 | bao b ao
11 | bei b ei
12 | ben b en
13 | beng b eng
14 | bi b i
15 | bian b ian
16 | biao b iao
17 | bie b ie
18 | bin b in
19 | bing b ing
20 | bo b o
21 | bu b u
22 | ca c a
23 | cai c ai
24 | can c an
25 | cang c ang
26 | cao c ao
27 | ce c e
28 | cei c ei
29 | cen c en
30 | ceng c eng
31 | cha ch a
32 | chai ch ai
33 | chan ch an
34 | chang ch ang
35 | chao ch ao
36 | che ch e
37 | chen ch en
38 | cheng ch eng
39 | chi ch ir
40 | chong ch ong
41 | chou ch ou
42 | chu ch u
43 | chua ch ua
44 | chuai ch uai
45 | chuan ch uan
46 | chuang ch uang
47 | chui ch ui
48 | chun ch un
49 | chuo ch uo
50 | ci c i0
51 | cong c ong
52 | cou c ou
53 | cu c u
54 | cuan c uan
55 | cui c ui
56 | cun c un
57 | cuo c uo
58 | da d a
59 | dai d ai
60 | dan d an
61 | dang d ang
62 | dao d ao
63 | de d e
64 | dei d ei
65 | den d en
66 | deng d eng
67 | di d i
68 | dia d ia
69 | dian d ian
70 | diao d iao
71 | die d ie
72 | ding d ing
73 | diu d iu
74 | dong d ong
75 | dou d ou
76 | du d u
77 | duan d uan
78 | dui d ui
79 | dun d un
80 | duo d uo
81 | e EE e
82 | ei EE ei
83 | en EE en
84 | eng EE eng
85 | er EE er
86 | fa f a
87 | fan f an
88 | fang f ang
89 | fei f ei
90 | fen f en
91 | feng f eng
92 | fo f o
93 | fou f ou
94 | fu f u
95 | ga g a
96 | gai g ai
97 | gan g an
98 | gang g ang
99 | gao g ao
100 | ge g e
101 | gei g ei
102 | gen g en
103 | geng g eng
104 | gong g ong
105 | gou g ou
106 | gu g u
107 | gua g ua
108 | guai g uai
109 | guan g uan
110 | guang g uang
111 | gui g ui
112 | gun g un
113 | guo g uo
114 | ha h a
115 | hai h ai
116 | han h an
117 | hang h ang
118 | hao h ao
119 | he h e
120 | hei h ei
121 | hen h en
122 | heng h eng
123 | hong h ong
124 | hou h ou
125 | hu h u
126 | hua h ua
127 | huai h uai
128 | huan h uan
129 | huang h uang
130 | hui h ui
131 | hun h un
132 | huo h uo
133 | ji j i
134 | jia j ia
135 | jian j ian
136 | jiang j iang
137 | jiao j iao
138 | jie j ie
139 | jin j in
140 | jing j ing
141 | jiong j iong
142 | jiu j iu
143 | ju j v
144 | jv j v
145 | juan j van
146 | jvan j van
147 | jue j ve
148 | jve j ve
149 | jun j vn
150 | jvn j vn
151 | ka k a
152 | kai k ai
153 | kan k an
154 | kang k ang
155 | kao k ao
156 | ke k e
157 | kei k ei
158 | ken k en
159 | keng k eng
160 | kong k ong
161 | kou k ou
162 | ku k u
163 | kua k ua
164 | kuai k uai
165 | kuan k uan
166 | kuang k uang
167 | kui k ui
168 | kun k un
169 | kuo k uo
170 | la l a
171 | lai l ai
172 | lan l an
173 | lang l ang
174 | lao l ao
175 | le l e
176 | lei l ei
177 | leng l eng
178 | li l i
179 | lia l ia
180 | lian l ian
181 | liang l iang
182 | liao l iao
183 | lie l ie
184 | lin l in
185 | ling l ing
186 | liu l iu
187 | lo l o
188 | long l ong
189 | lou l ou
190 | lu l u
191 | luan l uan
192 | lun l un
193 | luo l uo
194 | lv l v
195 | lve l ve
196 | ma m a
197 | mai m ai
198 | man m an
199 | mang m ang
200 | mao m ao
201 | me m e
202 | mei m ei
203 | men m en
204 | meng m eng
205 | mi m i
206 | mian m ian
207 | miao m iao
208 | mie m ie
209 | min m in
210 | ming m ing
211 | miu m iu
212 | mo m o
213 | mou m ou
214 | mu m u
215 | na n a
216 | nai n ai
217 | nan n an
218 | nang n ang
219 | nao n ao
220 | ne n e
221 | nei n ei
222 | nen n en
223 | neng n eng
224 | ni n i
225 | nian n ian
226 | niang n iang
227 | niao n iao
228 | nie n ie
229 | nin n in
230 | ning n ing
231 | niu n iu
232 | nong n ong
233 | nou n ou
234 | nu n u
235 | nuan n uan
236 | nun n un
237 | nuo n uo
238 | nv n v
239 | nve n ve
240 | o OO o
241 | ou OO ou
242 | pa p a
243 | pai p ai
244 | pan p an
245 | pang p ang
246 | pao p ao
247 | pei p ei
248 | pen p en
249 | peng p eng
250 | pi p i
251 | pian p ian
252 | piao p iao
253 | pie p ie
254 | pin p in
255 | ping p ing
256 | po p o
257 | pou p ou
258 | pu p u
259 | qi q i
260 | qia q ia
261 | qian q ian
262 | qiang q iang
263 | qiao q iao
264 | qie q ie
265 | qin q in
266 | qing q ing
267 | qiong q iong
268 | qiu q iu
269 | qu q v
270 | qv q v
271 | quan q van
272 | qvan q van
273 | que q ve
274 | qve q ve
275 | qun q vn
276 | qvn q vn
277 | ran r an
278 | rang r ang
279 | rao r ao
280 | re r e
281 | ren r en
282 | reng r eng
283 | ri r ir
284 | rong r ong
285 | rou r ou
286 | ru r u
287 | rua r ua
288 | ruan r uan
289 | rui r ui
290 | run r un
291 | ruo r uo
292 | sa s a
293 | sai s ai
294 | san s an
295 | sang s ang
296 | sao s ao
297 | se s e
298 | sen s en
299 | seng s eng
300 | sha sh a
301 | shai sh ai
302 | shan sh an
303 | shang sh ang
304 | shao sh ao
305 | she sh e
306 | shei sh ei
307 | shen sh en
308 | sheng sh eng
309 | shi sh ir
310 | shou sh ou
311 | shu sh u
312 | shua sh ua
313 | shuai sh uai
314 | shuan sh uan
315 | shuang sh uang
316 | shui sh ui
317 | shun sh un
318 | shuo sh uo
319 | si s i0
320 | song s ong
321 | sou s ou
322 | su s u
323 | suan s uan
324 | sui s ui
325 | sun s un
326 | suo s uo
327 | ta t a
328 | tai t ai
329 | tan t an
330 | tang t ang
331 | tao t ao
332 | te t e
333 | tei t ei
334 | teng t eng
335 | ti t i
336 | tian t ian
337 | tiao t iao
338 | tie t ie
339 | ting t ing
340 | tong t ong
341 | tou t ou
342 | tu t u
343 | tuan t uan
344 | tui t ui
345 | tun t un
346 | tuo t uo
347 | wa w a
348 | wai w ai
349 | wan w an
350 | wang w ang
351 | wei w ei
352 | wen w en
353 | weng w eng
354 | wo w o
355 | wu w u
356 | xi x i
357 | xia x ia
358 | xian x ian
359 | xiang x iang
360 | xiao x iao
361 | xie x ie
362 | xin x in
363 | xing x ing
364 | xiong x iong
365 | xiu x iu
366 | xu x v
367 | xv x v
368 | xuan x van
369 | xvan x van
370 | xue x ve
371 | xve x ve
372 | xun x vn
373 | xvn x vn
374 | ya y a
375 | yan y En
376 | yang y ang
377 | yao y ao
378 | ye y E
379 | yi y i
380 | yin y in
381 | ying y ing
382 | yo y o
383 | yong y ong
384 | you y ou
385 | yu y v
386 | yv y v
387 | yuan y van
388 | yvan y van
389 | yue y ve
390 | yve y ve
391 | yun y vn
392 | yvn y vn
393 | za z a
394 | zai z ai
395 | zan z an
396 | zang z ang
397 | zao z ao
398 | ze z e
399 | zei z ei
400 | zen z en
401 | zeng z eng
402 | zha zh a
403 | zhai zh ai
404 | zhan zh an
405 | zhang zh ang
406 | zhao zh ao
407 | zhe zh e
408 | zhei zh ei
409 | zhen zh en
410 | zheng zh eng
411 | zhi zh ir
412 | zhong zh ong
413 | zhou zh ou
414 | zhu zh u
415 | zhua zh ua
416 | zhuai zh uai
417 | zhuan zh uan
418 | zhuang zh uang
419 | zhui zh ui
420 | zhun zh un
421 | zhuo zh uo
422 | zi z i0
423 | zong z ong
424 | zou z ou
425 | zu z u
426 | zuan z uan
427 | zui z ui
428 | zun z un
429 | zuo z uo
430 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
1 | a AA a
2 | ai AA ai
3 | an AA an
4 | ang AA ang
5 | ao AA ao
6 | ba b a
7 | bai b ai
8 | ban b an
9 | bang b ang
10 | bao b ao
11 | bei b ei
12 | ben b en
13 | beng b eng
14 | bi b i
15 | bian b ian
16 | biao b iao
17 | bie b ie
18 | bin b in
19 | bing b ing
20 | bo b o
21 | bu b u
22 | ca c a
23 | cai c ai
24 | can c an
25 | cang c ang
26 | cao c ao
27 | ce c e
28 | cei c ei
29 | cen c en
30 | ceng c eng
31 | cha ch a
32 | chai ch ai
33 | chan ch an
34 | chang ch ang
35 | chao ch ao
36 | che ch e
37 | chen ch en
38 | cheng ch eng
39 | chi ch ir
40 | chong ch ong
41 | chou ch ou
42 | chu ch u
43 | chua ch ua
44 | chuai ch uai
45 | chuan ch uan
46 | chuang ch uang
47 | chui ch ui
48 | chun ch un
49 | chuo ch uo
50 | ci c i0
51 | cong c ong
52 | cou c ou
53 | cu c u
54 | cuan c uan
55 | cui c ui
56 | cun c un
57 | cuo c uo
58 | da d a
59 | dai d ai
60 | dan d an
61 | dang d ang
62 | dao d ao
63 | de d e
64 | dei d ei
65 | den d en
66 | deng d eng
67 | di d i
68 | dia d ia
69 | dian d ian
70 | diao d iao
71 | die d ie
72 | ding d ing
73 | diu d iu
74 | dong d ong
75 | dou d ou
76 | du d u
77 | duan d uan
78 | dui d ui
79 | dun d un
80 | duo d uo
81 | e EE e
82 | ei EE ei
83 | en EE en
84 | eng EE eng
85 | er EE er
86 | fa f a
87 | fan f an
88 | fang f ang
89 | fei f ei
90 | fen f en
91 | feng f eng
92 | fo f o
93 | fou f ou
94 | fu f u
95 | ga g a
96 | gai g ai
97 | gan g an
98 | gang g ang
99 | gao g ao
100 | ge g e
101 | gei g ei
102 | gen g en
103 | geng g eng
104 | gong g ong
105 | gou g ou
106 | gu g u
107 | gua g ua
108 | guai g uai
109 | guan g uan
110 | guang g uang
111 | gui g ui
112 | gun g un
113 | guo g uo
114 | ha h a
115 | hai h ai
116 | han h an
117 | hang h ang
118 | hao h ao
119 | he h e
120 | hei h ei
121 | hen h en
122 | heng h eng
123 | hong h ong
124 | hou h ou
125 | hu h u
126 | hua h ua
127 | huai h uai
128 | huan h uan
129 | huang h uang
130 | hui h ui
131 | hun h un
132 | huo h uo
133 | ji j i
134 | jia j ia
135 | jian j ian
136 | jiang j iang
137 | jiao j iao
138 | jie j ie
139 | jin j in
140 | jing j ing
141 | jiong j iong
142 | jiu j iu
143 | ju j v
144 | jv j v
145 | juan j van
146 | jvan j van
147 | jue j ve
148 | jve j ve
149 | jun j vn
150 | jvn j vn
151 | ka k a
152 | kai k ai
153 | kan k an
154 | kang k ang
155 | kao k ao
156 | ke k e
157 | kei k ei
158 | ken k en
159 | keng k eng
160 | kong k ong
161 | kou k ou
162 | ku k u
163 | kua k ua
164 | kuai k uai
165 | kuan k uan
166 | kuang k uang
167 | kui k ui
168 | kun k un
169 | kuo k uo
170 | la l a
171 | lai l ai
172 | lan l an
173 | lang l ang
174 | lao l ao
175 | le l e
176 | lei l ei
177 | leng l eng
178 | li l i
179 | lia l ia
180 | lian l ian
181 | liang l iang
182 | liao l iao
183 | lie l ie
184 | lin l in
185 | ling l ing
186 | liu l iu
187 | lo l o
188 | long l ong
189 | lou l ou
190 | lu l u
191 | luan l uan
192 | lun l un
193 | luo l uo
194 | lv l v
195 | lve l ve
196 | ma m a
197 | mai m ai
198 | man m an
199 | mang m ang
200 | mao m ao
201 | me m e
202 | mei m ei
203 | men m en
204 | meng m eng
205 | mi m i
206 | mian m ian
207 | miao m iao
208 | mie m ie
209 | min m in
210 | ming m ing
211 | miu m iu
212 | mo m o
213 | mou m ou
214 | mu m u
215 | na n a
216 | nai n ai
217 | nan n an
218 | nang n ang
219 | nao n ao
220 | ne n e
221 | nei n ei
222 | nen n en
223 | neng n eng
224 | ni n i
225 | nian n ian
226 | niang n iang
227 | niao n iao
228 | nie n ie
229 | nin n in
230 | ning n ing
231 | niu n iu
232 | nong n ong
233 | nou n ou
234 | nu n u
235 | nuan n uan
236 | nun n un
237 | nuo n uo
238 | nv n v
239 | nve n ve
240 | o OO o
241 | ou OO ou
242 | pa p a
243 | pai p ai
244 | pan p an
245 | pang p ang
246 | pao p ao
247 | pei p ei
248 | pen p en
249 | peng p eng
250 | pi p i
251 | pian p ian
252 | piao p iao
253 | pie p ie
254 | pin p in
255 | ping p ing
256 | po p o
257 | pou p ou
258 | pu p u
259 | qi q i
260 | qia q ia
261 | qian q ian
262 | qiang q iang
263 | qiao q iao
264 | qie q ie
265 | qin q in
266 | qing q ing
267 | qiong q iong
268 | qiu q iu
269 | qu q v
270 | qv q v
271 | quan q van
272 | qvan q van
273 | que q ve
274 | qve q ve
275 | qun q vn
276 | qvn q vn
277 | ran r an
278 | rang r ang
279 | rao r ao
280 | re r e
281 | ren r en
282 | reng r eng
283 | ri r ir
284 | rong r ong
285 | rou r ou
286 | ru r u
287 | rua r ua
288 | ruan r uan
289 | rui r ui
290 | run r un
291 | ruo r uo
292 | sa s a
293 | sai s ai
294 | san s an
295 | sang s ang
296 | sao s ao
297 | se s e
298 | sen s en
299 | seng s eng
300 | sha sh a
301 | shai sh ai
302 | shan sh an
303 | shang sh ang
304 | shao sh ao
305 | she sh e
306 | shei sh ei
307 | shen sh en
308 | sheng sh eng
309 | shi sh ir
310 | shou sh ou
311 | shu sh u
312 | shua sh ua
313 | shuai sh uai
314 | shuan sh uan
315 | shuang sh uang
316 | shui sh ui
317 | shun sh un
318 | shuo sh uo
319 | si s i0
320 | song s ong
321 | sou s ou
322 | su s u
323 | suan s uan
324 | sui s ui
325 | sun s un
326 | suo s uo
327 | ta t a
328 | tai t ai
329 | tan t an
330 | tang t ang
331 | tao t ao
332 | te t e
333 | tei t ei
334 | teng t eng
335 | ti t i
336 | tian t ian
337 | tiao t iao
338 | tie t ie
339 | ting t ing
340 | tong t ong
341 | tou t ou
342 | tu t u
343 | tuan t uan
344 | tui t ui
345 | tun t un
346 | tuo t uo
347 | wa w a
348 | wai w ai
349 | wan w an
350 | wang w ang
351 | wei w ei
352 | wen w en
353 | weng w eng
354 | wo w o
355 | wu w u
356 | xi x i
357 | xia x ia
358 | xian x ian
359 | xiang x iang
360 | xiao x iao
361 | xie x ie
362 | xin x in
363 | xing x ing
364 | xiong x iong
365 | xiu x iu
366 | xu x v
367 | xv x v
368 | xuan x van
369 | xvan x van
370 | xue x ve
371 | xve x ve
372 | xun x vn
373 | xvn x vn
374 | ya y a
375 | yan y En
376 | yang y ang
377 | yao y ao
378 | ye y E
379 | yi y i
380 | yin y in
381 | ying y ing
382 | yo y o
383 | yong y ong
384 | you y ou
385 | yu y v
386 | yv y v
387 | yuan y van
388 | yvan y van
389 | yue y ve
390 | yve y ve
391 | yun y vn
392 | yvn y vn
393 | za z a
394 | zai z ai
395 | zan z an
396 | zang z ang
397 | zao z ao
398 | ze z e
399 | zei z ei
400 | zen z en
401 | zeng z eng
402 | zha zh a
403 | zhai zh ai
404 | zhan zh an
405 | zhang zh ang
406 | zhao zh ao
407 | zhe zh e
408 | zhei zh ei
409 | zhen zh en
410 | zheng zh eng
411 | zhi zh ir
412 | zhong zh ong
413 | zhou zh ou
414 | zhu zh u
415 | zhua zh ua
416 | zhuai zh uai
417 | zhuan zh uan
418 | zhuang zh uang
419 | zhui zh ui
420 | zhun zh un
421 | zhuo zh uo
422 | zi z i0
423 | zong z ong
424 | zou z ou
425 | zu z u
426 | zuan z uan
427 | zui z ui
428 | zun z un
429 | zuo z uo
430 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
1 | a AA a
2 | ai AA ai
3 | an AA an
4 | ang AA ang
5 | ao AA ao
6 | ba b a
7 | bai b ai
8 | ban b an
9 | bang b ang
10 | bao b ao
11 | bei b ei
12 | ben b en
13 | beng b eng
14 | bi b i
15 | bian b ian
16 | biao b iao
17 | bie b ie
18 | bin b in
19 | bing b ing
20 | bo b o
21 | bu b u
22 | ca c a
23 | cai c ai
24 | can c an
25 | cang c ang
26 | cao c ao
27 | ce c e
28 | cei c ei
29 | cen c en
30 | ceng c eng
31 | cha ch a
32 | chai ch ai
33 | chan ch an
34 | chang ch ang
35 | chao ch ao
36 | che ch e
37 | chen ch en
38 | cheng ch eng
39 | chi ch ir
40 | chong ch ong
41 | chou ch ou
42 | chu ch u
43 | chua ch ua
44 | chuai ch uai
45 | chuan ch uan
46 | chuang ch uang
47 | chui ch ui
48 | chun ch un
49 | chuo ch uo
50 | ci c i0
51 | cong c ong
52 | cou c ou
53 | cu c u
54 | cuan c uan
55 | cui c ui
56 | cun c un
57 | cuo c uo
58 | da d a
59 | dai d ai
60 | dan d an
61 | dang d ang
62 | dao d ao
63 | de d e
64 | dei d ei
65 | den d en
66 | deng d eng
67 | di d i
68 | dia d ia
69 | dian d ian
70 | diao d iao
71 | die d ie
72 | ding d ing
73 | diu d iu
74 | dong d ong
75 | dou d ou
76 | du d u
77 | duan d uan
78 | dui d ui
79 | dun d un
80 | duo d uo
81 | e EE e
82 | ei EE ei
83 | en EE en
84 | eng EE eng
85 | er EE er
86 | fa f a
87 | fan f an
88 | fang f ang
89 | fei f ei
90 | fen f en
91 | feng f eng
92 | fo f o
93 | fou f ou
94 | fu f u
95 | ga g a
96 | gai g ai
97 | gan g an
98 | gang g ang
99 | gao g ao
100 | ge g e
101 | gei g ei
102 | gen g en
103 | geng g eng
104 | gong g ong
105 | gou g ou
106 | gu g u
107 | gua g ua
108 | guai g uai
109 | guan g uan
110 | guang g uang
111 | gui g ui
112 | gun g un
113 | guo g uo
114 | ha h a
115 | hai h ai
116 | han h an
117 | hang h ang
118 | hao h ao
119 | he h e
120 | hei h ei
121 | hen h en
122 | heng h eng
123 | hong h ong
124 | hou h ou
125 | hu h u
126 | hua h ua
127 | huai h uai
128 | huan h uan
129 | huang h uang
130 | hui h ui
131 | hun h un
132 | huo h uo
133 | ji j i
134 | jia j ia
135 | jian j ian
136 | jiang j iang
137 | jiao j iao
138 | jie j ie
139 | jin j in
140 | jing j ing
141 | jiong j iong
142 | jiu j iu
143 | ju j v
144 | jv j v
145 | juan j van
146 | jvan j van
147 | jue j ve
148 | jve j ve
149 | jun j vn
150 | jvn j vn
151 | ka k a
152 | kai k ai
153 | kan k an
154 | kang k ang
155 | kao k ao
156 | ke k e
157 | kei k ei
158 | ken k en
159 | keng k eng
160 | kong k ong
161 | kou k ou
162 | ku k u
163 | kua k ua
164 | kuai k uai
165 | kuan k uan
166 | kuang k uang
167 | kui k ui
168 | kun k un
169 | kuo k uo
170 | la l a
171 | lai l ai
172 | lan l an
173 | lang l ang
174 | lao l ao
175 | le l e
176 | lei l ei
177 | leng l eng
178 | li l i
179 | lia l ia
180 | lian l ian
181 | liang l iang
182 | liao l iao
183 | lie l ie
184 | lin l in
185 | ling l ing
186 | liu l iu
187 | lo l o
188 | long l ong
189 | lou l ou
190 | lu l u
191 | luan l uan
192 | lun l un
193 | luo l uo
194 | lv l v
195 | lve l ve
196 | ma m a
197 | mai m ai
198 | man m an
199 | mang m ang
200 | mao m ao
201 | me m e
202 | mei m ei
203 | men m en
204 | meng m eng
205 | mi m i
206 | mian m ian
207 | miao m iao
208 | mie m ie
209 | min m in
210 | ming m ing
211 | miu m iu
212 | mo m o
213 | mou m ou
214 | mu m u
215 | na n a
216 | nai n ai
217 | nan n an
218 | nang n ang
219 | nao n ao
220 | ne n e
221 | nei n ei
222 | nen n en
223 | neng n eng
224 | ni n i
225 | nian n ian
226 | niang n iang
227 | niao n iao
228 | nie n ie
229 | nin n in
230 | ning n ing
231 | niu n iu
232 | nong n ong
233 | nou n ou
234 | nu n u
235 | nuan n uan
236 | nun n un
237 | nuo n uo
238 | nv n v
239 | nve n ve
240 | o OO o
241 | ou OO ou
242 | pa p a
243 | pai p ai
244 | pan p an
245 | pang p ang
246 | pao p ao
247 | pei p ei
248 | pen p en
249 | peng p eng
250 | pi p i
251 | pian p ian
252 | piao p iao
253 | pie p ie
254 | pin p in
255 | ping p ing
256 | po p o
257 | pou p ou
258 | pu p u
259 | qi q i
260 | qia q ia
261 | qian q ian
262 | qiang q iang
263 | qiao q iao
264 | qie q ie
265 | qin q in
266 | qing q ing
267 | qiong q iong
268 | qiu q iu
269 | qu q v
270 | qv q v
271 | quan q van
272 | qvan q van
273 | que q ve
274 | qve q ve
275 | qun q vn
276 | qvn q vn
277 | ran r an
278 | rang r ang
279 | rao r ao
280 | re r e
281 | ren r en
282 | reng r eng
283 | ri r ir
284 | rong r ong
285 | rou r ou
286 | ru r u
287 | rua r ua
288 | ruan r uan
289 | rui r ui
290 | run r un
291 | ruo r uo
292 | sa s a
293 | sai s ai
294 | san s an
295 | sang s ang
296 | sao s ao
297 | se s e
298 | sen s en
299 | seng s eng
300 | sha sh a
301 | shai sh ai
302 | shan sh an
303 | shang sh ang
304 | shao sh ao
305 | she sh e
306 | shei sh ei
307 | shen sh en
308 | sheng sh eng
309 | shi sh ir
310 | shou sh ou
311 | shu sh u
312 | shua sh ua
313 | shuai sh uai
314 | shuan sh uan
315 | shuang sh uang
316 | shui sh ui
317 | shun sh un
318 | shuo sh uo
319 | si s i0
320 | song s ong
321 | sou s ou
322 | su s u
323 | suan s uan
324 | sui s ui
325 | sun s un
326 | suo s uo
327 | ta t a
328 | tai t ai
329 | tan t an
330 | tang t ang
331 | tao t ao
332 | te t e
333 | tei t ei
334 | teng t eng
335 | ti t i
336 | tian t ian
337 | tiao t iao
338 | tie t ie
339 | ting t ing
340 | tong t ong
341 | tou t ou
342 | tu t u
343 | tuan t uan
344 | tui t ui
345 | tun t un
346 | tuo t uo
347 | wa w a
348 | wai w ai
349 | wan w an
350 | wang w ang
351 | wei w ei
352 | wen w en
353 | weng w eng
354 | wo w o
355 | wu w u
356 | xi x i
357 | xia x ia
358 | xian x ian
359 | xiang x iang
360 | xiao x iao
361 | xie x ie
362 | xin x in
363 | xing x ing
364 | xiong x iong
365 | xiu x iu
366 | xu x v
367 | xv x v
368 | xuan x van
369 | xvan x van
370 | xue x ve
371 | xve x ve
372 | xun x vn
373 | xvn x vn
374 | ya y a
375 | yan y En
376 | yang y ang
377 | yao y ao
378 | ye y E
379 | yi y i
380 | yin y in
381 | ying y ing
382 | yo y o
383 | yong y ong
384 | you y ou
385 | yu y v
386 | yv y v
387 | yuan y van
388 | yvan y van
389 | yue y ve
390 | yve y ve
391 | yun y vn
392 | yvn y vn
393 | za z a
394 | zai z ai
395 | zan z an
396 | zang z ang
397 | zao z ao
398 | ze z e
399 | zei z ei
400 | zen z en
401 | zeng z eng
402 | zha zh a
403 | zhai zh ai
404 | zhan zh an
405 | zhang zh ang
406 | zhao zh ao
407 | zhe zh e
408 | zhei zh ei
409 | zhen zh en
410 | zheng zh eng
411 | zhi zh ir
412 | zhong zh ong
413 | zhou zh ou
414 | zhu zh u
415 | zhua zh ua
416 | zhuai zh uai
417 | zhuan zh uan
418 | zhuang zh uang
419 | zhui zh ui
420 | zhun zh un
421 | zhuo zh uo
422 | zi z i0
423 | zong z ong
424 | zou z ou
425 | zu z u
426 | zuan z uan
427 | zui z ui
428 | zun z un
429 | zuo z uo
430 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
1 | a AA a
2 | ai AA ai
3 | an AA an
4 | ang AA ang
5 | ao AA ao
6 | ba b a
7 | bai b ai
8 | ban b an
9 | bang b ang
10 | bao b ao
11 | bei b ei
12 | ben b en
13 | beng b eng
14 | bi b i
15 | bian b ian
16 | biao b iao
17 | bie b ie
18 | bin b in
19 | bing b ing
20 | bo b o
21 | bu b u
22 | ca c a
23 | cai c ai
24 | can c an
25 | cang c ang
26 | cao c ao
27 | ce c e
28 | cei c ei
29 | cen c en
30 | ceng c eng
31 | cha ch a
32 | chai ch ai
33 | chan ch an
34 | chang ch ang
35 | chao ch ao
36 | che ch e
37 | chen ch en
38 | cheng ch eng
39 | chi ch ir
40 | chong ch ong
41 | chou ch ou
42 | chu ch u
43 | chua ch ua
44 | chuai ch uai
45 | chuan ch uan
46 | chuang ch uang
47 | chui ch ui
48 | chun ch un
49 | chuo ch uo
50 | ci c i0
51 | cong c ong
52 | cou c ou
53 | cu c u
54 | cuan c uan
55 | cui c ui
56 | cun c un
57 | cuo c uo
58 | da d a
59 | dai d ai
60 | dan d an
61 | dang d ang
62 | dao d ao
63 | de d e
64 | dei d ei
65 | den d en
66 | deng d eng
67 | di d i
68 | dia d ia
69 | dian d ian
70 | diao d iao
71 | die d ie
72 | ding d ing
73 | diu d iu
74 | dong d ong
75 | dou d ou
76 | du d u
77 | duan d uan
78 | dui d ui
79 | dun d un
80 | duo d uo
81 | e EE e
82 | ei EE ei
83 | en EE en
84 | eng EE eng
85 | er EE er
86 | fa f a
87 | fan f an
88 | fang f ang
89 | fei f ei
90 | fen f en
91 | feng f eng
92 | fo f o
93 | fou f ou
94 | fu f u
95 | ga g a
96 | gai g ai
97 | gan g an
98 | gang g ang
99 | gao g ao
100 | ge g e
101 | gei g ei
102 | gen g en
103 | geng g eng
104 | gong g ong
105 | gou g ou
106 | gu g u
107 | gua g ua
108 | guai g uai
109 | guan g uan
110 | guang g uang
111 | gui g ui
112 | gun g un
113 | guo g uo
114 | ha h a
115 | hai h ai
116 | han h an
117 | hang h ang
118 | hao h ao
119 | he h e
120 | hei h ei
121 | hen h en
122 | heng h eng
123 | hong h ong
124 | hou h ou
125 | hu h u
126 | hua h ua
127 | huai h uai
128 | huan h uan
129 | huang h uang
130 | hui h ui
131 | hun h un
132 | huo h uo
133 | ji j i
134 | jia j ia
135 | jian j ian
136 | jiang j iang
137 | jiao j iao
138 | jie j ie
139 | jin j in
140 | jing j ing
141 | jiong j iong
142 | jiu j iu
143 | ju j v
144 | jv j v
145 | juan j van
146 | jvan j van
147 | jue j ve
148 | jve j ve
149 | jun j vn
150 | jvn j vn
151 | ka k a
152 | kai k ai
153 | kan k an
154 | kang k ang
155 | kao k ao
156 | ke k e
157 | kei k ei
158 | ken k en
159 | keng k eng
160 | kong k ong
161 | kou k ou
162 | ku k u
163 | kua k ua
164 | kuai k uai
165 | kuan k uan
166 | kuang k uang
167 | kui k ui
168 | kun k un
169 | kuo k uo
170 | la l a
171 | lai l ai
172 | lan l an
173 | lang l ang
174 | lao l ao
175 | le l e
176 | lei l ei
177 | leng l eng
178 | li l i
179 | lia l ia
180 | lian l ian
181 | liang l iang
182 | liao l iao
183 | lie l ie
184 | lin l in
185 | ling l ing
186 | liu l iu
187 | lo l o
188 | long l ong
189 | lou l ou
190 | lu l u
191 | luan l uan
192 | lun l un
193 | luo l uo
194 | lv l v
195 | lve l ve
196 | ma m a
197 | mai m ai
198 | man m an
199 | mang m ang
200 | mao m ao
201 | me m e
202 | mei m ei
203 | men m en
204 | meng m eng
205 | mi m i
206 | mian m ian
207 | miao m iao
208 | mie m ie
209 | min m in
210 | ming m ing
211 | miu m iu
212 | mo m o
213 | mou m ou
214 | mu m u
215 | na n a
216 | nai n ai
217 | nan n an
218 | nang n ang
219 | nao n ao
220 | ne n e
221 | nei n ei
222 | nen n en
223 | neng n eng
224 | ni n i
225 | nian n ian
226 | niang n iang
227 | niao n iao
228 | nie n ie
229 | nin n in
230 | ning n ing
231 | niu n iu
232 | nong n ong
233 | nou n ou
234 | nu n u
235 | nuan n uan
236 | nun n un
237 | nuo n uo
238 | nv n v
239 | nve n ve
240 | o OO o
241 | ou OO ou
242 | pa p a
243 | pai p ai
244 | pan p an
245 | pang p ang
246 | pao p ao
247 | pei p ei
248 | pen p en
249 | peng p eng
250 | pi p i
251 | pian p ian
252 | piao p iao
253 | pie p ie
254 | pin p in
255 | ping p ing
256 | po p o
257 | pou p ou
258 | pu p u
259 | qi q i
260 | qia q ia
261 | qian q ian
262 | qiang q iang
263 | qiao q iao
264 | qie q ie
265 | qin q in
266 | qing q ing
267 | qiong q iong
268 | qiu q iu
269 | qu q v
270 | qv q v
271 | quan q van
272 | qvan q van
273 | que q ve
274 | qve q ve
275 | qun q vn
276 | qvn q vn
277 | ran r an
278 | rang r ang
279 | rao r ao
280 | re r e
281 | ren r en
282 | reng r eng
283 | ri r ir
284 | rong r ong
285 | rou r ou
286 | ru r u
287 | rua r ua
288 | ruan r uan
289 | rui r ui
290 | run r un
291 | ruo r uo
292 | sa s a
293 | sai s ai
294 | san s an
295 | sang s ang
296 | sao s ao
297 | se s e
298 | sen s en
299 | seng s eng
300 | sha sh a
301 | shai sh ai
302 | shan sh an
303 | shang sh ang
304 | shao sh ao
305 | she sh e
306 | shei sh ei
307 | shen sh en
308 | sheng sh eng
309 | shi sh ir
310 | shou sh ou
311 | shu sh u
312 | shua sh ua
313 | shuai sh uai
314 | shuan sh uan
315 | shuang sh uang
316 | shui sh ui
317 | shun sh un
318 | shuo sh uo
319 | si s i0
320 | song s ong
321 | sou s ou
322 | su s u
323 | suan s uan
324 | sui s ui
325 | sun s un
326 | suo s uo
327 | ta t a
328 | tai t ai
329 | tan t an
330 | tang t ang
331 | tao t ao
332 | te t e
333 | tei t ei
334 | teng t eng
335 | ti t i
336 | tian t ian
337 | tiao t iao
338 | tie t ie
339 | ting t ing
340 | tong t ong
341 | tou t ou
342 | tu t u
343 | tuan t uan
344 | tui t ui
345 | tun t un
346 | tuo t uo
347 | wa w a
348 | wai w ai
349 | wan w an
350 | wang w ang
351 | wei w ei
352 | wen w en
353 | weng w eng
354 | wo w o
355 | wu w u
356 | xi x i
357 | xia x ia
358 | xian x ian
359 | xiang x iang
360 | xiao x iao
361 | xie x ie
362 | xin x in
363 | xing x ing
364 | xiong x iong
365 | xiu x iu
366 | xu x v
367 | xv x v
368 | xuan x van
369 | xvan x van
370 | xue x ve
371 | xve x ve
372 | xun x vn
373 | xvn x vn
374 | ya y a
375 | yan y En
376 | yang y ang
377 | yao y ao
378 | ye y E
379 | yi y i
380 | yin y in
381 | ying y ing
382 | yo y o
383 | yong y ong
384 | you y ou
385 | yu y v
386 | yv y v
387 | yuan y van
388 | yvan y van
389 | yue y ve
390 | yve y ve
391 | yun y vn
392 | yvn y vn
393 | za z a
394 | zai z ai
395 | zan z an
396 | zang z ang
397 | zao z ao
398 | ze z e
399 | zei z ei
400 | zen z en
401 | zeng z eng
402 | zha zh a
403 | zhai zh ai
404 | zhan zh an
405 | zhang zh ang
406 | zhao zh ao
407 | zhe zh e
408 | zhei zh ei
409 | zhen zh en
410 | zheng zh eng
411 | zhi zh ir
412 | zhong zh ong
413 | zhou zh ou
414 | zhu zh u
415 | zhua zh ua
416 | zhuai zh uai
417 | zhuan zh uan
418 | zhuang zh uang
419 | zhui zh ui
420 | zhun zh un
421 | zhuo zh uo
422 | zi z i0
423 | zong z ong
424 | zou z ou
425 | zu z u
426 | zuan z uan
427 | zui z ui
428 | zun z un
429 | zuo z uo
430 |
--------------------------------------------------------------------------------
/text/chinese.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import cn2an
5 | from pypinyin import lazy_pinyin, Style
6 |
7 | from text.symbols import punctuation
8 | from text.tone_sandhi import ToneSandhi
9 |
10 | current_file_path = os.path.dirname(__file__)
11 | pinyin_to_symbol_map = {
12 | line.split("\t")[0]: line.strip().split("\t")[1]
13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14 | }
15 |
16 | import jieba.posseg as psg
17 |
18 |
19 | rep_map = {
20 | ":": ",",
21 | ";": ",",
22 | ",": ",",
23 | "。": ".",
24 | "!": "!",
25 | "?": "?",
26 | "\n": ".",
27 | "·": ",",
28 | "、": ",",
29 | "...": "…",
30 | "$": ".",
31 | "“": "'",
32 | "”": "'",
33 | "‘": "'",
34 | "’": "'",
35 | "(": "'",
36 | ")": "'",
37 | "(": "'",
38 | ")": "'",
39 | "《": "'",
40 | "》": "'",
41 | "【": "'",
42 | "】": "'",
43 | "[": "'",
44 | "]": "'",
45 | "—": "-",
46 | "~": "-",
47 | "~": "-",
48 | "「": "'",
49 | "」": "'",
50 | }
51 |
52 | tone_modifier = ToneSandhi()
53 |
54 |
55 | def replace_punctuation(text):
56 | text = text.replace("嗯", "恩").replace("呣", "母")
57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58 |
59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60 |
61 | replaced_text = re.sub(
62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63 | )
64 |
65 | return replaced_text
66 |
67 |
68 | def g2p(text):
69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71 | phones, tones, word2ph = _g2p(sentences)
72 | assert sum(word2ph) == len(phones)
73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74 | phones = ["_"] + phones + ["_"]
75 | tones = [0] + tones + [0]
76 | word2ph = [1] + word2ph + [1]
77 | return phones, tones, word2ph
78 |
79 |
80 | def _get_initials_finals(word):
81 | initials = []
82 | finals = []
83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84 | orig_finals = lazy_pinyin(
85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86 | )
87 | for c, v in zip(orig_initials, orig_finals):
88 | initials.append(c)
89 | finals.append(v)
90 | return initials, finals
91 |
92 |
93 | def _g2p(segments):
94 | phones_list = []
95 | tones_list = []
96 | word2ph = []
97 | for seg in segments:
98 | # Replace all English words in the sentence
99 | seg = re.sub("[a-zA-Z]+", "", seg)
100 | seg_cut = psg.lcut(seg)
101 | initials = []
102 | finals = []
103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 | for word, pos in seg_cut:
105 | if pos == "eng":
106 | continue
107 | sub_initials, sub_finals = _get_initials_finals(word)
108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 | initials.append(sub_initials)
110 | finals.append(sub_finals)
111 |
112 | # assert len(sub_initials) == len(sub_finals) == len(word)
113 | initials = sum(initials, [])
114 | finals = sum(finals, [])
115 | #
116 | for c, v in zip(initials, finals):
117 | raw_pinyin = c + v
118 | # NOTE: post process for pypinyin outputs
119 | # we discriminate i, ii and iii
120 | if c == v:
121 | assert c in punctuation
122 | phone = [c]
123 | tone = "0"
124 | word2ph.append(1)
125 | else:
126 | v_without_tone = v[:-1]
127 | tone = v[-1]
128 |
129 | pinyin = c + v_without_tone
130 | assert tone in "12345"
131 |
132 | if c:
133 | # 多音节
134 | v_rep_map = {
135 | "uei": "ui",
136 | "iou": "iu",
137 | "uen": "un",
138 | }
139 | if v_without_tone in v_rep_map.keys():
140 | pinyin = c + v_rep_map[v_without_tone]
141 | else:
142 | # 单音节
143 | pinyin_rep_map = {
144 | "ing": "ying",
145 | "i": "yi",
146 | "in": "yin",
147 | "u": "wu",
148 | }
149 | if pinyin in pinyin_rep_map.keys():
150 | pinyin = pinyin_rep_map[pinyin]
151 | else:
152 | single_rep_map = {
153 | "v": "yu",
154 | "e": "e",
155 | "i": "y",
156 | "u": "w",
157 | }
158 | if pinyin[0] in single_rep_map.keys():
159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 |
161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 | phone = pinyin_to_symbol_map[pinyin].split(" ")
163 | word2ph.append(len(phone))
164 |
165 | phones_list += phone
166 | tones_list += [int(tone)] * len(phone)
167 | return phones_list, tones_list, word2ph
168 |
169 |
170 | def text_normalize(text):
171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 | for number in numbers:
173 | text = text.replace(number, cn2an.an2cn(number), 1)
174 | text = replace_punctuation(text)
175 | return text
176 |
177 |
178 | def get_bert_feature(text, word2ph):
179 | from text import chinese_bert
180 |
181 | return chinese_bert.get_bert_feature(text, word2ph)
182 |
183 |
184 | if __name__ == "__main__":
185 | from text.chinese_bert import get_bert_feature
186 |
187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188 | text = text_normalize(text)
189 | print(text)
190 | phones, tones, word2ph = g2p(text)
191 | bert = get_bert_feature(text, word2ph)
192 |
193 | print(phones, tones, word2ph, bert.shape)
194 |
195 |
196 | # # 示例用法
197 | # text = "这是一个示例文本:,你好!这是一个测试...."
198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
199 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import cn2an
5 | from pypinyin import lazy_pinyin, Style
6 |
7 | from .symbols import punctuation
8 | from .tone_sandhi import ToneSandhi
9 |
10 | current_file_path = os.path.dirname(__file__)
11 | pinyin_to_symbol_map = {
12 | line.split("\t")[0]: line.strip().split("\t")[1]
13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14 | }
15 |
16 | import jieba.posseg as psg
17 |
18 |
19 | rep_map = {
20 | ":": ",",
21 | ";": ",",
22 | ",": ",",
23 | "。": ".",
24 | "!": "!",
25 | "?": "?",
26 | "\n": ".",
27 | "·": ",",
28 | "、": ",",
29 | "...": "…",
30 | "$": ".",
31 | "“": "'",
32 | "”": "'",
33 | "‘": "'",
34 | "’": "'",
35 | "(": "'",
36 | ")": "'",
37 | "(": "'",
38 | ")": "'",
39 | "《": "'",
40 | "》": "'",
41 | "【": "'",
42 | "】": "'",
43 | "[": "'",
44 | "]": "'",
45 | "—": "-",
46 | "~": "-",
47 | "~": "-",
48 | "「": "'",
49 | "」": "'",
50 | }
51 |
52 | tone_modifier = ToneSandhi()
53 |
54 |
55 | def replace_punctuation(text):
56 | text = text.replace("嗯", "恩").replace("呣", "母")
57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58 |
59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60 |
61 | replaced_text = re.sub(
62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63 | )
64 |
65 | return replaced_text
66 |
67 |
68 | def g2p(text):
69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71 | phones, tones, word2ph = _g2p(sentences)
72 | assert sum(word2ph) == len(phones)
73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74 | phones = ["_"] + phones + ["_"]
75 | tones = [0] + tones + [0]
76 | word2ph = [1] + word2ph + [1]
77 | return phones, tones, word2ph
78 |
79 |
80 | def _get_initials_finals(word):
81 | initials = []
82 | finals = []
83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84 | orig_finals = lazy_pinyin(
85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86 | )
87 | for c, v in zip(orig_initials, orig_finals):
88 | initials.append(c)
89 | finals.append(v)
90 | return initials, finals
91 |
92 |
93 | def _g2p(segments):
94 | phones_list = []
95 | tones_list = []
96 | word2ph = []
97 | for seg in segments:
98 | # Replace all English words in the sentence
99 | seg = re.sub("[a-zA-Z]+", "", seg)
100 | seg_cut = psg.lcut(seg)
101 | initials = []
102 | finals = []
103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 | for word, pos in seg_cut:
105 | if pos == "eng":
106 | continue
107 | sub_initials, sub_finals = _get_initials_finals(word)
108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 | initials.append(sub_initials)
110 | finals.append(sub_finals)
111 |
112 | # assert len(sub_initials) == len(sub_finals) == len(word)
113 | initials = sum(initials, [])
114 | finals = sum(finals, [])
115 | #
116 | for c, v in zip(initials, finals):
117 | raw_pinyin = c + v
118 | # NOTE: post process for pypinyin outputs
119 | # we discriminate i, ii and iii
120 | if c == v:
121 | assert c in punctuation
122 | phone = [c]
123 | tone = "0"
124 | word2ph.append(1)
125 | else:
126 | v_without_tone = v[:-1]
127 | tone = v[-1]
128 |
129 | pinyin = c + v_without_tone
130 | assert tone in "12345"
131 |
132 | if c:
133 | # 多音节
134 | v_rep_map = {
135 | "uei": "ui",
136 | "iou": "iu",
137 | "uen": "un",
138 | }
139 | if v_without_tone in v_rep_map.keys():
140 | pinyin = c + v_rep_map[v_without_tone]
141 | else:
142 | # 单音节
143 | pinyin_rep_map = {
144 | "ing": "ying",
145 | "i": "yi",
146 | "in": "yin",
147 | "u": "wu",
148 | }
149 | if pinyin in pinyin_rep_map.keys():
150 | pinyin = pinyin_rep_map[pinyin]
151 | else:
152 | single_rep_map = {
153 | "v": "yu",
154 | "e": "e",
155 | "i": "y",
156 | "u": "w",
157 | }
158 | if pinyin[0] in single_rep_map.keys():
159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 |
161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 | phone = pinyin_to_symbol_map[pinyin].split(" ")
163 | word2ph.append(len(phone))
164 |
165 | phones_list += phone
166 | tones_list += [int(tone)] * len(phone)
167 | return phones_list, tones_list, word2ph
168 |
169 |
170 | def text_normalize(text):
171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 | for number in numbers:
173 | text = text.replace(number, cn2an.an2cn(number), 1)
174 | text = replace_punctuation(text)
175 | return text
176 |
177 |
178 | def get_bert_feature(text, word2ph):
179 | from text import chinese_bert
180 |
181 | return chinese_bert.get_bert_feature(text, word2ph)
182 |
183 |
184 | if __name__ == "__main__":
185 | from text.chinese_bert import get_bert_feature
186 |
187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188 | text = text_normalize(text)
189 | print(text)
190 | phones, tones, word2ph = g2p(text)
191 | bert = get_bert_feature(text, word2ph)
192 |
193 | print(phones, tones, word2ph, bert.shape)
194 |
195 |
196 | # # 示例用法
197 | # text = "这是一个示例文本:,你好!这是一个测试...."
198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
199 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import cn2an
5 | from pypinyin import lazy_pinyin, Style
6 |
7 | from .symbols import punctuation
8 | from .tone_sandhi import ToneSandhi
9 |
10 | current_file_path = os.path.dirname(__file__)
11 | pinyin_to_symbol_map = {
12 | line.split("\t")[0]: line.strip().split("\t")[1]
13 | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14 | }
15 |
16 | import jieba.posseg as psg
17 |
18 |
19 | rep_map = {
20 | ":": ",",
21 | ";": ",",
22 | ",": ",",
23 | "。": ".",
24 | "!": "!",
25 | "?": "?",
26 | "\n": ".",
27 | "·": ",",
28 | "、": ",",
29 | "...": "…",
30 | "$": ".",
31 | "“": "'",
32 | "”": "'",
33 | "‘": "'",
34 | "’": "'",
35 | "(": "'",
36 | ")": "'",
37 | "(": "'",
38 | ")": "'",
39 | "《": "'",
40 | "》": "'",
41 | "【": "'",
42 | "】": "'",
43 | "[": "'",
44 | "]": "'",
45 | "—": "-",
46 | "~": "-",
47 | "~": "-",
48 | "「": "'",
49 | "」": "'",
50 | }
51 |
52 | tone_modifier = ToneSandhi()
53 |
54 |
55 | def replace_punctuation(text):
56 | text = text.replace("嗯", "恩").replace("呣", "母")
57 | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58 |
59 | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60 |
61 | replaced_text = re.sub(
62 | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63 | )
64 |
65 | return replaced_text
66 |
67 |
68 | def g2p(text):
69 | pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70 | sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71 | phones, tones, word2ph = _g2p(sentences)
72 | assert sum(word2ph) == len(phones)
73 | assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74 | phones = ["_"] + phones + ["_"]
75 | tones = [0] + tones + [0]
76 | word2ph = [1] + word2ph + [1]
77 | return phones, tones, word2ph
78 |
79 |
80 | def _get_initials_finals(word):
81 | initials = []
82 | finals = []
83 | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84 | orig_finals = lazy_pinyin(
85 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86 | )
87 | for c, v in zip(orig_initials, orig_finals):
88 | initials.append(c)
89 | finals.append(v)
90 | return initials, finals
91 |
92 |
93 | def _g2p(segments):
94 | phones_list = []
95 | tones_list = []
96 | word2ph = []
97 | for seg in segments:
98 | # Replace all English words in the sentence
99 | seg = re.sub("[a-zA-Z]+", "", seg)
100 | seg_cut = psg.lcut(seg)
101 | initials = []
102 | finals = []
103 | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 | for word, pos in seg_cut:
105 | if pos == "eng":
106 | continue
107 | sub_initials, sub_finals = _get_initials_finals(word)
108 | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 | initials.append(sub_initials)
110 | finals.append(sub_finals)
111 |
112 | # assert len(sub_initials) == len(sub_finals) == len(word)
113 | initials = sum(initials, [])
114 | finals = sum(finals, [])
115 | #
116 | for c, v in zip(initials, finals):
117 | raw_pinyin = c + v
118 | # NOTE: post process for pypinyin outputs
119 | # we discriminate i, ii and iii
120 | if c == v:
121 | assert c in punctuation
122 | phone = [c]
123 | tone = "0"
124 | word2ph.append(1)
125 | else:
126 | v_without_tone = v[:-1]
127 | tone = v[-1]
128 |
129 | pinyin = c + v_without_tone
130 | assert tone in "12345"
131 |
132 | if c:
133 | # 多音节
134 | v_rep_map = {
135 | "uei": "ui",
136 | "iou": "iu",
137 | "uen": "un",
138 | }
139 | if v_without_tone in v_rep_map.keys():
140 | pinyin = c + v_rep_map[v_without_tone]
141 | else:
142 | # 单音节
143 | pinyin_rep_map = {
144 | "ing": "ying",
145 | "i": "yi",
146 | "in": "yin",
147 | "u": "wu",
148 | }
149 | if pinyin in pinyin_rep_map.keys():
150 | pinyin = pinyin_rep_map[pinyin]
151 | else:
152 | single_rep_map = {
153 | "v": "yu",
154 | "e": "e",
155 | "i": "y",
156 | "u": "w",
157 | }
158 | if pinyin[0] in single_rep_map.keys():
159 | pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 |
161 | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 | phone = pinyin_to_symbol_map[pinyin].split(" ")
163 | word2ph.append(len(phone))
164 |
165 | phones_list += phone
166 | tones_list += [int(tone)] * len(phone)
167 | return phones_list, tones_list, word2ph
168 |
169 |
170 | def text_normalize(text):
171 | numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 | for number in numbers:
173 | text = text.replace(number, cn2an.an2cn(number), 1)
174 | text = replace_punctuation(text)
175 | return text
176 |
177 |
178 | def get_bert_feature(text, word2ph):
179 | from text import chinese_bert
180 |
181 | return chinese_bert.get_bert_feature(text, word2ph)
182 |
183 |
184 | if __name__ == "__main__":
185 | from text.chinese_bert import get_bert_feature
186 |
187 | text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188 | text = text_normalize(text)
189 | print(text)
190 | phones, tones, word2ph = g2p(text)
191 | bert = get_bert_feature(text, word2ph)
192 |
193 | print(phones, tones, word2ph, bert.shape)
194 |
195 |
196 | # # 示例用法
197 | # text = "这是一个示例文本:,你好!这是一个测试...."
198 | # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
199 |
--------------------------------------------------------------------------------