├── oldVersion
    ├── V111
    │   ├── text
    │   │   ├── fix
    │   │   │   ├── __init__.py
    │   │   │   └── japanese_bert.py
    │   │   ├── english_bert_mock.py
    │   │   ├── japanese_bert.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── chinese_bert.py
    │   │   ├── symbols.py
    │   │   ├── english.py
    │   │   ├── opencpop-strict.txt
    │   │   └── chinese.py
    │   └── __init__.py
    ├── __init__.py
    ├── V101
    │   ├── text
    │   │   ├── english_bert_mock.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── chinese_bert.py
    │   │   ├── symbols.py
    │   │   ├── japanese.py
    │   │   ├── english.py
    │   │   └── opencpop-strict.txt
    │   └── __init__.py
    └── V110
    │   ├── text
    │       ├── english_bert_mock.py
    │       ├── cleaner.py
    │       ├── __init__.py
    │       ├── japanese_bert.py
    │       ├── chinese_bert.py
    │       ├── symbols.py
    │       ├── english.py
    │       ├── opencpop-strict.txt
    │       └── chinese.py
    │   └── __init__.py
├── tools
    ├── __init__.py
    ├── log.py
    ├── translate.py
    ├── classify_language.py
    └── sentence.py
├── emotional
    └── wav2vec2-large-robust-12-ft-emotion-msp-dim
    │   ├── vocab.json
    │   ├── preprocessor_config.json
    │   ├── config.json
    │   └── README.md
├── Web
    ├── img
    │   ├── Hiyori.ico
    │   ├── helps1.png
    │   └── helps2.png
    ├── index.html
    └── assets
    │   └── index-49e71a58.css
├── text
    ├── cmudict_cache.pickle
    ├── bert_utils.py
    ├── cleaner.py
    ├── english_bert_mock.py
    ├── __init__.py
    ├── japanese_bert.py
    ├── chinese_bert.py
    ├── symbols.py
    ├── opencpop-strict.txt
    └── chinese.py
├── run_Mgpus.sh
├── Data
    └── keqing
    │   ├── models
    │       └── eval
    │       │   └── events.out.tfevents.1700630428.ly.20380.1
    │   ├── config.yml
    │   └── config.json
├── bert
    ├── bert-base-japanese-v3
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   └── README.md
    ├── bert-large-japanese-v2
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   └── README.md
    └── bert_models.json
├── .gitignore
├── requirements.txt
├── monotonic_align
    ├── __init__.py
    └── core.py
├── audio_slicer.py
├── losses.py
├── config.yml
├── resample.py
├── re_matching.py
├── bert_gen.py
├── update_status.py
├── transcribe_genshin.py
├── README.md
├── configs
    └── default_config.yml
├── mel_processing.py
├── emo_gen.py
├── preprocess_text.py
├── short_audio_transcribe.py
├── commons.py
└── server.py


/oldVersion/V111/text/fix/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具包
3 | """
4 | 


--------------------------------------------------------------------------------
/oldVersion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 老版本模型推理兼容
3 | """
4 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/Web/img/Hiyori.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/Hiyori.ico


--------------------------------------------------------------------------------
/Web/img/helps1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps1.png


--------------------------------------------------------------------------------
/Web/img/helps2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Web/img/helps2.png


--------------------------------------------------------------------------------
/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/oldVersion/V101/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/run_Mgpus.sh:
--------------------------------------------------------------------------------
1 | torchrun \
2 |     --nnodes=1:3\
3 |     --nproc_per_node=2\
4 |     --rdzv_id=1\
5 |     --rdzv_backend=c10d\
6 |     --rdzv_endpoint="ib1:8880"\
7 |     train_ms.py
8 | 


--------------------------------------------------------------------------------
/Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v3ucn/Bert-VITS2_V202_Train/HEAD/Data/keqing/models/eval/events.out.tfevents.1700630428.ly.20380.1


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": true,
 3 |   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
 4 |   "feature_size": 1,
 5 |   "padding_side": "right",
 6 |   "padding_value": 0.0,
 7 |   "return_attention_mask": true,
 8 |   "sampling_rate": 16000
 9 | }
10 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tokenizer_class": "BertJapaneseTokenizer",
 3 |     "model_max_length": 512,
 4 |     "do_lower_case": false,
 5 |     "word_tokenizer_type": "mecab",
 6 |     "subword_tokenizer_type": "wordpiece",
 7 |     "mecab_kwargs": {
 8 |         "mecab_dic": "unidic_lite"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tokenizer_class": "BertJapaneseTokenizer",
 3 |     "model_max_length": 512,
 4 |     "do_lower_case": false,
 5 |     "word_tokenizer_type": "mecab",
 6 |     "subword_tokenizer_type": "wordpiece",
 7 |     "mecab_kwargs": {
 8 |         "mecab_dic": "unidic_lite"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/tools/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | logger封装
 3 | """
 4 | from loguru import logger
 5 | import sys
 6 | 
 7 | 
 8 | # 移除所有默认的处理器
 9 | logger.remove()
10 | 
11 | # 自定义格式并添加到标准输出
12 | log_format = (
13 |     "<g>{time:MM-DD HH:mm:ss}</g> <lvl>{level:<9}</lvl>| {file}:{line} | {message}"
14 | )
15 | 
16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | output/
 6 | ckpt/
 7 | pretrained_models/
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 | 
32 | 


--------------------------------------------------------------------------------
/bert/bert_models.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "deberta-v2-large-japanese": {
 3 |         "repo_id": "ku-nlp/deberta-v2-large-japanese",
 4 |         "files": ["pytorch_model.bin"]
 5 |     },
 6 |     "chinese-roberta-wwm-ext-large": {
 7 |         "repo_id": "hfl/chinese-roberta-wwm-ext-large",
 8 |         "files": ["pytorch_model.bin"]
 9 |     },
10 |     "deberta-v3-large": {
11 |         "repo_id": "microsoft/deberta-v3-large",
12 |         "files": ["spm.model", "pytorch_model.bin"]
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/Web/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <link rel="icon" href="/img/Hiyori.ico">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Hiyori UI</title>
 8 |     <script type="module" crossorigin src="/assets/index-c7d30f0f.js"></script>
 9 |     <link rel="stylesheet" href="/assets/index-49e71a58.css">
10 |   </head>
11 |   <body>
12 | 
13 |     
14 |     <div id="app"></div>
15 |   </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | librosa==0.9.1
 2 | matplotlib
 3 | numpy
 4 | numba
 5 | phonemizer
 6 | scipy
 7 | tensorboard
 8 | Unidecode
 9 | amfm_decompy
10 | jieba
11 | transformers
12 | pypinyin
13 | cn2an
14 | gradio
15 | av
16 | mecab-python3
17 | loguru
18 | unidic-lite
19 | cmudict
20 | fugashi
21 | num2words
22 | PyYAML
23 | requests
24 | pyopenjtalk; sys_platform == 'linux'
25 | openjtalk; sys_platform != 'linux'
26 | jaconv
27 | psutil
28 | GPUtil
29 | vector_quantize_pytorch
30 | g2p_en
31 | sentencepiece
32 | pykakasi
33 | langid
34 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |         "BertForPreTraining"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 12,
15 |     "num_hidden_layers": 12,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 32768
19 | }
20 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |         "BertForPreTraining"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 1024,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 4096,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 16,
15 |     "num_hidden_layers": 24,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 32768
19 | }
20 | 


--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | from numpy import zeros, int32, float32
 2 | from torch import from_numpy
 3 | 
 4 | from .core import maximum_path_jit
 5 | 
 6 | 
 7 | def maximum_path(neg_cent, mask):
 8 |     device = neg_cent.device
 9 |     dtype = neg_cent.dtype
10 |     neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 |     path = zeros(neg_cent.shape, dtype=int32)
12 | 
13 |     t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 |     t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 |     maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 |     return from_numpy(path).to(device=device, dtype=dtype)
17 | 


--------------------------------------------------------------------------------
/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | 
 7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 8 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 9 |     Args:
10 |       text: string to convert to a sequence
11 |     Returns:
12 |       List of integers corresponding to the symbols in the text
13 |     """
14 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 |     tone_start = language_tone_start_map[language]
16 |     tones = [i + tone_start for i in tones]
17 |     lang_id = language_id_map[language]
18 |     lang_ids = [lang_id for i in phones]
19 |     return phones, tones, lang_ids
20 | 
21 | 
22 | def get_bert(norm_text, word2ph, language):
23 |     from .chinese_bert import get_bert_feature as zh_bert
24 |     from .english_bert_mock import get_bert_feature as en_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph)
28 |     return bert
29 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | 
 7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 8 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 9 |     Args:
10 |       text: string to convert to a sequence
11 |     Returns:
12 |       List of integers corresponding to the symbols in the text
13 |     """
14 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 |     tone_start = language_tone_start_map[language]
16 |     tones = [i + tone_start for i in tones]
17 |     lang_id = language_id_map[language]
18 |     lang_ids = [lang_id for i in phones]
19 |     return phones, tones, lang_ids
20 | 
21 | 
22 | def get_bert(norm_text, word2ph, language, device):
23 |     from .chinese_bert import get_bert_feature as zh_bert
24 |     from .english_bert_mock import get_bert_feature as en_bert
25 |     from .japanese_bert import get_bert_feature as jp_bert
26 | 
27 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
28 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
29 |     return bert
30 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 6 | 
 7 | 
 8 | def get_bert_feature(text, word2ph, device=None):
 9 |     if (
10 |         sys.platform == "darwin"
11 |         and torch.backends.mps.is_available()
12 |         and device == "cpu"
13 |     ):
14 |         device = "mps"
15 |     if not device:
16 |         device = "cuda"
17 |     model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to(
18 |         device
19 |     )
20 |     with torch.no_grad():
21 |         inputs = tokenizer(text, return_tensors="pt")
22 |         for i in inputs:
23 |             inputs[i] = inputs[i].to(device)
24 |         res = model(**inputs, output_hidden_states=True)
25 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
27 |     word2phone = word2ph
28 |     phone_level_feature = []
29 |     for i in range(len(word2phone)):
30 |         repeat_feature = res[i].repeat(word2phone[i], 1)
31 |         phone_level_feature.append(repeat_feature)
32 | 
33 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
34 | 
35 |     return phone_level_feature.T
36 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 6 | 
 7 | models = dict()
 8 | 
 9 | 
10 | def get_bert_feature(text, word2ph, device=None):
11 |     if (
12 |         sys.platform == "darwin"
13 |         and torch.backends.mps.is_available()
14 |         and device == "cpu"
15 |     ):
16 |         device = "mps"
17 |     if not device:
18 |         device = "cuda"
19 |     if device not in models.keys():
20 |         models[device] = AutoModelForMaskedLM.from_pretrained(
21 |             "./bert/bert-base-japanese-v3"
22 |         ).to(device)
23 |     with torch.no_grad():
24 |         inputs = tokenizer(text, return_tensors="pt")
25 |         for i in inputs:
26 |             inputs[i] = inputs[i].to(device)
27 |         res = models[device](**inputs, output_hidden_states=True)
28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
30 |     word2phone = word2ph
31 |     phone_level_feature = []
32 |     for i in range(len(word2phone)):
33 |         repeat_feature = res[i].repeat(word2phone[i], 1)
34 |         phone_level_feature.append(repeat_feature)
35 | 
36 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 | 
38 |     return phone_level_feature.T
39 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, cleaned_text_to_sequence
 2 | from .fix import japanese as japanese_fix
 3 | 
 4 | 
 5 | language_module_map = {"ZH": chinese, "JP": japanese}
 6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix}
 7 | 
 8 | 
 9 | def clean_text(text, language):
10 |     language_module = language_module_map[language]
11 |     norm_text = language_module.text_normalize(text)
12 |     phones, tones, word2ph = language_module.g2p(norm_text)
13 |     return norm_text, phones, tones, word2ph
14 | 
15 | 
16 | def clean_text_fix(text, language):
17 |     """使用dev分支修复"""
18 |     language_module = language_module_map_fix[language]
19 |     norm_text = language_module.text_normalize(text)
20 |     phones, tones, word2ph = language_module.g2p(norm_text)
21 |     return norm_text, phones, tones, word2ph
22 | 
23 | 
24 | def clean_text_bert(text, language):
25 |     language_module = language_module_map[language]
26 |     norm_text = language_module.text_normalize(text)
27 |     phones, tones, word2ph = language_module.g2p(norm_text)
28 |     bert = language_module.get_bert_feature(norm_text, word2ph)
29 |     return phones, tones, bert
30 | 
31 | 
32 | def text_to_sequence(text, language):
33 |     norm_text, phones, tones, word2ph = clean_text(text, language)
34 |     return cleaned_text_to_sequence(phones, tones, language)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     pass
39 | 


--------------------------------------------------------------------------------
/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     if (
18 |         sys.platform == "darwin"
19 |         and torch.backends.mps.is_available()
20 |         and device == "cpu"
21 |     ):
22 |         device = "mps"
23 |     if not device:
24 |         device = "cuda"
25 |     if device not in models.keys():
26 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 |     with torch.no_grad():
28 |         inputs = tokenizer(text, return_tensors="pt")
29 |         for i in inputs:
30 |             inputs[i] = inputs[i].to(device)
31 |         res = models[device](**inputs, output_hidden_states=True)
32 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 |     # assert len(word2ph) == len(text)+2
34 |     word2phone = word2ph
35 |     phone_level_feature = []
36 |     for i in range(len(word2phone)):
37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
38 |         phone_level_feature.append(repeat_feature)
39 | 
40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 | 
42 |     return phone_level_feature.T
43 | 


--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
 1 | import numba
 2 | 
 3 | 
 4 | @numba.jit(
 5 |     numba.void(
 6 |         numba.int32[:, :, ::1],
 7 |         numba.float32[:, :, ::1],
 8 |         numba.int32[::1],
 9 |         numba.int32[::1],
10 |     ),
11 |     nopython=True,
12 |     nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 |     b = paths.shape[0]
16 |     max_neg_val = -1e9
17 |     for i in range(int(b)):
18 |         path = paths[i]
19 |         value = values[i]
20 |         t_y = t_ys[i]
21 |         t_x = t_xs[i]
22 | 
23 |         v_prev = v_cur = 0.0
24 |         index = t_x - 1
25 | 
26 |         for y in range(t_y):
27 |             for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 |                 if x == y:
29 |                     v_cur = max_neg_val
30 |                 else:
31 |                     v_cur = value[y - 1, x]
32 |                 if x == 0:
33 |                     if y == 0:
34 |                         v_prev = 0.0
35 |                     else:
36 |                         v_prev = max_neg_val
37 |                 else:
38 |                     v_prev = value[y - 1, x - 1]
39 |                 value[y, x] += max(v_prev, v_cur)
40 | 
41 |         for y in range(t_y - 1, -1, -1):
42 |             path[y, index] = 1
43 |             if index != 0 and (
44 |                 index == y or value[y - 1, index] < value[y - 1, index - 1]
45 |             ):
46 |                 index = index - 1
47 | 


--------------------------------------------------------------------------------
/audio_slicer.py:
--------------------------------------------------------------------------------
 1 | import librosa  # Optional. Use any library you like to read audio files.
 2 | import soundfile  # Optional. Use any library you like to write audio files.
 3 | 
 4 | import shutil
 5 | import gradio as gr
 6 | import os
 7 | import webbrowser
 8 | import subprocess
 9 | import datetime
10 | import json
11 | import requests
12 | import soundfile as sf
13 | import numpy as np
14 | import yaml
15 | from config import config
16 | import os
17 | 
18 | with open('config.yml', mode="r", encoding="utf-8") as f:
19 |     configyml=yaml.load(f,Loader=yaml.FullLoader)
20 | 
21 | model_name = configyml["dataset_path"].replace("Data\\","")
22 | 
23 | from slicer2 import Slicer
24 | 
25 | audio, sr = librosa.load(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav', sr=None, mono=False)  # Load an audio file with librosa.
26 | slicer = Slicer(
27 |     sr=sr,
28 |     threshold=-40,
29 |     min_length=2000,
30 |     min_interval=300,
31 |     hop_size=10,
32 |     max_sil_kept=500
33 | )
34 | chunks = slicer.slice(audio)
35 | for i, chunk in enumerate(chunks):
36 |     if len(chunk.shape) > 1:
37 |         chunk = chunk.T  # Swap axes if the audio is stereo.
38 |     soundfile.write(f'./Data/{model_name}/raw/{model_name}/{model_name}_{i}.wav', chunk, sr)  # Save sliced audio files with soundfile.
39 | 
40 | if os.path.exists(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav'):  # 如果文件存在
41 |     os.remove(f'./Data/{model_name}/raw/{model_name}/{model_name}.wav')  


--------------------------------------------------------------------------------
/oldVersion/V111/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 |     return bert
29 | 
30 | 
31 | def get_bert_fix(norm_text, word2ph, language, device):
32 |     from .chinese_bert import get_bert_feature as zh_bert
33 |     from .english_bert_mock import get_bert_feature as en_bert
34 |     from .fix.japanese_bert import get_bert_feature as jp_bert
35 | 
36 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
37 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
38 |     return bert
39 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 | 
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from text.symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 |     return bert
29 | 
30 | 
31 | def check_bert_models():
32 |     import json
33 |     from pathlib import Path
34 | 
35 |     from config import config
36 |     from .bert_utils import _check_bert
37 | 
38 |     if config.mirror.lower() == "openi":
39 |         import openi
40 | 
41 |         kwargs = {"token": config.openi_token} if config.openi_token else {}
42 |         openi.login(**kwargs)
43 | 
44 |     with open("./bert/bert_models.json", "r") as fp:
45 |         models = json.load(fp)
46 |         for k, v in models.items():
47 |             local_path = Path("./bert").joinpath(k)
48 |             _check_bert(v["repo_id"], v["files"], local_path)
49 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | bert_gen:
 2 |   config_path: config.json
 3 |   device: cuda
 4 |   num_processes: 2
 5 |   use_multi_device: false
 6 | dataset_path: Data\keqing
 7 | mirror: ''
 8 | openi_token: ''
 9 | preprocess_text:
10 |   clean: true
11 |   cleaned_path: filelists/cleaned.list
12 |   config_path: config.json
13 |   max_val_total: 8
14 |   train_path: filelists/train.list
15 |   transcription_path: filelists/short_character_anno.list
16 |   val_path: filelists/val.list
17 |   val_per_spk: 5
18 | resample:
19 |   in_dir: raw
20 |   out_dir: raw
21 |   sampling_rate: 44100
22 | server:
23 |   device: cuda
24 |   models:
25 |   - config: ./Data/keqing/config.json
26 |     device: cuda
27 |     language: ZH
28 |     model: ./Data/keqing/models/G_0.pth
29 |     speakers:
30 |     - length_scale: 1
31 |       noise_scale: 0.6
32 |       noise_scale_w: 0.8
33 |       sdp_ratio: 0.2
34 |       speaker: "\u79D1\u6BD4"
35 |     - length_scale: 0.5
36 |       noise_scale: 0.7
37 |       noise_scale_w: 0.8
38 |       sdp_ratio: 0.3
39 |       speaker: "\u4E94\u6761\u609F"
40 |     - length_scale: 1.2
41 |       noise_scale: 0.6
42 |       noise_scale_w: 0.8
43 |       sdp_ratio: 0.2
44 |       speaker: "\u5B89\u500D\u664B\u4E09"
45 |   - config: ./Data/keqing/config.json
46 |     device: cuda
47 |     language: JP
48 |     model: ./Data/keqing/models/G_0.pth
49 |     speakers: []
50 |   port: 7860
51 | train_ms:
52 |   base:
53 |     model_image: "Bert-VITS2中日英底模-fix"
54 |     repo_id: Stardust_minus/Bert-VITS2
55 |     use_base_model: false
56 |   config_path: config.json
57 |   env:
58 |     MASTER_ADDR: localhost
59 |     MASTER_PORT: 10086
60 |     RANK: 0
61 |     THE_ENV_VAR_YOU_NEED_TO_USE: '1234567'
62 |     WORLD_SIZE: 1
63 |   model: models
64 | translate:
65 |   app_key: ''
66 |   secret_key: ''
67 | webui:
68 |   config_path: Data/keqing/config.json
69 |   debug: false
70 |   device: cuda
71 |   language_identification_library: langid
72 |   model: models/G_0.pth
73 |   port: 7860
74 |   share: false
75 | 


--------------------------------------------------------------------------------
/Data/keqing/config.yml:
--------------------------------------------------------------------------------
 1 | bert_gen:
 2 |   config_path: config.json
 3 |   device: cuda
 4 |   num_processes: 2
 5 |   use_multi_device: false
 6 | dataset_path: Data\keqing
 7 | mirror: ''
 8 | openi_token: ''
 9 | preprocess_text:
10 |   clean: true
11 |   cleaned_path: filelists/cleaned.list
12 |   config_path: config.json
13 |   max_val_total: 8
14 |   train_path: filelists/train.list
15 |   transcription_path: filelists/short_character_anno.list
16 |   val_path: filelists/val.list
17 |   val_per_spk: 5
18 | resample:
19 |   in_dir: raw
20 |   out_dir: raw
21 |   sampling_rate: 44100
22 | server:
23 |   device: cuda
24 |   models:
25 |   - config: ./Data/TEST/config.json
26 |     device: cuda
27 |     language: ZH
28 |     model: ./Data/TEST/models/G_100.pth
29 |     speakers:
30 |     - length_scale: 1
31 |       noise_scale: 0.6
32 |       noise_scale_w: 0.8
33 |       sdp_ratio: 0.2
34 |       speaker: "\u79D1\u6BD4"
35 |     - length_scale: 0.5
36 |       noise_scale: 0.7
37 |       noise_scale_w: 0.8
38 |       sdp_ratio: 0.3
39 |       speaker: "\u4E94\u6761\u609F"
40 |     - length_scale: 1.2
41 |       noise_scale: 0.6
42 |       noise_scale_w: 0.8
43 |       sdp_ratio: 0.2
44 |       speaker: "\u5B89\u500D\u664B\u4E09"
45 |   - config: ./Data/test/config.json
46 |     device: cuda
47 |     language: JP
48 |     model: ./Data/test/models/G_100.pth
49 |     speakers: []
50 |   port: 7860
51 | train_ms:
52 |   base:
53 |     model_image: "Bert-VITS2中日英底模-fix"
54 |     repo_id: Stardust_minus/Bert-VITS2
55 |     use_base_model: false
56 |   config_path: config.json
57 |   env:
58 |     MASTER_ADDR: localhost
59 |     MASTER_PORT: 10086
60 |     RANK: 0
61 |     THE_ENV_VAR_YOU_NEED_TO_USE: '1234567'
62 |     WORLD_SIZE: 1
63 |   model: models
64 | translate:
65 |   app_key: ''
66 |   secret_key: ''
67 | webui:
68 |   config_path: Data/TEST/config.json
69 |   debug: false
70 |   device: cuda
71 |   language_identification_library: langid
72 |   model: models/G_100.pth
73 |   port: 7860
74 |   share: false
75 | 


--------------------------------------------------------------------------------
/Data/keqing/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 50,
 4 |     "eval_interval": 50,
 5 |     "seed": 42,
 6 |     "epochs": 200,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 8,
14 |     "fp16_run": false,
15 |     "lr_decay": 0.99995,
16 |     "segment_size": 16384,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "skip_optimizer": false
22 |   },
23 |   "data": {
24 |     "training_files": "Data/keqing/filelists/train.list",
25 |     "validation_files": "Data/keqing/filelists/val.list",
26 |     "max_wav_value": 32768.0,
27 |     "sampling_rate": 44100,
28 |     "filter_length": 2048,
29 |     "hop_length": 512,
30 |     "win_length": 2048,
31 |     "n_mel_channels": 128,
32 |     "mel_fmin": 0.0,
33 |     "mel_fmax": null,
34 |     "add_blank": true,
35 |     "n_speakers": 1,
36 |     "cleaned_text": true,
37 |     "spk2id": {
38 |       "keqing": 0
39 |     }
40 |   },
41 |   "model": {
42 |     "use_spk_conditioned_encoder": true,
43 |     "use_noise_scaled_mas": true,
44 |     "use_mel_posterior_encoder": false,
45 |     "use_duration_discriminator": true,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [
55 |       3,
56 |       7,
57 |       11
58 |     ],
59 |     "resblock_dilation_sizes": [
60 |       [
61 |         1,
62 |         3,
63 |         5
64 |       ],
65 |       [
66 |         1,
67 |         3,
68 |         5
69 |       ],
70 |       [
71 |         1,
72 |         3,
73 |         5
74 |       ]
75 |     ],
76 |     "upsample_rates": [
77 |       8,
78 |       8,
79 |       2,
80 |       2,
81 |       2
82 |     ],
83 |     "upsample_initial_channel": 512,
84 |     "upsample_kernel_sizes": [
85 |       16,
86 |       16,
87 |       8,
88 |       2,
89 |       2
90 |     ],
91 |     "n_layers_q": 3,
92 |     "use_spectral_norm": false,
93 |     "gin_channels": 256
94 |   },
95 |   "version": "2.0"
96 | }


--------------------------------------------------------------------------------
/tools/translate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 翻译api
 3 | """
 4 | from config import config
 5 | 
 6 | import random
 7 | import hashlib
 8 | import requests
 9 | 
10 | 
11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12 |     """
13 |     :param Sentence: 待翻译语句
14 |     :param from_Language: 待翻译语句语言
15 |     :param to_Language: 目标语言
16 |     :return: 翻译后语句 出错时返回None
17 | 
18 |     常见语言代码：中文 zh 英语 en 日语 jp
19 |     """
20 |     appid = config.translate_config.app_key
21 |     key = config.translate_config.secret_key
22 |     if appid == "" or key == "":
23 |         return "请开发者在config.yml中配置app_key与secret_key"
24 |     url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25 |     texts = Sentence.splitlines()
26 |     outTexts = []
27 |     for t in texts:
28 |         if t != "":
29 |             # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30 |             salt = str(random.randint(1, 100000))
31 |             signString = appid + t + salt + key
32 |             hs = hashlib.md5()
33 |             hs.update(signString.encode("utf-8"))
34 |             signString = hs.hexdigest()
35 |             if from_Language == "":
36 |                 from_Language = "auto"
37 |             headers = {"Content-Type": "application/x-www-form-urlencoded"}
38 |             payload = {
39 |                 "q": t,
40 |                 "from": from_Language,
41 |                 "to": to_Language,
42 |                 "appid": appid,
43 |                 "salt": salt,
44 |                 "sign": signString,
45 |             }
46 |             # 发送请求
47 |             try:
48 |                 response = requests.post(
49 |                     url=url, data=payload, headers=headers, timeout=3
50 |                 )
51 |                 response = response.json()
52 |                 if "trans_result" in response.keys():
53 |                     result = response["trans_result"][0]
54 |                     if "dst" in result.keys():
55 |                         dst = result["dst"]
56 |                         outTexts.append(dst)
57 |             except Exception:
58 |                 return Sentence
59 |         else:
60 |             outTexts.append(t)
61 |     return "\n".join(outTexts)
62 | 


--------------------------------------------------------------------------------
/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from text.japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     sep_text, _, _ = text2sep_kata(text)
18 |     sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 |     sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 |     sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 |     return get_bert_feature_with_token(sep_ids, word2ph, device)
22 | 
23 | 
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 |     if (
26 |         sys.platform == "darwin"
27 |         and torch.backends.mps.is_available()
28 |         and device == "cpu"
29 |     ):
30 |         device = "mps"
31 |     if not device:
32 |         device = "cuda"
33 |     if device not in models.keys():
34 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 |     with torch.no_grad():
36 |         inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 |         token_type_ids = torch.zeros_like(inputs).to(device)
38 |         attention_mask = torch.ones_like(inputs).to(device)
39 |         inputs = {
40 |             "input_ids": inputs,
41 |             "token_type_ids": token_type_ids,
42 |             "attention_mask": attention_mask,
43 |         }
44 | 
45 |         # for i in inputs:
46 |         #     inputs[i] = inputs[i].to(device)
47 |         res = models[device](**inputs, output_hidden_states=True)
48 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
50 |     word2phone = word2ph
51 |     phone_level_feature = []
52 |     for i in range(len(word2phone)):
53 |         repeat_feature = res[i].repeat(word2phone[i], 1)
54 |         phone_level_feature.append(repeat_feature)
55 | 
56 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 | 
58 |     return phone_level_feature.T
59 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | from .japanese import text2sep_kata
 5 | from config import config
 6 | 
 7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 8 | 
 9 | models = dict()
10 | 
11 | 
12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
13 |     sep_text, _ = text2sep_kata(text)
14 |     sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
15 |     sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
16 |     sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
17 |     return get_bert_feature_with_token(sep_ids, word2ph, device)
18 | 
19 | 
20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
21 |     if (
22 |         sys.platform == "darwin"
23 |         and torch.backends.mps.is_available()
24 |         and device == "cpu"
25 |     ):
26 |         device = "mps"
27 |     if not device:
28 |         device = "cuda"
29 |     if device not in models.keys():
30 |         models[device] = AutoModelForMaskedLM.from_pretrained(
31 |             "./bert/bert-base-japanese-v3"
32 |         ).to(device)
33 |     with torch.no_grad():
34 |         inputs = torch.tensor(tokens).to(device).unsqueeze(0)
35 |         token_type_ids = torch.zeros_like(inputs).to(device)
36 |         attention_mask = torch.ones_like(inputs).to(device)
37 |         inputs = {
38 |             "input_ids": inputs,
39 |             "token_type_ids": token_type_ids,
40 |             "attention_mask": attention_mask,
41 |         }
42 | 
43 |         # for i in inputs:
44 |         #     inputs[i] = inputs[i].to(device)
45 |         res = models[device](**inputs, output_hidden_states=True)
46 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
47 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
48 |     word2phone = word2ph
49 |     phone_level_feature = []
50 |     for i in range(len(word2phone)):
51 |         repeat_feature = res[i].repeat(word2phone[i], 1)
52 |         phone_level_feature.append(repeat_feature)
53 | 
54 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
55 | 
56 |     return phone_level_feature.T
57 | 


--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | from multiprocessing import Pool, cpu_count
 5 | 
 6 | import soundfile
 7 | from tqdm import tqdm
 8 | 
 9 | from config import config
10 | 
11 | 
12 | def process(item):
13 |     spkdir, wav_name, args = item
14 |     wav_path = os.path.join(args.in_dir, spkdir, wav_name)
15 |     if os.path.exists(wav_path) and ".wav" in wav_path:
16 |         wav, sr = librosa.load(wav_path, sr=args.sr)
17 |         soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument(
23 |         "--sr",
24 |         type=int,
25 |         default=config.resample_config.sampling_rate,
26 |         help="sampling rate",
27 |     )
28 |     parser.add_argument(
29 |         "--in_dir",
30 |         type=str,
31 |         default=config.resample_config.in_dir,
32 |         help="path to source dir",
33 |     )
34 |     parser.add_argument(
35 |         "--out_dir",
36 |         type=str,
37 |         default=config.resample_config.out_dir,
38 |         help="path to target dir",
39 |     )
40 |     parser.add_argument(
41 |         "--processes",
42 |         type=int,
43 |         default=0,
44 |         help="cpu_processes",
45 |     )
46 |     args, _ = parser.parse_known_args()
47 |     # autodl 无卡模式会识别出46个cpu
48 |     if args.processes == 0:
49 |         processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 |     else:
51 |         processes = args.processes
52 |     pool = Pool(processes=processes)
53 | 
54 |     tasks = []
55 | 
56 |     for dirpath, _, filenames in os.walk(args.in_dir):
57 |         # 子级目录
58 |         spk_dir = os.path.relpath(dirpath, args.in_dir)
59 |         spk_dir_out = os.path.join(args.out_dir, spk_dir)
60 |         if not os.path.isdir(spk_dir_out):
61 |             os.makedirs(spk_dir_out, exist_ok=True)
62 |         for filename in filenames:
63 |             if filename.endswith(".wav"):
64 |                 twople = (spk_dir, filename, args)
65 |                 tasks.append(twople)
66 | 
67 |     for _ in tqdm(
68 |         pool.imap_unordered(process, tasks),
69 |     ):
70 |         pass
71 | 
72 |     pool.close()
73 |     pool.join()
74 | 
75 |     print("音频重采样完毕!")
76 | 


--------------------------------------------------------------------------------
/oldVersion/V101/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 1.0.1 版本兼容
 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1
 4 | """
 5 | import torch
 6 | import commons
 7 | from .text.cleaner import clean_text
 8 | from .text import cleaned_text_to_sequence
 9 | from oldVersion.V111.text import get_bert
10 | 
11 | 
12 | def get_text(text, language_str, hps, device):
13 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 | 
16 |     if hps.data.add_blank:
17 |         phone = commons.intersperse(phone, 0)
18 |         tone = commons.intersperse(tone, 0)
19 |         language = commons.intersperse(language, 0)
20 |         for i in range(len(word2ph)):
21 |             word2ph[i] = word2ph[i] * 2
22 |         word2ph[0] += 1
23 |     bert = get_bert(norm_text, word2ph, language_str, device)
24 |     del word2ph
25 | 
26 |     assert bert.shape[-1] == len(phone)
27 | 
28 |     phone = torch.LongTensor(phone)
29 |     tone = torch.LongTensor(tone)
30 |     language = torch.LongTensor(language)
31 | 
32 |     return bert, phone, tone, language
33 | 
34 | 
35 | def infer(
36 |     text,
37 |     sdp_ratio,
38 |     noise_scale,
39 |     noise_scale_w,
40 |     length_scale,
41 |     sid,
42 |     hps,
43 |     net_g,
44 |     device,
45 | ):
46 |     bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device)
47 |     with torch.no_grad():
48 |         x_tst = phones.to(device).unsqueeze(0)
49 |         tones = tones.to(device).unsqueeze(0)
50 |         lang_ids = lang_ids.to(device).unsqueeze(0)
51 |         bert = bert.to(device).unsqueeze(0)
52 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
53 |         del phones
54 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
55 |         audio = (
56 |             net_g.infer(
57 |                 x_tst,
58 |                 x_tst_lengths,
59 |                 speakers,
60 |                 tones,
61 |                 lang_ids,
62 |                 bert,
63 |                 sdp_ratio=sdp_ratio,
64 |                 noise_scale=noise_scale,
65 |                 noise_scale_w=noise_scale_w,
66 |                 length_scale=length_scale,
67 |             )[0][0, 0]
68 |             .data.cpu()
69 |             .float()
70 |             .numpy()
71 |         )
72 |         del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
73 |         if torch.cuda.is_available():
74 |             torch.cuda.empty_cache()
75 |         return audio
76 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
 6 | 
 7 | 
 8 | def get_bert_feature(text, word2ph, device=None):
 9 |     if (
10 |         sys.platform == "darwin"
11 |         and torch.backends.mps.is_available()
12 |         and device == "cpu"
13 |     ):
14 |         device = "mps"
15 |     if not device:
16 |         device = "cuda"
17 |     model = AutoModelForMaskedLM.from_pretrained(
18 |         "./bert/chinese-roberta-wwm-ext-large"
19 |     ).to(device)
20 |     with torch.no_grad():
21 |         inputs = tokenizer(text, return_tensors="pt")
22 |         for i in inputs:
23 |             inputs[i] = inputs[i].to(device)
24 |         res = model(**inputs, output_hidden_states=True)
25 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 | 
27 |     assert len(word2ph) == len(text) + 2
28 |     word2phone = word2ph
29 |     phone_level_feature = []
30 |     for i in range(len(word2phone)):
31 |         repeat_feature = res[i].repeat(word2phone[i], 1)
32 |         phone_level_feature.append(repeat_feature)
33 | 
34 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
35 | 
36 |     return phone_level_feature.T
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     import torch
41 | 
42 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
43 |     word2phone = [
44 |         1,
45 |         2,
46 |         1,
47 |         2,
48 |         2,
49 |         1,
50 |         2,
51 |         2,
52 |         1,
53 |         2,
54 |         2,
55 |         1,
56 |         2,
57 |         2,
58 |         2,
59 |         2,
60 |         2,
61 |         1,
62 |         1,
63 |         2,
64 |         2,
65 |         1,
66 |         2,
67 |         2,
68 |         2,
69 |         2,
70 |         1,
71 |         2,
72 |         2,
73 |         2,
74 |         2,
75 |         2,
76 |         1,
77 |         2,
78 |         2,
79 |         2,
80 |         2,
81 |         1,
82 |     ]
83 | 
84 |     # 计算总帧数
85 |     total_frames = sum(word2phone)
86 |     print(word_level_feature.shape)
87 |     print(word2phone)
88 |     phone_level_feature = []
89 |     for i in range(len(word2phone)):
90 |         print(word_level_feature[i].shape)
91 | 
92 |         # 对每个词重复word2phone[i]次
93 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
94 |         phone_level_feature.append(repeat_feature)
95 | 
96 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
97 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
98 | 


--------------------------------------------------------------------------------
/re_matching.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def extract_language_and_text_updated(speaker, dialogue):
 5 |     # 使用正则表达式匹配<语言>标签和其后的文本
 6 |     pattern_language_text = r"<(\S+?)>([^<]+)"
 7 |     matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
 8 |     speaker = speaker[1:-1]
 9 |     # 清理文本：去除两边的空白字符
10 |     matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11 |     matches_cleaned.append(speaker)
12 |     return matches_cleaned
13 | 
14 | 
15 | def validate_text(input_text):
16 |     # 验证说话人的正则表达式
17 |     pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18 | 
19 |     # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20 |     matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21 | 
22 |     # 对每个匹配到的说话人内容进行进一步验证
23 |     for _, dialogue in matches:
24 |         language_text_matches = extract_language_and_text_updated(_, dialogue)
25 |         if not language_text_matches:
26 |             return (
27 |                 False,
28 |                 "Error: Invalid format detected in dialogue content. Please check your input.",
29 |             )
30 | 
31 |     # 如果输入的文本中没有找到任何匹配项
32 |     if not matches:
33 |         return (
34 |             False,
35 |             "Error: No valid speaker format detected. Please check your input.",
36 |         )
37 | 
38 |     return True, "Input is valid."
39 | 
40 | 
41 | def text_matching(text: str) -> list:
42 |     speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43 |     matches = re.findall(speaker_pattern, text, re.DOTALL)
44 |     result = []
45 |     for speaker, dialogue in matches:
46 |         result.append(extract_language_and_text_updated(speaker, dialogue))
47 |     print(result)
48 |     return result
49 | 
50 | 
51 | def cut_para(text):
52 |     splitted_para = re.split("[\n]", text)  # 按段分
53 |     splitted_para = [
54 |         sentence.strip() for sentence in splitted_para if sentence.strip()
55 |     ]  # 删除空字符串
56 |     return splitted_para
57 | 
58 | 
59 | def cut_sent(para):
60 |     para = re.sub("([。！;？\?])([^”’])", r"\1\n\2", para)  # 单字符断句符
61 |     para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)  # 英文省略号
62 |     para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)  # 中文省略号
63 |     para = re.sub("([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
64 |     para = para.rstrip()  # 段尾如果有多余的\n就去掉它
65 |     return para.split("\n")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     text = """
70 |     [说话人1]
71 |     [说话人2]<zh>你好吗？<jp>元気ですか？<jp>こんにちは，世界。<zh>你好吗？
72 |     [说话人3]<zh>谢谢。<jp>どういたしまして。
73 |     """
74 |     text_matching(text)
75 |     # 测试函数
76 |     test_text = """
77 |     [说话人1]<zh>你好，こんにちは！<jp>こんにちは，世界。
78 |     [说话人2]<zh>你好吗？
79 |     """
80 |     text_matching(test_text)
81 |     res = validate_text(test_text)
82 |     print(res)
83 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
  4 | 
  5 | device = torch.device(
  6 |     "cuda"
  7 |     if torch.cuda.is_available()
  8 |     else (
  9 |         "mps"
 10 |         if sys.platform == "darwin" and torch.backends.mps.is_available()
 11 |         else "cpu"
 12 |     )
 13 | )
 14 | 
 15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
 16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(
 17 |     device
 18 | )
 19 | 
 20 | 
 21 | def get_bert_feature(text, word2ph):
 22 |     with torch.no_grad():
 23 |         inputs = tokenizer(text, return_tensors="pt")
 24 |         for i in inputs:
 25 |             inputs[i] = inputs[i].to(device)
 26 |         res = model(**inputs, output_hidden_states=True)
 27 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 28 | 
 29 |     assert len(word2ph) == len(text) + 2
 30 |     word2phone = word2ph
 31 |     phone_level_feature = []
 32 |     for i in range(len(word2phone)):
 33 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 34 |         phone_level_feature.append(repeat_feature)
 35 | 
 36 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 37 | 
 38 |     return phone_level_feature.T
 39 | 
 40 | 
 41 | if __name__ == "__main__":
 42 |     # feature = get_bert_feature('你好,我是说的道理。')
 43 |     import torch
 44 | 
 45 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 46 |     word2phone = [
 47 |         1,
 48 |         2,
 49 |         1,
 50 |         2,
 51 |         2,
 52 |         1,
 53 |         2,
 54 |         2,
 55 |         1,
 56 |         2,
 57 |         2,
 58 |         1,
 59 |         2,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         1,
 65 |         1,
 66 |         2,
 67 |         2,
 68 |         1,
 69 |         2,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         1,
 74 |         2,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         1,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |     ]
 86 | 
 87 |     # 计算总帧数
 88 |     total_frames = sum(word2phone)
 89 |     print(word_level_feature.shape)
 90 |     print(word2phone)
 91 |     phone_level_feature = []
 92 |     for i in range(len(word2phone)):
 93 |         print(word_level_feature[i].shape)
 94 | 
 95 |         # 对每个词重复word2phone[i]次
 96 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 97 |         phone_level_feature.append(repeat_feature)
 98 | 
 99 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
101 | 


--------------------------------------------------------------------------------
/bert_gen.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from multiprocessing import Pool
 3 | import commons
 4 | import utils
 5 | from tqdm import tqdm
 6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert
 7 | import argparse
 8 | import torch.multiprocessing as mp
 9 | from config import config
10 | 
11 | 
12 | def process_line(line):
13 |     device = config.bert_gen_config.device
14 |     if config.bert_gen_config.use_multi_device:
15 |         rank = mp.current_process()._identity
16 |         rank = rank[0] if len(rank) > 0 else 0
17 |         if torch.cuda.is_available():
18 |             gpu_id = rank % torch.cuda.device_count()
19 |             device = torch.device(f"cuda:{gpu_id}")
20 |         else:
21 |             device = torch.device("cpu")
22 |     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
23 |     phone = phones.split(" ")
24 |     tone = [int(i) for i in tone.split(" ")]
25 |     word2ph = [int(i) for i in word2ph.split(" ")]
26 |     word2ph = [i for i in word2ph]
27 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
28 | 
29 |     phone = commons.intersperse(phone, 0)
30 |     tone = commons.intersperse(tone, 0)
31 |     language = commons.intersperse(language, 0)
32 |     for i in range(len(word2ph)):
33 |         word2ph[i] = word2ph[i] * 2
34 |     word2ph[0] += 1
35 | 
36 |     bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
37 | 
38 |     try:
39 |         bert = torch.load(bert_path)
40 |         assert bert.shape[-1] == len(phone)
41 |     except Exception:
42 |         bert = get_bert(text, word2ph, language_str, device)
43 |         assert bert.shape[-1] == len(phone)
44 |         torch.save(bert, bert_path)
45 | 
46 | 
47 | preprocess_text_config = config.preprocess_text_config
48 | 
49 | if __name__ == "__main__":
50 |     parser = argparse.ArgumentParser()
51 |     parser.add_argument(
52 |         "-c", "--config", type=str, default=config.bert_gen_config.config_path
53 |     )
54 |     parser.add_argument(
55 |         "--num_processes", type=int, default=config.bert_gen_config.num_processes
56 |     )
57 |     args, _ = parser.parse_known_args()
58 |     config_path = args.config
59 |     hps = utils.get_hparams_from_file(config_path)
60 |     check_bert_models()
61 |     lines = []
62 |     with open(hps.data.training_files, encoding="utf-8") as f:
63 |         lines.extend(f.readlines())
64 | 
65 |     with open(hps.data.validation_files, encoding="utf-8") as f:
66 |         lines.extend(f.readlines())
67 |     if len(lines) != 0:
68 |         num_processes = args.num_processes
69 |         with Pool(processes=num_processes) as pool:
70 |             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
71 |                 pass
72 | 
73 |     print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
74 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
  4 | 
  5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
  6 | 
  7 | models = dict()
  8 | 
  9 | 
 10 | def get_bert_feature(text, word2ph, device=None):
 11 |     if (
 12 |         sys.platform == "darwin"
 13 |         and torch.backends.mps.is_available()
 14 |         and device == "cpu"
 15 |     ):
 16 |         device = "mps"
 17 |     if not device:
 18 |         device = "cuda"
 19 |     if device not in models.keys():
 20 |         models[device] = AutoModelForMaskedLM.from_pretrained(
 21 |             "./bert/chinese-roberta-wwm-ext-large"
 22 |         ).to(device)
 23 |     with torch.no_grad():
 24 |         inputs = tokenizer(text, return_tensors="pt")
 25 |         for i in inputs:
 26 |             inputs[i] = inputs[i].to(device)
 27 |         res = models[device](**inputs, output_hidden_states=True)
 28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 29 | 
 30 |     assert len(word2ph) == len(text) + 2
 31 |     word2phone = word2ph
 32 |     phone_level_feature = []
 33 |     for i in range(len(word2phone)):
 34 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 35 |         phone_level_feature.append(repeat_feature)
 36 | 
 37 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 38 | 
 39 |     return phone_level_feature.T
 40 | 
 41 | 
 42 | if __name__ == "__main__":
 43 |     import torch
 44 | 
 45 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 46 |     word2phone = [
 47 |         1,
 48 |         2,
 49 |         1,
 50 |         2,
 51 |         2,
 52 |         1,
 53 |         2,
 54 |         2,
 55 |         1,
 56 |         2,
 57 |         2,
 58 |         1,
 59 |         2,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         1,
 65 |         1,
 66 |         2,
 67 |         2,
 68 |         1,
 69 |         2,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         1,
 74 |         2,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         1,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |     ]
 86 | 
 87 |     # 计算总帧数
 88 |     total_frames = sum(word2phone)
 89 |     print(word_level_feature.shape)
 90 |     print(word2phone)
 91 |     phone_level_feature = []
 92 |     for i in range(len(word2phone)):
 93 |         print(word_level_feature[i].shape)
 94 | 
 95 |         # 对每个词重复word2phone[i]次
 96 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 97 |         phone_level_feature.append(repeat_feature)
 98 | 
 99 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
101 | 


--------------------------------------------------------------------------------
/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
 16 |     if (
 17 |         sys.platform == "darwin"
 18 |         and torch.backends.mps.is_available()
 19 |         and device == "cpu"
 20 |     ):
 21 |         device = "mps"
 22 |     if not device:
 23 |         device = "cuda"
 24 |     if device not in models.keys():
 25 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 26 |     with torch.no_grad():
 27 |         inputs = tokenizer(text, return_tensors="pt")
 28 |         for i in inputs:
 29 |             inputs[i] = inputs[i].to(device)
 30 |         res = models[device](**inputs, output_hidden_states=True)
 31 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 32 | 
 33 |     assert len(word2ph) == len(text) + 2
 34 |     word2phone = word2ph
 35 |     phone_level_feature = []
 36 |     for i in range(len(word2phone)):
 37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 38 |         phone_level_feature.append(repeat_feature)
 39 | 
 40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 41 | 
 42 |     return phone_level_feature.T
 43 | 
 44 | 
 45 | if __name__ == "__main__":
 46 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 47 |     word2phone = [
 48 |         1,
 49 |         2,
 50 |         1,
 51 |         2,
 52 |         2,
 53 |         1,
 54 |         2,
 55 |         2,
 56 |         1,
 57 |         2,
 58 |         2,
 59 |         1,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         2,
 65 |         1,
 66 |         1,
 67 |         2,
 68 |         2,
 69 |         1,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         2,
 74 |         1,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         2,
 80 |         1,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         2,
 85 |         1,
 86 |     ]
 87 | 
 88 |     # 计算总帧数
 89 |     total_frames = sum(word2phone)
 90 |     print(word_level_feature.shape)
 91 |     print(word2phone)
 92 |     phone_level_feature = []
 93 |     for i in range(len(word2phone)):
 94 |         print(word_level_feature[i].shape)
 95 | 
 96 |         # 对每个词重复word2phone[i]次
 97 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 98 |         phone_level_feature.append(repeat_feature)
 99 | 
100 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
102 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: apache-2.0
 3 | datasets:
 4 | - cc100
 5 | - wikipedia
 6 | language:
 7 | - ja
 8 | widget:
 9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 | 
12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 | 
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 | 
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 | 
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 | 
21 | ## Model architecture
22 | 
23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24 | 
25 | ## Training Data
26 | 
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 | 
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 | 
33 | ## Tokenization
34 | 
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 | 
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 | 
40 | ## Training
41 | 
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 | 
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 | 
47 | ## Licenses
48 | 
49 | The pretrained models are distributed under the Apache License 2.0.
50 | 
51 | ## Acknowledgments
52 | 
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: apache-2.0
 3 | datasets:
 4 | - cc100
 5 | - wikipedia
 6 | language:
 7 | - ja
 8 | widget:
 9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 | 
12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 | 
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 | 
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 | 
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 | 
21 | ## Model architecture
22 | 
23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24 | 
25 | ## Training Data
26 | 
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 | 
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 | 
33 | ## Tokenization
34 | 
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 | 
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 | 
40 | ## Training
41 | 
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 | 
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 | 
47 | ## Licenses
48 | 
49 | The pretrained models are distributed under the Apache License 2.0.
50 | 
51 | ## Acknowledgments
52 | 
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_name_or_path": "torch",
  3 |   "activation_dropout": 0.1,
  4 |   "adapter_kernel_size": 3,
  5 |   "adapter_stride": 2,
  6 |   "add_adapter": false,
  7 |   "apply_spec_augment": true,
  8 |   "architectures": [
  9 |     "Wav2Vec2ForSpeechClassification"
 10 |   ],
 11 |   "attention_dropout": 0.1,
 12 |   "bos_token_id": 1,
 13 |   "classifier_proj_size": 256,
 14 |   "codevector_dim": 768,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": true,
 17 |   "conv_dim": [
 18 |     512,
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512
 25 |   ],
 26 |   "conv_kernel": [
 27 |     10,
 28 |     3,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     2,
 33 |     2
 34 |   ],
 35 |   "conv_stride": [
 36 |     5,
 37 |     2,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2
 43 |   ],
 44 |   "ctc_loss_reduction": "sum",
 45 |   "ctc_zero_infinity": false,
 46 |   "diversity_loss_weight": 0.1,
 47 |   "do_stable_layer_norm": true,
 48 |   "eos_token_id": 2,
 49 |   "feat_extract_activation": "gelu",
 50 |   "feat_extract_dropout": 0.0,
 51 |   "feat_extract_norm": "layer",
 52 |   "feat_proj_dropout": 0.1,
 53 |   "feat_quantizer_dropout": 0.0,
 54 |   "final_dropout": 0.1,
 55 |   "finetuning_task": "wav2vec2_reg",
 56 |   "gradient_checkpointing": false,
 57 |   "hidden_act": "gelu",
 58 |   "hidden_dropout": 0.1,
 59 |   "hidden_dropout_prob": 0.1,
 60 |   "hidden_size": 1024,
 61 |   "id2label": {
 62 |     "0": "arousal",
 63 |     "1": "dominance",
 64 |     "2": "valence"
 65 |   },
 66 |   "initializer_range": 0.02,
 67 |   "intermediate_size": 4096,
 68 |   "label2id": {
 69 |     "arousal": 0,
 70 |     "dominance": 1,
 71 |     "valence": 2
 72 |   },
 73 |   "layer_norm_eps": 1e-05,
 74 |   "layerdrop": 0.1,
 75 |   "mask_feature_length": 10,
 76 |   "mask_feature_min_masks": 0,
 77 |   "mask_feature_prob": 0.0,
 78 |   "mask_time_length": 10,
 79 |   "mask_time_min_masks": 2,
 80 |   "mask_time_prob": 0.05,
 81 |   "model_type": "wav2vec2",
 82 |   "num_adapter_layers": 3,
 83 |   "num_attention_heads": 16,
 84 |   "num_codevector_groups": 2,
 85 |   "num_codevectors_per_group": 320,
 86 |   "num_conv_pos_embedding_groups": 16,
 87 |   "num_conv_pos_embeddings": 128,
 88 |   "num_feat_extract_layers": 7,
 89 |   "num_hidden_layers": 12,
 90 |   "num_negatives": 100,
 91 |   "output_hidden_size": 1024,
 92 |   "pad_token_id": 0,
 93 |   "pooling_mode": "mean",
 94 |   "problem_type": "regression",
 95 |   "proj_codevector_dim": 768,
 96 |   "tdnn_dilation": [
 97 |     1,
 98 |     2,
 99 |     3,
100 |     1,
101 |     1
102 |   ],
103 |   "tdnn_dim": [
104 |     512,
105 |     512,
106 |     512,
107 |     512,
108 |     1500
109 |   ],
110 |   "tdnn_kernel": [
111 |     5,
112 |     3,
113 |     3,
114 |     1,
115 |     1
116 |   ],
117 |   "torch_dtype": "float32",
118 |   "transformers_version": "4.17.0.dev0",
119 |   "use_weighted_layer_sum": false,
120 |   "vocab_size": null,
121 |   "xvector_output_dim": 512
122 | }
123 | 


--------------------------------------------------------------------------------
/oldVersion/V110/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 1.1 版本兼容
 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1
 4 | """
 5 | import torch
 6 | import commons
 7 | from .text.cleaner import clean_text
 8 | from .text import cleaned_text_to_sequence
 9 | from oldVersion.V111.text import get_bert
10 | 
11 | 
12 | def get_text(text, language_str, hps, device):
13 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 | 
16 |     if hps.data.add_blank:
17 |         phone = commons.intersperse(phone, 0)
18 |         tone = commons.intersperse(tone, 0)
19 |         language = commons.intersperse(language, 0)
20 |         for i in range(len(word2ph)):
21 |             word2ph[i] = word2ph[i] * 2
22 |         word2ph[0] += 1
23 |     bert = get_bert(norm_text, word2ph, language_str, device)
24 |     del word2ph
25 |     assert bert.shape[-1] == len(phone), phone
26 | 
27 |     if language_str == "ZH":
28 |         bert = bert
29 |         ja_bert = torch.zeros(768, len(phone))
30 |     elif language_str == "JP":
31 |         ja_bert = bert
32 |         bert = torch.zeros(1024, len(phone))
33 |     else:
34 |         bert = torch.zeros(1024, len(phone))
35 |         ja_bert = torch.zeros(768, len(phone))
36 | 
37 |     assert bert.shape[-1] == len(
38 |         phone
39 |     ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 | 
41 |     phone = torch.LongTensor(phone)
42 |     tone = torch.LongTensor(tone)
43 |     language = torch.LongTensor(language)
44 |     return bert, ja_bert, phone, tone, language
45 | 
46 | 
47 | def infer(
48 |     text,
49 |     sdp_ratio,
50 |     noise_scale,
51 |     noise_scale_w,
52 |     length_scale,
53 |     sid,
54 |     language,
55 |     hps,
56 |     net_g,
57 |     device,
58 | ):
59 |     bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
60 |     with torch.no_grad():
61 |         x_tst = phones.to(device).unsqueeze(0)
62 |         tones = tones.to(device).unsqueeze(0)
63 |         lang_ids = lang_ids.to(device).unsqueeze(0)
64 |         bert = bert.to(device).unsqueeze(0)
65 |         ja_bert = ja_bert.to(device).unsqueeze(0)
66 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
67 |         del phones
68 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
69 |         audio = (
70 |             net_g.infer(
71 |                 x_tst,
72 |                 x_tst_lengths,
73 |                 speakers,
74 |                 tones,
75 |                 lang_ids,
76 |                 bert,
77 |                 ja_bert,
78 |                 sdp_ratio=sdp_ratio,
79 |                 noise_scale=noise_scale,
80 |                 noise_scale_w=noise_scale_w,
81 |                 length_scale=length_scale,
82 |             )[0][0, 0]
83 |             .data.cpu()
84 |             .float()
85 |             .numpy()
86 |         )
87 |         del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
88 |         if torch.cuda.is_available():
89 |             torch.cuda.empty_cache()
90 |         return audio
91 | 


--------------------------------------------------------------------------------
/update_status.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gradio as gr
 3 | 
 4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"}
 5 | 
 6 | 
 7 | def raw_dir_convert_to_path(target_dir: str, lang):
 8 |     res = target_dir.rstrip("/").rstrip("\\")
 9 |     if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")):
10 |         res = os.path.join("./raw", res)
11 |     if (
12 |         (not res.endswith("_zh"))
13 |         and (not res.endswith("_jp"))
14 |         and (not res.endswith("_en"))
15 |     ):
16 |         res += lang_dict[lang]
17 |     return res
18 | 
19 | 
20 | def update_g_files():
21 |     g_files = []
22 |     cnt = 0
23 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
24 |         for file in files:
25 |             if file.startswith("G_") and file.endswith(".pth"):
26 |                 g_files.append(os.path.join(root, file))
27 |                 cnt += 1
28 |     print(g_files)
29 |     return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files)
30 | 
31 | 
32 | def update_c_files():
33 |     c_files = []
34 |     cnt = 0
35 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
36 |         for file in files:
37 |             if file.startswith("config.json"):
38 |                 c_files.append(os.path.join(root, file))
39 |                 cnt += 1
40 |     print(c_files)
41 |     return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files)
42 | 
43 | 
44 | def update_model_folders():
45 |     subdirs = []
46 |     cnt = 0
47 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
48 |         for dir_name in dirs:
49 |             if os.path.basename(dir_name) != "eval":
50 |                 subdirs.append(os.path.join(root, dir_name))
51 |                 cnt += 1
52 |     print(subdirs)
53 |     return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs)
54 | 
55 | 
56 | def update_wav_lab_pairs():
57 |     wav_count = tot_count = 0
58 |     for root, _, files in os.walk("./raw"):
59 |         for file in files:
60 |             # print(file)
61 |             file_path = os.path.join(root, file)
62 |             if file.lower().endswith(".wav"):
63 |                 lab_file = os.path.splitext(file_path)[0] + ".lab"
64 |                 if os.path.exists(lab_file):
65 |                     wav_count += 1
66 |                 tot_count += 1
67 |     return f"{wav_count} / {tot_count}"
68 | 
69 | 
70 | def update_raw_folders():
71 |     subdirs = []
72 |     cnt = 0
73 |     script_path = os.path.dirname(os.path.abspath(__file__))  # 获取当前脚本的绝对路径
74 |     raw_path = os.path.join(script_path, "raw")
75 |     print(raw_path)
76 |     os.makedirs(raw_path, exist_ok=True)
77 |     for root, dirs, files in os.walk(raw_path):
78 |         for dir_name in dirs:
79 |             relative_path = os.path.relpath(
80 |                 os.path.join(root, dir_name), script_path
81 |             )  # 获取相对路径
82 |             subdirs.append(relative_path)
83 |             cnt += 1
84 |     print(subdirs)
85 |     return (
86 |         f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹",
87 |         gr.Dropdown.update(choices=subdirs),
88 |         gr.Textbox.update(value=update_wav_lab_pairs()),
89 |     )
90 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "I",
 78 |     "N",
 79 |     "U",
 80 |     "a",
 81 |     "b",
 82 |     "by",
 83 |     "ch",
 84 |     "cl",
 85 |     "d",
 86 |     "dy",
 87 |     "e",
 88 |     "f",
 89 |     "g",
 90 |     "gy",
 91 |     "h",
 92 |     "hy",
 93 |     "i",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "p",
103 |     "py",
104 |     "r",
105 |     "ry",
106 |     "s",
107 |     "sh",
108 |     "t",
109 |     "ts",
110 |     "u",
111 |     "V",
112 |     "w",
113 |     "y",
114 |     "z",
115 | ]
116 | num_ja_tones = 1
117 | 
118 | # English
119 | en_symbols = [
120 |     "aa",
121 |     "ae",
122 |     "ah",
123 |     "ao",
124 |     "aw",
125 |     "ay",
126 |     "b",
127 |     "ch",
128 |     "d",
129 |     "dh",
130 |     "eh",
131 |     "er",
132 |     "ey",
133 |     "f",
134 |     "g",
135 |     "hh",
136 |     "ih",
137 |     "iy",
138 |     "jh",
139 |     "k",
140 |     "l",
141 |     "m",
142 |     "n",
143 |     "ng",
144 |     "ow",
145 |     "oy",
146 |     "p",
147 |     "r",
148 |     "s",
149 |     "sh",
150 |     "t",
151 |     "th",
152 |     "uh",
153 |     "uw",
154 |     "V",
155 |     "w",
156 |     "y",
157 |     "z",
158 |     "zh",
159 | ]
160 | num_en_tones = 4
161 | 
162 | # combine all symbols
163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
164 | symbols = [pad] + normal_symbols + pu_symbols
165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
166 | 
167 | # combine all tones
168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
169 | 
170 | # language maps
171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2}
172 | num_languages = len(language_id_map.keys())
173 | 
174 | language_tone_start_map = {
175 |     "ZH": 0,
176 |     "JA": num_zh_tones,
177 |     "EN": num_zh_tones + num_ja_tones,
178 | }
179 | 
180 | if __name__ == "__main__":
181 |     a = set(zh_symbols)
182 |     b = set(en_symbols)
183 |     print(sorted(a & b))
184 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 1
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 1
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/japanese.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2 | import re
  3 | import sys
  4 | 
  5 | import pyopenjtalk
  6 | 
  7 | from . import symbols
  8 | 
  9 | # Regular expression matching Japanese without punctuation marks:
 10 | _japanese_characters = re.compile(
 11 |     r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 12 | )
 13 | 
 14 | # Regular expression matching non-Japanese characters or punctuation marks:
 15 | _japanese_marks = re.compile(
 16 |     r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 17 | )
 18 | 
 19 | # List of (symbol, Japanese) pairs for marks:
 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
 21 | 
 22 | 
 23 | # List of (consonant, sokuon) pairs:
 24 | _real_sokuon = [
 25 |     (re.compile("%s" % x[0]), x[1])
 26 |     for x in [
 27 |         (r"Q([↑↓]*[kg])", r"k#\1"),
 28 |         (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
 29 |         (r"Q([↑↓]*[sʃ])", r"s\1"),
 30 |         (r"Q([↑↓]*[pb])", r"p#\1"),
 31 |     ]
 32 | ]
 33 | 
 34 | # List of (consonant, hatsuon) pairs:
 35 | _real_hatsuon = [
 36 |     (re.compile("%s" % x[0]), x[1])
 37 |     for x in [
 38 |         (r"N([↑↓]*[pbm])", r"m\1"),
 39 |         (r"N([↑↓]*[ʧʥj])", r"n^\1"),
 40 |         (r"N([↑↓]*[tdn])", r"n\1"),
 41 |         (r"N([↑↓]*[kg])", r"ŋ\1"),
 42 |     ]
 43 | ]
 44 | 
 45 | 
 46 | def post_replace_ph(ph):
 47 |     rep_map = {
 48 |         "：": ",",
 49 |         "；": ",",
 50 |         "，": ",",
 51 |         "。": ".",
 52 |         "！": "!",
 53 |         "？": "?",
 54 |         "\n": ".",
 55 |         "·": ",",
 56 |         "、": ",",
 57 |         "...": "…",
 58 |         "v": "V",
 59 |     }
 60 |     if ph in rep_map.keys():
 61 |         ph = rep_map[ph]
 62 |     if ph in symbols:
 63 |         return ph
 64 |     if ph not in symbols:
 65 |         ph = "UNK"
 66 |     return ph
 67 | 
 68 | 
 69 | def symbols_to_japanese(text):
 70 |     for regex, replacement in _symbols_to_japanese:
 71 |         text = re.sub(regex, replacement, text)
 72 |     return text
 73 | 
 74 | 
 75 | def preprocess_jap(text):
 76 |     """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
 77 |     text = symbols_to_japanese(text)
 78 |     sentences = re.split(_japanese_marks, text)
 79 |     marks = re.findall(_japanese_marks, text)
 80 |     text = []
 81 |     for i, sentence in enumerate(sentences):
 82 |         if re.match(_japanese_characters, sentence):
 83 |             p = pyopenjtalk.g2p(sentence)
 84 |             text += p.split(" ")
 85 | 
 86 |         if i < len(marks):
 87 |             text += [marks[i].replace(" ", "")]
 88 |     return text
 89 | 
 90 | 
 91 | def text_normalize(text):
 92 |     # todo: jap text normalize
 93 |     return text
 94 | 
 95 | 
 96 | def g2p(norm_text):
 97 |     phones = preprocess_jap(norm_text)
 98 |     phones = [post_replace_ph(i) for i in phones]
 99 |     # todo: implement tones and word2ph
100 |     tones = [0 for i in phones]
101 |     word2ph = [1 for i in phones]
102 |     return phones, tones, word2ph
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     for line in open("../../../Downloads/transcript_utf8.txt").readlines():
107 |         text = line.split(":")[1]
108 |         phones, tones, word2ph = g2p(text)
109 |         for p in phones:
110 |             if p == "z":
111 |                 print(text, phones)
112 |                 sys.exit(0)
113 | 


--------------------------------------------------------------------------------
/tools/classify_language.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from config import config
  4 | 
  5 | LANGUAGE_IDENTIFICATION_LIBRARY = config.webui_config.language_identification_library
  6 | 
  7 | module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
  8 | 
  9 | langid_languages = [
 10 |     "af",
 11 |     "am",
 12 |     "an",
 13 |     "ar",
 14 |     "as",
 15 |     "az",
 16 |     "be",
 17 |     "bg",
 18 |     "bn",
 19 |     "br",
 20 |     "bs",
 21 |     "ca",
 22 |     "cs",
 23 |     "cy",
 24 |     "da",
 25 |     "de",
 26 |     "dz",
 27 |     "el",
 28 |     "en",
 29 |     "eo",
 30 |     "es",
 31 |     "et",
 32 |     "eu",
 33 |     "fa",
 34 |     "fi",
 35 |     "fo",
 36 |     "fr",
 37 |     "ga",
 38 |     "gl",
 39 |     "gu",
 40 |     "he",
 41 |     "hi",
 42 |     "hr",
 43 |     "ht",
 44 |     "hu",
 45 |     "hy",
 46 |     "id",
 47 |     "is",
 48 |     "it",
 49 |     "ja",
 50 |     "jv",
 51 |     "ka",
 52 |     "kk",
 53 |     "km",
 54 |     "kn",
 55 |     "ko",
 56 |     "ku",
 57 |     "ky",
 58 |     "la",
 59 |     "lb",
 60 |     "lo",
 61 |     "lt",
 62 |     "lv",
 63 |     "mg",
 64 |     "mk",
 65 |     "ml",
 66 |     "mn",
 67 |     "mr",
 68 |     "ms",
 69 |     "mt",
 70 |     "nb",
 71 |     "ne",
 72 |     "nl",
 73 |     "nn",
 74 |     "no",
 75 |     "oc",
 76 |     "or",
 77 |     "pa",
 78 |     "pl",
 79 |     "ps",
 80 |     "pt",
 81 |     "qu",
 82 |     "ro",
 83 |     "ru",
 84 |     "rw",
 85 |     "se",
 86 |     "si",
 87 |     "sk",
 88 |     "sl",
 89 |     "sq",
 90 |     "sr",
 91 |     "sv",
 92 |     "sw",
 93 |     "ta",
 94 |     "te",
 95 |     "th",
 96 |     "tl",
 97 |     "tr",
 98 |     "ug",
 99 |     "uk",
100 |     "ur",
101 |     "vi",
102 |     "vo",
103 |     "wa",
104 |     "xh",
105 |     "zh",
106 |     "zu",
107 | ]
108 | 
109 | 
110 | def classify_language(text: str, target_languages: list = None) -> str:
111 |     if module == "fastlid" or module == "fasttext":
112 |         from fastlid import fastlid, supported_langs
113 | 
114 |         classifier = fastlid
115 |         if target_languages != None:
116 |             target_languages = [
117 |                 lang for lang in target_languages if lang in supported_langs
118 |             ]
119 |             fastlid.set_languages = target_languages
120 |     elif module == "langid":
121 |         import langid
122 | 
123 |         classifier = langid.classify
124 |         if target_languages != None:
125 |             target_languages = [
126 |                 lang for lang in target_languages if lang in langid_languages
127 |             ]
128 |             langid.set_languages(target_languages)
129 |     else:
130 |         raise ValueError(f"Wrong module {module}")
131 | 
132 |     lang = classifier(text)[0]
133 | 
134 |     return lang
135 | 
136 | 
137 | def classify_zh_ja(text: str) -> str:
138 |     for idx, char in enumerate(text):
139 |         unicode_val = ord(char)
140 | 
141 |         # 检测日语字符
142 |         if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
143 |             return "ja"
144 | 
145 |         # 检测汉字字符
146 |         if 0x4E00 <= unicode_val <= 0x9FFF:
147 |             # 检查周围的字符
148 |             next_char = text[idx + 1] if idx + 1 < len(text) else None
149 | 
150 |             if next_char and (
151 |                 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
152 |             ):
153 |                 return "ja"
154 | 
155 |     return "zh"
156 | 
157 | 
158 | def split_alpha_nonalpha(text):
159 |     return re.split(
160 |         r"(?:(?<=[\u4e00-\u9fff])|(?<=[\u3040-\u30FF]))(?=[a-zA-Z])|(?<=[a-zA-Z])(?:(?=[\u4e00-\u9fff])|(?=[\u3040-\u30FF]))",
161 |         text,
162 |     )
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     text = "这是一个测试文本"
167 |     print(classify_language(text))
168 |     print(classify_zh_ja(text))  # "zh"
169 | 
170 |     text = "これはテストテキストです"
171 |     print(classify_language(text))
172 |     print(classify_zh_ja(text))  # "ja"
173 | 


--------------------------------------------------------------------------------
/transcribe_genshin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | import numpy as np
 5 | from multiprocessing import Pool, cpu_count
 6 | 
 7 | import soundfile
 8 | from scipy.io import wavfile
 9 | from tqdm import tqdm
10 | from config import config
11 | 
12 | global speaker_annos
13 | speaker_annos = []
14 | 
15 | def process(item):  
16 |     spkdir, wav_name, args = item
17 |     speaker = spkdir.replace("\\", "/").split("/")[-1]
18 |     wav_path = os.path.join(args.in_dir, speaker, wav_name)
19 |     if os.path.exists(wav_path) and '.wav' in wav_path:
20 |         os.makedirs(os.path.join(args.out_dir, speaker), exist_ok=True)
21 |         wav, sr = librosa.load(wav_path, sr=args.sr)
22 |         soundfile.write(
23 |             os.path.join(args.out_dir, speaker, wav_name),
24 |             wav,
25 |             sr
26 |         )
27 | 
28 | def process_text(item):
29 |     spkdir, wav_name, args,lang = item
30 |     speaker = spkdir.replace("\\", "/").split("/")[-1]
31 |     wav_path = os.path.join(args.in_dir, speaker, wav_name)
32 |     global speaker_annos
33 |     tr_name = wav_name.replace('.wav', '')
34 |     with open(args.out_dir+'/'+speaker+'/'+tr_name+'.lab', "r", encoding="utf-8") as file:
35 |              text = file.read()
36 |     text = text.replace("{NICKNAME}",'旅行者')
37 |     text = text.replace("{M#他}{F#她}",'他')
38 |     text = text.replace("{M#她}{F#他}",'他')
39 |     substring = "{M#妹妹}{F#哥哥}"  
40 |     if substring in text:
41 |         if tr_name.endswith("a"):
42 |            text = text.replace("{M#妹妹}{F#哥哥}",'妹妹')
43 |         if tr_name.endswith("b"):
44 |            text = text.replace("{M#妹妹}{F#哥哥}",'哥哥')
45 |     text = text.replace("#",'')   
46 |     text = f'{lang}|{text}\n' #
47 |     speaker_annos.append(args.out_dir+'/'+speaker+'/'+wav_name+ "|" + speaker + "|" + text)
48 | 
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("--sr", type=int, default=44100, help="sampling rate")
54 |     parser.add_argument("--in_dir", type=str, default=config.resample_config.in_dir, help="path to source dir")
55 |     parser.add_argument("--out_dir", type=str, default=config.resample_config.out_dir, help="path to target dir")
56 |     parent_dir=config.resample_config.in_dir
57 |     print(config.resample_config.out_dir)
58 |     speaker_names = list(os.walk(parent_dir))[0][1]   
59 |     args = parser.parse_args()
60 |    
61 |     entered = False
62 |     while not entered:
63 |       print("Enter a letter to choose language.\n")
64 |       print("C = Chinese ; J = Japanese ;E = English;\n e.g: C \n")
65 |       languages=input("Enter language: ")
66 |       if (languages == "C"or languages == "c"):
67 |         lang='ZH'
68 |         entered = True
69 |       elif (languages == "J"or languages == "j"):
70 |         lang='JP'
71 |         entered = True
72 |       elif (languages == "E"or languages == "e"):
73 |         lang='EN'
74 |         entered = True
75 |       else:
76 |         print("Illegal Arguments! Please try again.\n")
77 |     # processs = 8
78 |     processs = cpu_count()-2 if cpu_count() >4 else 1
79 |     pool = Pool(processes=processs)
80 | 
81 |     for speaker in os.listdir(args.in_dir):
82 |         spk_dir = os.path.join(args.in_dir, speaker)
83 |         if os.path.isdir(spk_dir):
84 |             print(spk_dir)
85 |             for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
86 |                 pass
87 |             for i in os.listdir(spk_dir):
88 |                if i.endswith("wav"):
89 |                   pro=(spk_dir, i, args, lang)
90 |                   process_text(pro)
91 |     if len(speaker_annos) == 0:
92 |         print("transcribe error. len(speaker_annos) == 0")
93 |     else:
94 |       with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f:
95 |         for line in speaker_annos:
96 |             f.write(line)
97 |       print("finished.")
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bert-VITS2_train
  2 | 
  3 | ## 本项目fork自https://github.com/YYuX-1145/Bert-VITS2-Integration-package/tree/2.0.2
  4 | 
  5 | ## 安装依赖
  6 | 
  7 | ```
  8 | pip install -r requirements.txt
  9 | ```
 10 | 
 11 | ## 下载bert模型 放入bert目录
 12 | 
 13 | ```
 14 | 链接：https://pan.baidu.com/s/11vLNEVDeP_8YhYIJUjcUeg?pwd=v3uc 
 15 | ```
 16 | 
 17 | ```
 18 | E:\work\Bert-VITS2-v202\bert>tree /f
 19 | Folder PATH listing for volume myssd
 20 | Volume serial number is 7CE3-15AE
 21 | E:.
 22 | │   bert_models.json
 23 | │
 24 | ├───bert-base-japanese-v3
 25 | │       config.json
 26 | │       README.md
 27 | │       tokenizer_config.json
 28 | │       vocab.txt
 29 | │
 30 | ├───bert-large-japanese-v2
 31 | │       config.json
 32 | │       README.md
 33 | │       tokenizer_config.json
 34 | │       vocab.txt
 35 | │
 36 | ├───chinese-roberta-wwm-ext-large
 37 | │       added_tokens.json
 38 | │       config.json
 39 | │       pytorch_model.bin
 40 | │       README.md
 41 | │       special_tokens_map.json
 42 | │       tokenizer.json
 43 | │       tokenizer_config.json
 44 | │       vocab.txt
 45 | │
 46 | ├───deberta-v2-large-japanese
 47 | │       config.json
 48 | │       pytorch_model.bin
 49 | │       README.md
 50 | │       special_tokens_map.json
 51 | │       tokenizer.json
 52 | │       tokenizer_config.json
 53 | │
 54 | └───deberta-v3-large
 55 |         config.json
 56 |         generator_config.json
 57 |         pytorch_model.bin
 58 |         README.md
 59 |         spm.model
 60 |         tokenizer_config.json
 61 | ```
 62 | 
 63 | ## 下载预训练模型，放入pretrained_models目录
 64 | 
 65 | ```
 66 | https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/model_readme_tmpl?name=Bert-VITS2%E4%B8%AD%E6%97%A5%E8%8B%B1%E5%BA%95%E6%A8%A1-fix
 67 | ```
 68 | 
 69 | ```
 70 | E:\work\Bert-VITS2-v202\pretrained_models>tree /f
 71 | Folder PATH listing for volume myssd
 72 | Volume serial number is 7CE3-15AE
 73 | E:.
 74 |     DUR_0.pth
 75 |     D_0.pth
 76 |     G_0.pth
 77 | 
 78 | No subfolders exist
 79 | ```
 80 | 
 81 | ## 下载数据集
 82 | 
 83 | ```
 84 | https://pan.ai-hobbyist.org/Genshin%20Datasets/%E4%B8%AD%E6%96%87%20-%20Chinese/%E5%88%86%E8%A7%92%E8%89%B2%20-%20Single/%E8%A7%92%E8%89%B2%E8%AF%AD%E9%9F%B3%20-%20Character
 85 | ```
 86 | 
 87 | ## 以刻晴为例 解压缩后，放入项目的Data/keqing/raw/keqing目录
 88 | 
 89 | ```
 90 | E:\work\Bert-VITS2-v202\Data\keqing\raw\keqing>tree /f
 91 | Folder PATH listing for volume myssd
 92 | Volume serial number is 7CE3-15AE
 93 | E:.
 94 |     vo_card_keqing_endOfGame_fail_01.lab
 95 |     vo_card_keqing_endOfGame_fail_01.wav
 96 | ```
 97 | 
 98 | ## 转写标注文件
 99 | 
100 | ```
101 | 
102 | python3 transcribe_genshin.py
103 | 
104 | ```
105 | 
106 | 
107 | ## 如果是自主构建数据集，把音频素材以当前模型命名为*.wav文件，如meimei.wav，放入raw目录，随后运行脚本进行切分
108 | 
109 | ```
110 | python3 audio_slicer.py
111 | ```
112 | 
113 | ```
114 | E:\work\Bert-VITS2-v202_demo\Data\meimei\raw\meimei>tree /f
115 | Folder PATH listing for volume myssd
116 | Volume serial number is 7CE3-15AE
117 | E:.
118 |     meimei_0.wav
119 |     meimei_1.wav
120 |     meimei_2.wav
121 |     meimei_3.wav
122 |     meimei_4.wav
123 |     meimei_5.wav
124 |     meimei_6.wav
125 |     meimei_7.wav
126 |     meimei_8.wav
127 | ```
128 | 
129 | ## 文本预处理和生成bert模型可读文件：
130 | 
131 | ```
132 | python3 preprocess_text.py
133 | 
134 | python3 bert_gen.py
135 | 
136 | ```
137 | 
138 | ## 开始训练
139 | 
140 | ```
141 | python3 train_ms.py
142 | ```
143 | 
144 | ## 训练好的模型目录
145 | 
146 | ```
147 | 
148 | E:\work\Bert-VITS2-v202\Data\keqing\models>tree /f
149 | Folder PATH listing for volume myssd
150 | Volume serial number is 7CE3-15AE
151 | E:.
152 | │   DUR_0.pth
153 | │   DUR_550.pth
154 | │   DUR_600.pth
155 | │   DUR_650.pth
156 | │   D_0.pth
157 | │   D_600.pth
158 | │   D_650.pth
159 | │   events.out.tfevents.1700625154.ly.24008.0
160 | │   events.out.tfevents.1700630428.ly.20380.0
161 | │   G_0.pth
162 | │   G_450.pth
163 | │   G_500.pth
164 | │   G_550.pth
165 | │   G_600.pth
166 | │   G_650.pth
167 | │   train.log
168 | │
169 | └───eval
170 |         events.out.tfevents.1700625154.ly.24008.1
171 |         events.out.tfevents.1700630428.ly.20380.1
172 | 
173 | ```
174 | 
175 | ## 模型推理验证
176 | 
177 | ```
178 | python3 server_fastapi.py
179 | ```
180 | 


--------------------------------------------------------------------------------
/Web/assets/index-49e71a58.css:
--------------------------------------------------------------------------------
 1 | html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}@-ms-viewport{width:device-width}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}/*!
 2 |   Theme: StackOverflow Light
 3 |   Description: Light theme as used on stackoverflow.com
 4 |   Author: stackoverflow.com
 5 |   Maintainer: @Hirse
 6 |   Website: https://github.com/StackExchange/Stacks
 7 |   License: MIT
 8 |   Updated: 2021-05-15
 9 | 
10 |   Updated for @stackoverflow/stacks v0.64.0
11 |   Code Blocks: /blob/v0.64.0/lib/css/components/_stacks-code-blocks.less
12 |   Colors: /blob/v0.64.0/lib/css/exports/_stacks-constants-colors.less
13 | */.hljs{color:#2f3337;background:#f6f6f6}.hljs-subst{color:#2f3337}.hljs-comment{color:#656e77}.hljs-keyword,.hljs-selector-tag,.hljs-meta .hljs-keyword,.hljs-doctag,.hljs-section,.hljs-attr{color:#015692}.hljs-attribute{color:#803378}.hljs-name,.hljs-type,.hljs-number,.hljs-selector-id,.hljs-quote,.hljs-template-tag{color:#b75501}.hljs-selector-class{color:#015692}.hljs-string,.hljs-regexp,.hljs-symbol,.hljs-variable,.hljs-template-variable,.hljs-link,.hljs-selector-attr{color:#54790d}.hljs-meta,.hljs-selector-pseudo{color:#015692}.hljs-built_in,.hljs-title,.hljs-literal{color:#b75501}.hljs-bullet,.hljs-code{color:#535a60}.hljs-meta .hljs-string{color:#54790d}.hljs-deletion{color:#c02d2e}.hljs-addition{color:#2f6f44}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}
14 | 


--------------------------------------------------------------------------------
/configs/default_config.yml:
--------------------------------------------------------------------------------
  1 | # 全局配置
  2 | # 对于希望在同一时间使用多个配置文件的情况，例如两个GPU同时跑两个训练集：通过环境变量指定配置文件，不指定则默认为./config.yml
  3 | 
  4 | # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
  5 | # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
  6 | # 不填或者填空则路径为相对于项目根目录的路径
  7 | dataset_path: "Data/TEST"
  8 | 
  9 | # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 10 | mirror: ""
 11 | openi_token: ""  # openi token
 12 | 
 13 | # resample 音频重采样配置
 14 | # 注意， “:” 后需要加空格
 15 | resample:
 16 |   # 目标重采样率
 17 |   sampling_rate: 44100
 18 |   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
 19 |   # 请填入相对于datasetPath的相对路径
 20 |   in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
 21 |   # 音频文件重采样后输出路径
 22 |   out_dir: "audios/wavs"
 23 | 
 24 | 
 25 | # preprocess_text 数据集预处理相关配置
 26 | # 注意， “:” 后需要加空格
 27 | preprocess_text:
 28 |   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
 29 |   transcription_path: "filelists/short_character_anno.list"
 30 |   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
 31 |   cleaned_path: "filelists/cleaned.list"
 32 |   # 训练集路径
 33 |   train_path: "filelists/train.list"
 34 |   # 验证集路径
 35 |   val_path: "filelists/val.list"
 36 |   # 配置文件路径
 37 |   config_path: "config.json"
 38 |   # 每个speaker的验证集条数
 39 |   val_per_spk: 5
 40 |   # 验证集最大条数，多于的会被截断并放到训练集中
 41 |   max_val_total: 8
 42 |   # 是否进行数据清洗
 43 |   clean: true
 44 | 
 45 | 
 46 | # bert_gen 相关配置
 47 | # 注意， “:” 后需要加空格
 48 | bert_gen:
 49 |   # 训练数据集配置文件路径
 50 |   config_path: "config.json"
 51 |   # 并行数
 52 |   num_processes: 2
 53 |   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
 54 |   # 该选项同时决定了get_bert_feature的默认设备
 55 |   device: "cuda"
 56 |   # 使用多卡推理
 57 |   use_multi_device: false
 58 | 
 59 | 
 60 | # train 训练配置
 61 | # 注意， “:” 后需要加空格
 62 | train_ms:
 63 |   # 需要加载的环境变量，多显卡训练时RANK请手动在环境变量填写
 64 |   # 环境变量对应名称环境变量不存在时加载，也就是说手动添加的环境变量优先级更高，会覆盖本配置文件
 65 |   env:
 66 |     MASTER_ADDR: "localhost"
 67 |     MASTER_PORT: 10086
 68 |     WORLD_SIZE: 1
 69 |     RANK: 0
 70 |     # 可以填写任意名的环境变量
 71 |     THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
 72 |   # 底模设置
 73 |   base:
 74 |     use_base_model: false
 75 |     repo_id: "Stardust_minus/Bert-VITS2"
 76 |     model_image: "Bert-VITS2中日底模" # openi网页的模型名
 77 |   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
 78 |   model: "models"
 79 |   # 配置文件路径
 80 |   config_path: "config.json"
 81 | 
 82 | 
 83 | # webui webui配置
 84 | # 注意， “:” 后需要加空格
 85 | webui:
 86 |   # 推理设备
 87 |   device: "cuda"
 88 |   # 模型路径
 89 |   model: "models/G_100.pth"
 90 |   # 配置文件路径
 91 |   config_path: "Data/TEST/config.json"
 92 |   # 端口号
 93 |   port: 7860
 94 |   # 是否公开部署，对外网开放
 95 |   share: false
 96 |   # 是否开启debug模式
 97 |   debug: false
 98 |   # 语种识别库，可选langid, fastlid
 99 |   language_identification_library: "langid"
100 | 
101 | 
102 | # server api配置
103 | # 注意， “:” 后需要加空格
104 | # 注意，本配置下的所有配置均为相对于根目录的路径
105 | server:
106 |   # 端口号
107 |   port: 7860
108 |   # 模型默认使用设备：但是当前并没有实现这个配置。
109 |   device: "cuda"
110 |   # 需要加载的所有模型的配置
111 |   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
112 |   models:
113 |     - # 模型的路径
114 |       model: "./Data/TEST/models/G_100.pth"
115 |       # 模型config.json的路径
116 |       config: "./Data/TEST/config.json"
117 |       # 模型使用设备，若填写则会覆盖默认配置
118 |       device: "cuda"
119 |       # 模型默认使用的语言
120 |       language: "ZH"
121 |       # 模型人物默认参数
122 |       # 不必填写所有人物，不填的使用默认值
123 |       # 暂时不用填写，当前尚未实现按人区分配置
124 |       speakers:
125 |         - speaker: "科比"
126 |           sdp_ratio: 0.2
127 |           noise_scale: 0.6
128 |           noise_scale_w: 0.8
129 |           length_scale: 1
130 |         - speaker: "五条悟"
131 |           sdp_ratio: 0.3
132 |           noise_scale: 0.7
133 |           noise_scale_w: 0.8
134 |           length_scale: 0.5
135 |         - speaker: "安倍晋三"
136 |           sdp_ratio: 0.2
137 |           noise_scale: 0.6
138 |           noise_scale_w: 0.8
139 |           length_scale: 1.2
140 |     - # 模型的路径
141 |       model: "./Data/test/models/G_100.pth"
142 |       # 模型config.json的路径
143 |       config: "./Data/test/config.json"
144 |       # 模型使用设备，若填写则会覆盖默认配置
145 |       device: "cuda"
146 |       # 模型默认使用的语言
147 |       language: "JP"
148 |       # 模型人物默认参数
149 |       # 不必填写所有人物，不填的使用默认值
150 |       speakers: [ ] # 也可以不填
151 | 
152 | 
153 | # 百度翻译开放平台 api配置
154 | # api接入文档 https://api.fanyi.baidu.com/doc/21
155 | # 请不要在github等网站公开分享你的app id 与 key
156 | translate:
157 |   # 你的APPID
158 |   "app_key": ""
159 |   # 你的密钥
160 |   "secret_key": ""
161 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | language: en
  3 | datasets:
  4 | - msp-podcast
  5 | inference: true
  6 | tags:
  7 | - speech
  8 | - audio
  9 | - wav2vec2
 10 | - audio-classification
 11 | - emotion-recognition
 12 | license: cc-by-nc-sa-4.0
 13 | pipeline_tag: audio-classification
 14 | ---
 15 | 
 16 | # Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0
 17 | 
 18 | The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning [
 19 | Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust) on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An [ONNX](https://onnx.ai/") export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127). Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).
 20 | 
 21 | # Usage
 22 | 
 23 | ```python
 24 | import numpy as np
 25 | import torch
 26 | import torch.nn as nn
 27 | from transformers import Wav2Vec2Processor
 28 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
 29 |     Wav2Vec2Model,
 30 |     Wav2Vec2PreTrainedModel,
 31 | )
 32 | 
 33 | 
 34 | class RegressionHead(nn.Module):
 35 |     r"""Classification head."""
 36 | 
 37 |     def __init__(self, config):
 38 | 
 39 |         super().__init__()
 40 | 
 41 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 42 |         self.dropout = nn.Dropout(config.final_dropout)
 43 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
 44 | 
 45 |     def forward(self, features, **kwargs):
 46 | 
 47 |         x = features
 48 |         x = self.dropout(x)
 49 |         x = self.dense(x)
 50 |         x = torch.tanh(x)
 51 |         x = self.dropout(x)
 52 |         x = self.out_proj(x)
 53 | 
 54 |         return x
 55 | 
 56 | 
 57 | class EmotionModel(Wav2Vec2PreTrainedModel):
 58 |     r"""Speech emotion classifier."""
 59 | 
 60 |     def __init__(self, config):
 61 | 
 62 |         super().__init__(config)
 63 | 
 64 |         self.config = config
 65 |         self.wav2vec2 = Wav2Vec2Model(config)
 66 |         self.classifier = RegressionHead(config)
 67 |         self.init_weights()
 68 | 
 69 |     def forward(
 70 |             self,
 71 |             input_values,
 72 |     ):
 73 | 
 74 |         outputs = self.wav2vec2(input_values)
 75 |         hidden_states = outputs[0]
 76 |         hidden_states = torch.mean(hidden_states, dim=1)
 77 |         logits = self.classifier(hidden_states)
 78 | 
 79 |         return hidden_states, logits
 80 | 
 81 | 
 82 | 
 83 | # load model from hub
 84 | device = 'cpu'
 85 | model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
 86 | processor = Wav2Vec2Processor.from_pretrained(model_name)
 87 | model = EmotionModel.from_pretrained(model_name)
 88 | 
 89 | # dummy signal
 90 | sampling_rate = 16000
 91 | signal = np.zeros((1, sampling_rate), dtype=np.float32)
 92 | 
 93 | 
 94 | def process_func(
 95 |     x: np.ndarray,
 96 |     sampling_rate: int,
 97 |     embeddings: bool = False,
 98 | ) -> np.ndarray:
 99 |     r"""Predict emotions or extract embeddings from raw audio signal."""
100 | 
101 |     # run through processor to normalize signal
102 |     # always returns a batch, so we just get the first entry
103 |     # then we put it on the device
104 |     y = processor(x, sampling_rate=sampling_rate)
105 |     y = y['input_values'][0]
106 |     y = y.reshape(1, -1)
107 |     y = torch.from_numpy(y).to(device)
108 | 
109 |     # run through model
110 |     with torch.no_grad():
111 |         y = model(y)[0 if embeddings else 1]
112 | 
113 |     # convert to numpy
114 |     y = y.detach().cpu().numpy()
115 | 
116 |     return y
117 | 
118 | 
119 | print(process_func(signal, sampling_rate))
120 | #  Arousal    dominance valence
121 | # [[0.5460754  0.6062266  0.40431657]]
122 | 
123 | print(process_func(signal, sampling_rate, embeddings=True))
124 | # Pooled hidden states of last transformer layer
125 | # [[-0.00752167  0.0065819  -0.00746342 ...  0.00663632  0.00848748
126 | #    0.00599211]]
127 | ```
128 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | import warnings
  5 | 
  6 | # warnings.simplefilter(action='ignore', category=FutureWarning)
  7 | warnings.filterwarnings(action="ignore")
  8 | MAX_WAV_VALUE = 32768.0
  9 | 
 10 | 
 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 12 |     """
 13 |     PARAMS
 14 |     ------
 15 |     C: compression factor
 16 |     """
 17 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 18 | 
 19 | 
 20 | def dynamic_range_decompression_torch(x, C=1):
 21 |     """
 22 |     PARAMS
 23 |     ------
 24 |     C: compression factor used to compress
 25 |     """
 26 |     return torch.exp(x) / C
 27 | 
 28 | 
 29 | def spectral_normalize_torch(magnitudes):
 30 |     output = dynamic_range_compression_torch(magnitudes)
 31 |     return output
 32 | 
 33 | 
 34 | def spectral_de_normalize_torch(magnitudes):
 35 |     output = dynamic_range_decompression_torch(magnitudes)
 36 |     return output
 37 | 
 38 | 
 39 | mel_basis = {}
 40 | hann_window = {}
 41 | 
 42 | 
 43 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 44 |     if torch.min(y) < -1.0:
 45 |         print("min value is ", torch.min(y))
 46 |     if torch.max(y) > 1.0:
 47 |         print("max value is ", torch.max(y))
 48 | 
 49 |     global hann_window
 50 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 51 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 52 |     if wnsize_dtype_device not in hann_window:
 53 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 54 |             dtype=y.dtype, device=y.device
 55 |         )
 56 | 
 57 |     y = torch.nn.functional.pad(
 58 |         y.unsqueeze(1),
 59 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 60 |         mode="reflect",
 61 |     )
 62 |     y = y.squeeze(1)
 63 | 
 64 |     spec = torch.stft(
 65 |         y,
 66 |         n_fft,
 67 |         hop_length=hop_size,
 68 |         win_length=win_size,
 69 |         window=hann_window[wnsize_dtype_device],
 70 |         center=center,
 71 |         pad_mode="reflect",
 72 |         normalized=False,
 73 |         onesided=True,
 74 |         return_complex=False,
 75 |     )
 76 | 
 77 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 78 |     return spec
 79 | 
 80 | 
 81 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 82 |     global mel_basis
 83 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 84 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 85 |     if fmax_dtype_device not in mel_basis:
 86 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 87 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 88 |             dtype=spec.dtype, device=spec.device
 89 |         )
 90 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 91 |     spec = spectral_normalize_torch(spec)
 92 |     return spec
 93 | 
 94 | 
 95 | def mel_spectrogram_torch(
 96 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
 97 | ):
 98 |     if torch.min(y) < -1.0:
 99 |         print("min value is ", torch.min(y))
100 |     if torch.max(y) > 1.0:
101 |         print("max value is ", torch.max(y))
102 | 
103 |     global mel_basis, hann_window
104 |     dtype_device = str(y.dtype) + "_" + str(y.device)
105 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
106 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
107 |     if fmax_dtype_device not in mel_basis:
108 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
109 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
110 |             dtype=y.dtype, device=y.device
111 |         )
112 |     if wnsize_dtype_device not in hann_window:
113 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
114 |             dtype=y.dtype, device=y.device
115 |         )
116 | 
117 |     y = torch.nn.functional.pad(
118 |         y.unsqueeze(1),
119 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
120 |         mode="reflect",
121 |     )
122 |     y = y.squeeze(1)
123 | 
124 |     spec = torch.stft(
125 |         y,
126 |         n_fft,
127 |         hop_length=hop_size,
128 |         win_length=win_size,
129 |         window=hann_window[wnsize_dtype_device],
130 |         center=center,
131 |         pad_mode="reflect",
132 |         normalized=False,
133 |         onesided=True,
134 |         return_complex=False,
135 |     )
136 | 
137 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
138 | 
139 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
140 |     spec = spectral_normalize_torch(spec)
141 | 
142 |     return spec
143 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/english.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import re
  4 | from g2p_en import G2p
  5 | 
  6 | from text import symbols
  7 | 
  8 | current_file_path = os.path.dirname(__file__)
  9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
 11 | _g2p = G2p()
 12 | 
 13 | arpa = {
 14 |     "AH0",
 15 |     "S",
 16 |     "AH1",
 17 |     "EY2",
 18 |     "AE2",
 19 |     "EH0",
 20 |     "OW2",
 21 |     "UH0",
 22 |     "NG",
 23 |     "B",
 24 |     "G",
 25 |     "AY0",
 26 |     "M",
 27 |     "AA0",
 28 |     "F",
 29 |     "AO0",
 30 |     "ER2",
 31 |     "UH1",
 32 |     "IY1",
 33 |     "AH2",
 34 |     "DH",
 35 |     "IY0",
 36 |     "EY1",
 37 |     "IH0",
 38 |     "K",
 39 |     "N",
 40 |     "W",
 41 |     "IY2",
 42 |     "T",
 43 |     "AA1",
 44 |     "ER1",
 45 |     "EH2",
 46 |     "OY0",
 47 |     "UH2",
 48 |     "UW1",
 49 |     "Z",
 50 |     "AW2",
 51 |     "AW1",
 52 |     "V",
 53 |     "UW2",
 54 |     "AA2",
 55 |     "ER",
 56 |     "AW0",
 57 |     "UW0",
 58 |     "R",
 59 |     "OW1",
 60 |     "EH1",
 61 |     "ZH",
 62 |     "AE0",
 63 |     "IH2",
 64 |     "IH",
 65 |     "Y",
 66 |     "JH",
 67 |     "P",
 68 |     "AY1",
 69 |     "EY0",
 70 |     "OY2",
 71 |     "TH",
 72 |     "HH",
 73 |     "D",
 74 |     "ER0",
 75 |     "CH",
 76 |     "AO1",
 77 |     "AE1",
 78 |     "AO2",
 79 |     "OY1",
 80 |     "AY2",
 81 |     "IH1",
 82 |     "OW0",
 83 |     "L",
 84 |     "SH",
 85 | }
 86 | 
 87 | 
 88 | def post_replace_ph(ph):
 89 |     rep_map = {
 90 |         "：": ",",
 91 |         "；": ",",
 92 |         "，": ",",
 93 |         "。": ".",
 94 |         "！": "!",
 95 |         "？": "?",
 96 |         "\n": ".",
 97 |         "·": ",",
 98 |         "、": ",",
 99 |         "...": "…",
100 |         "v": "V",
101 |     }
102 |     if ph in rep_map.keys():
103 |         ph = rep_map[ph]
104 |     if ph in symbols:
105 |         return ph
106 |     if ph not in symbols:
107 |         ph = "UNK"
108 |     return ph
109 | 
110 | 
111 | def read_dict():
112 |     g2p_dict = {}
113 |     start_line = 49
114 |     with open(CMU_DICT_PATH) as f:
115 |         line = f.readline()
116 |         line_index = 1
117 |         while line:
118 |             if line_index >= start_line:
119 |                 line = line.strip()
120 |                 word_split = line.split("  ")
121 |                 word = word_split[0]
122 | 
123 |                 syllable_split = word_split[1].split(" - ")
124 |                 g2p_dict[word] = []
125 |                 for syllable in syllable_split:
126 |                     phone_split = syllable.split(" ")
127 |                     g2p_dict[word].append(phone_split)
128 | 
129 |             line_index = line_index + 1
130 |             line = f.readline()
131 | 
132 |     return g2p_dict
133 | 
134 | 
135 | def cache_dict(g2p_dict, file_path):
136 |     with open(file_path, "wb") as pickle_file:
137 |         pickle.dump(g2p_dict, pickle_file)
138 | 
139 | 
140 | def get_dict():
141 |     if os.path.exists(CACHE_PATH):
142 |         with open(CACHE_PATH, "rb") as pickle_file:
143 |             g2p_dict = pickle.load(pickle_file)
144 |     else:
145 |         g2p_dict = read_dict()
146 |         cache_dict(g2p_dict, CACHE_PATH)
147 | 
148 |     return g2p_dict
149 | 
150 | 
151 | eng_dict = get_dict()
152 | 
153 | 
154 | def refine_ph(phn):
155 |     tone = 0
156 |     if re.search(r"\d$", phn):
157 |         tone = int(phn[-1]) + 1
158 |         phn = phn[:-1]
159 |     return phn.lower(), tone
160 | 
161 | 
162 | def refine_syllables(syllables):
163 |     tones = []
164 |     phonemes = []
165 |     for phn_list in syllables:
166 |         for i in range(len(phn_list)):
167 |             phn = phn_list[i]
168 |             phn, tone = refine_ph(phn)
169 |             phonemes.append(phn)
170 |             tones.append(tone)
171 |     return phonemes, tones
172 | 
173 | 
174 | def text_normalize(text):
175 |     # todo: eng text normalize
176 |     return text
177 | 
178 | 
179 | def g2p(text):
180 |     phones = []
181 |     tones = []
182 |     words = re.split(r"([,;.\-\?\!\s+])", text)
183 |     for w in words:
184 |         if w.upper() in eng_dict:
185 |             phns, tns = refine_syllables(eng_dict[w.upper()])
186 |             phones += phns
187 |             tones += tns
188 |         else:
189 |             phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 |             for ph in phone_list:
191 |                 if ph in arpa:
192 |                     ph, tn = refine_ph(ph)
193 |                     phones.append(ph)
194 |                     tones.append(tn)
195 |                 else:
196 |                     phones.append(ph)
197 |                     tones.append(0)
198 |     # todo: implement word2ph
199 |     word2ph = [1 for i in phones]
200 | 
201 |     phones = [post_replace_ph(i) for i in phones]
202 |     return phones, tones, word2ph
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     # print(get_dict())
207 |     # print(eng_word_to_phoneme("hello"))
208 |     print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 |     # all_phones = set()
210 |     # for k, syllables in eng_dict.items():
211 |     #     for group in syllables:
212 |     #         for ph in group:
213 |     #             all_phones.add(ph)
214 |     # print(all_phones)
215 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/english.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import re
  4 | from g2p_en import G2p
  5 | 
  6 | from . import symbols
  7 | 
  8 | current_file_path = os.path.dirname(__file__)
  9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
 11 | _g2p = G2p()
 12 | 
 13 | arpa = {
 14 |     "AH0",
 15 |     "S",
 16 |     "AH1",
 17 |     "EY2",
 18 |     "AE2",
 19 |     "EH0",
 20 |     "OW2",
 21 |     "UH0",
 22 |     "NG",
 23 |     "B",
 24 |     "G",
 25 |     "AY0",
 26 |     "M",
 27 |     "AA0",
 28 |     "F",
 29 |     "AO0",
 30 |     "ER2",
 31 |     "UH1",
 32 |     "IY1",
 33 |     "AH2",
 34 |     "DH",
 35 |     "IY0",
 36 |     "EY1",
 37 |     "IH0",
 38 |     "K",
 39 |     "N",
 40 |     "W",
 41 |     "IY2",
 42 |     "T",
 43 |     "AA1",
 44 |     "ER1",
 45 |     "EH2",
 46 |     "OY0",
 47 |     "UH2",
 48 |     "UW1",
 49 |     "Z",
 50 |     "AW2",
 51 |     "AW1",
 52 |     "V",
 53 |     "UW2",
 54 |     "AA2",
 55 |     "ER",
 56 |     "AW0",
 57 |     "UW0",
 58 |     "R",
 59 |     "OW1",
 60 |     "EH1",
 61 |     "ZH",
 62 |     "AE0",
 63 |     "IH2",
 64 |     "IH",
 65 |     "Y",
 66 |     "JH",
 67 |     "P",
 68 |     "AY1",
 69 |     "EY0",
 70 |     "OY2",
 71 |     "TH",
 72 |     "HH",
 73 |     "D",
 74 |     "ER0",
 75 |     "CH",
 76 |     "AO1",
 77 |     "AE1",
 78 |     "AO2",
 79 |     "OY1",
 80 |     "AY2",
 81 |     "IH1",
 82 |     "OW0",
 83 |     "L",
 84 |     "SH",
 85 | }
 86 | 
 87 | 
 88 | def post_replace_ph(ph):
 89 |     rep_map = {
 90 |         "：": ",",
 91 |         "；": ",",
 92 |         "，": ",",
 93 |         "。": ".",
 94 |         "！": "!",
 95 |         "？": "?",
 96 |         "\n": ".",
 97 |         "·": ",",
 98 |         "、": ",",
 99 |         "...": "…",
100 |         "v": "V",
101 |     }
102 |     if ph in rep_map.keys():
103 |         ph = rep_map[ph]
104 |     if ph in symbols:
105 |         return ph
106 |     if ph not in symbols:
107 |         ph = "UNK"
108 |     return ph
109 | 
110 | 
111 | def read_dict():
112 |     g2p_dict = {}
113 |     start_line = 49
114 |     with open(CMU_DICT_PATH) as f:
115 |         line = f.readline()
116 |         line_index = 1
117 |         while line:
118 |             if line_index >= start_line:
119 |                 line = line.strip()
120 |                 word_split = line.split("  ")
121 |                 word = word_split[0]
122 | 
123 |                 syllable_split = word_split[1].split(" - ")
124 |                 g2p_dict[word] = []
125 |                 for syllable in syllable_split:
126 |                     phone_split = syllable.split(" ")
127 |                     g2p_dict[word].append(phone_split)
128 | 
129 |             line_index = line_index + 1
130 |             line = f.readline()
131 | 
132 |     return g2p_dict
133 | 
134 | 
135 | def cache_dict(g2p_dict, file_path):
136 |     with open(file_path, "wb") as pickle_file:
137 |         pickle.dump(g2p_dict, pickle_file)
138 | 
139 | 
140 | def get_dict():
141 |     if os.path.exists(CACHE_PATH):
142 |         with open(CACHE_PATH, "rb") as pickle_file:
143 |             g2p_dict = pickle.load(pickle_file)
144 |     else:
145 |         g2p_dict = read_dict()
146 |         cache_dict(g2p_dict, CACHE_PATH)
147 | 
148 |     return g2p_dict
149 | 
150 | 
151 | eng_dict = get_dict()
152 | 
153 | 
154 | def refine_ph(phn):
155 |     tone = 0
156 |     if re.search(r"\d$", phn):
157 |         tone = int(phn[-1]) + 1
158 |         phn = phn[:-1]
159 |     return phn.lower(), tone
160 | 
161 | 
162 | def refine_syllables(syllables):
163 |     tones = []
164 |     phonemes = []
165 |     for phn_list in syllables:
166 |         for i in range(len(phn_list)):
167 |             phn = phn_list[i]
168 |             phn, tone = refine_ph(phn)
169 |             phonemes.append(phn)
170 |             tones.append(tone)
171 |     return phonemes, tones
172 | 
173 | 
174 | def text_normalize(text):
175 |     # todo: eng text normalize
176 |     return text
177 | 
178 | 
179 | def g2p(text):
180 |     phones = []
181 |     tones = []
182 |     words = re.split(r"([,;.\-\?\!\s+])", text)
183 |     for w in words:
184 |         if w.upper() in eng_dict:
185 |             phns, tns = refine_syllables(eng_dict[w.upper()])
186 |             phones += phns
187 |             tones += tns
188 |         else:
189 |             phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 |             for ph in phone_list:
191 |                 if ph in arpa:
192 |                     ph, tn = refine_ph(ph)
193 |                     phones.append(ph)
194 |                     tones.append(tn)
195 |                 else:
196 |                     phones.append(ph)
197 |                     tones.append(0)
198 |     # todo: implement word2ph
199 |     word2ph = [1 for i in phones]
200 | 
201 |     phones = [post_replace_ph(i) for i in phones]
202 |     return phones, tones, word2ph
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     # print(get_dict())
207 |     # print(eng_word_to_phoneme("hello"))
208 |     print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 |     # all_phones = set()
210 |     # for k, syllables in eng_dict.items():
211 |     #     for group in syllables:
212 |     #         for ph in group:
213 |     #             all_phones.add(ph)
214 |     # print(all_phones)
215 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/english.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import re
  4 | from g2p_en import G2p
  5 | 
  6 | from . import symbols
  7 | 
  8 | current_file_path = os.path.dirname(__file__)
  9 | CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
 10 | CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
 11 | _g2p = G2p()
 12 | 
 13 | arpa = {
 14 |     "AH0",
 15 |     "S",
 16 |     "AH1",
 17 |     "EY2",
 18 |     "AE2",
 19 |     "EH0",
 20 |     "OW2",
 21 |     "UH0",
 22 |     "NG",
 23 |     "B",
 24 |     "G",
 25 |     "AY0",
 26 |     "M",
 27 |     "AA0",
 28 |     "F",
 29 |     "AO0",
 30 |     "ER2",
 31 |     "UH1",
 32 |     "IY1",
 33 |     "AH2",
 34 |     "DH",
 35 |     "IY0",
 36 |     "EY1",
 37 |     "IH0",
 38 |     "K",
 39 |     "N",
 40 |     "W",
 41 |     "IY2",
 42 |     "T",
 43 |     "AA1",
 44 |     "ER1",
 45 |     "EH2",
 46 |     "OY0",
 47 |     "UH2",
 48 |     "UW1",
 49 |     "Z",
 50 |     "AW2",
 51 |     "AW1",
 52 |     "V",
 53 |     "UW2",
 54 |     "AA2",
 55 |     "ER",
 56 |     "AW0",
 57 |     "UW0",
 58 |     "R",
 59 |     "OW1",
 60 |     "EH1",
 61 |     "ZH",
 62 |     "AE0",
 63 |     "IH2",
 64 |     "IH",
 65 |     "Y",
 66 |     "JH",
 67 |     "P",
 68 |     "AY1",
 69 |     "EY0",
 70 |     "OY2",
 71 |     "TH",
 72 |     "HH",
 73 |     "D",
 74 |     "ER0",
 75 |     "CH",
 76 |     "AO1",
 77 |     "AE1",
 78 |     "AO2",
 79 |     "OY1",
 80 |     "AY2",
 81 |     "IH1",
 82 |     "OW0",
 83 |     "L",
 84 |     "SH",
 85 | }
 86 | 
 87 | 
 88 | def post_replace_ph(ph):
 89 |     rep_map = {
 90 |         "：": ",",
 91 |         "；": ",",
 92 |         "，": ",",
 93 |         "。": ".",
 94 |         "！": "!",
 95 |         "？": "?",
 96 |         "\n": ".",
 97 |         "·": ",",
 98 |         "、": ",",
 99 |         "...": "…",
100 |         "v": "V",
101 |     }
102 |     if ph in rep_map.keys():
103 |         ph = rep_map[ph]
104 |     if ph in symbols:
105 |         return ph
106 |     if ph not in symbols:
107 |         ph = "UNK"
108 |     return ph
109 | 
110 | 
111 | def read_dict():
112 |     g2p_dict = {}
113 |     start_line = 49
114 |     with open(CMU_DICT_PATH) as f:
115 |         line = f.readline()
116 |         line_index = 1
117 |         while line:
118 |             if line_index >= start_line:
119 |                 line = line.strip()
120 |                 word_split = line.split("  ")
121 |                 word = word_split[0]
122 | 
123 |                 syllable_split = word_split[1].split(" - ")
124 |                 g2p_dict[word] = []
125 |                 for syllable in syllable_split:
126 |                     phone_split = syllable.split(" ")
127 |                     g2p_dict[word].append(phone_split)
128 | 
129 |             line_index = line_index + 1
130 |             line = f.readline()
131 | 
132 |     return g2p_dict
133 | 
134 | 
135 | def cache_dict(g2p_dict, file_path):
136 |     with open(file_path, "wb") as pickle_file:
137 |         pickle.dump(g2p_dict, pickle_file)
138 | 
139 | 
140 | def get_dict():
141 |     if os.path.exists(CACHE_PATH):
142 |         with open(CACHE_PATH, "rb") as pickle_file:
143 |             g2p_dict = pickle.load(pickle_file)
144 |     else:
145 |         g2p_dict = read_dict()
146 |         cache_dict(g2p_dict, CACHE_PATH)
147 | 
148 |     return g2p_dict
149 | 
150 | 
151 | eng_dict = get_dict()
152 | 
153 | 
154 | def refine_ph(phn):
155 |     tone = 0
156 |     if re.search(r"\d$", phn):
157 |         tone = int(phn[-1]) + 1
158 |         phn = phn[:-1]
159 |     return phn.lower(), tone
160 | 
161 | 
162 | def refine_syllables(syllables):
163 |     tones = []
164 |     phonemes = []
165 |     for phn_list in syllables:
166 |         for i in range(len(phn_list)):
167 |             phn = phn_list[i]
168 |             phn, tone = refine_ph(phn)
169 |             phonemes.append(phn)
170 |             tones.append(tone)
171 |     return phonemes, tones
172 | 
173 | 
174 | def text_normalize(text):
175 |     # todo: eng text normalize
176 |     return text
177 | 
178 | 
179 | def g2p(text):
180 |     phones = []
181 |     tones = []
182 |     words = re.split(r"([,;.\-\?\!\s+])", text)
183 |     for w in words:
184 |         if w.upper() in eng_dict:
185 |             phns, tns = refine_syllables(eng_dict[w.upper()])
186 |             phones += phns
187 |             tones += tns
188 |         else:
189 |             phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190 |             for ph in phone_list:
191 |                 if ph in arpa:
192 |                     ph, tn = refine_ph(ph)
193 |                     phones.append(ph)
194 |                     tones.append(tn)
195 |                 else:
196 |                     phones.append(ph)
197 |                     tones.append(0)
198 |     # todo: implement word2ph
199 |     word2ph = [1 for i in phones]
200 | 
201 |     phones = [post_replace_ph(i) for i in phones]
202 |     return phones, tones, word2ph
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     # print(get_dict())
207 |     # print(eng_word_to_phoneme("hello"))
208 |     print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209 |     # all_phones = set()
210 |     # for k, syllables in eng_dict.items():
211 |     #     for group in syllables:
212 |     #         for ph in group:
213 |     #             all_phones.add(ph)
214 |     # print(all_phones)
215 | 


--------------------------------------------------------------------------------
/emo_gen.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.utils.data import Dataset
  4 | from torch.utils.data import DataLoader
  5 | from transformers import Wav2Vec2Processor
  6 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
  7 |     Wav2Vec2Model,
  8 |     Wav2Vec2PreTrainedModel,
  9 | )
 10 | import librosa
 11 | import numpy as np
 12 | import argparse
 13 | from config import config
 14 | import utils
 15 | import os
 16 | from tqdm import tqdm
 17 | 
 18 | 
 19 | class RegressionHead(nn.Module):
 20 |     r"""Classification head."""
 21 | 
 22 |     def __init__(self, config):
 23 |         super().__init__()
 24 | 
 25 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 26 |         self.dropout = nn.Dropout(config.final_dropout)
 27 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
 28 | 
 29 |     def forward(self, features, **kwargs):
 30 |         x = features
 31 |         x = self.dropout(x)
 32 |         x = self.dense(x)
 33 |         x = torch.tanh(x)
 34 |         x = self.dropout(x)
 35 |         x = self.out_proj(x)
 36 | 
 37 |         return x
 38 | 
 39 | 
 40 | class EmotionModel(Wav2Vec2PreTrainedModel):
 41 |     r"""Speech emotion classifier."""
 42 | 
 43 |     def __init__(self, config):
 44 |         super().__init__(config)
 45 | 
 46 |         self.config = config
 47 |         self.wav2vec2 = Wav2Vec2Model(config)
 48 |         self.classifier = RegressionHead(config)
 49 |         self.init_weights()
 50 | 
 51 |     def forward(
 52 |         self,
 53 |         input_values,
 54 |     ):
 55 |         outputs = self.wav2vec2(input_values)
 56 |         hidden_states = outputs[0]
 57 |         hidden_states = torch.mean(hidden_states, dim=1)
 58 |         logits = self.classifier(hidden_states)
 59 | 
 60 |         return hidden_states, logits
 61 | 
 62 | 
 63 | class AudioDataset(Dataset):
 64 |     def __init__(self, list_of_wav_files, sr, processor):
 65 |         self.list_of_wav_files = list_of_wav_files
 66 |         self.processor = processor
 67 |         self.sr = sr
 68 | 
 69 |     def __len__(self):
 70 |         return len(self.list_of_wav_files)
 71 | 
 72 |     def __getitem__(self, idx):
 73 |         wav_file = self.list_of_wav_files[idx]
 74 |         audio_data, _ = librosa.load(wav_file, sr=self.sr)
 75 |         processed_data = self.processor(audio_data, sampling_rate=self.sr)[
 76 |             "input_values"
 77 |         ][0]
 78 |         return torch.from_numpy(processed_data)
 79 | 
 80 | 
 81 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
 82 | processor = Wav2Vec2Processor.from_pretrained(model_name)
 83 | model = EmotionModel.from_pretrained(model_name)
 84 | 
 85 | 
 86 | def process_func(
 87 |     x: np.ndarray,
 88 |     sampling_rate: int,
 89 |     model: EmotionModel,
 90 |     processor: Wav2Vec2Processor,
 91 |     device: str,
 92 |     embeddings: bool = False,
 93 | ) -> np.ndarray:
 94 |     r"""Predict emotions or extract embeddings from raw audio signal."""
 95 |     model = model.to(device)
 96 |     y = processor(x, sampling_rate=sampling_rate)
 97 |     y = y["input_values"][0]
 98 |     y = torch.from_numpy(y).unsqueeze(0).to(device)
 99 | 
100 |     # run through model
101 |     with torch.no_grad():
102 |         y = model(y)[0 if embeddings else 1]
103 | 
104 |     # convert to numpy
105 |     y = y.detach().cpu().numpy()
106 | 
107 |     return y
108 | 
109 | 
110 | def get_emo(path):
111 |     wav, sr = librosa.load(path, 16000)
112 |     device = config.bert_gen_config.device
113 |     return process_func(
114 |         np.expand_dims(wav, 0).astype(np.float),
115 |         sr,
116 |         model,
117 |         processor,
118 |         device,
119 |         embeddings=True,
120 |     ).squeeze(0)
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     parser = argparse.ArgumentParser()
125 |     parser.add_argument(
126 |         "-c", "--config", type=str, default=config.bert_gen_config.config_path
127 |     )
128 |     parser.add_argument(
129 |         "--num_processes", type=int, default=config.bert_gen_config.num_processes
130 |     )
131 |     args, _ = parser.parse_known_args()
132 |     config_path = args.config
133 |     hps = utils.get_hparams_from_file(config_path)
134 | 
135 |     device = config.bert_gen_config.device
136 | 
137 |     model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
138 |     processor = (
139 |         Wav2Vec2Processor.from_pretrained(model_name)
140 |         if processor is None
141 |         else processor
142 |     )
143 |     model = (
144 |         EmotionModel.from_pretrained(model_name).to(device)
145 |         if model is None
146 |         else model.to(device)
147 |     )
148 | 
149 |     lines = []
150 |     with open(hps.data.training_files, encoding="utf-8") as f:
151 |         lines.extend(f.readlines())
152 | 
153 |     with open(hps.data.validation_files, encoding="utf-8") as f:
154 |         lines.extend(f.readlines())
155 | 
156 |     wavnames = [line.split("|")[0] for line in lines]
157 |     dataset = AudioDataset(wavnames, 16000, processor)
158 |     data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=16)
159 | 
160 |     with torch.no_grad():
161 |         for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
162 |             wavname = wavnames[i]
163 |             emo_path = wavname.replace(".wav", ".emo.npy")
164 |             if os.path.exists(emo_path):
165 |                 continue
166 |             emb = model(data.to(device))[0].detach().cpu().numpy()
167 |             np.save(emo_path, emb)
168 | 
169 |     print("Emo vec 生成完毕!")
170 | 


--------------------------------------------------------------------------------
/preprocess_text.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections import defaultdict
  3 | from random import shuffle
  4 | from typing import Optional
  5 | import os
  6 | 
  7 | from tqdm import tqdm
  8 | import click
  9 | from text.cleaner import clean_text
 10 | from config import config
 11 | from infer import latest_version
 12 | 
 13 | preprocess_text_config = config.preprocess_text_config
 14 | 
 15 | 
 16 | @click.command()
 17 | @click.option(
 18 |     "--transcription-path",
 19 |     default=preprocess_text_config.transcription_path,
 20 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 21 | )
 22 | @click.option("--cleaned-path", default=preprocess_text_config.cleaned_path)
 23 | @click.option("--train-path", default=preprocess_text_config.train_path)
 24 | @click.option("--val-path", default=preprocess_text_config.val_path)
 25 | @click.option(
 26 |     "--config-path",
 27 |     default=preprocess_text_config.config_path,
 28 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 29 | )
 30 | @click.option("--val-per-spk", default=preprocess_text_config.val_per_spk)
 31 | @click.option("--max-val-total", default=preprocess_text_config.max_val_total)
 32 | @click.option("--clean/--no-clean", default=preprocess_text_config.clean)
 33 | @click.option("-y", "--yml_config")
 34 | def preprocess(
 35 |     transcription_path: str,
 36 |     cleaned_path: Optional[str],
 37 |     train_path: str,
 38 |     val_path: str,
 39 |     config_path: str,
 40 |     val_per_spk: int,
 41 |     max_val_total: int,
 42 |     clean: bool,
 43 |     yml_config: str,  # 这个不要删
 44 | ):
 45 |     if cleaned_path == "" or cleaned_path is None:
 46 |         cleaned_path = transcription_path + ".cleaned"
 47 | 
 48 |     if clean:
 49 |         with open(cleaned_path, "w", encoding="utf-8") as out_file:
 50 |             with open(transcription_path, "r", encoding="utf-8") as trans_file:
 51 |                 lines = trans_file.readlines()
 52 |                 # print(lines, ' ', len(lines))
 53 |                 if len(lines) != 0:
 54 |                     for line in tqdm(lines):
 55 |                         try:
 56 |                             utt, spk, language, text = line.strip().split("|")
 57 |                             norm_text, phones, tones, word2ph = clean_text(
 58 |                                 text, language
 59 |                             )
 60 |                             out_file.write(
 61 |                                 "{}|{}|{}|{}|{}|{}|{}\n".format(
 62 |                                     utt,
 63 |                                     spk,
 64 |                                     language,
 65 |                                     norm_text,
 66 |                                     " ".join(phones),
 67 |                                     " ".join([str(i) for i in tones]),
 68 |                                     " ".join([str(i) for i in word2ph]),
 69 |                                 )
 70 |                             )
 71 |                         except Exception as e:
 72 |                             print(line)
 73 |                             print(f"生成训练集和验证集时发生错误！, 详细信息:\n{e}")
 74 | 
 75 |     transcription_path = cleaned_path
 76 |     spk_utt_map = defaultdict(list)
 77 |     spk_id_map = {}
 78 |     current_sid = 0
 79 | 
 80 |     with open(transcription_path, "r", encoding="utf-8") as f:
 81 |         audioPaths = set()
 82 |         countSame = 0
 83 |         countNotFound = 0
 84 |         for line in f.readlines():
 85 |             utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
 86 |             if utt in audioPaths:
 87 |                 # 过滤数据集错误：相同的音频匹配多个文本，导致后续bert出问题
 88 |                 print(f"重复音频文本：{line}")
 89 |                 countSame += 1
 90 |                 continue
 91 |             if not os.path.isfile(utt):
 92 |                 # 过滤数据集错误：不存在对应音频
 93 |                 print(f"没有找到对应的音频：{utt}")
 94 |                 countNotFound += 1
 95 |                 continue
 96 |             audioPaths.add(utt)
 97 |             spk_utt_map[spk].append(line)
 98 | 
 99 |             if spk not in spk_id_map.keys():
100 |                 spk_id_map[spk] = current_sid
101 |                 current_sid += 1
102 |         print(f"总重复音频数：{countSame}，总未找到的音频数:{countNotFound}")
103 | 
104 |     train_list = []
105 |     val_list = []
106 | 
107 |     for spk, utts in spk_utt_map.items():
108 |         shuffle(utts)
109 |         val_list += utts[:val_per_spk]
110 |         train_list += utts[val_per_spk:]
111 | 
112 |     if len(val_list) > max_val_total:
113 |         train_list += val_list[max_val_total:]
114 |         val_list = val_list[:max_val_total]
115 | 
116 |     with open(train_path, "w", encoding="utf-8") as f:
117 |         for line in train_list:
118 |             f.write(line)
119 | 
120 |     with open(val_path, "w", encoding="utf-8") as f:
121 |         for line in val_list:
122 |             f.write(line)
123 | 
124 |     json_config = json.load(open(config_path, encoding="utf-8"))
125 |     json_config["data"]["spk2id"] = spk_id_map
126 |     json_config['data']["n_speakers"] = current_sid#
127 |     # 新增写入：写入训练版本、数据集路径
128 |     json_config["version"] = latest_version
129 |     json_config["data"]["training_files"] = os.path.normpath(train_path).replace(
130 |         "\\", "/"
131 |     )
132 |     json_config["data"]["validation_files"] = os.path.normpath(val_path).replace(
133 |         "\\", "/"
134 |     )
135 |     with open(config_path, "w", encoding="utf-8") as f:
136 |         json.dump(json_config, f, indent=2, ensure_ascii=False)
137 |     print("训练集和验证集生成完成！")
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     preprocess()
142 | 


--------------------------------------------------------------------------------
/short_audio_transcribe.py:
--------------------------------------------------------------------------------
  1 | import whisper
  2 | import os
  3 | import json
  4 | import torchaudio
  5 | import argparse
  6 | import torch
  7 | from config import config
  8 | lang2token = {
  9 |             'zh': "ZH|",
 10 |             'ja': "JP|",
 11 |             "en": "EN|",
 12 |         }
 13 | def transcribe_one(audio_path):
 14 |     # load audio and pad/trim it to fit 30 seconds
 15 |     audio = whisper.load_audio(audio_path)
 16 |     audio = whisper.pad_or_trim(audio)
 17 | 
 18 |     # make log-Mel spectrogram and move to the same device as the model
 19 |     mel = whisper.log_mel_spectrogram(audio).to(model.device)
 20 | 
 21 |     # detect the spoken language
 22 |     _, probs = model.detect_language(mel)
 23 |     print(f"Detected language: {max(probs, key=probs.get)}")
 24 |     lang = max(probs, key=probs.get)
 25 |     # decode the audio
 26 |     options = whisper.DecodingOptions(beam_size=5)
 27 |     result = whisper.decode(model, mel, options)
 28 | 
 29 |     # print the recognized text
 30 |     print(result.text)
 31 |     return lang, result.text
 32 | if __name__ == "__main__":
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument("--languages", default="CJ")
 35 |     parser.add_argument("--whisper_size", default="medium")
 36 |     args = parser.parse_args()
 37 |     if args.languages == "CJE":
 38 |         lang2token = {
 39 |             'zh': "ZH|",
 40 |             'ja': "JP|",
 41 |             "en": "EN|",
 42 |         }
 43 |     elif args.languages == "CJ":
 44 |         lang2token = {
 45 |             'zh': "ZH|",
 46 |             'ja': "JP|",
 47 |         }
 48 |     elif args.languages == "C":
 49 |         lang2token = {
 50 |             'zh': "ZH|",
 51 |         }
 52 |     assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
 53 |     model = whisper.load_model(args.whisper_size)
 54 |     #parent_dir = "./custom_character_voice/"
 55 |     parent_dir=config.resample_config.in_dir
 56 |     print(parent_dir)
 57 |     speaker_names = list(os.walk(parent_dir))[0][1]
 58 |     speaker_annos = []
 59 |     total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
 60 |     # resample audios
 61 |     # 2023/4/21: Get the target sampling rate
 62 |     with open(config.train_ms_config.config_path,'r', encoding='utf-8') as f:
 63 |         hps = json.load(f)
 64 |     target_sr = hps['data']['sampling_rate']
 65 |     processed_files = 0
 66 |     for speaker in speaker_names:
 67 |         for i, wavfile in enumerate(list(os.walk(os.path.join(parent_dir,speaker)))[0][2]):
 68 |             # try to load file as audio
 69 |             if wavfile.startswith("processed_"):
 70 |                 continue
 71 |             try:
 72 |                 wav, sr = torchaudio.load(parent_dir + "/" + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
 73 |                                           channels_first=True)
 74 |                 wav = wav.mean(dim=0).unsqueeze(0)
 75 |                 if sr != target_sr:
 76 |                     wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
 77 |                 if wav.shape[1] / sr > 20:
 78 |                     print(f"{wavfile} too long, ignoring\n")
 79 |                 save_path = parent_dir+"/"+ speaker + "/" + f"processed_{i}.wav"
 80 |                 torchaudio.save(save_path, wav, target_sr, channels_first=True)
 81 |                 # transcribe text
 82 |                 lang, text = transcribe_one(save_path)
 83 |                 if lang not in list(lang2token.keys()):
 84 |                     print(f"{lang} not supported, ignoring\n")
 85 |                     continue
 86 |                 #text = "ZH|" + text + "\n"
 87 |                 text = lang2token[lang] + text + "\n"
 88 |                 speaker_annos.append(save_path + "|" + speaker + "|" + text)
 89 |                 
 90 |                 processed_files += 1
 91 |                 print(f"Processed: {processed_files}/{total_files}")
 92 |             except Exception as e:
 93 |                 print(e)
 94 |                 continue
 95 | 
 96 |     # # clean annotation
 97 |     # import argparse
 98 |     # import text
 99 |     # from utils import load_filepaths_and_text
100 |     # for i, line in enumerate(speaker_annos):
101 |     #     path, sid, txt = line.split("|")
102 |     #     cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
103 |     #     cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
104 |     #     speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
105 |     # write into annotation
106 |     if len(speaker_annos) == 0:
107 |         print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
108 |         print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
109 |     with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f:
110 |         for line in speaker_annos:
111 |             f.write(line)
112 | 
113 |     # import json
114 |     # # generate new config
115 |     # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
116 |     #     hps = json.load(f)
117 |     # # modify n_speakers
118 |     # hps['data']["n_speakers"] = 1000 + len(speaker2id)
119 |     # # add speaker names
120 |     # for speaker in speaker_names:
121 |     #     hps['speakers'][speaker] = speaker2id[speaker]
122 |     # # save modified config
123 |     # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
124 |     #     json.dump(hps, f, indent=2)
125 |     # print("finished")
126 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def init_weights(m, mean=0.0, std=0.01):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find("Conv") != -1:
  9 |         m.weight.data.normal_(mean, std)
 10 | 
 11 | 
 12 | def get_padding(kernel_size, dilation=1):
 13 |     return int((kernel_size * dilation - dilation) / 2)
 14 | 
 15 | 
 16 | def convert_pad_shape(pad_shape):
 17 |     layer = pad_shape[::-1]
 18 |     pad_shape = [item for sublist in layer for item in sublist]
 19 |     return pad_shape
 20 | 
 21 | 
 22 | def intersperse(lst, item):
 23 |     result = [item] * (len(lst) * 2 + 1)
 24 |     result[1::2] = lst
 25 |     return result
 26 | 
 27 | 
 28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 29 |     """KL(P||Q)"""
 30 |     kl = (logs_q - logs_p) - 0.5
 31 |     kl += (
 32 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 33 |     )
 34 |     return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |     """Sample from the Gumbel distribution, protect from overflows."""
 39 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |     return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |     return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |     ret = torch.zeros_like(x[:, :, :segment_size])
 50 |     for i in range(x.size(0)):
 51 |         idx_str = ids_str[i]
 52 |         idx_end = idx_str + segment_size
 53 |         if idx_str < 0:
 54 |             i1 = x.size(2) + idx_str
 55 |             r1 = x[i, :, i1:]
 56 |             r2 = x[i, :, :idx_end]
 57 |             ret[i] = torch.cat([r1, r2], dim=1)
 58 |         else:
 59 |             ret[i] = x[i, :, idx_str:idx_end]
 60 |     return ret
 61 | 
 62 | 
 63 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 64 |     b, d, t = x.size()
 65 |     if x_lengths is None:
 66 |         x_lengths = t
 67 |     ids_str_max = x_lengths - segment_size + 1
 68 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 69 |     ret = slice_segments(x, ids_str, segment_size)
 70 |     return ret, ids_str
 71 | 
 72 | 
 73 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 74 |     position = torch.arange(length, dtype=torch.float)
 75 |     num_timescales = channels // 2
 76 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 77 |         num_timescales - 1
 78 |     )
 79 |     inv_timescales = min_timescale * torch.exp(
 80 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 81 |     )
 82 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 83 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 84 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 85 |     signal = signal.view(1, channels, length)
 86 |     return signal
 87 | 
 88 | 
 89 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 90 |     b, channels, length = x.size()
 91 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |     return x + signal.to(dtype=x.dtype, device=x.device)
 93 | 
 94 | 
 95 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 96 |     b, channels, length = x.size()
 97 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 98 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 99 | 
100 | 
101 | def subsequent_mask(length):
102 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
103 |     return mask
104 | 
105 | 
106 | @torch.jit.script
107 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
108 |     n_channels_int = n_channels[0]
109 |     in_act = input_a + input_b
110 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
111 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
112 |     acts = t_act * s_act
113 |     return acts
114 | 
115 | 
116 | def convert_pad_shape(pad_shape):
117 |     layer = pad_shape[::-1]
118 |     pad_shape = [item for sublist in layer for item in sublist]
119 |     return pad_shape
120 | 
121 | 
122 | def shift_1d(x):
123 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
124 |     return x
125 | 
126 | 
127 | def sequence_mask(length, max_length=None):
128 |     if max_length is None:
129 |         max_length = length.max()
130 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
131 |     return x.unsqueeze(0) < length.unsqueeze(1)
132 | 
133 | 
134 | def generate_path(duration, mask):
135 |     """
136 |     duration: [b, 1, t_x]
137 |     mask: [b, 1, t_y, t_x]
138 |     """
139 | 
140 |     b, _, t_y, t_x = mask.shape
141 |     cum_duration = torch.cumsum(duration, -1)
142 | 
143 |     cum_duration_flat = cum_duration.view(b * t_x)
144 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145 |     path = path.view(b, t_x, t_y)
146 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147 |     path = path.unsqueeze(1).transpose(2, 3) * mask
148 |     return path
149 | 
150 | 
151 | def clip_grad_value_(parameters, clip_value, norm_type=2):
152 |     if isinstance(parameters, torch.Tensor):
153 |         parameters = [parameters]
154 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
155 |     norm_type = float(norm_type)
156 |     if clip_value is not None:
157 |         clip_value = float(clip_value)
158 | 
159 |     total_norm = 0
160 |     for p in parameters:
161 |         param_norm = p.grad.data.norm(norm_type)
162 |         total_norm += param_norm.item() ** norm_type
163 |         if clip_value is not None:
164 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
165 |     total_norm = total_norm ** (1.0 / norm_type)
166 |     return total_norm
167 | 


--------------------------------------------------------------------------------
/tools/sentence.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import regex as re
  4 | 
  5 | from tools.classify_language import classify_language, split_alpha_nonalpha
  6 | 
  7 | 
  8 | def check_is_none(item) -> bool:
  9 |     """none -> True, not none -> False"""
 10 |     return (
 11 |         item is None
 12 |         or (isinstance(item, str) and str(item).isspace())
 13 |         or str(item) == ""
 14 |     )
 15 | 
 16 | 
 17 | def markup_language(text: str, target_languages: list = None) -> str:
 18 |     pattern = (
 19 |         r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
 20 |         r"\！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」"
 21 |         r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
 22 |     )
 23 |     sentences = re.split(pattern, text)
 24 | 
 25 |     pre_lang = ""
 26 |     p = 0
 27 | 
 28 |     sorted_target_languages = sorted(target_languages)
 29 |     if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
 30 |         new_sentences = []
 31 |         for sentence in sentences:
 32 |             new_sentences.extend(split_alpha_nonalpha(sentence))
 33 |         sentences = new_sentences
 34 | 
 35 |     for sentence in sentences:
 36 |         if check_is_none(sentence):
 37 |             continue
 38 | 
 39 |         lang = classify_language(sentence, target_languages)
 40 | 
 41 |         if pre_lang == "":
 42 |             text = text[:p] + text[p:].replace(
 43 |                 sentence, f"[{lang.upper()}]{sentence}", 1
 44 |             )
 45 |             p += len(f"[{lang.upper()}]")
 46 |         elif pre_lang != lang:
 47 |             text = text[:p] + text[p:].replace(
 48 |                 sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
 49 |             )
 50 |             p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
 51 |         pre_lang = lang
 52 |         p += text[p:].index(sentence) + len(sentence)
 53 |     text += f"[{pre_lang.upper()}]"
 54 | 
 55 |     return text
 56 | 
 57 | 
 58 | def split_by_language(text: str, target_languages: list = None) -> list:
 59 |     pattern = (
 60 |         r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
 61 |         r"\！？\。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」"
 62 |         r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
 63 |     )
 64 |     sentences = re.split(pattern, text)
 65 | 
 66 |     pre_lang = ""
 67 |     start = 0
 68 |     end = 0
 69 |     sentences_list = []
 70 | 
 71 |     sorted_target_languages = sorted(target_languages)
 72 |     if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
 73 |         new_sentences = []
 74 |         for sentence in sentences:
 75 |             new_sentences.extend(split_alpha_nonalpha(sentence))
 76 |         sentences = new_sentences
 77 | 
 78 |     for sentence in sentences:
 79 |         if check_is_none(sentence):
 80 |             continue
 81 | 
 82 |         lang = classify_language(sentence, target_languages)
 83 | 
 84 |         end += text[end:].index(sentence)
 85 |         if pre_lang != "" and pre_lang != lang:
 86 |             sentences_list.append((text[start:end], pre_lang))
 87 |             start = end
 88 |         end += len(sentence)
 89 |         pre_lang = lang
 90 |     sentences_list.append((text[start:], pre_lang))
 91 | 
 92 |     return sentences_list
 93 | 
 94 | 
 95 | def sentence_split(text: str, max: int) -> list:
 96 |     pattern = r"[!(),—+\-.:;?？。，、；：]+"
 97 |     sentences = re.split(pattern, text)
 98 |     discarded_chars = re.findall(pattern, text)
 99 | 
100 |     sentences_list, count, p = [], 0, 0
101 | 
102 |     # 按被分割的符号遍历
103 |     for i, discarded_chars in enumerate(discarded_chars):
104 |         count += len(sentences[i]) + len(discarded_chars)
105 |         if count >= max:
106 |             sentences_list.append(text[p : p + count].strip())
107 |             p += count
108 |             count = 0
109 | 
110 |     # 加入最后剩余的文本
111 |     if p < len(text):
112 |         sentences_list.append(text[p:])
113 | 
114 |     return sentences_list
115 | 
116 | 
117 | def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
118 |     # 如果该speaker只支持一种语言
119 |     if speaker_lang is not None and len(speaker_lang) == 1:
120 |         if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
121 |             logging.debug(
122 |                 f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
123 |             )
124 |         lang = speaker_lang[0]
125 | 
126 |     sentences_list = []
127 |     if lang.upper() != "MIX":
128 |         if max <= 0:
129 |             sentences_list.append(
130 |                 markup_language(text, speaker_lang)
131 |                 if lang.upper() == "AUTO"
132 |                 else f"[{lang.upper()}]{text}[{lang.upper()}]"
133 |             )
134 |         else:
135 |             for i in sentence_split(text, max):
136 |                 if check_is_none(i):
137 |                     continue
138 |                 sentences_list.append(
139 |                     markup_language(i, speaker_lang)
140 |                     if lang.upper() == "AUTO"
141 |                     else f"[{lang.upper()}]{i}[{lang.upper()}]"
142 |                 )
143 |     else:
144 |         sentences_list.append(text)
145 | 
146 |     for i in sentences_list:
147 |         logging.debug(i)
148 | 
149 |     return sentences_list
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     text = "这几天心里颇不宁静。今晚在院子里坐着乘凉，忽然想起日日走过的荷塘，在这满月的光里，总该另有一番样子吧。月亮渐渐地升高了，墙外马路上孩子们的欢笑，已经听不见了；妻在屋里拍着闰儿，迷迷糊糊地哼着眠歌。我悄悄地披了大衫，带上门出去。"
154 |     print(markup_language(text, target_languages=None))
155 |     print(sentence_split(text, max=50))
156 |     print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
157 |     text = "你好，这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好！今天我们要介绍VITS项目，其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
158 |     print(split_by_language(text, ["zh", "ja", "en"]))
159 | 


--------------------------------------------------------------------------------
/oldVersion/V111/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 1.1.1版本兼容
  3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1.1
  4 | """
  5 | import torch
  6 | import commons
  7 | from .text.cleaner import clean_text, clean_text_fix
  8 | from .text import cleaned_text_to_sequence
  9 | from .text import get_bert, get_bert_fix
 10 | 
 11 | 
 12 | def get_text(text, language_str, hps, device):
 13 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
 14 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
 15 | 
 16 |     if hps.data.add_blank:
 17 |         phone = commons.intersperse(phone, 0)
 18 |         tone = commons.intersperse(tone, 0)
 19 |         language = commons.intersperse(language, 0)
 20 |         for i in range(len(word2ph)):
 21 |             word2ph[i] = word2ph[i] * 2
 22 |         word2ph[0] += 1
 23 |     bert = get_bert(norm_text, word2ph, language_str, device)
 24 |     del word2ph
 25 |     assert bert.shape[-1] == len(phone), phone
 26 | 
 27 |     if language_str == "ZH":
 28 |         bert = bert
 29 |         ja_bert = torch.zeros(768, len(phone))
 30 |     elif language_str == "JP":
 31 |         ja_bert = bert
 32 |         bert = torch.zeros(1024, len(phone))
 33 |     else:
 34 |         bert = torch.zeros(1024, len(phone))
 35 |         ja_bert = torch.zeros(768, len(phone))
 36 | 
 37 |     assert bert.shape[-1] == len(
 38 |         phone
 39 |     ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
 40 | 
 41 |     phone = torch.LongTensor(phone)
 42 |     tone = torch.LongTensor(tone)
 43 |     language = torch.LongTensor(language)
 44 |     return bert, ja_bert, phone, tone, language
 45 | 
 46 | 
 47 | def get_text_fix(text, language_str, hps, device):
 48 |     norm_text, phone, tone, word2ph = clean_text_fix(text, language_str)
 49 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
 50 | 
 51 |     if hps.data.add_blank:
 52 |         phone = commons.intersperse(phone, 0)
 53 |         tone = commons.intersperse(tone, 0)
 54 |         language = commons.intersperse(language, 0)
 55 |         for i in range(len(word2ph)):
 56 |             word2ph[i] = word2ph[i] * 2
 57 |         word2ph[0] += 1
 58 |     bert = get_bert_fix(norm_text, word2ph, language_str, device)
 59 |     del word2ph
 60 |     assert bert.shape[-1] == len(phone), phone
 61 | 
 62 |     if language_str == "ZH":
 63 |         bert = bert
 64 |         ja_bert = torch.zeros(768, len(phone))
 65 |     elif language_str == "JP":
 66 |         ja_bert = bert
 67 |         bert = torch.zeros(1024, len(phone))
 68 |     else:
 69 |         bert = torch.zeros(1024, len(phone))
 70 |         ja_bert = torch.zeros(768, len(phone))
 71 | 
 72 |     assert bert.shape[-1] == len(
 73 |         phone
 74 |     ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
 75 | 
 76 |     phone = torch.LongTensor(phone)
 77 |     tone = torch.LongTensor(tone)
 78 |     language = torch.LongTensor(language)
 79 |     return bert, ja_bert, phone, tone, language
 80 | 
 81 | 
 82 | def infer(
 83 |     text,
 84 |     sdp_ratio,
 85 |     noise_scale,
 86 |     noise_scale_w,
 87 |     length_scale,
 88 |     sid,
 89 |     language,
 90 |     hps,
 91 |     net_g,
 92 |     device,
 93 | ):
 94 |     bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
 95 |     with torch.no_grad():
 96 |         x_tst = phones.to(device).unsqueeze(0)
 97 |         tones = tones.to(device).unsqueeze(0)
 98 |         lang_ids = lang_ids.to(device).unsqueeze(0)
 99 |         bert = bert.to(device).unsqueeze(0)
100 |         ja_bert = ja_bert.to(device).unsqueeze(0)
101 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
102 |         del phones
103 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
104 |         audio = (
105 |             net_g.infer(
106 |                 x_tst,
107 |                 x_tst_lengths,
108 |                 speakers,
109 |                 tones,
110 |                 lang_ids,
111 |                 bert,
112 |                 ja_bert,
113 |                 sdp_ratio=sdp_ratio,
114 |                 noise_scale=noise_scale,
115 |                 noise_scale_w=noise_scale_w,
116 |                 length_scale=length_scale,
117 |             )[0][0, 0]
118 |             .data.cpu()
119 |             .float()
120 |             .numpy()
121 |         )
122 |         del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
123 |         if torch.cuda.is_available():
124 |             torch.cuda.empty_cache()
125 |         return audio
126 | 
127 | 
128 | def infer_fix(
129 |     text,
130 |     sdp_ratio,
131 |     noise_scale,
132 |     noise_scale_w,
133 |     length_scale,
134 |     sid,
135 |     language,
136 |     hps,
137 |     net_g,
138 |     device,
139 | ):
140 |     bert, ja_bert, phones, tones, lang_ids = get_text_fix(text, language, hps, device)
141 |     with torch.no_grad():
142 |         x_tst = phones.to(device).unsqueeze(0)
143 |         tones = tones.to(device).unsqueeze(0)
144 |         lang_ids = lang_ids.to(device).unsqueeze(0)
145 |         bert = bert.to(device).unsqueeze(0)
146 |         ja_bert = ja_bert.to(device).unsqueeze(0)
147 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
148 |         del phones
149 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
150 |         audio = (
151 |             net_g.infer(
152 |                 x_tst,
153 |                 x_tst_lengths,
154 |                 speakers,
155 |                 tones,
156 |                 lang_ids,
157 |                 bert,
158 |                 ja_bert,
159 |                 sdp_ratio=sdp_ratio,
160 |                 noise_scale=noise_scale,
161 |                 noise_scale_w=noise_scale_w,
162 |                 length_scale=length_scale,
163 |             )[0][0, 0]
164 |             .data.cpu()
165 |             .float()
166 |             .numpy()
167 |         )
168 |         del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
169 |         if torch.cuda.is_available():
170 |             torch.cuda.empty_cache()
171 |         return audio
172 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request, Response
  2 | from io import BytesIO
  3 | import torch
  4 | from av import open as avopen
  5 | from typing import Dict, List
  6 | import re_matching
  7 | import utils
  8 | from infer import infer, get_net_g, latest_version
  9 | from scipy.io import wavfile
 10 | import gradio as gr
 11 | from config import config
 12 | 
 13 | # Flask Init
 14 | app = Flask(__name__)
 15 | app.config["JSON_AS_ASCII"] = False
 16 | 
 17 | 
 18 | def replace_punctuation(text, i=2):
 19 |     punctuation = "，。？！"
 20 |     for char in punctuation:
 21 |         text = text.replace(char, char * i)
 22 |     return text
 23 | 
 24 | 
 25 | def wav2(i, o, format):
 26 |     inp = avopen(i, "rb")
 27 |     out = avopen(o, "wb", format=format)
 28 |     if format == "ogg":
 29 |         format = "libvorbis"
 30 | 
 31 |     ostream = out.add_stream(format)
 32 | 
 33 |     for frame in inp.decode(audio=0):
 34 |         for p in ostream.encode(frame):
 35 |             out.mux(p)
 36 | 
 37 |     for p in ostream.encode(None):
 38 |         out.mux(p)
 39 | 
 40 |     out.close()
 41 |     inp.close()
 42 | 
 43 | 
 44 | net_g_List = []
 45 | hps_List = []
 46 | # 模型角色字典
 47 | # 使用方法 chr_name = chrsMap[model_id][chr_id]
 48 | chrsMap: List[Dict[int, str]] = list()
 49 | 
 50 | # 加载模型
 51 | models = config.server_config.models
 52 | for model in models:
 53 |     hps_List.append(utils.get_hparams_from_file(model["config"]))
 54 |     # 添加角色字典
 55 |     chrsMap.append(dict())
 56 |     for name, cid in hps_List[-1].data.spk2id.items():
 57 |         chrsMap[-1][cid] = name
 58 |     version = (
 59 |         hps_List[-1].version if hasattr(hps_List[-1], "version") else latest_version
 60 |     )
 61 |     net_g_List.append(
 62 |         get_net_g(
 63 |             model_path=model["model"],
 64 |             version=version,
 65 |             device=model["device"],
 66 |             hps=hps_List[-1],
 67 |         )
 68 |     )
 69 | 
 70 | 
 71 | def generate_audio(
 72 |     slices,
 73 |     sdp_ratio,
 74 |     noise_scale,
 75 |     noise_scale_w,
 76 |     length_scale,
 77 |     speaker,
 78 |     language,
 79 | ):
 80 |     audio_list = []
 81 |     silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
 82 |     with torch.no_grad():
 83 |         for piece in slices:
 84 |             audio = infer(
 85 |                 piece,
 86 |                 sdp_ratio=sdp_ratio,
 87 |                 noise_scale=noise_scale,
 88 |                 noise_scale_w=noise_scale_w,
 89 |                 length_scale=length_scale,
 90 |                 sid=speaker,
 91 |                 language=language,
 92 |                 hps=hps,
 93 |                 net_g=net_g,
 94 |                 device=device,
 95 |             )
 96 |             audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
 97 |             audio_list.append(audio16bit)
 98 |             audio_list.append(silence)  # 将静音添加到列表中
 99 |     return audio_list
100 | 
101 | 
102 | @app.route("/")
103 | def main():
104 |     try:
105 |         model = int(request.args.get("model"))
106 |         speaker = request.args.get("speaker", "")  # 指定人物名
107 |         speaker_id = request.args.get("speaker_id", None)  # 直接指定id
108 |         text = request.args.get("text").replace("/n", "")
109 |         sdp_ratio = float(request.args.get("sdp_ratio", 0.2))
110 |         noise = float(request.args.get("noise", 0.5))
111 |         noisew = float(request.args.get("noisew", 0.6))
112 |         length = float(request.args.get("length", 1.2))
113 |         language = request.args.get("language")
114 |         if length >= 2:
115 |             return "Too big length"
116 |         if len(text) >= 250:
117 |             return "Too long text"
118 |         fmt = request.args.get("format", "wav")
119 |         if None in (speaker, text):
120 |             return "Missing Parameter"
121 |         if fmt not in ("mp3", "wav", "ogg"):
122 |             return "Invalid Format"
123 |         if language not in ("JP", "ZH", "EN", "mix"):
124 |             return "Invalid language"
125 |     except:
126 |         return "Invalid Parameter"
127 | 
128 |     if speaker_id is not None:
129 |         if speaker_id.isdigit():
130 |             speaker = chrsMap[model][int(speaker_id)]
131 |     audio_list = []
132 |     if language == "mix":
133 |         bool_valid, str_valid = re_matching.validate_text(text)
134 |         if not bool_valid:
135 |             return str_valid, (
136 |                 hps.data.sampling_rate,
137 |                 np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
138 |             )
139 |         result = re_matching.text_matching(text)
140 |         for one in result:
141 |             _speaker = one.pop()
142 |             for lang, content in one:
143 |                 audio_list.extend(
144 |                     generate_audio(
145 |                         content.split("|"),
146 |                         sdp_ratio,
147 |                         noise_scale,
148 |                         noise_scale_w,
149 |                         length_scale,
150 |                         _speaker,
151 |                         lang,
152 |                     )
153 |                 )
154 |     else:
155 |         audio_list.extend(
156 |             generate_audio(
157 |                 text.split("|"),
158 |                 sdp_ratio,
159 |                 noise_scale,
160 |                 noise_scale_w,
161 |                 length_scale,
162 |                 speaker,
163 |                 language,
164 |             )
165 |         )
166 | 
167 |     audio_concat = np.concatenate(audio_list)
168 |     with BytesIO() as wav:
169 |         wavfile.write(wav, hps_List[model].data.sampling_rate, audio_concat)
170 |         torch.cuda.empty_cache()
171 |         if fmt == "wav":
172 |             return Response(wav.getvalue(), mimetype="audio/wav")
173 |         wav.seek(0, 0)
174 |         with BytesIO() as ofp:
175 |             wav2(wav, ofp, fmt)
176 |             return Response(
177 |                 ofp.getvalue(), mimetype="audio/mpeg" if fmt == "mp3" else "audio/ogg"
178 |             )
179 | 
180 | 
181 | if __name__ == "__main__":
182 |     app.run(port=config.server_config.port, server_name="0.0.0.0")
183 | 


--------------------------------------------------------------------------------
/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import cn2an
  5 | from pypinyin import lazy_pinyin, Style
  6 | 
  7 | from text.symbols import punctuation
  8 | from text.tone_sandhi import ToneSandhi
  9 | 
 10 | current_file_path = os.path.dirname(__file__)
 11 | pinyin_to_symbol_map = {
 12 |     line.split("\t")[0]: line.strip().split("\t")[1]
 13 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 14 | }
 15 | 
 16 | import jieba.posseg as psg
 17 | 
 18 | 
 19 | rep_map = {
 20 |     "：": ",",
 21 |     "；": ",",
 22 |     "，": ",",
 23 |     "。": ".",
 24 |     "！": "!",
 25 |     "？": "?",
 26 |     "\n": ".",
 27 |     "·": ",",
 28 |     "、": ",",
 29 |     "...": "…",
 30 |     "$": ".",
 31 |     "“": "'",
 32 |     "”": "'",
 33 |     "‘": "'",
 34 |     "’": "'",
 35 |     "（": "'",
 36 |     "）": "'",
 37 |     "(": "'",
 38 |     ")": "'",
 39 |     "《": "'",
 40 |     "》": "'",
 41 |     "【": "'",
 42 |     "】": "'",
 43 |     "[": "'",
 44 |     "]": "'",
 45 |     "—": "-",
 46 |     "～": "-",
 47 |     "~": "-",
 48 |     "「": "'",
 49 |     "」": "'",
 50 | }
 51 | 
 52 | tone_modifier = ToneSandhi()
 53 | 
 54 | 
 55 | def replace_punctuation(text):
 56 |     text = text.replace("嗯", "恩").replace("呣", "母")
 57 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 58 | 
 59 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 60 | 
 61 |     replaced_text = re.sub(
 62 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 63 |     )
 64 | 
 65 |     return replaced_text
 66 | 
 67 | 
 68 | def g2p(text):
 69 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 70 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 71 |     phones, tones, word2ph = _g2p(sentences)
 72 |     assert sum(word2ph) == len(phones)
 73 |     assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
 74 |     phones = ["_"] + phones + ["_"]
 75 |     tones = [0] + tones + [0]
 76 |     word2ph = [1] + word2ph + [1]
 77 |     return phones, tones, word2ph
 78 | 
 79 | 
 80 | def _get_initials_finals(word):
 81 |     initials = []
 82 |     finals = []
 83 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 84 |     orig_finals = lazy_pinyin(
 85 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 86 |     )
 87 |     for c, v in zip(orig_initials, orig_finals):
 88 |         initials.append(c)
 89 |         finals.append(v)
 90 |     return initials, finals
 91 | 
 92 | 
 93 | def _g2p(segments):
 94 |     phones_list = []
 95 |     tones_list = []
 96 |     word2ph = []
 97 |     for seg in segments:
 98 |         # Replace all English words in the sentence
 99 |         seg = re.sub("[a-zA-Z]+", "", seg)
100 |         seg_cut = psg.lcut(seg)
101 |         initials = []
102 |         finals = []
103 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 |         for word, pos in seg_cut:
105 |             if pos == "eng":
106 |                 continue
107 |             sub_initials, sub_finals = _get_initials_finals(word)
108 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 |             initials.append(sub_initials)
110 |             finals.append(sub_finals)
111 | 
112 |             # assert len(sub_initials) == len(sub_finals) == len(word)
113 |         initials = sum(initials, [])
114 |         finals = sum(finals, [])
115 |         #
116 |         for c, v in zip(initials, finals):
117 |             raw_pinyin = c + v
118 |             # NOTE: post process for pypinyin outputs
119 |             # we discriminate i, ii and iii
120 |             if c == v:
121 |                 assert c in punctuation
122 |                 phone = [c]
123 |                 tone = "0"
124 |                 word2ph.append(1)
125 |             else:
126 |                 v_without_tone = v[:-1]
127 |                 tone = v[-1]
128 | 
129 |                 pinyin = c + v_without_tone
130 |                 assert tone in "12345"
131 | 
132 |                 if c:
133 |                     # 多音节
134 |                     v_rep_map = {
135 |                         "uei": "ui",
136 |                         "iou": "iu",
137 |                         "uen": "un",
138 |                     }
139 |                     if v_without_tone in v_rep_map.keys():
140 |                         pinyin = c + v_rep_map[v_without_tone]
141 |                 else:
142 |                     # 单音节
143 |                     pinyin_rep_map = {
144 |                         "ing": "ying",
145 |                         "i": "yi",
146 |                         "in": "yin",
147 |                         "u": "wu",
148 |                     }
149 |                     if pinyin in pinyin_rep_map.keys():
150 |                         pinyin = pinyin_rep_map[pinyin]
151 |                     else:
152 |                         single_rep_map = {
153 |                             "v": "yu",
154 |                             "e": "e",
155 |                             "i": "y",
156 |                             "u": "w",
157 |                         }
158 |                         if pinyin[0] in single_rep_map.keys():
159 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 | 
161 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 |                 phone = pinyin_to_symbol_map[pinyin].split(" ")
163 |                 word2ph.append(len(phone))
164 | 
165 |             phones_list += phone
166 |             tones_list += [int(tone)] * len(phone)
167 |     return phones_list, tones_list, word2ph
168 | 
169 | 
170 | def text_normalize(text):
171 |     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 |     for number in numbers:
173 |         text = text.replace(number, cn2an.an2cn(number), 1)
174 |     text = replace_punctuation(text)
175 |     return text
176 | 
177 | 
178 | def get_bert_feature(text, word2ph):
179 |     from text import chinese_bert
180 | 
181 |     return chinese_bert.get_bert_feature(text, word2ph)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     from text.chinese_bert import get_bert_feature
186 | 
187 |     text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
188 |     text = text_normalize(text)
189 |     print(text)
190 |     phones, tones, word2ph = g2p(text)
191 |     bert = get_bert_feature(text, word2ph)
192 | 
193 |     print(phones, tones, word2ph, bert.shape)
194 | 
195 | 
196 | # # 示例用法
197 | # text = "这是一个示例文本：,你好！这是一个测试...."
198 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
199 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import cn2an
  5 | from pypinyin import lazy_pinyin, Style
  6 | 
  7 | from .symbols import punctuation
  8 | from .tone_sandhi import ToneSandhi
  9 | 
 10 | current_file_path = os.path.dirname(__file__)
 11 | pinyin_to_symbol_map = {
 12 |     line.split("\t")[0]: line.strip().split("\t")[1]
 13 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 14 | }
 15 | 
 16 | import jieba.posseg as psg
 17 | 
 18 | 
 19 | rep_map = {
 20 |     "：": ",",
 21 |     "；": ",",
 22 |     "，": ",",
 23 |     "。": ".",
 24 |     "！": "!",
 25 |     "？": "?",
 26 |     "\n": ".",
 27 |     "·": ",",
 28 |     "、": ",",
 29 |     "...": "…",
 30 |     "$": ".",
 31 |     "“": "'",
 32 |     "”": "'",
 33 |     "‘": "'",
 34 |     "’": "'",
 35 |     "（": "'",
 36 |     "）": "'",
 37 |     "(": "'",
 38 |     ")": "'",
 39 |     "《": "'",
 40 |     "》": "'",
 41 |     "【": "'",
 42 |     "】": "'",
 43 |     "[": "'",
 44 |     "]": "'",
 45 |     "—": "-",
 46 |     "～": "-",
 47 |     "~": "-",
 48 |     "「": "'",
 49 |     "」": "'",
 50 | }
 51 | 
 52 | tone_modifier = ToneSandhi()
 53 | 
 54 | 
 55 | def replace_punctuation(text):
 56 |     text = text.replace("嗯", "恩").replace("呣", "母")
 57 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 58 | 
 59 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 60 | 
 61 |     replaced_text = re.sub(
 62 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 63 |     )
 64 | 
 65 |     return replaced_text
 66 | 
 67 | 
 68 | def g2p(text):
 69 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 70 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 71 |     phones, tones, word2ph = _g2p(sentences)
 72 |     assert sum(word2ph) == len(phones)
 73 |     assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
 74 |     phones = ["_"] + phones + ["_"]
 75 |     tones = [0] + tones + [0]
 76 |     word2ph = [1] + word2ph + [1]
 77 |     return phones, tones, word2ph
 78 | 
 79 | 
 80 | def _get_initials_finals(word):
 81 |     initials = []
 82 |     finals = []
 83 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 84 |     orig_finals = lazy_pinyin(
 85 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 86 |     )
 87 |     for c, v in zip(orig_initials, orig_finals):
 88 |         initials.append(c)
 89 |         finals.append(v)
 90 |     return initials, finals
 91 | 
 92 | 
 93 | def _g2p(segments):
 94 |     phones_list = []
 95 |     tones_list = []
 96 |     word2ph = []
 97 |     for seg in segments:
 98 |         # Replace all English words in the sentence
 99 |         seg = re.sub("[a-zA-Z]+", "", seg)
100 |         seg_cut = psg.lcut(seg)
101 |         initials = []
102 |         finals = []
103 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 |         for word, pos in seg_cut:
105 |             if pos == "eng":
106 |                 continue
107 |             sub_initials, sub_finals = _get_initials_finals(word)
108 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 |             initials.append(sub_initials)
110 |             finals.append(sub_finals)
111 | 
112 |             # assert len(sub_initials) == len(sub_finals) == len(word)
113 |         initials = sum(initials, [])
114 |         finals = sum(finals, [])
115 |         #
116 |         for c, v in zip(initials, finals):
117 |             raw_pinyin = c + v
118 |             # NOTE: post process for pypinyin outputs
119 |             # we discriminate i, ii and iii
120 |             if c == v:
121 |                 assert c in punctuation
122 |                 phone = [c]
123 |                 tone = "0"
124 |                 word2ph.append(1)
125 |             else:
126 |                 v_without_tone = v[:-1]
127 |                 tone = v[-1]
128 | 
129 |                 pinyin = c + v_without_tone
130 |                 assert tone in "12345"
131 | 
132 |                 if c:
133 |                     # 多音节
134 |                     v_rep_map = {
135 |                         "uei": "ui",
136 |                         "iou": "iu",
137 |                         "uen": "un",
138 |                     }
139 |                     if v_without_tone in v_rep_map.keys():
140 |                         pinyin = c + v_rep_map[v_without_tone]
141 |                 else:
142 |                     # 单音节
143 |                     pinyin_rep_map = {
144 |                         "ing": "ying",
145 |                         "i": "yi",
146 |                         "in": "yin",
147 |                         "u": "wu",
148 |                     }
149 |                     if pinyin in pinyin_rep_map.keys():
150 |                         pinyin = pinyin_rep_map[pinyin]
151 |                     else:
152 |                         single_rep_map = {
153 |                             "v": "yu",
154 |                             "e": "e",
155 |                             "i": "y",
156 |                             "u": "w",
157 |                         }
158 |                         if pinyin[0] in single_rep_map.keys():
159 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 | 
161 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 |                 phone = pinyin_to_symbol_map[pinyin].split(" ")
163 |                 word2ph.append(len(phone))
164 | 
165 |             phones_list += phone
166 |             tones_list += [int(tone)] * len(phone)
167 |     return phones_list, tones_list, word2ph
168 | 
169 | 
170 | def text_normalize(text):
171 |     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 |     for number in numbers:
173 |         text = text.replace(number, cn2an.an2cn(number), 1)
174 |     text = replace_punctuation(text)
175 |     return text
176 | 
177 | 
178 | def get_bert_feature(text, word2ph):
179 |     from text import chinese_bert
180 | 
181 |     return chinese_bert.get_bert_feature(text, word2ph)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     from text.chinese_bert import get_bert_feature
186 | 
187 |     text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
188 |     text = text_normalize(text)
189 |     print(text)
190 |     phones, tones, word2ph = g2p(text)
191 |     bert = get_bert_feature(text, word2ph)
192 | 
193 |     print(phones, tones, word2ph, bert.shape)
194 | 
195 | 
196 | # # 示例用法
197 | # text = "这是一个示例文本：,你好！这是一个测试...."
198 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
199 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import cn2an
  5 | from pypinyin import lazy_pinyin, Style
  6 | 
  7 | from .symbols import punctuation
  8 | from .tone_sandhi import ToneSandhi
  9 | 
 10 | current_file_path = os.path.dirname(__file__)
 11 | pinyin_to_symbol_map = {
 12 |     line.split("\t")[0]: line.strip().split("\t")[1]
 13 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 14 | }
 15 | 
 16 | import jieba.posseg as psg
 17 | 
 18 | 
 19 | rep_map = {
 20 |     "：": ",",
 21 |     "；": ",",
 22 |     "，": ",",
 23 |     "。": ".",
 24 |     "！": "!",
 25 |     "？": "?",
 26 |     "\n": ".",
 27 |     "·": ",",
 28 |     "、": ",",
 29 |     "...": "…",
 30 |     "$": ".",
 31 |     "“": "'",
 32 |     "”": "'",
 33 |     "‘": "'",
 34 |     "’": "'",
 35 |     "（": "'",
 36 |     "）": "'",
 37 |     "(": "'",
 38 |     ")": "'",
 39 |     "《": "'",
 40 |     "》": "'",
 41 |     "【": "'",
 42 |     "】": "'",
 43 |     "[": "'",
 44 |     "]": "'",
 45 |     "—": "-",
 46 |     "～": "-",
 47 |     "~": "-",
 48 |     "「": "'",
 49 |     "」": "'",
 50 | }
 51 | 
 52 | tone_modifier = ToneSandhi()
 53 | 
 54 | 
 55 | def replace_punctuation(text):
 56 |     text = text.replace("嗯", "恩").replace("呣", "母")
 57 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 58 | 
 59 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 60 | 
 61 |     replaced_text = re.sub(
 62 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 63 |     )
 64 | 
 65 |     return replaced_text
 66 | 
 67 | 
 68 | def g2p(text):
 69 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 70 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 71 |     phones, tones, word2ph = _g2p(sentences)
 72 |     assert sum(word2ph) == len(phones)
 73 |     assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
 74 |     phones = ["_"] + phones + ["_"]
 75 |     tones = [0] + tones + [0]
 76 |     word2ph = [1] + word2ph + [1]
 77 |     return phones, tones, word2ph
 78 | 
 79 | 
 80 | def _get_initials_finals(word):
 81 |     initials = []
 82 |     finals = []
 83 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 84 |     orig_finals = lazy_pinyin(
 85 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 86 |     )
 87 |     for c, v in zip(orig_initials, orig_finals):
 88 |         initials.append(c)
 89 |         finals.append(v)
 90 |     return initials, finals
 91 | 
 92 | 
 93 | def _g2p(segments):
 94 |     phones_list = []
 95 |     tones_list = []
 96 |     word2ph = []
 97 |     for seg in segments:
 98 |         # Replace all English words in the sentence
 99 |         seg = re.sub("[a-zA-Z]+", "", seg)
100 |         seg_cut = psg.lcut(seg)
101 |         initials = []
102 |         finals = []
103 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104 |         for word, pos in seg_cut:
105 |             if pos == "eng":
106 |                 continue
107 |             sub_initials, sub_finals = _get_initials_finals(word)
108 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109 |             initials.append(sub_initials)
110 |             finals.append(sub_finals)
111 | 
112 |             # assert len(sub_initials) == len(sub_finals) == len(word)
113 |         initials = sum(initials, [])
114 |         finals = sum(finals, [])
115 |         #
116 |         for c, v in zip(initials, finals):
117 |             raw_pinyin = c + v
118 |             # NOTE: post process for pypinyin outputs
119 |             # we discriminate i, ii and iii
120 |             if c == v:
121 |                 assert c in punctuation
122 |                 phone = [c]
123 |                 tone = "0"
124 |                 word2ph.append(1)
125 |             else:
126 |                 v_without_tone = v[:-1]
127 |                 tone = v[-1]
128 | 
129 |                 pinyin = c + v_without_tone
130 |                 assert tone in "12345"
131 | 
132 |                 if c:
133 |                     # 多音节
134 |                     v_rep_map = {
135 |                         "uei": "ui",
136 |                         "iou": "iu",
137 |                         "uen": "un",
138 |                     }
139 |                     if v_without_tone in v_rep_map.keys():
140 |                         pinyin = c + v_rep_map[v_without_tone]
141 |                 else:
142 |                     # 单音节
143 |                     pinyin_rep_map = {
144 |                         "ing": "ying",
145 |                         "i": "yi",
146 |                         "in": "yin",
147 |                         "u": "wu",
148 |                     }
149 |                     if pinyin in pinyin_rep_map.keys():
150 |                         pinyin = pinyin_rep_map[pinyin]
151 |                     else:
152 |                         single_rep_map = {
153 |                             "v": "yu",
154 |                             "e": "e",
155 |                             "i": "y",
156 |                             "u": "w",
157 |                         }
158 |                         if pinyin[0] in single_rep_map.keys():
159 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160 | 
161 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162 |                 phone = pinyin_to_symbol_map[pinyin].split(" ")
163 |                 word2ph.append(len(phone))
164 | 
165 |             phones_list += phone
166 |             tones_list += [int(tone)] * len(phone)
167 |     return phones_list, tones_list, word2ph
168 | 
169 | 
170 | def text_normalize(text):
171 |     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172 |     for number in numbers:
173 |         text = text.replace(number, cn2an.an2cn(number), 1)
174 |     text = replace_punctuation(text)
175 |     return text
176 | 
177 | 
178 | def get_bert_feature(text, word2ph):
179 |     from text import chinese_bert
180 | 
181 |     return chinese_bert.get_bert_feature(text, word2ph)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     from text.chinese_bert import get_bert_feature
186 | 
187 |     text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
188 |     text = text_normalize(text)
189 |     print(text)
190 |     phones, tones, word2ph = g2p(text)
191 |     bert = get_bert_feature(text, word2ph)
192 | 
193 |     print(phones, tones, word2ph, bert.shape)
194 | 
195 | 
196 | # # 示例用法
197 | # text = "这是一个示例文本：,你好！这是一个测试...."
198 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
199 | 


--------------------------------------------------------------------------------