2 |
3 |

4 |
5 | # Bert-VITS2
6 |
7 | VITS2 Backbone with multilingual bert
8 |
9 | For quick guide, please refer to `webui_preprocess.py`.
10 |
11 | 简易教程请参见 `webui_preprocess.py`。
12 |
13 | ## 请注意,本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
14 | ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
15 |
16 | [//]: # (## 本项目与[PlayVoice/vits_chinese](https://github.com/PlayVoice/vits_chinese) 没有任何关系)
17 |
18 | [//]: # ()
19 | [//]: # (本仓库来源于之前朋友分享了ai峰哥的视频,本人被其中的效果惊艳,在自己尝试MassTTS以后发现fs在音质方面与vits有一定差距,并且training的pipeline比vits更复杂,因此按照其思路将bert)
20 |
21 | ## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应当参阅代码自己学习如何训练。
22 |
23 | ### 严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
24 | ### 严禁用于任何政治相关用途。
25 | #### Video:https://www.bilibili.com/video/BV1hp4y1K78E
26 | #### Demo:https://www.bilibili.com/video/BV1TF411k78w
27 | #### QQ Group:815818430
28 | ## References
29 | + [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
30 | + [jaywalnut310/vits](https://github.com/jaywalnut310/vits)
31 | + [p0p4k/vits2_pytorch](https://github.com/p0p4k/vits2_pytorch)
32 | + [svc-develop-team/so-vits-svc](https://github.com/svc-develop-team/so-vits-svc)
33 | + [PaddlePaddle/PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech)
34 | + [emotional-vits](https://github.com/innnky/emotional-vits)
35 | + [fish-speech](https://github.com/fishaudio/fish-speech)
36 | + [Bert-VITS2-UI](https://github.com/jiangyuxiaoxiao/Bert-VITS2-UI)
37 | ## 感谢所有贡献者作出的努力
38 |
39 |
40 |
41 |
42 | [//]: # (# 本项目所有代码引用均已写明,bert部分代码思路来源于[AI峰哥](https://www.bilibili.com/video/BV1w24y1c7z9),与[vits_chinese](https://github.com/PlayVoice/vits_chinese)无任何关系。欢迎各位查阅代码。同时,我们也对该开发者的[碰瓷,乃至开盒开发者的行为](https://www.bilibili.com/read/cv27101514/)表示强烈谴责。)
43 |
--------------------------------------------------------------------------------
/oldVersion/V220/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](
28 | norm_text, word2ph, device, style_text, style_weight
29 | )
30 | return bert
31 |
32 |
33 | def check_bert_models():
34 | import json
35 | from pathlib import Path
36 |
37 | from config import config
38 | from .bert_utils import _check_bert
39 |
40 | if config.mirror.lower() == "openi":
41 | import openi
42 |
43 | kwargs = {"token": config.openi_token} if config.openi_token else {}
44 | openi.login(**kwargs)
45 |
46 | with open("./bert/bert_models.json", "r") as fp:
47 | models = json.load(fp)
48 | for k, v in models.items():
49 | local_path = Path("./bert").joinpath(k)
50 | _check_bert(v["repo_id"], v["files"], local_path)
51 |
52 |
53 | def init_openjtalk():
54 | import platform
55 |
56 | if platform.platform() == "Linux":
57 | import pyopenjtalk
58 |
59 | pyopenjtalk.g2p("こんにちは,世界。")
60 |
61 |
62 | init_openjtalk()
63 | check_bert_models()
64 |
--------------------------------------------------------------------------------
/tools/translate.py:
--------------------------------------------------------------------------------
1 | """
2 | 翻译api
3 | """
4 | from config import config
5 |
6 | import random
7 | import hashlib
8 | import requests
9 |
10 |
11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12 | """
13 | :param Sentence: 待翻译语句
14 | :param from_Language: 待翻译语句语言
15 | :param to_Language: 目标语言
16 | :return: 翻译后语句 出错时返回None
17 |
18 | 常见语言代码:中文 zh 英语 en 日语 jp
19 | """
20 | appid = config.translate_config.app_key
21 | key = config.translate_config.secret_key
22 | if appid == "" or key == "":
23 | return "请开发者在config.yml中配置app_key与secret_key"
24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25 | texts = Sentence.splitlines()
26 | outTexts = []
27 | for t in texts:
28 | if t != "":
29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30 | salt = str(random.randint(1, 100000))
31 | signString = appid + t + salt + key
32 | hs = hashlib.md5()
33 | hs.update(signString.encode("utf-8"))
34 | signString = hs.hexdigest()
35 | if from_Language == "":
36 | from_Language = "auto"
37 | headers = {"Content-Type": "application/x-www-form-urlencoded"}
38 | payload = {
39 | "q": t,
40 | "from": from_Language,
41 | "to": to_Language,
42 | "appid": appid,
43 | "salt": salt,
44 | "sign": signString,
45 | }
46 | # 发送请求
47 | try:
48 | response = requests.post(
49 | url=url, data=payload, headers=headers, timeout=3
50 | )
51 | response = response.json()
52 | if "trans_result" in response.keys():
53 | result = response["trans_result"][0]
54 | if "dst" in result.keys():
55 | dst = result["dst"]
56 | outTexts.append(dst)
57 | except Exception:
58 | return Sentence
59 | else:
60 | outTexts.append(t)
61 | return "\n".join(outTexts)
62 |
--------------------------------------------------------------------------------
/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | if (
24 | sys.platform == "darwin"
25 | and torch.backends.mps.is_available()
26 | and device == "cpu"
27 | ):
28 | device = "mps"
29 | if not device:
30 | device = "cuda"
31 | if device not in models.keys():
32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 | with torch.no_grad():
34 | inputs = tokenizer(text, return_tensors="pt")
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = models[device](**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | if style_text:
40 | style_inputs = tokenizer(style_text, return_tensors="pt")
41 | for i in style_inputs:
42 | style_inputs[i] = style_inputs[i].to(device)
43 | style_res = models[device](**style_inputs, output_hidden_states=True)
44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 | style_res_mean = style_res.mean(0)
46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | if (
24 | sys.platform == "darwin"
25 | and torch.backends.mps.is_available()
26 | and device == "cpu"
27 | ):
28 | device = "mps"
29 | if not device:
30 | device = "cuda"
31 | if device not in models.keys():
32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 | with torch.no_grad():
34 | inputs = tokenizer(text, return_tensors="pt")
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = models[device](**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | if style_text:
40 | style_inputs = tokenizer(style_text, return_tensors="pt")
41 | for i in style_inputs:
42 | style_inputs[i] = style_inputs[i].to(device)
43 | style_res = models[device](**style_inputs, output_hidden_states=True)
44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 | style_res_mean = style_res.mean(0)
46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
--------------------------------------------------------------------------------
/oldVersion/V220/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | if (
24 | sys.platform == "darwin"
25 | and torch.backends.mps.is_available()
26 | and device == "cpu"
27 | ):
28 | device = "mps"
29 | if not device:
30 | device = "cuda"
31 | if device not in models.keys():
32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 | with torch.no_grad():
34 | inputs = tokenizer(text, return_tensors="pt")
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = models[device](**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | if style_text:
40 | style_inputs = tokenizer(style_text, return_tensors="pt")
41 | for i in style_inputs:
42 | style_inputs[i] = style_inputs[i].to(device)
43 | style_res = models[device](**style_inputs, output_hidden_states=True)
44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 | style_res_mean = style_res.mean(0)
46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | sep_text, _, _ = text2sep_kata(text)
18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 | return get_bert_feature_with_token(sep_ids, word2ph, device)
22 |
23 |
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 | if (
26 | sys.platform == "darwin"
27 | and torch.backends.mps.is_available()
28 | and device == "cpu"
29 | ):
30 | device = "mps"
31 | if not device:
32 | device = "cuda"
33 | if device not in models.keys():
34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 | with torch.no_grad():
36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 | token_type_ids = torch.zeros_like(inputs).to(device)
38 | attention_mask = torch.ones_like(inputs).to(device)
39 | inputs = {
40 | "input_ids": inputs,
41 | "token_type_ids": token_type_ids,
42 | "attention_mask": attention_mask,
43 | }
44 |
45 | # for i in inputs:
46 | # inputs[i] = inputs[i].to(device)
47 | res = models[device](**inputs, output_hidden_states=True)
48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 | assert inputs["input_ids"].shape[-1] == len(word2ph)
50 | word2phone = word2ph
51 | phone_level_feature = []
52 | for i in range(len(word2phone)):
53 | repeat_feature = res[i].repeat(word2phone[i], 1)
54 | phone_level_feature.append(repeat_feature)
55 |
56 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 |
58 | return phone_level_feature.T
59 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 | from .japanese import text2sep_kata
5 | from config import config
6 |
7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
8 |
9 | models = dict()
10 |
11 |
12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
13 | sep_text, _ = text2sep_kata(text)
14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
17 | return get_bert_feature_with_token(sep_ids, word2ph, device)
18 |
19 |
20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
21 | if (
22 | sys.platform == "darwin"
23 | and torch.backends.mps.is_available()
24 | and device == "cpu"
25 | ):
26 | device = "mps"
27 | if not device:
28 | device = "cuda"
29 | if device not in models.keys():
30 | models[device] = AutoModelForMaskedLM.from_pretrained(
31 | "./bert/bert-base-japanese-v3"
32 | ).to(device)
33 | with torch.no_grad():
34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
35 | token_type_ids = torch.zeros_like(inputs).to(device)
36 | attention_mask = torch.ones_like(inputs).to(device)
37 | inputs = {
38 | "input_ids": inputs,
39 | "token_type_ids": token_type_ids,
40 | "attention_mask": attention_mask,
41 | }
42 |
43 | # for i in inputs:
44 | # inputs[i] = inputs[i].to(device)
45 | res = models[device](**inputs, output_hidden_states=True)
46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
47 | assert inputs["input_ids"].shape[-1] == len(word2ph)
48 | word2phone = word2ph
49 | phone_level_feature = []
50 | for i in range(len(word2phone)):
51 | repeat_feature = res[i].repeat(word2phone[i], 1)
52 | phone_level_feature.append(repeat_feature)
53 |
54 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
55 |
56 | return phone_level_feature.T
57 |
--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import librosa
4 | from multiprocessing import Pool, cpu_count
5 |
6 | import soundfile
7 | from tqdm import tqdm
8 |
9 | from config import config
10 |
11 |
12 | def process(item):
13 | spkdir, wav_name, args = item
14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name)
15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
16 | wav, sr = librosa.load(wav_path, sr=args.sr)
17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
18 |
19 |
20 | if __name__ == "__main__":
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument(
23 | "--sr",
24 | type=int,
25 | default=config.resample_config.sampling_rate,
26 | help="sampling rate",
27 | )
28 | parser.add_argument(
29 | "--in_dir",
30 | type=str,
31 | default=config.resample_config.in_dir,
32 | help="path to source dir",
33 | )
34 | parser.add_argument(
35 | "--out_dir",
36 | type=str,
37 | default=config.resample_config.out_dir,
38 | help="path to target dir",
39 | )
40 | parser.add_argument(
41 | "--processes",
42 | type=int,
43 | default=0,
44 | help="cpu_processes",
45 | )
46 | args, _ = parser.parse_known_args()
47 | # autodl 无卡模式会识别出46个cpu
48 | if args.processes == 0:
49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 | else:
51 | processes = args.processes
52 | pool = Pool(processes=processes)
53 |
54 | tasks = []
55 |
56 | for dirpath, _, filenames in os.walk(args.in_dir):
57 | # 子级目录
58 | spk_dir = os.path.relpath(dirpath, args.in_dir)
59 | spk_dir_out = os.path.join(args.out_dir, spk_dir)
60 | if not os.path.isdir(spk_dir_out):
61 | os.makedirs(spk_dir_out, exist_ok=True)
62 | for filename in filenames:
63 | if filename.lower().endswith(".wav"):
64 | twople = (spk_dir, filename, args)
65 | tasks.append(twople)
66 |
67 | for _ in tqdm(
68 | pool.imap_unordered(process, tasks),
69 | ):
70 | pass
71 |
72 | pool.close()
73 | pool.join()
74 |
75 | print("音频重采样完毕!")
76 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | sep_text, _, _ = text2sep_kata(text)
18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 | return get_bert_feature_with_token(sep_ids, word2ph, device)
22 |
23 |
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 | if (
26 | sys.platform == "darwin"
27 | and torch.backends.mps.is_available()
28 | and device == "cpu"
29 | ):
30 | device = "mps"
31 | if not device:
32 | device = "cuda"
33 | if device not in models.keys():
34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 | with torch.no_grad():
36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 | token_type_ids = torch.zeros_like(inputs).to(device)
38 | attention_mask = torch.ones_like(inputs).to(device)
39 | inputs = {
40 | "input_ids": inputs,
41 | "token_type_ids": token_type_ids,
42 | "attention_mask": attention_mask,
43 | }
44 |
45 | # for i in inputs:
46 | # inputs[i] = inputs[i].to(device)
47 | res = models[device](**inputs, output_hidden_states=True)
48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 | assert inputs["input_ids"].shape[-1] == len(word2ph)
50 | word2phone = word2ph
51 | phone_level_feature = []
52 | for i in range(len(word2phone)):
53 | repeat_feature = res[i].repeat(word2phone[i], 1)
54 | phone_level_feature.append(repeat_feature)
55 |
56 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 |
58 | return phone_level_feature.T
59 |
--------------------------------------------------------------------------------
/oldVersion/V220/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from text.japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | text = "".join(text2sep_kata(text)[0])
24 | if (
25 | sys.platform == "darwin"
26 | and torch.backends.mps.is_available()
27 | and device == "cpu"
28 | ):
29 | device = "mps"
30 | if not device:
31 | device = "cuda"
32 | if device not in models.keys():
33 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
34 | with torch.no_grad():
35 | inputs = tokenizer(text, return_tensors="pt")
36 | for i in inputs:
37 | inputs[i] = inputs[i].to(device)
38 | res = models[device](**inputs, output_hidden_states=True)
39 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
40 | if style_text:
41 | style_inputs = tokenizer(style_text, return_tensors="pt")
42 | for i in style_inputs:
43 | style_inputs[i] = style_inputs[i].to(device)
44 | style_res = models[device](**style_inputs, output_hidden_states=True)
45 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
46 | style_res_mean = style_res.mean(0)
47 |
48 | assert len(word2ph) == len(text) + 2
49 | word2phone = word2ph
50 | phone_level_feature = []
51 | for i in range(len(word2phone)):
52 | if style_text:
53 | repeat_feature = (
54 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
55 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
56 | )
57 | else:
58 | repeat_feature = res[i].repeat(word2phone[i], 1)
59 | phone_level_feature.append(repeat_feature)
60 |
61 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
62 |
63 | return phone_level_feature.T
64 |
--------------------------------------------------------------------------------
/oldVersion/V220/clap_gen.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from multiprocessing import Pool, cpu_count
3 |
4 | import torch
5 | import torch.multiprocessing as mp
6 | from tqdm import tqdm
7 |
8 | import utils
9 | from config import config
10 | from .clap_wrapper import get_clap_audio_feature
11 | import librosa
12 | import os
13 |
14 | os.environ["OMP_NUM_THREADS"] = "1"
15 | os.environ["MKL_NUM_THREADS"] = "1"
16 |
17 |
18 | def process_line(line):
19 | device = config.emo_gen_config.device
20 | if config.emo_gen_config.use_multi_device:
21 | rank = mp.current_process()._identity
22 | rank = rank[0] if len(rank) > 0 else 0
23 | if torch.cuda.is_available():
24 | gpu_id = rank % torch.cuda.device_count()
25 | device = torch.device(f"cuda:{gpu_id}")
26 | else:
27 | device = torch.device("cpu")
28 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
29 |
30 | clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
31 | if os.path.isfile(clap_path):
32 | return
33 |
34 | audio = librosa.load(wav_path, 48000)[0]
35 | # audio = librosa.resample(audio, 44100, 48000)
36 |
37 | clap = get_clap_audio_feature(audio, device)
38 | torch.save(clap, clap_path)
39 |
40 |
41 | if __name__ == "__main__":
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument(
44 | "-c", "--config", type=str, default=config.emo_gen_config.config_path
45 | )
46 | parser.add_argument(
47 | "--num_processes", type=int, default=config.emo_gen_config.num_processes
48 | )
49 | args, _ = parser.parse_known_args()
50 | config_path = args.config
51 | hps = utils.get_hparams_from_file(config_path)
52 | lines = []
53 | with open(hps.data.training_files, encoding="utf-8") as f:
54 | lines.extend(f.readlines())
55 |
56 | with open(hps.data.validation_files, encoding="utf-8") as f:
57 | lines.extend(f.readlines())
58 | if len(lines) != 0:
59 | num_processes = min(args.num_processes, cpu_count())
60 | with Pool(processes=num_processes) as pool:
61 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
62 | pass
63 |
64 | print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
65 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language:
3 | - zh
4 | tags:
5 | - bert
6 | license: "apache-2.0"
7 | ---
8 |
9 | # Please use 'Bert' related functions to load this model!
10 |
11 | ## Chinese BERT with Whole Word Masking
12 | For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13 |
14 | **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15 | Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16 |
17 | This repository is developed based on:https://github.com/google-research/bert
18 |
19 | You may also interested in,
20 | - Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm
21 | - Chinese MacBERT: https://github.com/ymcui/MacBERT
22 | - Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA
23 | - Chinese XLNet: https://github.com/ymcui/Chinese-XLNet
24 | - Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer
25 |
26 | More resources by HFL: https://github.com/ymcui/HFL-Anthology
27 |
28 | ## Citation
29 | If you find the technical report or resource is useful, please cite the following technical report in your paper.
30 | - Primary: https://arxiv.org/abs/2004.13922
31 | ```
32 | @inproceedings{cui-etal-2020-revisiting,
33 | title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing",
34 | author = "Cui, Yiming and
35 | Che, Wanxiang and
36 | Liu, Ting and
37 | Qin, Bing and
38 | Wang, Shijin and
39 | Hu, Guoping",
40 | booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
41 | month = nov,
42 | year = "2020",
43 | address = "Online",
44 | publisher = "Association for Computational Linguistics",
45 | url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58",
46 | pages = "657--668",
47 | }
48 | ```
49 | - Secondary: https://arxiv.org/abs/1906.08101
50 | ```
51 | @article{chinese-bert-wwm,
52 | title={Pre-Training with Whole Word Masking for Chinese BERT},
53 | author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping},
54 | journal={arXiv preprint arXiv:1906.08101},
55 | year={2019}
56 | }
57 | ```
58 |
--------------------------------------------------------------------------------
/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from text.japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | text = "".join(text2sep_kata(text)[0])
24 | if style_text:
25 | style_text = "".join(text2sep_kata(style_text)[0])
26 | if (
27 | sys.platform == "darwin"
28 | and torch.backends.mps.is_available()
29 | and device == "cpu"
30 | ):
31 | device = "mps"
32 | if not device:
33 | device = "cuda"
34 | if device not in models.keys():
35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 | with torch.no_grad():
37 | inputs = tokenizer(text, return_tensors="pt")
38 | for i in inputs:
39 | inputs[i] = inputs[i].to(device)
40 | res = models[device](**inputs, output_hidden_states=True)
41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 | if style_text:
43 | style_inputs = tokenizer(style_text, return_tensors="pt")
44 | for i in style_inputs:
45 | style_inputs[i] = style_inputs[i].to(device)
46 | style_res = models[device](**style_inputs, output_hidden_states=True)
47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 | style_res_mean = style_res.mean(0)
49 |
50 | assert len(word2ph) == len(text) + 2
51 | word2phone = word2ph
52 | phone_level_feature = []
53 | for i in range(len(word2phone)):
54 | if style_text:
55 | repeat_feature = (
56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 | )
59 | else:
60 | repeat_feature = res[i].repeat(word2phone[i], 1)
61 | phone_level_feature.append(repeat_feature)
62 |
63 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 |
65 | return phone_level_feature.T
66 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | text = "".join(text2sep_kata(text)[0])
24 | if style_text:
25 | style_text = "".join(text2sep_kata(style_text)[0])
26 | if (
27 | sys.platform == "darwin"
28 | and torch.backends.mps.is_available()
29 | and device == "cpu"
30 | ):
31 | device = "mps"
32 | if not device:
33 | device = "cuda"
34 | if device not in models.keys():
35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 | with torch.no_grad():
37 | inputs = tokenizer(text, return_tensors="pt")
38 | for i in inputs:
39 | inputs[i] = inputs[i].to(device)
40 | res = models[device](**inputs, output_hidden_states=True)
41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 | if style_text:
43 | style_inputs = tokenizer(style_text, return_tensors="pt")
44 | for i in style_inputs:
45 | style_inputs[i] = style_inputs[i].to(device)
46 | style_res = models[device](**style_inputs, output_hidden_states=True)
47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 | style_res_mean = style_res.mean(0)
49 |
50 | assert len(word2ph) == len(text) + 2
51 | word2phone = word2ph
52 | phone_level_feature = []
53 | for i in range(len(word2phone)):
54 | if style_text:
55 | repeat_feature = (
56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 | )
59 | else:
60 | repeat_feature = res[i].repeat(word2phone[i], 1)
61 | phone_level_feature.append(repeat_feature)
62 |
63 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 |
65 | return phone_level_feature.T
66 |
--------------------------------------------------------------------------------
/oldVersion/V101/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.0.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 |
26 | assert bert.shape[-1] == len(phone)
27 |
28 | phone = torch.LongTensor(phone)
29 | tone = torch.LongTensor(tone)
30 | language = torch.LongTensor(language)
31 |
32 | return bert, phone, tone, language
33 |
34 |
35 | def infer(
36 | text,
37 | sdp_ratio,
38 | noise_scale,
39 | noise_scale_w,
40 | length_scale,
41 | sid,
42 | hps,
43 | net_g,
44 | device,
45 | ):
46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device)
47 | with torch.no_grad():
48 | x_tst = phones.to(device).unsqueeze(0)
49 | tones = tones.to(device).unsqueeze(0)
50 | lang_ids = lang_ids.to(device).unsqueeze(0)
51 | bert = bert.to(device).unsqueeze(0)
52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
53 | del phones
54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
55 | audio = (
56 | net_g.infer(
57 | x_tst,
58 | x_tst_lengths,
59 | speakers,
60 | tones,
61 | lang_ids,
62 | bert,
63 | sdp_ratio=sdp_ratio,
64 | noise_scale=noise_scale,
65 | noise_scale_w=noise_scale_w,
66 | length_scale=length_scale,
67 | )[0][0, 0]
68 | .data.cpu()
69 | .float()
70 | .numpy()
71 | )
72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
73 | if torch.cuda.is_available():
74 | torch.cuda.empty_cache()
75 | return audio
76 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 |
8 | def get_bert_feature(text, word2ph, device=None):
9 | if (
10 | sys.platform == "darwin"
11 | and torch.backends.mps.is_available()
12 | and device == "cpu"
13 | ):
14 | device = "mps"
15 | if not device:
16 | device = "cuda"
17 | model = AutoModelForMaskedLM.from_pretrained(
18 | "./bert/chinese-roberta-wwm-ext-large"
19 | ).to(device)
20 | with torch.no_grad():
21 | inputs = tokenizer(text, return_tensors="pt")
22 | for i in inputs:
23 | inputs[i] = inputs[i].to(device)
24 | res = model(**inputs, output_hidden_states=True)
25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 |
27 | assert len(word2ph) == len(text) + 2
28 | word2phone = word2ph
29 | phone_level_feature = []
30 | for i in range(len(word2phone)):
31 | repeat_feature = res[i].repeat(word2phone[i], 1)
32 | phone_level_feature.append(repeat_feature)
33 |
34 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
35 |
36 | return phone_level_feature.T
37 |
38 |
39 | if __name__ == "__main__":
40 | import torch
41 |
42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
43 | word2phone = [
44 | 1,
45 | 2,
46 | 1,
47 | 2,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 2,
59 | 2,
60 | 2,
61 | 1,
62 | 1,
63 | 2,
64 | 2,
65 | 1,
66 | 2,
67 | 2,
68 | 2,
69 | 2,
70 | 1,
71 | 2,
72 | 2,
73 | 2,
74 | 2,
75 | 2,
76 | 1,
77 | 2,
78 | 2,
79 | 2,
80 | 2,
81 | 1,
82 | ]
83 |
84 | # 计算总帧数
85 | total_frames = sum(word2phone)
86 | print(word_level_feature.shape)
87 | print(word2phone)
88 | phone_level_feature = []
89 | for i in range(len(word2phone)):
90 | print(word_level_feature[i].shape)
91 |
92 | # 对每个词重复word2phone[i]次
93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
94 | phone_level_feature.append(repeat_feature)
95 |
96 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
97 | print(phone_level_feature.shape) # torch.Size([36, 1024])
98 |
--------------------------------------------------------------------------------
/whisper_transcribe.py:
--------------------------------------------------------------------------------
1 | import whisper
2 | import os
3 | import argparse
4 | import torch
5 |
6 |
7 | def transcribe_one(audio_path):
8 | # load audio and pad/trim it to fit 30 seconds
9 | audio = whisper.load_audio(audio_path)
10 | audio = whisper.pad_or_trim(audio)
11 |
12 | # make log-Mel spectrogram and move to the same device as the model
13 | mel = whisper.log_mel_spectrogram(audio).to(model.device)
14 |
15 | # detect the spoken language
16 | _, probs = model.detect_language(mel)
17 | print(f"Detected language: {max(probs, key=probs.get)}")
18 | lang = max(probs, key=probs.get)
19 | # decode the audio
20 | options = whisper.DecodingOptions(beam_size=5)
21 | result = whisper.decode(model, mel, options)
22 |
23 | # print the recognized text
24 | print(result.text)
25 | return lang, result.text
26 |
27 |
28 | if __name__ == "__main__":
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--languages", default="CJ")
31 | parser.add_argument("--whisper_size", default="medium")
32 | parser.add_argument("--speaker")
33 | parser.add_argument("--input_dir")
34 | parser.add_argument("--output")
35 | args = parser.parse_args()
36 |
37 | model = whisper.load_model(args.whisper_size)
38 | speaker = args.speaker
39 | input_dir = args.input_dir
40 | output = args.output
41 |
42 | if args.languages == "CJE":
43 | lang2token = {
44 | "zh": "ZH|",
45 | "ja": "JP|",
46 | "en": "EN|",
47 | }
48 | elif args.languages == "CJ":
49 | lang2token = {
50 | "zh": "ZH|",
51 | "ja": "JP|",
52 | }
53 | elif args.languages == "C":
54 | lang2token = {
55 | "zh": "ZH|",
56 | }
57 |
58 | assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
59 |
60 | speaker_annos = []
61 | total_files = sum([len(files) for _, _, files in os.walk(input_dir)])
62 |
63 | for i, wavfile in enumerate(list(os.walk(input_dir))[0][2]):
64 | try:
65 | lang, text = transcribe_one(f"./data/{speaker}/raw/{wavfile}")
66 | if lang not in list(lang2token.keys()):
67 | print(f"{lang} not supported, ignoring\n")
68 | continue
69 | speaker_annos.append(f"{wavfile}|{speaker}|{lang2token[lang]}{text}")
70 | print(f"Processed: {i + 1}/{total_files}")
71 | except Exception as e:
72 | print(e)
73 | continue
74 |
75 | with open(output, "w", encoding="utf-8") as f:
76 | f.write("\n".join(speaker_annos))
--------------------------------------------------------------------------------
/re_matching.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def extract_language_and_text_updated(speaker, dialogue):
5 | # 使用正则表达式匹配<语言>标签和其后的文本
6 | pattern_language_text = r"<(\S+?)>([^<]+)"
7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
8 | speaker = speaker[1:-1]
9 | # 清理文本:去除两边的空白字符
10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11 | matches_cleaned.append(speaker)
12 | return matches_cleaned
13 |
14 |
15 | def validate_text(input_text):
16 | # 验证说话人的正则表达式
17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18 |
19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21 |
22 | # 对每个匹配到的说话人内容进行进一步验证
23 | for _, dialogue in matches:
24 | language_text_matches = extract_language_and_text_updated(_, dialogue)
25 | if not language_text_matches:
26 | return (
27 | False,
28 | "Error: Invalid format detected in dialogue content. Please check your input.",
29 | )
30 |
31 | # 如果输入的文本中没有找到任何匹配项
32 | if not matches:
33 | return (
34 | False,
35 | "Error: No valid speaker format detected. Please check your input.",
36 | )
37 |
38 | return True, "Input is valid."
39 |
40 |
41 | def text_matching(text: str) -> list:
42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43 | matches = re.findall(speaker_pattern, text, re.DOTALL)
44 | result = []
45 | for speaker, dialogue in matches:
46 | result.append(extract_language_and_text_updated(speaker, dialogue))
47 | return result
48 |
49 |
50 | def cut_para(text):
51 | splitted_para = re.split("[\n]", text) # 按段分
52 | splitted_para = [
53 | sentence.strip() for sentence in splitted_para if sentence.strip()
54 | ] # 删除空字符串
55 | return splitted_para
56 |
57 |
58 | def cut_sent(para):
59 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符
60 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号
61 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号
62 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
63 | para = para.rstrip() # 段尾如果有多余的\n就去掉它
64 | return para.split("\n")
65 |
66 |
67 | if __name__ == "__main__":
68 | text = """
69 | [说话人1]
70 | [说话人2]
你好吗?元気ですか?こんにちは,世界。你好吗?
71 | [说话人3]谢谢。どういたしまして。
72 | """
73 | text_matching(text)
74 | # 测试函数
75 | test_text = """
76 | [说话人1]你好,こんにちは!こんにちは,世界。
77 | [说话人2]你好吗?
78 | """
79 | text_matching(test_text)
80 | res = validate_text(test_text)
81 | print(res)
82 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | device = torch.device(
6 | "cuda"
7 | if torch.cuda.is_available()
8 | else (
9 | "mps"
10 | if sys.platform == "darwin" and torch.backends.mps.is_available()
11 | else "cpu"
12 | )
13 | )
14 |
15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(
17 | device
18 | )
19 |
20 |
21 | def get_bert_feature(text, word2ph):
22 | with torch.no_grad():
23 | inputs = tokenizer(text, return_tensors="pt")
24 | for i in inputs:
25 | inputs[i] = inputs[i].to(device)
26 | res = model(**inputs, output_hidden_states=True)
27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
28 |
29 | assert len(word2ph) == len(text) + 2
30 | word2phone = word2ph
31 | phone_level_feature = []
32 | for i in range(len(word2phone)):
33 | repeat_feature = res[i].repeat(word2phone[i], 1)
34 | phone_level_feature.append(repeat_feature)
35 |
36 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 |
38 | return phone_level_feature.T
39 |
40 |
41 | if __name__ == "__main__":
42 | # feature = get_bert_feature('你好,我是说的道理。')
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/compress_model.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from text.symbols import symbols
3 | import torch
4 |
5 | from tools.log import logger
6 | import utils
7 | from models import SynthesizerTrn
8 | import os
9 |
10 |
11 | def copyStateDict(state_dict):
12 | if list(state_dict.keys())[0].startswith("module"):
13 | start_idx = 1
14 | else:
15 | start_idx = 0
16 | new_state_dict = OrderedDict()
17 | for k, v in state_dict.items():
18 | name = ",".join(k.split(".")[start_idx:])
19 | new_state_dict[name] = v
20 | return new_state_dict
21 |
22 |
23 | def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
24 | hps = utils.get_hparams_from_file(config)
25 |
26 | net_g = SynthesizerTrn(
27 | len(symbols),
28 | hps.data.filter_length // 2 + 1,
29 | hps.train.segment_size // hps.data.hop_length,
30 | n_speakers=hps.data.n_speakers,
31 | **hps.model,
32 | )
33 |
34 | optim_g = torch.optim.AdamW(
35 | net_g.parameters(),
36 | hps.train.learning_rate,
37 | betas=hps.train.betas,
38 | eps=hps.train.eps,
39 | )
40 |
41 | state_dict_g = torch.load(input_model, map_location="cpu")
42 | new_dict_g = copyStateDict(state_dict_g)
43 | keys = []
44 | for k, v in new_dict_g["model"].items():
45 | if "enc_q" in k:
46 | continue # noqa: E701
47 | keys.append(k)
48 |
49 | new_dict_g = (
50 | {k: new_dict_g["model"][k].half() for k in keys}
51 | if ishalf
52 | else {k: new_dict_g["model"][k] for k in keys}
53 | )
54 |
55 | torch.save(
56 | {
57 | "model": new_dict_g,
58 | "iteration": 0,
59 | "optimizer": optim_g.state_dict(),
60 | "learning_rate": 0.0001,
61 | },
62 | output_model,
63 | )
64 |
65 |
66 | if __name__ == "__main__":
67 | import argparse
68 |
69 | parser = argparse.ArgumentParser()
70 | parser.add_argument("-c", "--config", type=str, default="configs/config.json")
71 | parser.add_argument("-i", "--input", type=str)
72 | parser.add_argument("-o", "--output", type=str, default=None)
73 | parser.add_argument(
74 | "-hf", "--half", action="store_true", default=False, help="Save as FP16"
75 | )
76 |
77 | args = parser.parse_args()
78 |
79 | output = args.output
80 |
81 | if output is None:
82 | import os.path
83 |
84 | filename, ext = os.path.splitext(args.input)
85 | half = "_half" if args.half else ""
86 | output = filename + "_release" + half + ext
87 |
88 | removeOptimizer(args.config, args.input, args.half, output)
89 | logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")
90 |
--------------------------------------------------------------------------------
/slm/wavlm-base-plus/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "wavlm-base-plus",
3 | "activation_dropout": 0.0,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "WavLMModel"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 256,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": false,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": false,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_norm": "group",
51 | "feat_proj_dropout": 0.1,
52 | "feat_quantizer_dropout": 0.0,
53 | "final_dropout": 0.0,
54 | "freeze_feat_extract_train": true,
55 | "hidden_act": "gelu",
56 | "hidden_dropout": 0.1,
57 | "hidden_size": 768,
58 | "initializer_range": 0.02,
59 | "intermediate_size": 3072,
60 | "layer_norm_eps": 1e-05,
61 | "layerdrop": 0.05,
62 | "mask_channel_length": 10,
63 | "mask_channel_min_space": 1,
64 | "mask_channel_other": 0.0,
65 | "mask_channel_prob": 0.0,
66 | "mask_channel_selection": "static",
67 | "mask_feature_length": 10,
68 | "mask_feature_min_masks": 0,
69 | "mask_feature_prob": 0.0,
70 | "mask_time_length": 10,
71 | "mask_time_min_masks": 2,
72 | "mask_time_min_space": 1,
73 | "mask_time_other": 0.0,
74 | "mask_time_prob": 0.05,
75 | "mask_time_selection": "static",
76 | "model_type": "wavlm",
77 | "no_mask_channel_overlap": false,
78 | "no_mask_time_overlap": false,
79 | "num_adapter_layers": 3,
80 | "num_attention_heads": 12,
81 | "num_buckets": 320,
82 | "num_codevector_groups": 2,
83 | "num_codevectors_per_group": 320,
84 | "num_conv_pos_embedding_groups": 16,
85 | "num_conv_pos_embeddings": 128,
86 | "num_ctc_classes": 80,
87 | "num_feat_extract_layers": 7,
88 | "num_hidden_layers": 12,
89 | "num_negatives": 100,
90 | "output_hidden_size": 768,
91 | "pad_token_id": 0,
92 | "proj_codevector_dim": 256,
93 | "replace_prob": 0.5,
94 | "torch_dtype": "float32",
95 | "transformers_version": "4.13.0.dev0",
96 | "use_weighted_layer_sum": false,
97 | "vocab_size": 32,
98 | "tokenizer_class": "Wav2Vec2CTCTokenizer"
99 | }
100 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 | models = dict()
8 |
9 |
10 | def get_bert_feature(text, word2ph, device=None):
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if device not in models.keys():
20 | models[device] = AutoModelForMaskedLM.from_pretrained(
21 | "./bert/chinese-roberta-wwm-ext-large"
22 | ).to(device)
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = models[device](**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert len(word2ph) == len(text) + 2
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
41 |
42 | if __name__ == "__main__":
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16 | if (
17 | sys.platform == "darwin"
18 | and torch.backends.mps.is_available()
19 | and device == "cpu"
20 | ):
21 | device = "mps"
22 | if not device:
23 | device = "cuda"
24 | if device not in models.keys():
25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26 | with torch.no_grad():
27 | inputs = tokenizer(text, return_tensors="pt")
28 | for i in inputs:
29 | inputs[i] = inputs[i].to(device)
30 | res = models[device](**inputs, output_hidden_states=True)
31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32 |
33 | assert len(word2ph) == len(text) + 2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
44 |
45 | if __name__ == "__main__":
46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47 | word2phone = [
48 | 1,
49 | 2,
50 | 1,
51 | 2,
52 | 2,
53 | 1,
54 | 2,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 2,
65 | 1,
66 | 1,
67 | 2,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 1,
86 | ]
87 |
88 | # 计算总帧数
89 | total_frames = sum(word2phone)
90 | print(word_level_feature.shape)
91 | print(word2phone)
92 | phone_level_feature = []
93 | for i in range(len(word2phone)):
94 | print(word_level_feature[i].shape)
95 |
96 | # 对每个词重复word2phone[i]次
97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98 | phone_level_feature.append(repeat_feature)
99 |
100 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 | print(phone_level_feature.shape) # torch.Size([36, 1024])
102 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16 | if (
17 | sys.platform == "darwin"
18 | and torch.backends.mps.is_available()
19 | and device == "cpu"
20 | ):
21 | device = "mps"
22 | if not device:
23 | device = "cuda"
24 | if device not in models.keys():
25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26 | with torch.no_grad():
27 | inputs = tokenizer(text, return_tensors="pt")
28 | for i in inputs:
29 | inputs[i] = inputs[i].to(device)
30 | res = models[device](**inputs, output_hidden_states=True)
31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32 |
33 | assert len(word2ph) == len(text) + 2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
44 |
45 | if __name__ == "__main__":
46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47 | word2phone = [
48 | 1,
49 | 2,
50 | 1,
51 | 2,
52 | 2,
53 | 1,
54 | 2,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 2,
65 | 1,
66 | 1,
67 | 2,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 1,
86 | ]
87 |
88 | # 计算总帧数
89 | total_frames = sum(word2phone)
90 | print(word_level_feature.shape)
91 | print(word2phone)
92 | phone_level_feature = []
93 | for i in range(len(word2phone)):
94 | print(word_level_feature[i].shape)
95 |
96 | # 对每个词重复word2phone[i]次
97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98 | phone_level_feature.append(repeat_feature)
99 |
100 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 | print(phone_level_feature.shape) # torch.Size([36, 1024])
102 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "torch",
3 | "activation_dropout": 0.1,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "Wav2Vec2ForSpeechClassification"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 768,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": true,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": true,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_dropout": 0.0,
51 | "feat_extract_norm": "layer",
52 | "feat_proj_dropout": 0.1,
53 | "feat_quantizer_dropout": 0.0,
54 | "final_dropout": 0.1,
55 | "finetuning_task": "wav2vec2_reg",
56 | "gradient_checkpointing": false,
57 | "hidden_act": "gelu",
58 | "hidden_dropout": 0.1,
59 | "hidden_dropout_prob": 0.1,
60 | "hidden_size": 1024,
61 | "id2label": {
62 | "0": "arousal",
63 | "1": "dominance",
64 | "2": "valence"
65 | },
66 | "initializer_range": 0.02,
67 | "intermediate_size": 4096,
68 | "label2id": {
69 | "arousal": 0,
70 | "dominance": 1,
71 | "valence": 2
72 | },
73 | "layer_norm_eps": 1e-05,
74 | "layerdrop": 0.1,
75 | "mask_feature_length": 10,
76 | "mask_feature_min_masks": 0,
77 | "mask_feature_prob": 0.0,
78 | "mask_time_length": 10,
79 | "mask_time_min_masks": 2,
80 | "mask_time_prob": 0.05,
81 | "model_type": "wav2vec2",
82 | "num_adapter_layers": 3,
83 | "num_attention_heads": 16,
84 | "num_codevector_groups": 2,
85 | "num_codevectors_per_group": 320,
86 | "num_conv_pos_embedding_groups": 16,
87 | "num_conv_pos_embeddings": 128,
88 | "num_feat_extract_layers": 7,
89 | "num_hidden_layers": 12,
90 | "num_negatives": 100,
91 | "output_hidden_size": 1024,
92 | "pad_token_id": 0,
93 | "pooling_mode": "mean",
94 | "problem_type": "regression",
95 | "proj_codevector_dim": 768,
96 | "tdnn_dilation": [
97 | 1,
98 | 2,
99 | 3,
100 | 1,
101 | 1
102 | ],
103 | "tdnn_dim": [
104 | 512,
105 | 512,
106 | 512,
107 | 512,
108 | 1500
109 | ],
110 | "tdnn_kernel": [
111 | 5,
112 | 3,
113 | 3,
114 | 1,
115 | 1
116 | ],
117 | "torch_dtype": "float32",
118 | "transformers_version": "4.17.0.dev0",
119 | "use_weighted_layer_sum": false,
120 | "vocab_size": null,
121 | "xvector_output_dim": 512
122 | }
123 |
--------------------------------------------------------------------------------
/spec_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tqdm import tqdm
3 | from multiprocessing import Pool
4 | from mel_processing import spectrogram_torch, mel_spectrogram_torch
5 | from utils import load_wav_to_torch
6 |
7 |
8 | class AudioProcessor:
9 | def __init__(
10 | self,
11 | max_wav_value,
12 | use_mel_spec_posterior,
13 | filter_length,
14 | n_mel_channels,
15 | sampling_rate,
16 | hop_length,
17 | win_length,
18 | mel_fmin,
19 | mel_fmax,
20 | ):
21 | self.max_wav_value = max_wav_value
22 | self.use_mel_spec_posterior = use_mel_spec_posterior
23 | self.filter_length = filter_length
24 | self.n_mel_channels = n_mel_channels
25 | self.sampling_rate = sampling_rate
26 | self.hop_length = hop_length
27 | self.win_length = win_length
28 | self.mel_fmin = mel_fmin
29 | self.mel_fmax = mel_fmax
30 |
31 | def process_audio(self, filename):
32 | audio, sampling_rate = load_wav_to_torch(filename)
33 | audio_norm = audio / self.max_wav_value
34 | audio_norm = audio_norm.unsqueeze(0)
35 | spec_filename = filename.replace(".wav", ".spec.pt")
36 | if self.use_mel_spec_posterior:
37 | spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
38 | try:
39 | spec = torch.load(spec_filename)
40 | except:
41 | if self.use_mel_spec_posterior:
42 | spec = mel_spectrogram_torch(
43 | audio_norm,
44 | self.filter_length,
45 | self.n_mel_channels,
46 | self.sampling_rate,
47 | self.hop_length,
48 | self.win_length,
49 | self.mel_fmin,
50 | self.mel_fmax,
51 | center=False,
52 | )
53 | else:
54 | spec = spectrogram_torch(
55 | audio_norm,
56 | self.filter_length,
57 | self.sampling_rate,
58 | self.hop_length,
59 | self.win_length,
60 | center=False,
61 | )
62 | spec = torch.squeeze(spec, 0)
63 | torch.save(spec, spec_filename)
64 | return spec, audio_norm
65 |
66 |
67 | # 使用示例
68 | processor = AudioProcessor(
69 | max_wav_value=32768.0,
70 | use_mel_spec_posterior=False,
71 | filter_length=2048,
72 | n_mel_channels=128,
73 | sampling_rate=44100,
74 | hop_length=512,
75 | win_length=2048,
76 | mel_fmin=0.0,
77 | mel_fmax="null",
78 | )
79 |
80 | with open("filelists/train.list", "r") as f:
81 | filepaths = [line.split("|")[0] for line in f] # 取每一行的第一部分作为audiopath
82 |
83 | # 使用多进程处理
84 | with Pool(processes=32) as pool: # 使用4个进程
85 | with tqdm(total=len(filepaths)) as pbar:
86 | for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)):
87 | pbar.update()
88 |
--------------------------------------------------------------------------------
/bert_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from multiprocessing import Pool
3 | import commons
4 | import utils
5 | from tqdm import tqdm
6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert
7 | import argparse
8 | import torch.multiprocessing as mp
9 | from config import config
10 |
11 |
12 | def process_line(x):
13 | line, add_blank = x
14 | device = config.bert_gen_config.device
15 | if config.bert_gen_config.use_multi_device:
16 | rank = mp.current_process()._identity
17 | rank = rank[0] if len(rank) > 0 else 0
18 | if torch.cuda.is_available():
19 | gpu_id = rank % torch.cuda.device_count()
20 | device = torch.device(f"cuda:{gpu_id}")
21 | else:
22 | device = torch.device("cpu")
23 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
24 | phone = phones.split(" ")
25 | tone = [int(i) for i in tone.split(" ")]
26 | word2ph = [int(i) for i in word2ph.split(" ")]
27 | word2ph = [i for i in word2ph]
28 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
29 |
30 | if add_blank:
31 | phone = commons.intersperse(phone, 0)
32 | tone = commons.intersperse(tone, 0)
33 | language = commons.intersperse(language, 0)
34 | for i in range(len(word2ph)):
35 | word2ph[i] = word2ph[i] * 2
36 | word2ph[0] += 1
37 |
38 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
39 |
40 | try:
41 | bert = torch.load(bert_path)
42 | assert bert.shape[0] == 2048
43 | except Exception:
44 | bert = get_bert(text, word2ph, language_str, device)
45 | assert bert.shape[-1] == len(phone)
46 | torch.save(bert, bert_path)
47 |
48 |
49 | preprocess_text_config = config.preprocess_text_config
50 |
51 | if __name__ == "__main__":
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument(
54 | "-c", "--config", type=str, default=config.bert_gen_config.config_path
55 | )
56 | parser.add_argument(
57 | "--num_processes", type=int, default=config.bert_gen_config.num_processes
58 | )
59 | args, _ = parser.parse_known_args()
60 | config_path = args.config
61 | hps = utils.get_hparams_from_file(config_path)
62 | check_bert_models()
63 | lines = []
64 | with open(hps.data.training_files, encoding="utf-8") as f:
65 | lines.extend(f.readlines())
66 |
67 | with open(hps.data.validation_files, encoding="utf-8") as f:
68 | lines.extend(f.readlines())
69 | add_blank = [hps.data.add_blank] * len(lines)
70 |
71 | if len(lines) != 0:
72 | num_processes = args.num_processes
73 | with Pool(processes=num_processes) as pool:
74 | for _ in tqdm(
75 | pool.imap_unordered(process_line, zip(lines, add_blank)),
76 | total=len(lines),
77 | ):
78 | # 这里是缩进的代码块,表示循环体
79 | pass # 使用pass语句作为占位符
80 |
81 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
82 |
--------------------------------------------------------------------------------
/oldVersion/V110/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 | assert bert.shape[-1] == len(phone), phone
26 |
27 | if language_str == "ZH":
28 | bert = bert
29 | ja_bert = torch.zeros(768, len(phone))
30 | elif language_str == "JP":
31 | ja_bert = bert
32 | bert = torch.zeros(1024, len(phone))
33 | else:
34 | bert = torch.zeros(1024, len(phone))
35 | ja_bert = torch.zeros(768, len(phone))
36 |
37 | assert bert.shape[-1] == len(
38 | phone
39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 |
41 | phone = torch.LongTensor(phone)
42 | tone = torch.LongTensor(tone)
43 | language = torch.LongTensor(language)
44 | return bert, ja_bert, phone, tone, language
45 |
46 |
47 | def infer(
48 | text,
49 | sdp_ratio,
50 | noise_scale,
51 | noise_scale_w,
52 | length_scale,
53 | sid,
54 | language,
55 | hps,
56 | net_g,
57 | device,
58 | ):
59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
60 | with torch.no_grad():
61 | x_tst = phones.to(device).unsqueeze(0)
62 | tones = tones.to(device).unsqueeze(0)
63 | lang_ids = lang_ids.to(device).unsqueeze(0)
64 | bert = bert.to(device).unsqueeze(0)
65 | ja_bert = ja_bert.to(device).unsqueeze(0)
66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
67 | del phones
68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
69 | audio = (
70 | net_g.infer(
71 | x_tst,
72 | x_tst_lengths,
73 | speakers,
74 | tones,
75 | lang_ids,
76 | bert,
77 | ja_bert,
78 | sdp_ratio=sdp_ratio,
79 | noise_scale=noise_scale,
80 | noise_scale_w=noise_scale_w,
81 | length_scale=length_scale,
82 | )[0][0, 0]
83 | .data.cpu()
84 | .float()
85 | .numpy()
86 | )
87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
88 | if torch.cuda.is_available():
89 | torch.cuda.empty_cache()
90 | return audio
91 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "I",
78 | "N",
79 | "U",
80 | "a",
81 | "b",
82 | "by",
83 | "ch",
84 | "cl",
85 | "d",
86 | "dy",
87 | "e",
88 | "f",
89 | "g",
90 | "gy",
91 | "h",
92 | "hy",
93 | "i",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "p",
103 | "py",
104 | "r",
105 | "ry",
106 | "s",
107 | "sh",
108 | "t",
109 | "ts",
110 | "u",
111 | "V",
112 | "w",
113 | "y",
114 | "z",
115 | ]
116 | num_ja_tones = 1
117 |
118 | # English
119 | en_symbols = [
120 | "aa",
121 | "ae",
122 | "ah",
123 | "ao",
124 | "aw",
125 | "ay",
126 | "b",
127 | "ch",
128 | "d",
129 | "dh",
130 | "eh",
131 | "er",
132 | "ey",
133 | "f",
134 | "g",
135 | "hh",
136 | "ih",
137 | "iy",
138 | "jh",
139 | "k",
140 | "l",
141 | "m",
142 | "n",
143 | "ng",
144 | "ow",
145 | "oy",
146 | "p",
147 | "r",
148 | "s",
149 | "sh",
150 | "t",
151 | "th",
152 | "uh",
153 | "uw",
154 | "V",
155 | "w",
156 | "y",
157 | "z",
158 | "zh",
159 | ]
160 | num_en_tones = 4
161 |
162 | # combine all symbols
163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
164 | symbols = [pad] + normal_symbols + pu_symbols
165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
166 |
167 | # combine all tones
168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
169 |
170 | # language maps
171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2}
172 | num_languages = len(language_id_map.keys())
173 |
174 | language_tone_start_map = {
175 | "ZH": 0,
176 | "JA": num_zh_tones,
177 | "EN": num_zh_tones + num_ja_tones,
178 | }
179 |
180 | if __name__ == "__main__":
181 | a = set(zh_symbols)
182 | b = set(en_symbols)
183 | print(sorted(a & b))
184 |
--------------------------------------------------------------------------------
/update_status.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gradio as gr
3 |
4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"}
5 |
6 |
7 | def raw_dir_convert_to_path(target_dir: str, lang):
8 | res = target_dir.rstrip("/").rstrip("\\")
9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")):
10 | res = os.path.join("./raw", res)
11 | if (
12 | (not res.endswith("_zh"))
13 | and (not res.endswith("_jp"))
14 | and (not res.endswith("_en"))
15 | ):
16 | res += lang_dict[lang]
17 | return res
18 |
19 |
20 | def update_g_files():
21 | g_files = []
22 | cnt = 0
23 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
24 | for file in files:
25 | if file.startswith("G_") and file.endswith(".pth"):
26 | g_files.append(os.path.join(root, file))
27 | cnt += 1
28 | print(g_files)
29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files)
30 |
31 |
32 | def update_c_files():
33 | c_files = []
34 | cnt = 0
35 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
36 | for file in files:
37 | if file.startswith("config.json"):
38 | c_files.append(os.path.join(root, file))
39 | cnt += 1
40 | print(c_files)
41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files)
42 |
43 |
44 | def update_model_folders():
45 | subdirs = []
46 | cnt = 0
47 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
48 | for dir_name in dirs:
49 | if os.path.basename(dir_name) != "eval":
50 | subdirs.append(os.path.join(root, dir_name))
51 | cnt += 1
52 | print(subdirs)
53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs)
54 |
55 |
56 | def update_wav_lab_pairs():
57 | wav_count = tot_count = 0
58 | for root, _, files in os.walk("./raw"):
59 | for file in files:
60 | # print(file)
61 | file_path = os.path.join(root, file)
62 | if file.lower().endswith(".wav"):
63 | lab_file = os.path.splitext(file_path)[0] + ".lab"
64 | if os.path.exists(lab_file):
65 | wav_count += 1
66 | tot_count += 1
67 | return f"{wav_count} / {tot_count}"
68 |
69 |
70 | def update_raw_folders():
71 | subdirs = []
72 | cnt = 0
73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径
74 | raw_path = os.path.join(script_path, "raw")
75 | print(raw_path)
76 | os.makedirs(raw_path, exist_ok=True)
77 | for root, dirs, files in os.walk(raw_path):
78 | for dir_name in dirs:
79 | relative_path = os.path.relpath(
80 | os.path.join(root, dir_name), script_path
81 | ) # 获取相对路径
82 | subdirs.append(relative_path)
83 | cnt += 1
84 | print(subdirs)
85 | return (
86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹",
87 | gr.Dropdown.update(choices=subdirs),
88 | gr.Textbox.update(value=update_wav_lab_pairs()),
89 | )
90 |
--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V220/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V220/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V230/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V200/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | @Desc: 2.0版本兼容 对应2.0.1 2.0.2-fix
3 | """
4 | import torch
5 | import commons
6 | from .text import cleaned_text_to_sequence, get_bert
7 | from .text.cleaner import clean_text
8 |
9 |
10 | def get_text(text, language_str, hps, device):
11 | # 在此处实现当前版本的get_text
12 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
13 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
14 |
15 | if hps.data.add_blank:
16 | phone = commons.intersperse(phone, 0)
17 | tone = commons.intersperse(tone, 0)
18 | language = commons.intersperse(language, 0)
19 | for i in range(len(word2ph)):
20 | word2ph[i] = word2ph[i] * 2
21 | word2ph[0] += 1
22 | bert_ori = get_bert(norm_text, word2ph, language_str, device)
23 | del word2ph
24 | assert bert_ori.shape[-1] == len(phone), phone
25 |
26 | if language_str == "ZH":
27 | bert = bert_ori
28 | ja_bert = torch.zeros(1024, len(phone))
29 | en_bert = torch.zeros(1024, len(phone))
30 | elif language_str == "JP":
31 | bert = torch.zeros(1024, len(phone))
32 | ja_bert = bert_ori
33 | en_bert = torch.zeros(1024, len(phone))
34 | elif language_str == "EN":
35 | bert = torch.zeros(1024, len(phone))
36 | ja_bert = torch.zeros(1024, len(phone))
37 | en_bert = bert_ori
38 | else:
39 | raise ValueError("language_str should be ZH, JP or EN")
40 |
41 | assert bert.shape[-1] == len(
42 | phone
43 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
44 |
45 | phone = torch.LongTensor(phone)
46 | tone = torch.LongTensor(tone)
47 | language = torch.LongTensor(language)
48 | return bert, ja_bert, en_bert, phone, tone, language
49 |
50 |
51 | def infer(
52 | text,
53 | sdp_ratio,
54 | noise_scale,
55 | noise_scale_w,
56 | length_scale,
57 | sid,
58 | language,
59 | hps,
60 | net_g,
61 | device,
62 | ):
63 | bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
64 | text, language, hps, device
65 | )
66 | with torch.no_grad():
67 | x_tst = phones.to(device).unsqueeze(0)
68 | tones = tones.to(device).unsqueeze(0)
69 | lang_ids = lang_ids.to(device).unsqueeze(0)
70 | bert = bert.to(device).unsqueeze(0)
71 | ja_bert = ja_bert.to(device).unsqueeze(0)
72 | en_bert = en_bert.to(device).unsqueeze(0)
73 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
74 | del phones
75 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
76 | audio = (
77 | net_g.infer(
78 | x_tst,
79 | x_tst_lengths,
80 | speakers,
81 | tones,
82 | lang_ids,
83 | bert,
84 | ja_bert,
85 | en_bert,
86 | sdp_ratio=sdp_ratio,
87 | noise_scale=noise_scale,
88 | noise_scale_w=noise_scale_w,
89 | length_scale=length_scale,
90 | )[0][0, 0]
91 | .data.cpu()
92 | .float()
93 | .numpy()
94 | )
95 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
96 | if torch.cuda.is_available():
97 | torch.cuda.empty_cache()
98 | return audio
99 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V210/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/japanese.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2 | import re
3 | import sys
4 |
5 | import pyopenjtalk
6 |
7 | from . import symbols
8 |
9 | # Regular expression matching Japanese without punctuation marks:
10 | _japanese_characters = re.compile(
11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
12 | )
13 |
14 | # Regular expression matching non-Japanese characters or punctuation marks:
15 | _japanese_marks = re.compile(
16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
17 | )
18 |
19 | # List of (symbol, Japanese) pairs for marks:
20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
21 |
22 |
23 | # List of (consonant, sokuon) pairs:
24 | _real_sokuon = [
25 | (re.compile("%s" % x[0]), x[1])
26 | for x in [
27 | (r"Q([↑↓]*[kg])", r"k#\1"),
28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
29 | (r"Q([↑↓]*[sʃ])", r"s\1"),
30 | (r"Q([↑↓]*[pb])", r"p#\1"),
31 | ]
32 | ]
33 |
34 | # List of (consonant, hatsuon) pairs:
35 | _real_hatsuon = [
36 | (re.compile("%s" % x[0]), x[1])
37 | for x in [
38 | (r"N([↑↓]*[pbm])", r"m\1"),
39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"),
40 | (r"N([↑↓]*[tdn])", r"n\1"),
41 | (r"N([↑↓]*[kg])", r"ŋ\1"),
42 | ]
43 | ]
44 |
45 |
46 | def post_replace_ph(ph):
47 | rep_map = {
48 | ":": ",",
49 | ";": ",",
50 | ",": ",",
51 | "。": ".",
52 | "!": "!",
53 | "?": "?",
54 | "\n": ".",
55 | "·": ",",
56 | "、": ",",
57 | "...": "…",
58 | "v": "V",
59 | }
60 | if ph in rep_map.keys():
61 | ph = rep_map[ph]
62 | if ph in symbols:
63 | return ph
64 | if ph not in symbols:
65 | ph = "UNK"
66 | return ph
67 |
68 |
69 | def symbols_to_japanese(text):
70 | for regex, replacement in _symbols_to_japanese:
71 | text = re.sub(regex, replacement, text)
72 | return text
73 |
74 |
75 | def preprocess_jap(text):
76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
77 | text = symbols_to_japanese(text)
78 | sentences = re.split(_japanese_marks, text)
79 | marks = re.findall(_japanese_marks, text)
80 | text = []
81 | for i, sentence in enumerate(sentences):
82 | if re.match(_japanese_characters, sentence):
83 | p = pyopenjtalk.g2p(sentence)
84 | text += p.split(" ")
85 |
86 | if i < len(marks):
87 | text += [marks[i].replace(" ", "")]
88 | return text
89 |
90 |
91 | def text_normalize(text):
92 | # todo: jap text normalize
93 | return text
94 |
95 |
96 | def g2p(norm_text):
97 | phones = preprocess_jap(norm_text)
98 | phones = [post_replace_ph(i) for i in phones]
99 | # todo: implement tones and word2ph
100 | tones = [0 for i in phones]
101 | word2ph = [1 for i in phones]
102 | return phones, tones, word2ph
103 |
104 |
105 | if __name__ == "__main__":
106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines():
107 | text = line.split(":")[1]
108 | phones, tones, word2ph = g2p(text)
109 | for p in phones:
110 | if p == "z":
111 | print(text, phones)
112 | sys.exit(0)
113 |
--------------------------------------------------------------------------------
/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(
16 | text,
17 | word2ph,
18 | device=config.bert_gen_config.device,
19 | style_text=None,
20 | style_weight=0.7,
21 | ):
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 | if device not in models.keys():
31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | for i in inputs:
35 | inputs[i] = inputs[i].to(device)
36 | res = models[device](**inputs, output_hidden_states=True)
37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38 | if style_text:
39 | style_inputs = tokenizer(style_text, return_tensors="pt")
40 | for i in style_inputs:
41 | style_inputs[i] = style_inputs[i].to(device)
42 | style_res = models[device](**style_inputs, output_hidden_states=True)
43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44 | style_res_mean = style_res.mean(0)
45 | assert len(word2ph) == len(text) + 2
46 | word2phone = word2ph
47 | phone_level_feature = []
48 | for i in range(len(word2phone)):
49 | if style_text:
50 | repeat_feature = (
51 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
52 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
53 | )
54 | else:
55 | repeat_feature = res[i].repeat(word2phone[i], 1)
56 | phone_level_feature.append(repeat_feature)
57 |
58 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
59 |
60 | return phone_level_feature.T
61 |
62 |
63 | if __name__ == "__main__":
64 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
65 | word2phone = [
66 | 1,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 1,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 1,
78 | 2,
79 | 2,
80 | 2,
81 | 2,
82 | 2,
83 | 1,
84 | 1,
85 | 2,
86 | 2,
87 | 1,
88 | 2,
89 | 2,
90 | 2,
91 | 2,
92 | 1,
93 | 2,
94 | 2,
95 | 2,
96 | 2,
97 | 2,
98 | 1,
99 | 2,
100 | 2,
101 | 2,
102 | 2,
103 | 1,
104 | ]
105 |
106 | # 计算总帧数
107 | total_frames = sum(word2phone)
108 | print(word_level_feature.shape)
109 | print(word2phone)
110 | phone_level_feature = []
111 | for i in range(len(word2phone)):
112 | print(word_level_feature[i].shape)
113 |
114 | # 对每个词重复word2phone[i]次
115 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
116 | phone_level_feature.append(repeat_feature)
117 |
118 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
119 | print(phone_level_feature.shape) # torch.Size([36, 1024])
120 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(
16 | text,
17 | word2ph,
18 | device=config.bert_gen_config.device,
19 | style_text=None,
20 | style_weight=0.7,
21 | ):
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 | if device not in models.keys():
31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | for i in inputs:
35 | inputs[i] = inputs[i].to(device)
36 | res = models[device](**inputs, output_hidden_states=True)
37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38 | if style_text:
39 | style_inputs = tokenizer(style_text, return_tensors="pt")
40 | for i in style_inputs:
41 | style_inputs[i] = style_inputs[i].to(device)
42 | style_res = models[device](**style_inputs, output_hidden_states=True)
43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44 | style_res_mean = style_res.mean(0)
45 |
46 | assert len(word2ph) == len(text) + 2
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
63 |
64 | if __name__ == "__main__":
65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
66 | word2phone = [
67 | 1,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 1,
73 | 2,
74 | 2,
75 | 1,
76 | 2,
77 | 2,
78 | 1,
79 | 2,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | 1,
86 | 2,
87 | 2,
88 | 1,
89 | 2,
90 | 2,
91 | 2,
92 | 2,
93 | 1,
94 | 2,
95 | 2,
96 | 2,
97 | 2,
98 | 2,
99 | 1,
100 | 2,
101 | 2,
102 | 2,
103 | 2,
104 | 1,
105 | ]
106 |
107 | # 计算总帧数
108 | total_frames = sum(word2phone)
109 | print(word_level_feature.shape)
110 | print(word2phone)
111 | phone_level_feature = []
112 | for i in range(len(word2phone)):
113 | print(word_level_feature[i].shape)
114 |
115 | # 对每个词重复word2phone[i]次
116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
117 | phone_level_feature.append(repeat_feature)
118 |
119 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
120 | print(phone_level_feature.shape) # torch.Size([36, 1024])
121 |
--------------------------------------------------------------------------------
/oldVersion/V220/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(
16 | text,
17 | word2ph,
18 | device=config.bert_gen_config.device,
19 | style_text=None,
20 | style_weight=0.7,
21 | ):
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 | if device not in models.keys():
31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | for i in inputs:
35 | inputs[i] = inputs[i].to(device)
36 | res = models[device](**inputs, output_hidden_states=True)
37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38 | if style_text:
39 | style_inputs = tokenizer(style_text, return_tensors="pt")
40 | for i in style_inputs:
41 | style_inputs[i] = style_inputs[i].to(device)
42 | style_res = models[device](**style_inputs, output_hidden_states=True)
43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44 | style_res_mean = style_res.mean(0)
45 |
46 | assert len(word2ph) == len(text) + 2
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
63 |
64 | if __name__ == "__main__":
65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
66 | word2phone = [
67 | 1,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 1,
73 | 2,
74 | 2,
75 | 1,
76 | 2,
77 | 2,
78 | 1,
79 | 2,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | 1,
86 | 2,
87 | 2,
88 | 1,
89 | 2,
90 | 2,
91 | 2,
92 | 2,
93 | 1,
94 | 2,
95 | 2,
96 | 2,
97 | 2,
98 | 2,
99 | 1,
100 | 2,
101 | 2,
102 | 2,
103 | 2,
104 | 1,
105 | ]
106 |
107 | # 计算总帧数
108 | total_frames = sum(word2phone)
109 | print(word_level_feature.shape)
110 | print(word2phone)
111 | phone_level_feature = []
112 | for i in range(len(word2phone)):
113 | print(word_level_feature[i].shape)
114 |
115 | # 对每个词重复word2phone[i]次
116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
117 | phone_level_feature.append(repeat_feature)
118 |
119 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
120 | print(phone_level_feature.shape) # torch.Size([36, 1024])
121 |
--------------------------------------------------------------------------------
/bert/deberta-v3-large/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language: en
3 | tags:
4 | - deberta
5 | - deberta-v3
6 | - fill-mask
7 | thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
8 | license: mit
9 | ---
10 |
11 | ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
12 |
13 | [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
14 |
15 | In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
16 |
17 | Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
18 |
19 | The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2.
20 |
21 |
22 | #### Fine-tuning on NLU tasks
23 |
24 | We present the dev results on SQuAD 2.0 and MNLI tasks.
25 |
26 | | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
27 | |-------------------|----------|-------------------|-----------|----------|
28 | | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 |
29 | | XLNet-large |32 |- | 90.6/87.9 | 90.8 |
30 | | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 |
31 | | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**|
32 |
33 |
34 | #### Fine-tuning with HF transformers
35 |
36 | ```bash
37 | #!/bin/bash
38 |
39 | cd transformers/examples/pytorch/text-classification/
40 |
41 | pip install datasets
42 | export TASK_NAME=mnli
43 |
44 | output_dir="ds_results"
45 |
46 | num_gpus=8
47 |
48 | batch_size=8
49 |
50 | python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
51 | run_glue.py \
52 | --model_name_or_path microsoft/deberta-v3-large \
53 | --task_name $TASK_NAME \
54 | --do_train \
55 | --do_eval \
56 | --evaluation_strategy steps \
57 | --max_seq_length 256 \
58 | --warmup_steps 50 \
59 | --per_device_train_batch_size ${batch_size} \
60 | --learning_rate 6e-6 \
61 | --num_train_epochs 2 \
62 | --output_dir $output_dir \
63 | --overwrite_output_dir \
64 | --logging_steps 1000 \
65 | --logging_dir $output_dir
66 |
67 | ```
68 |
69 | ### Citation
70 |
71 | If you find DeBERTa useful for your work, please cite the following papers:
72 |
73 | ``` latex
74 | @misc{he2021debertav3,
75 | title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
76 | author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
77 | year={2021},
78 | eprint={2111.09543},
79 | archivePrefix={arXiv},
80 | primaryClass={cs.CL}
81 | }
82 | ```
83 |
84 | ``` latex
85 | @inproceedings{
86 | he2021deberta,
87 | title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
88 | author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
89 | booktitle={International Conference on Learning Representations},
90 | year={2021},
91 | url={https://openreview.net/forum?id=XPZIaotutsD}
92 | }
93 | ```
94 |
--------------------------------------------------------------------------------
/oldVersion/V210/emo_gen.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from torch.utils.data import Dataset
6 | from torch.utils.data import Dataset
7 | from transformers import Wav2Vec2Processor
8 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
9 | Wav2Vec2Model,
10 | Wav2Vec2PreTrainedModel,
11 | )
12 |
13 | from config import config
14 |
15 |
16 | class RegressionHead(nn.Module):
17 | r"""Classification head."""
18 |
19 | def __init__(self, config):
20 | super().__init__()
21 |
22 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
23 | self.dropout = nn.Dropout(config.final_dropout)
24 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
25 |
26 | def forward(self, features, **kwargs):
27 | x = features
28 | x = self.dropout(x)
29 | x = self.dense(x)
30 | x = torch.tanh(x)
31 | x = self.dropout(x)
32 | x = self.out_proj(x)
33 |
34 | return x
35 |
36 |
37 | class EmotionModel(Wav2Vec2PreTrainedModel):
38 | r"""Speech emotion classifier."""
39 |
40 | def __init__(self, config):
41 | super().__init__(config)
42 |
43 | self.config = config
44 | self.wav2vec2 = Wav2Vec2Model(config)
45 | self.classifier = RegressionHead(config)
46 | self.init_weights()
47 |
48 | def forward(
49 | self,
50 | input_values,
51 | ):
52 | outputs = self.wav2vec2(input_values)
53 | hidden_states = outputs[0]
54 | hidden_states = torch.mean(hidden_states, dim=1)
55 | logits = self.classifier(hidden_states)
56 |
57 | return hidden_states, logits
58 |
59 |
60 | class AudioDataset(Dataset):
61 | def __init__(self, list_of_wav_files, sr, processor):
62 | self.list_of_wav_files = list_of_wav_files
63 | self.processor = processor
64 | self.sr = sr
65 |
66 | def __len__(self):
67 | return len(self.list_of_wav_files)
68 |
69 | def __getitem__(self, idx):
70 | wav_file = self.list_of_wav_files[idx]
71 | audio_data, _ = librosa.load(wav_file, sr=self.sr)
72 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[
73 | "input_values"
74 | ][0]
75 | return torch.from_numpy(processed_data)
76 |
77 |
78 | device = config.emo_gen_config.device
79 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
80 | processor = Wav2Vec2Processor.from_pretrained(model_name)
81 | model = EmotionModel.from_pretrained(model_name).to(device)
82 |
83 |
84 | def process_func(
85 | x: np.ndarray,
86 | sampling_rate: int,
87 | model: EmotionModel,
88 | processor: Wav2Vec2Processor,
89 | device: str,
90 | embeddings: bool = False,
91 | ) -> np.ndarray:
92 | r"""Predict emotions or extract embeddings from raw audio signal."""
93 | model = model.to(device)
94 | y = processor(x, sampling_rate=sampling_rate)
95 | y = y["input_values"][0]
96 | y = torch.from_numpy(y).unsqueeze(0).to(device)
97 |
98 | # run through model
99 | with torch.no_grad():
100 | y = model(y)[0 if embeddings else 1]
101 |
102 | # convert to numpy
103 | y = y.detach().cpu().numpy()
104 |
105 | return y
106 |
107 |
108 | def get_emo(path):
109 | wav, sr = librosa.load(path, 16000)
110 | return process_func(
111 | np.expand_dims(wav, 0).astype(np.float64),
112 | sr,
113 | model,
114 | processor,
115 | device,
116 | embeddings=True,
117 | ).squeeze(0)
118 |
--------------------------------------------------------------------------------