├── .gitmodules
├── oldVersion
├── V111
│ └── text
│ │ ├── fix
│ │ ├── __init__.py
│ │ └── japanese_bert.py
│ │ ├── english_bert_mock.py
│ │ ├── japanese_bert.py
│ │ ├── cleaner.py
│ │ ├── __init__.py
│ │ ├── chinese_bert.py
│ │ └── symbols.py
├── __init__.py
├── V200
│ ├── text
│ │ ├── cmudict_cache.pickle
│ │ ├── bert_utils.py
│ │ ├── cleaner.py
│ │ ├── english_bert_mock.py
│ │ ├── __init__.py
│ │ ├── japanese_bert.py
│ │ ├── chinese_bert.py
│ │ └── symbols.py
│ └── __init__.py
├── V210
│ ├── text
│ │ ├── cmudict_cache.pickle
│ │ ├── bert_utils.py
│ │ ├── cleaner.py
│ │ ├── __init__.py
│ │ ├── english_bert_mock.py
│ │ ├── japanese_bert.py
│ │ ├── symbols.py
│ │ └── chinese_bert.py
│ └── emo_gen.py
├── V101
│ ├── text
│ │ ├── english_bert_mock.py
│ │ ├── cleaner.py
│ │ ├── __init__.py
│ │ ├── chinese_bert.py
│ │ ├── symbols.py
│ │ └── japanese.py
│ └── __init__.py
└── V110
│ ├── text
│ ├── english_bert_mock.py
│ ├── cleaner.py
│ ├── __init__.py
│ ├── japanese_bert.py
│ ├── chinese_bert.py
│ └── symbols.py
│ └── __init__.py
├── tools
├── __init__.py
├── log.py
└── translate.py
├── bert
├── chinese-roberta-wwm-ext-large
│ ├── added_tokens.json
│ ├── tokenizer_config.json
│ ├── special_tokens_map.json
│ ├── .gitattributes
│ ├── config.json
│ └── README.md
├── deberta-v3-large
│ ├── tokenizer_config.json
│ ├── generator_config.json
│ ├── config.json
│ ├── .gitattributes
│ └── README.md
├── deberta-v2-large-japanese-char-wwm
│ ├── special_tokens_map.json
│ ├── tokenizer_config.json
│ ├── config.json
│ └── .gitattributes
├── deberta-v2-large-japanese
│ ├── special_tokens_map.json
│ ├── tokenizer_config.json
│ ├── config.json
│ └── .gitattributes
├── bert-base-japanese-v3
│ ├── tokenizer_config.json
│ ├── config.json
│ ├── .gitattributes
│ └── README.md
├── bert-large-japanese-v2
│ ├── tokenizer_config.json
│ ├── config.json
│ ├── .gitattributes
│ └── README.md
└── bert_models.json
├── onnx_modules
├── V200
│ ├── text
│ │ ├── __init__.py
│ │ ├── bert_utils.py
│ │ ├── cleaner.py
│ │ ├── english_bert_mock.py
│ │ ├── japanese_bert.py
│ │ ├── chinese_bert.py
│ │ └── symbols.py
│ └── __init__.py
├── V210
│ ├── text
│ │ ├── __init__.py
│ │ └── symbols.py
│ └── __init__.py
├── V220
│ ├── text
│ │ ├── __init__.py
│ │ └── symbols.py
│ └── __init__.py
├── V230
│ ├── text
│ │ ├── __init__.py
│ │ └── symbols.py
│ └── __init__.py
├── V220_novq_dev
│ ├── text
│ │ ├── __init__.py
│ │ └── symbols.py
│ └── __init__.py
└── __init__.py
├── emotional
├── wav2vec2-large-robust-12-ft-emotion-msp-dim
│ ├── vocab.json
│ ├── preprocessor_config.json
│ ├── .gitattributes
│ └── config.json
└── clap-htsat-fused
│ ├── special_tokens_map.json
│ ├── tokenizer_config.json
│ ├── preprocessor_config.json
│ └── .gitattributes
├── img
├── 宵宫.png
├── yuyu.png
├── 参数说明.png
├── 神里绫华.png
├── 纳西妲.png
├── bert-vits2-e.png
└── 微信图片_20231010105112.png
├── text
├── cmudict_cache.pickle
├── bert_utils.py
├── cleaner.py
├── __init__.py
├── english_bert_mock.py
├── japanese_bert.py
├── symbols.py
└── chinese_bert.py
├── slm
└── wavlm-base-plus
│ ├── preprocessor_config.json
│ ├── .gitattributes
│ └── config.json
├── css
└── custom.css
├── requirements.txt
├── export_onnx.py
├── monotonic_align
├── __init__.py
└── core.py
├── .pre-commit-config.yaml
├── run_MnodesAndMgpus.sh
├── clap_wrapper.py
├── onnx_infer.py
├── resample_legacy.py
├── resample.py
├── clap_gen.py
├── .vscode
└── launch.json
├── motion
├── record.py
├── wav_to_visemes.py
├── prepare_visemes.py
├── visemes_tools.py
└── data.ipynb
├── Improvement_2025.md
├── re_matching.py
├── compress_model.py
├── configs
└── config.json
├── spec_gen.py
├── bert_gen.py
└── update_status.py
/.gitmodules:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具包
3 | """
4 |
--------------------------------------------------------------------------------
/oldVersion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 老版本模型推理兼容
3 | """
4 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/added_tokens.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
--------------------------------------------------------------------------------
/onnx_modules/V210/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
--------------------------------------------------------------------------------
/onnx_modules/V220/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
--------------------------------------------------------------------------------
/onnx_modules/V230/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
--------------------------------------------------------------------------------
/img/宵宫.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/宵宫.png
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"init_inputs": []}
2 |
--------------------------------------------------------------------------------
/img/yuyu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/yuyu.png
--------------------------------------------------------------------------------
/img/参数说明.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/参数说明.png
--------------------------------------------------------------------------------
/img/神里绫华.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/神里绫华.png
--------------------------------------------------------------------------------
/img/纳西妲.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/纳西妲.png
--------------------------------------------------------------------------------
/img/bert-vits2-e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/bert-vits2-e.png
--------------------------------------------------------------------------------
/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/text/cmudict_cache.pickle
--------------------------------------------------------------------------------
/bert/deberta-v3-large/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "do_lower_case": false,
3 | "vocab_type": "spm"
4 | }
5 |
--------------------------------------------------------------------------------
/img/微信图片_20231010105112.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/微信图片_20231010105112.png
--------------------------------------------------------------------------------
/oldVersion/V200/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/oldVersion/V200/text/cmudict_cache.pickle
--------------------------------------------------------------------------------
/oldVersion/V210/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/oldVersion/V210/text/cmudict_cache.pickle
--------------------------------------------------------------------------------
/oldVersion/V101/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_bert_feature(norm_text, word2ph):
5 | return torch.zeros(1024, sum(word2ph))
6 |
--------------------------------------------------------------------------------
/onnx_modules/V200/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 |
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 |
--------------------------------------------------------------------------------
/onnx_modules/V210/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 |
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 |
--------------------------------------------------------------------------------
/onnx_modules/V220/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 |
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 |
--------------------------------------------------------------------------------
/onnx_modules/V230/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 |
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 |
--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 |
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
2 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "cls_token": "[CLS]",
3 | "mask_token": "[MASK]",
4 | "pad_token": "[PAD]",
5 | "sep_token": "[SEP]",
6 | "unk_token": "[UNK]"
7 | }
8 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token": "[CLS]",
3 | "cls_token": "[CLS]",
4 | "eos_token": "[SEP]",
5 | "mask_token": "[MASK]",
6 | "pad_token": "[PAD]",
7 | "sep_token": "[SEP]",
8 | "unk_token": "[UNK]"
9 | }
10 |
--------------------------------------------------------------------------------
/slm/wavlm-base-plus/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "do_normalize": false,
3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4 | "feature_size": 1,
5 | "padding_side": "right",
6 | "padding_value": 0.0,
7 | "return_attention_mask": true,
8 | "sampling_rate": 16000
9 | }
10 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "do_normalize": true,
3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4 | "feature_size": 1,
5 | "padding_side": "right",
6 | "padding_value": 0.0,
7 | "return_attention_mask": true,
8 | "sampling_rate": 16000
9 | }
10 |
--------------------------------------------------------------------------------
/css/custom.css:
--------------------------------------------------------------------------------
1 |
2 | #yml_code {
3 | height: 600px;
4 | flex-grow: inherit;
5 | overflow-y: auto;
6 | }
7 |
8 | #json_code {
9 | height: 600px;
10 | flex-grow: inherit;
11 | overflow-y: auto;
12 | }
13 |
14 | #gpu_code {
15 | height: 300px;
16 | flex-grow: inherit;
17 | overflow-y: auto;
18 | }
19 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "tokenizer_class": "BertJapaneseTokenizer",
3 | "model_max_length": 512,
4 | "do_lower_case": false,
5 | "word_tokenizer_type": "mecab",
6 | "subword_tokenizer_type": "wordpiece",
7 | "mecab_kwargs": {
8 | "mecab_dic": "unidic_lite"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "tokenizer_class": "BertJapaneseTokenizer",
3 | "model_max_length": 512,
4 | "do_lower_case": false,
5 | "word_tokenizer_type": "mecab",
6 | "subword_tokenizer_type": "wordpiece",
7 | "mecab_kwargs": {
8 | "mecab_dic": "unidic_lite"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/tools/log.py:
--------------------------------------------------------------------------------
1 | """
2 | logger封装
3 | """
4 | from loguru import logger
5 | import sys
6 |
7 |
8 | # 移除所有默认的处理器
9 | logger.remove()
10 |
11 | # 自定义格式并添加到标准输出
12 | log_format = (
13 | "{time:MM-DD HH:mm:ss} {level:<9}| {file}:{line} | {message}"
14 | )
15 |
16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
17 |
--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token": "",
3 | "cls_token": "",
4 | "eos_token": "",
5 | "mask_token": {
6 | "content": "",
7 | "lstrip": true,
8 | "normalized": false,
9 | "rstrip": false,
10 | "single_word": false
11 | },
12 | "pad_token": "",
13 | "sep_token": "",
14 | "unk_token": ""
15 | }
16 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token": "[CLS]",
3 | "cls_token": "[CLS]",
4 | "do_lower_case": false,
5 | "eos_token": "[SEP]",
6 | "keep_accents": true,
7 | "mask_token": "[MASK]",
8 | "pad_token": "[PAD]",
9 | "sep_token": "[SEP]",
10 | "sp_model_kwargs": {},
11 | "special_tokens_map_file": null,
12 | "split_by_punct": false,
13 | "tokenizer_class": "DebertaV2Tokenizer",
14 | "unk_token": "[UNK]"
15 | }
16 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bin.* filter=lfs diff=lfs merge=lfs -text
2 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.h5 filter=lfs diff=lfs merge=lfs -text
5 | *.tflite filter=lfs diff=lfs merge=lfs -text
6 | *.tar.gz filter=lfs diff=lfs merge=lfs -text
7 | *.ot filter=lfs diff=lfs merge=lfs -text
8 | *.onnx filter=lfs diff=lfs merge=lfs -text
9 | *.msgpack filter=lfs diff=lfs merge=lfs -text
10 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa==0.9.2
2 | matplotlib
3 | numpy
4 | numba
5 | phonemizer
6 | scipy
7 | tensorboard
8 | Unidecode
9 | amfm_decompy
10 | jieba
11 | transformers
12 | pypinyin
13 | cn2an
14 | gradio
15 | av
16 | mecab-python3
17 | loguru
18 | unidic-lite
19 | cmudict
20 | fugashi
21 | num2words
22 | PyYAML
23 | requests
24 | pyopenjtalk-prebuilt
25 | jaconv
26 | psutil
27 | GPUtil
28 | vector_quantize_pytorch
29 | g2p_en
30 | sentencepiece
31 | pykakasi
32 | langid
33 |
--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "add_prefix_space": false,
3 | "bos_token": "",
4 | "cls_token": "",
5 | "eos_token": "",
6 | "errors": "replace",
7 | "mask_token": "",
8 | "model_max_length": 512,
9 | "pad_token": "",
10 | "processor_class": "ClapProcessor",
11 | "sep_token": "",
12 | "special_tokens_map_file": null,
13 | "tokenizer_class": "RobertaTokenizer",
14 | "trim_offsets": true,
15 | "unk_token": ""
16 | }
17 |
--------------------------------------------------------------------------------
/export_onnx.py:
--------------------------------------------------------------------------------
1 | from onnx_modules import export_onnx
2 | import os
3 |
4 | if __name__ == "__main__":
5 | export_path = "BertVits2.2PT"
6 | model_path = "model\\G_0.pth"
7 | config_path = "model\\config.json"
8 | novq = False
9 | dev = False
10 | if not os.path.exists("onnx"):
11 | os.makedirs("onnx")
12 | if not os.path.exists(f"onnx/{export_path}"):
13 | os.makedirs(f"onnx/{export_path}")
14 | export_onnx(export_path, model_path, config_path, novq, dev)
15 |
--------------------------------------------------------------------------------
/bert/bert_models.json:
--------------------------------------------------------------------------------
1 | {
2 | "deberta-v2-large-japanese-char-wwm": {
3 | "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
4 | "files": ["pytorch_model.bin"]
5 | },
6 | "chinese-roberta-wwm-ext-large": {
7 | "repo_id": "hfl/chinese-roberta-wwm-ext-large",
8 | "files": ["pytorch_model.bin"]
9 | },
10 | "deberta-v3-large": {
11 | "repo_id": "microsoft/deberta-v3-large",
12 | "files": ["spm.model", "pytorch_model.bin"]
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForPreTraining"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "type_vocab_size": 2,
18 | "vocab_size": 32768
19 | }
20 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForPreTraining"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 1024,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 4096,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 16,
15 | "num_hidden_layers": 24,
16 | "pad_token_id": 0,
17 | "type_vocab_size": 2,
18 | "vocab_size": 32768
19 | }
20 |
--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 | from numpy import zeros, int32, float32
2 | from torch import from_numpy
3 |
4 | from .core import maximum_path_jit
5 |
6 |
7 | def maximum_path(neg_cent, mask):
8 | device = neg_cent.device
9 | dtype = neg_cent.dtype
10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 | path = zeros(neg_cent.shape, dtype=int32)
12 |
13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 | return from_numpy(path).to(device=device, dtype=dtype)
17 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "cls_token": "[CLS]",
3 | "do_lower_case": false,
4 | "do_subword_tokenize": true,
5 | "do_word_tokenize": true,
6 | "jumanpp_kwargs": null,
7 | "mask_token": "[MASK]",
8 | "mecab_kwargs": null,
9 | "model_max_length": 1000000000000000019884624838656,
10 | "never_split": null,
11 | "pad_token": "[PAD]",
12 | "sep_token": "[SEP]",
13 | "special_tokens_map_file": null,
14 | "subword_tokenizer_type": "character",
15 | "sudachi_kwargs": null,
16 | "tokenizer_class": "BertJapaneseTokenizer",
17 | "unk_token": "[UNK]",
18 | "word_tokenizer_type": "basic"
19 | }
20 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.5.0
4 | hooks:
5 | - id: check-yaml
6 | - id: end-of-file-fixer
7 | - id: trailing-whitespace
8 |
9 | - repo: https://github.com/astral-sh/ruff-pre-commit
10 | rev: v0.1.8
11 | hooks:
12 | - id: ruff
13 | args: [ --fix ]
14 |
15 | - repo: https://github.com/psf/black
16 | rev: 23.12.0
17 | hooks:
18 | - id: black
19 |
20 | - repo: https://github.com/codespell-project/codespell
21 | rev: v2.2.6
22 | hooks:
23 | - id: codespell
24 | files: ^.*\.(py|md|rst|yml)$
25 | args: [-L=fro]
26 |
--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "chunk_length_s": 10,
3 | "feature_extractor_type": "ClapFeatureExtractor",
4 | "feature_size": 64,
5 | "fft_window_size": 1024,
6 | "frequency_max": 14000,
7 | "frequency_min": 50,
8 | "hop_length": 480,
9 | "max_length_s": 10,
10 | "n_fft": 1024,
11 | "nb_frequency_bins": 513,
12 | "nb_max_frames": 1000,
13 | "nb_max_samples": 480000,
14 | "padding": "repeatpad",
15 | "padding_side": "right",
16 | "padding_value": 0.0,
17 | "processor_class": "ClapProcessor",
18 | "return_attention_mask": false,
19 | "sampling_rate": 48000,
20 | "top_db": null,
21 | "truncation": "fusion"
22 | }
23 |
--------------------------------------------------------------------------------
/bert/deberta-v3-large/generator_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_type": "deberta-v2",
3 | "attention_probs_dropout_prob": 0.1,
4 | "hidden_act": "gelu",
5 | "hidden_dropout_prob": 0.1,
6 | "hidden_size": 1024,
7 | "initializer_range": 0.02,
8 | "intermediate_size": 4096,
9 | "max_position_embeddings": 512,
10 | "relative_attention": true,
11 | "position_buckets": 256,
12 | "norm_rel_ebd": "layer_norm",
13 | "share_att_key": true,
14 | "pos_att_type": "p2c|c2p",
15 | "layer_norm_eps": 1e-7,
16 | "max_relative_positions": -1,
17 | "position_biased_input": false,
18 | "num_attention_heads": 16,
19 | "num_hidden_layers": 12,
20 | "type_vocab_size": 0,
21 | "vocab_size": 128100
22 | }
23 |
--------------------------------------------------------------------------------
/bert/deberta-v3-large/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_type": "deberta-v2",
3 | "attention_probs_dropout_prob": 0.1,
4 | "hidden_act": "gelu",
5 | "hidden_dropout_prob": 0.1,
6 | "hidden_size": 1024,
7 | "initializer_range": 0.02,
8 | "intermediate_size": 4096,
9 | "max_position_embeddings": 512,
10 | "relative_attention": true,
11 | "position_buckets": 256,
12 | "norm_rel_ebd": "layer_norm",
13 | "share_att_key": true,
14 | "pos_att_type": "p2c|c2p",
15 | "layer_norm_eps": 1e-7,
16 | "max_relative_positions": -1,
17 | "position_biased_input": false,
18 | "num_attention_heads": 16,
19 | "num_hidden_layers": 24,
20 | "type_vocab_size": 0,
21 | "vocab_size": 128100
22 | }
23 |
--------------------------------------------------------------------------------
/text/bert_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from huggingface_hub import hf_hub_download
4 |
5 | from config import config
6 |
7 |
8 | MIRROR: str = config.mirror
9 |
10 |
11 | def _check_bert(repo_id, files, local_path):
12 | for file in files:
13 | if not Path(local_path).joinpath(file).exists():
14 | if MIRROR.lower() == "openi":
15 | import openi
16 |
17 | openi.model.download_model(
18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 | )
20 | else:
21 | hf_hub_download(
22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 | )
24 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/bert_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from huggingface_hub import hf_hub_download
4 |
5 | from config import config
6 |
7 |
8 | MIRROR: str = config.mirror
9 |
10 |
11 | def _check_bert(repo_id, files, local_path):
12 | for file in files:
13 | if not Path(local_path).joinpath(file).exists():
14 | if MIRROR.lower() == "openi":
15 | import openi
16 |
17 | openi.model.download_model(
18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 | )
20 | else:
21 | hf_hub_download(
22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 | )
24 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/bert_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from huggingface_hub import hf_hub_download
4 |
5 | from config import config
6 |
7 |
8 | MIRROR: str = config.mirror
9 |
10 |
11 | def _check_bert(repo_id, files, local_path):
12 | for file in files:
13 | if not Path(local_path).joinpath(file).exists():
14 | if MIRROR.lower() == "openi":
15 | import openi
16 |
17 | openi.model.download_model(
18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 | )
20 | else:
21 | hf_hub_download(
22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 | )
24 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/bert_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from huggingface_hub import hf_hub_download
4 |
5 | from config import config
6 |
7 |
8 | MIRROR: str = config.mirror
9 |
10 |
11 | def _check_bert(repo_id, files, local_path):
12 | for file in files:
13 | if not Path(local_path).joinpath(file).exists():
14 | if MIRROR.lower() == "openi":
15 | import openi
16 |
17 | openi.model.download_model(
18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 | )
20 | else:
21 | hf_hub_download(
22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 | )
24 |
--------------------------------------------------------------------------------
/run_MnodesAndMgpus.sh:
--------------------------------------------------------------------------------
1 | #多机多卡训练
2 |
3 | #--nnodes=1:3 表示 使用一到三台机器 弹性分配资源
4 | #--nnodes=<最小节点数>:<最大节点数>
5 | #--nproc_per_node=每台机器上可用的GPU数
6 | #--rdzv_endpoint=主节点(最先启动的)ip:端口号
7 | #其他不需要变
8 |
9 | #注意: 此版本的分布式训练是基于数据并行的,多机多卡相当于开更大的batchsize,此时epoch迭代速度会增加,
10 | #但由于 该版本的代码中 保存模型是按照global step来计算的,所以会出现的效果就是 : 保存模型的时间不会有明显加速,
11 | #但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数,实现更好的效果”
12 |
13 | #*************************
14 | # torchrun \
15 | # --nnodes=1:3\
16 | # --nproc_per_node=2\
17 | # --rdzv_id=1\
18 | # --rdzv_backend=c10d\
19 | # --rdzv_endpoint="inspur1:8880"\
20 | # train_ms.py
21 | #****************************
22 |
23 | #多卡训练
24 | #nproc_per_node = 机器上可用的GPU数
25 |
26 | #*************************
27 | torchrun \
28 | --nnodes=1\
29 | --nproc_per_node=2\
30 | train_ms.py
31 | #*************************
32 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "bos_token_id": 0,
7 | "directionality": "bidi",
8 | "eos_token_id": 2,
9 | "hidden_act": "gelu",
10 | "hidden_dropout_prob": 0.1,
11 | "hidden_size": 1024,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 4096,
14 | "layer_norm_eps": 1e-12,
15 | "max_position_embeddings": 512,
16 | "model_type": "bert",
17 | "num_attention_heads": 16,
18 | "num_hidden_layers": 24,
19 | "output_past": true,
20 | "pad_token_id": 0,
21 | "pooler_fc_size": 768,
22 | "pooler_num_attention_heads": 12,
23 | "pooler_num_fc_layers": 3,
24 | "pooler_size_per_head": 128,
25 | "pooler_type": "first_token_transform",
26 | "type_vocab_size": 2,
27 | "vocab_size": 21128
28 | }
29 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from text import chinese, japanese, english, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, english, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, english, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, english, cleaned_text_to_sequence
2 |
3 |
4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5 |
6 |
7 | def clean_text(text, language):
8 | language_module = language_module_map[language]
9 | norm_text = language_module.text_normalize(text)
10 | phones, tones, word2ph = language_module.g2p(norm_text)
11 | return norm_text, phones, tones, word2ph
12 |
13 |
14 | def clean_text_bert(text, language):
15 | language_module = language_module_map[language]
16 | norm_text = language_module.text_normalize(text)
17 | phones, tones, word2ph = language_module.g2p(norm_text)
18 | bert = language_module.get_bert_feature(norm_text, word2ph)
19 | return phones, tones, bert
20 |
21 |
22 | def text_to_sequence(text, language):
23 | norm_text, phones, tones, word2ph = clean_text(text, language)
24 | return cleaned_text_to_sequence(phones, tones, language)
25 |
26 |
27 | if __name__ == "__main__":
28 | pass
29 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 |
7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9 | Args:
10 | text: string to convert to a sequence
11 | Returns:
12 | List of integers corresponding to the symbols in the text
13 | """
14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 | tone_start = language_tone_start_map[language]
16 | tones = [i + tone_start for i in tones]
17 | lang_id = language_id_map[language]
18 | lang_ids = [lang_id for i in phones]
19 | return phones, tones, lang_ids
20 |
21 |
22 | def get_bert(norm_text, word2ph, language):
23 | from .chinese_bert import get_bert_feature as zh_bert
24 | from .english_bert_mock import get_bert_feature as en_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph)
28 | return bert
29 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "DebertaV2ForMaskedLM"
4 | ],
5 | "attention_head_size": 64,
6 | "attention_probs_dropout_prob": 0.1,
7 | "conv_act": "gelu",
8 | "conv_kernel_size": 3,
9 | "hidden_act": "gelu",
10 | "hidden_dropout_prob": 0.1,
11 | "hidden_size": 1024,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 4096,
14 | "layer_norm_eps": 1e-07,
15 | "max_position_embeddings": 512,
16 | "max_relative_positions": -1,
17 | "model_type": "deberta-v2",
18 | "norm_rel_ebd": "layer_norm",
19 | "num_attention_heads": 16,
20 | "num_hidden_layers": 24,
21 | "pad_token_id": 0,
22 | "pooler_dropout": 0,
23 | "pooler_hidden_act": "gelu",
24 | "pooler_hidden_size": 1024,
25 | "pos_att_type": [
26 | "p2c",
27 | "c2p"
28 | ],
29 | "position_biased_input": false,
30 | "position_buckets": 256,
31 | "relative_attention": true,
32 | "share_att_key": true,
33 | "torch_dtype": "float16",
34 | "transformers_version": "4.25.1",
35 | "type_vocab_size": 0,
36 | "vocab_size": 22012
37 | }
38 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "configs/deberta_v2_large.json",
3 | "architectures": [
4 | "DebertaV2ForMaskedLM"
5 | ],
6 | "attention_head_size": 64,
7 | "attention_probs_dropout_prob": 0.1,
8 | "conv_act": "gelu",
9 | "conv_kernel_size": 3,
10 | "hidden_act": "gelu",
11 | "hidden_dropout_prob": 0.1,
12 | "hidden_size": 1024,
13 | "initializer_range": 0.02,
14 | "intermediate_size": 4096,
15 | "layer_norm_eps": 1e-07,
16 | "max_position_embeddings": 512,
17 | "max_relative_positions": -1,
18 | "model_type": "deberta-v2",
19 | "norm_rel_ebd": "layer_norm",
20 | "num_attention_heads": 16,
21 | "num_hidden_layers": 24,
22 | "pad_token_id": 0,
23 | "pooler_dropout": 0,
24 | "pooler_hidden_act": "gelu",
25 | "pooler_hidden_size": 1024,
26 | "pos_att_type": [
27 | "p2c",
28 | "c2p"
29 | ],
30 | "position_biased_input": false,
31 | "position_buckets": 256,
32 | "relative_attention": true,
33 | "share_att_key": true,
34 | "torch_dtype": "float32",
35 | "transformers_version": "4.23.1",
36 | "type_vocab_size": 0,
37 | "vocab_size": 32000
38 | }
39 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 |
7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9 | Args:
10 | text: string to convert to a sequence
11 | Returns:
12 | List of integers corresponding to the symbols in the text
13 | """
14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 | tone_start = language_tone_start_map[language]
16 | tones = [i + tone_start for i in tones]
17 | lang_id = language_id_map[language]
18 | lang_ids = [lang_id for i in phones]
19 | return phones, tones, lang_ids
20 |
21 |
22 | def get_bert(norm_text, word2ph, language, device):
23 | from .chinese_bert import get_bert_feature as zh_bert
24 | from .english_bert_mock import get_bert_feature as en_bert
25 | from .japanese_bert import get_bert_feature as jp_bert
26 |
27 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
28 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
29 | return bert
30 |
--------------------------------------------------------------------------------
/bert/deberta-v3-large/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
6 |
7 |
8 | def get_bert_feature(text, word2ph, device=None):
9 | if (
10 | sys.platform == "darwin"
11 | and torch.backends.mps.is_available()
12 | and device == "cpu"
13 | ):
14 | device = "mps"
15 | if not device:
16 | device = "cuda"
17 | model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to(
18 | device
19 | )
20 | with torch.no_grad():
21 | inputs = tokenizer(text, return_tensors="pt")
22 | for i in inputs:
23 | inputs[i] = inputs[i].to(device)
24 | res = model(**inputs, output_hidden_states=True)
25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 | assert inputs["input_ids"].shape[-1] == len(word2ph)
27 | word2phone = word2ph
28 | phone_level_feature = []
29 | for i in range(len(word2phone)):
30 | repeat_feature = res[i].repeat(word2phone[i], 1)
31 | phone_level_feature.append(repeat_feature)
32 |
33 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
34 |
35 | return phone_level_feature.T
36 |
--------------------------------------------------------------------------------
/slm/wavlm-base-plus/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 |
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.wasm filter=lfs diff=lfs merge=lfs -text
25 | *.xz filter=lfs diff=lfs merge=lfs -text
26 | *.zip filter=lfs diff=lfs merge=lfs -text
27 | *.zstandard filter=lfs diff=lfs merge=lfs -text
28 | *tfevents* filter=lfs diff=lfs merge=lfs -text
29 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
6 |
7 | models = dict()
8 |
9 |
10 | def get_bert_feature(text, word2ph, device=None):
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if device not in models.keys():
20 | models[device] = AutoModelForMaskedLM.from_pretrained(
21 | "./bert/bert-base-japanese-v3"
22 | ).to(device)
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = models[device](**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 | assert inputs["input_ids"].shape[-1] == len(word2ph)
30 | word2phone = word2ph
31 | phone_level_feature = []
32 | for i in range(len(word2phone)):
33 | repeat_feature = res[i].repeat(word2phone[i], 1)
34 | phone_level_feature.append(repeat_feature)
35 |
36 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 |
38 | return phone_level_feature.T
39 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from . import chinese, japanese, cleaned_text_to_sequence
2 | from .fix import japanese as japanese_fix
3 |
4 |
5 | language_module_map = {"ZH": chinese, "JP": japanese}
6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix}
7 |
8 |
9 | def clean_text(text, language):
10 | language_module = language_module_map[language]
11 | norm_text = language_module.text_normalize(text)
12 | phones, tones, word2ph = language_module.g2p(norm_text)
13 | return norm_text, phones, tones, word2ph
14 |
15 |
16 | def clean_text_fix(text, language):
17 | """使用dev分支修复"""
18 | language_module = language_module_map_fix[language]
19 | norm_text = language_module.text_normalize(text)
20 | phones, tones, word2ph = language_module.g2p(norm_text)
21 | return norm_text, phones, tones, word2ph
22 |
23 |
24 | def clean_text_bert(text, language):
25 | language_module = language_module_map[language]
26 | norm_text = language_module.text_normalize(text)
27 | phones, tones, word2ph = language_module.g2p(norm_text)
28 | bert = language_module.get_bert_feature(norm_text, word2ph)
29 | return phones, tones, bert
30 |
31 |
32 | def text_to_sequence(text, language):
33 | norm_text, phones, tones, word2ph = clean_text(text, language)
34 | return cleaned_text_to_sequence(phones, tones, language)
35 |
36 |
37 | if __name__ == "__main__":
38 | pass
39 |
--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
1 | import numba
2 |
3 |
4 | @numba.jit(
5 | numba.void(
6 | numba.int32[:, :, ::1],
7 | numba.float32[:, :, ::1],
8 | numba.int32[::1],
9 | numba.int32[::1],
10 | ),
11 | nopython=True,
12 | nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 | b = paths.shape[0]
16 | max_neg_val = -1e9
17 | for i in range(int(b)):
18 | path = paths[i]
19 | value = values[i]
20 | t_y = t_ys[i]
21 | t_x = t_xs[i]
22 |
23 | v_prev = v_cur = 0.0
24 | index = t_x - 1
25 |
26 | for y in range(t_y):
27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 | if x == y:
29 | v_cur = max_neg_val
30 | else:
31 | v_cur = value[y - 1, x]
32 | if x == 0:
33 | if y == 0:
34 | v_prev = 0.0
35 | else:
36 | v_prev = max_neg_val
37 | else:
38 | v_prev = value[y - 1, x - 1]
39 | value[y, x] += max(v_prev, v_cur)
40 |
41 | for y in range(t_y - 1, -1, -1):
42 | path[y, index] = 1
43 | if index != 0 and (
44 | index == y or value[y - 1, index] < value[y - 1, index - 1]
45 | ):
46 | index = index - 1
47 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | if (
18 | sys.platform == "darwin"
19 | and torch.backends.mps.is_available()
20 | and device == "cpu"
21 | ):
22 | device = "mps"
23 | if not device:
24 | device = "cuda"
25 | if device not in models.keys():
26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 | with torch.no_grad():
28 | inputs = tokenizer(text, return_tensors="pt")
29 | for i in inputs:
30 | inputs[i] = inputs[i].to(device)
31 | res = models[device](**inputs, output_hidden_states=True)
32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 | # assert len(word2ph) == len(text)+2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | if (
18 | sys.platform == "darwin"
19 | and torch.backends.mps.is_available()
20 | and device == "cpu"
21 | ):
22 | device = "mps"
23 | if not device:
24 | device = "cuda"
25 | if device not in models.keys():
26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 | with torch.no_grad():
28 | inputs = tokenizer(text, return_tensors="pt")
29 | for i in inputs:
30 | inputs[i] = inputs[i].to(device)
31 | res = models[device](**inputs, output_hidden_states=True)
32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 | # assert len(word2ph) == len(text)+2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 | return bert
29 |
30 |
31 | def get_bert_fix(norm_text, word2ph, language, device):
32 | from .chinese_bert import get_bert_feature as zh_bert
33 | from .english_bert_mock import get_bert_feature as en_bert
34 | from .fix.japanese_bert import get_bert_feature as jp_bert
35 |
36 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
37 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
38 | return bert
39 |
--------------------------------------------------------------------------------
/clap_wrapper.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import ClapModel, ClapProcessor
5 |
6 | from config import config
7 |
8 | models = dict()
9 | processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
10 |
11 |
12 | def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
13 | if (
14 | sys.platform == "darwin"
15 | and torch.backends.mps.is_available()
16 | and device == "cpu"
17 | ):
18 | device = "mps"
19 | if not device:
20 | device = "cuda"
21 | if device not in models.keys():
22 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
23 | device
24 | )
25 | with torch.no_grad():
26 | inputs = processor(
27 | audios=audio_data, return_tensors="pt", sampling_rate=48000
28 | ).to(device)
29 | emb = models[device].get_audio_features(**inputs)
30 | return emb.T
31 |
32 |
33 | def get_clap_text_feature(text, device=config.bert_gen_config.device):
34 | if (
35 | sys.platform == "darwin"
36 | and torch.backends.mps.is_available()
37 | and device == "cpu"
38 | ):
39 | device = "mps"
40 | if not device:
41 | device = "cuda"
42 | if device not in models.keys():
43 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
44 | device
45 | )
46 | with torch.no_grad():
47 | inputs = processor(text=text, return_tensors="pt").to(device)
48 | emb = models[device].get_text_features(**inputs)
49 | return emb.T
50 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 |
--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 |
--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 |
--------------------------------------------------------------------------------
/onnx_infer.py:
--------------------------------------------------------------------------------
1 | from onnx_modules.V220_OnnxInference import OnnxInferenceSession
2 | import numpy as np
3 | Session = OnnxInferenceSession(
4 | {
5 | "enc" : "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
6 | "emb_g" : "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
7 | "dp" : "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
8 | "sdp" : "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
9 | "flow" : "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
10 | "dec" : "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx"
11 | },
12 | Providers = ["CPUExecutionProvider"]
13 | )
14 |
15 | #这里的输入和原版是一样的,只需要在原版预处理结果出来之后加上.numpy()即可
16 | x = np.array(
17 | [
18 | 0,
19 | 97,
20 | 0,
21 | 8,
22 | 0,
23 | 78,
24 | 0,
25 | 8,
26 | 0,
27 | 76,
28 | 0,
29 | 37,
30 | 0,
31 | 40,
32 | 0,
33 | 97,
34 | 0,
35 | 8,
36 | 0,
37 | 23,
38 | 0,
39 | 8,
40 | 0,
41 | 74,
42 | 0,
43 | 26,
44 | 0,
45 | 104,
46 | 0,
47 | ]
48 | )
49 | tone = np.zeros_like(x)
50 | language = np.zeros_like(x)
51 | sid = np.array([0])
52 | bert = np.random.randn(x.shape[0], 1024)
53 | ja_bert = np.random.randn(x.shape[0], 1024)
54 | en_bert = np.random.randn(x.shape[0], 1024)
55 | emo = np.random.randn(512, 1)
56 |
57 | audio = Session(
58 | x,
59 | tone,
60 | language,
61 | bert,
62 | ja_bert,
63 | en_bert,
64 | emo,
65 | sid
66 | )
67 |
68 | print(audio)
69 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 | return bert
29 |
30 |
31 | def check_bert_models():
32 | import json
33 | from pathlib import Path
34 |
35 | from config import config
36 | from .bert_utils import _check_bert
37 |
38 | if config.mirror.lower() == "openi":
39 | import openi
40 |
41 | kwargs = {"token": config.openi_token} if config.openi_token else {}
42 | openi.login(**kwargs)
43 |
44 | with open("./bert/bert_models.json", "r") as fp:
45 | models = json.load(fp)
46 | for k, v in models.items():
47 | local_path = Path("./bert").joinpath(k)
48 | _check_bert(v["repo_id"], v["files"], local_path)
49 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device, style_text, style_weight):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](
28 | norm_text, word2ph, device, style_text, style_weight
29 | )
30 | return bert
31 |
32 |
33 | def check_bert_models():
34 | import json
35 | from pathlib import Path
36 |
37 | from config import config
38 | from .bert_utils import _check_bert
39 |
40 | if config.mirror.lower() == "openi":
41 | import openi
42 |
43 | kwargs = {"token": config.openi_token} if config.openi_token else {}
44 | openi.login(**kwargs)
45 |
46 | with open("./bert/bert_models.json", "r") as fp:
47 | models = json.load(fp)
48 | for k, v in models.items():
49 | local_path = Path("./bert").joinpath(k)
50 | _check_bert(v["repo_id"], v["files"], local_path)
51 |
52 |
53 | check_bert_models()
54 |
--------------------------------------------------------------------------------
/onnx_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from utils import get_hparams_from_file, load_checkpoint
2 | import json
3 |
4 |
5 | def export_onnx(export_path, model_path, config_path, novq, dev):
6 | hps = get_hparams_from_file(config_path)
7 | version = hps.version[0:3]
8 | if version == "2.0" or (version == "2.1" and novq):
9 | from .V200 import SynthesizerTrn, symbols
10 | elif version == "2.1" and (not novq):
11 | from .V210 import SynthesizerTrn, symbols
12 | elif version == "2.2":
13 | if novq and dev:
14 | from .V220_novq_dev import SynthesizerTrn, symbols
15 | else:
16 | from .V220 import SynthesizerTrn, symbols
17 | elif version == "2.3":
18 | from .V230 import SynthesizerTrn, symbols
19 | net_g = SynthesizerTrn(
20 | len(symbols),
21 | hps.data.filter_length // 2 + 1,
22 | hps.train.segment_size // hps.data.hop_length,
23 | n_speakers=hps.data.n_speakers,
24 | **hps.model,
25 | )
26 | _ = net_g.eval()
27 | _ = load_checkpoint(model_path, net_g, None, skip_optimizer=True)
28 | net_g.cpu()
29 | net_g.export_onnx(export_path)
30 |
31 | spklist = []
32 | for key in hps.data.spk2id.keys():
33 | spklist.append(key)
34 |
35 | MoeVSConf = {
36 | "Folder": f"{export_path}",
37 | "Name": f"{export_path}",
38 | "Type": "BertVits",
39 | "Symbol": symbols,
40 | "Cleaner": "",
41 | "Rate": hps.data.sampling_rate,
42 | "CharaMix": True,
43 | "Characters": spklist,
44 | "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
45 | "Dict": "BasicDict",
46 | "BertPath": [
47 | "chinese-roberta-wwm-ext-large",
48 | "deberta-v2-large-japanese",
49 | "bert-base-japanese-v3",
50 | ],
51 | "Clap": "clap-htsat-fused",
52 | }
53 |
54 | with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
55 | json.dump(MoeVSConf, MoeVsConfFile, indent=4)
56 |
--------------------------------------------------------------------------------
/resample_legacy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import librosa
4 | from multiprocessing import Pool, cpu_count
5 |
6 | import soundfile
7 | from tqdm import tqdm
8 |
9 | from config import config
10 |
11 |
12 | def process(item):
13 | wav_name, args = item
14 | wav_path = os.path.join(args.in_dir, wav_name)
15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
16 | wav, sr = librosa.load(wav_path, sr=args.sr)
17 | soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr)
18 |
19 |
20 | if __name__ == "__main__":
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument(
23 | "--sr",
24 | type=int,
25 | default=config.resample_config.sampling_rate,
26 | help="sampling rate",
27 | )
28 | parser.add_argument(
29 | "--in_dir",
30 | type=str,
31 | default=config.resample_config.in_dir,
32 | help="path to source dir",
33 | )
34 | parser.add_argument(
35 | "--out_dir",
36 | type=str,
37 | default=config.resample_config.out_dir,
38 | help="path to target dir",
39 | )
40 | parser.add_argument(
41 | "--processes",
42 | type=int,
43 | default=0,
44 | help="cpu_processes",
45 | )
46 | args, _ = parser.parse_known_args()
47 | # autodl 无卡模式会识别出46个cpu
48 | if args.processes == 0:
49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 | else:
51 | processes = args.processes
52 | pool = Pool(processes=processes)
53 |
54 | tasks = []
55 |
56 | for dirpath, _, filenames in os.walk(args.in_dir):
57 | if not os.path.isdir(args.out_dir):
58 | os.makedirs(args.out_dir, exist_ok=True)
59 | for filename in filenames:
60 | if filename.lower().endswith(".wav"):
61 | tasks.append((filename, args))
62 |
63 | for _ in tqdm(
64 | pool.imap_unordered(process, tasks),
65 | ):
66 | pass
67 |
68 | pool.close()
69 | pool.join()
70 |
71 | print("音频重采样完毕!")
72 |
--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
1 | from text.symbols import *
2 |
3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4 |
5 |
6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | """
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | tone_start = language_tone_start_map[language]
15 | tones = [i + tone_start for i in tones]
16 | lang_id = language_id_map[language]
17 | lang_ids = [lang_id for i in phones]
18 | return phones, tones, lang_ids
19 |
20 |
21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22 | from .chinese_bert import get_bert_feature as zh_bert
23 | from .english_bert_mock import get_bert_feature as en_bert
24 | from .japanese_bert import get_bert_feature as jp_bert
25 |
26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 | bert = lang_bert_func_map[language](
28 | norm_text, word2ph, device, style_text, style_weight
29 | )
30 | return bert
31 |
32 |
33 | def check_bert_models():
34 | import json
35 | from pathlib import Path
36 |
37 | from config import config
38 | from .bert_utils import _check_bert
39 |
40 | if config.mirror.lower() == "openi":
41 | import openi
42 |
43 | kwargs = {"token": config.openi_token} if config.openi_token else {}
44 | openi.login(**kwargs)
45 |
46 | with open("./bert/bert_models.json", "r") as fp:
47 | models = json.load(fp)
48 | for k, v in models.items():
49 | local_path = Path("./bert").joinpath(k)
50 | _check_bert(v["repo_id"], v["files"], local_path)
51 |
52 |
53 | def init_openjtalk():
54 | import platform
55 |
56 | if platform.platform() == "Linux":
57 | import pyopenjtalk
58 |
59 | pyopenjtalk.g2p("こんにちは,世界。")
60 |
61 |
62 | init_openjtalk()
63 | check_bert_models()
64 |
--------------------------------------------------------------------------------
/tools/translate.py:
--------------------------------------------------------------------------------
1 | """
2 | 翻译api
3 | """
4 | from config import config
5 |
6 | import random
7 | import hashlib
8 | import requests
9 |
10 |
11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12 | """
13 | :param Sentence: 待翻译语句
14 | :param from_Language: 待翻译语句语言
15 | :param to_Language: 目标语言
16 | :return: 翻译后语句 出错时返回None
17 |
18 | 常见语言代码:中文 zh 英语 en 日语 jp
19 | """
20 | appid = config.translate_config.app_key
21 | key = config.translate_config.secret_key
22 | if appid == "" or key == "":
23 | return "请开发者在config.yml中配置app_key与secret_key"
24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25 | texts = Sentence.splitlines()
26 | outTexts = []
27 | for t in texts:
28 | if t != "":
29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30 | salt = str(random.randint(1, 100000))
31 | signString = appid + t + salt + key
32 | hs = hashlib.md5()
33 | hs.update(signString.encode("utf-8"))
34 | signString = hs.hexdigest()
35 | if from_Language == "":
36 | from_Language = "auto"
37 | headers = {"Content-Type": "application/x-www-form-urlencoded"}
38 | payload = {
39 | "q": t,
40 | "from": from_Language,
41 | "to": to_Language,
42 | "appid": appid,
43 | "salt": salt,
44 | "sign": signString,
45 | }
46 | # 发送请求
47 | try:
48 | response = requests.post(
49 | url=url, data=payload, headers=headers, timeout=3
50 | )
51 | response = response.json()
52 | if "trans_result" in response.keys():
53 | result = response["trans_result"][0]
54 | if "dst" in result.keys():
55 | dst = result["dst"]
56 | outTexts.append(dst)
57 | except Exception:
58 | return Sentence
59 | else:
60 | outTexts.append(t)
61 | return "\n".join(outTexts)
62 |
--------------------------------------------------------------------------------
/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | if (
24 | sys.platform == "darwin"
25 | and torch.backends.mps.is_available()
26 | and device == "cpu"
27 | ):
28 | device = "mps"
29 | if not device:
30 | device = "cuda"
31 | if device not in models.keys():
32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 | with torch.no_grad():
34 | inputs = tokenizer(text, return_tensors="pt")
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = models[device](**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | if style_text:
40 | style_inputs = tokenizer(style_text, return_tensors="pt")
41 | for i in style_inputs:
42 | style_inputs[i] = style_inputs[i].to(device)
43 | style_res = models[device](**style_inputs, output_hidden_states=True)
44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 | style_res_mean = style_res.mean(0)
46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
5 |
6 | from config import config
7 |
8 |
9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 |
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | if (
24 | sys.platform == "darwin"
25 | and torch.backends.mps.is_available()
26 | and device == "cpu"
27 | ):
28 | device = "mps"
29 | if not device:
30 | device = "cuda"
31 | if device not in models.keys():
32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 | with torch.no_grad():
34 | inputs = tokenizer(text, return_tensors="pt")
35 | for i in inputs:
36 | inputs[i] = inputs[i].to(device)
37 | res = models[device](**inputs, output_hidden_states=True)
38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 | if style_text:
40 | style_inputs = tokenizer(style_text, return_tensors="pt")
41 | for i in style_inputs:
42 | style_inputs[i] = style_inputs[i].to(device)
43 | style_res = models[device](**style_inputs, output_hidden_states=True)
44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 | style_res_mean = style_res.mean(0)
46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | sep_text, _, _ = text2sep_kata(text)
18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 | return get_bert_feature_with_token(sep_ids, word2ph, device)
22 |
23 |
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 | if (
26 | sys.platform == "darwin"
27 | and torch.backends.mps.is_available()
28 | and device == "cpu"
29 | ):
30 | device = "mps"
31 | if not device:
32 | device = "cuda"
33 | if device not in models.keys():
34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 | with torch.no_grad():
36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 | token_type_ids = torch.zeros_like(inputs).to(device)
38 | attention_mask = torch.ones_like(inputs).to(device)
39 | inputs = {
40 | "input_ids": inputs,
41 | "token_type_ids": token_type_ids,
42 | "attention_mask": attention_mask,
43 | }
44 |
45 | # for i in inputs:
46 | # inputs[i] = inputs[i].to(device)
47 | res = models[device](**inputs, output_hidden_states=True)
48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 | assert inputs["input_ids"].shape[-1] == len(word2ph)
50 | word2phone = word2ph
51 | phone_level_feature = []
52 | for i in range(len(word2phone)):
53 | repeat_feature = res[i].repeat(word2phone[i], 1)
54 | phone_level_feature.append(repeat_feature)
55 |
56 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 |
58 | return phone_level_feature.T
59 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
3 | import sys
4 | from .japanese import text2sep_kata
5 | from config import config
6 |
7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
8 |
9 | models = dict()
10 |
11 |
12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
13 | sep_text, _ = text2sep_kata(text)
14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
17 | return get_bert_feature_with_token(sep_ids, word2ph, device)
18 |
19 |
20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
21 | if (
22 | sys.platform == "darwin"
23 | and torch.backends.mps.is_available()
24 | and device == "cpu"
25 | ):
26 | device = "mps"
27 | if not device:
28 | device = "cuda"
29 | if device not in models.keys():
30 | models[device] = AutoModelForMaskedLM.from_pretrained(
31 | "./bert/bert-base-japanese-v3"
32 | ).to(device)
33 | with torch.no_grad():
34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
35 | token_type_ids = torch.zeros_like(inputs).to(device)
36 | attention_mask = torch.ones_like(inputs).to(device)
37 | inputs = {
38 | "input_ids": inputs,
39 | "token_type_ids": token_type_ids,
40 | "attention_mask": attention_mask,
41 | }
42 |
43 | # for i in inputs:
44 | # inputs[i] = inputs[i].to(device)
45 | res = models[device](**inputs, output_hidden_states=True)
46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
47 | assert inputs["input_ids"].shape[-1] == len(word2ph)
48 | word2phone = word2ph
49 | phone_level_feature = []
50 | for i in range(len(word2phone)):
51 | repeat_feature = res[i].repeat(word2phone[i], 1)
52 | phone_level_feature.append(repeat_feature)
53 |
54 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
55 |
56 | return phone_level_feature.T
57 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 | sep_text, _, _ = text2sep_kata(text)
18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 | return get_bert_feature_with_token(sep_ids, word2ph, device)
22 |
23 |
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 | if (
26 | sys.platform == "darwin"
27 | and torch.backends.mps.is_available()
28 | and device == "cpu"
29 | ):
30 | device = "mps"
31 | if not device:
32 | device = "cuda"
33 | if device not in models.keys():
34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 | with torch.no_grad():
36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 | token_type_ids = torch.zeros_like(inputs).to(device)
38 | attention_mask = torch.ones_like(inputs).to(device)
39 | inputs = {
40 | "input_ids": inputs,
41 | "token_type_ids": token_type_ids,
42 | "attention_mask": attention_mask,
43 | }
44 |
45 | # for i in inputs:
46 | # inputs[i] = inputs[i].to(device)
47 | res = models[device](**inputs, output_hidden_states=True)
48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 | assert inputs["input_ids"].shape[-1] == len(word2ph)
50 | word2phone = word2ph
51 | phone_level_feature = []
52 | for i in range(len(word2phone)):
53 | repeat_feature = res[i].repeat(word2phone[i], 1)
54 | phone_level_feature.append(repeat_feature)
55 |
56 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 |
58 | return phone_level_feature.T
59 |
--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import librosa
4 | from multiprocessing import Pool, cpu_count
5 |
6 | import soundfile
7 | from tqdm import tqdm
8 |
9 | from config import config
10 |
11 |
12 | def process(item):
13 | spkdir, wav_name, args = item
14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name)
15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
16 | wav, sr = librosa.load(wav_path, sr=args.sr)
17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
18 |
19 |
20 | if __name__ == "__main__":
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument(
23 | "--sr",
24 | type=int,
25 | default=config.resample_config.sampling_rate,
26 | help="sampling rate",
27 | )
28 | parser.add_argument(
29 | "--in_dir",
30 | type=str,
31 | default=config.resample_config.in_dir,
32 | help="path to source dir",
33 | )
34 | parser.add_argument(
35 | "--out_dir",
36 | type=str,
37 | default=config.resample_config.out_dir,
38 | help="path to target dir",
39 | )
40 | parser.add_argument(
41 | "--processes",
42 | type=int,
43 | default=0,
44 | help="cpu_processes",
45 | )
46 | args, _ = parser.parse_known_args()
47 | # autodl 无卡模式会识别出46个cpu
48 | if args.processes == 0:
49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 | else:
51 | processes = args.processes
52 | pool = Pool(processes=processes)
53 |
54 | tasks = []
55 |
56 | for dirpath, _, filenames in os.walk(args.in_dir):
57 | # 子级目录
58 | spk_dir = os.path.relpath(dirpath, args.in_dir)
59 | spk_dir_out = os.path.join(args.out_dir, spk_dir)
60 | if not os.path.isdir(spk_dir_out):
61 | os.makedirs(spk_dir_out, exist_ok=True)
62 | for filename in filenames:
63 | if filename.lower().endswith(".wav"):
64 | twople = (spk_dir, filename, args)
65 | tasks.append(twople)
66 |
67 | for _ in tqdm(
68 | pool.imap_unordered(process, tasks),
69 | ):
70 | pass
71 |
72 | pool.close()
73 | pool.join()
74 |
75 | print("音频重采样完毕!")
76 |
--------------------------------------------------------------------------------
/clap_gen.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from multiprocessing import Pool, cpu_count
3 |
4 | import torch
5 | import torch.multiprocessing as mp
6 | from tqdm import tqdm
7 |
8 | import utils
9 | from config import config
10 | from clap_wrapper import get_clap_audio_feature
11 | import librosa
12 | import os
13 |
14 | os.environ["OMP_NUM_THREADS"] = "1"
15 | os.environ["MKL_NUM_THREADS"] = "1"
16 |
17 |
18 | def process_line(line):
19 | device = config.emo_gen_config.device
20 | if config.emo_gen_config.use_multi_device:
21 | rank = mp.current_process()._identity
22 | rank = rank[0] if len(rank) > 0 else 0
23 | if torch.cuda.is_available():
24 | gpu_id = rank % torch.cuda.device_count()
25 | device = torch.device(f"cuda:{gpu_id}")
26 | else:
27 | device = torch.device("cpu")
28 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
29 |
30 | clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.pt")
31 | if os.path.isfile(clap_path):
32 | return
33 |
34 | audio = librosa.load(wav_path, 48000)[0]
35 | # audio = librosa.resample(audio, 44100, 48000)
36 |
37 | clap = get_clap_audio_feature(audio, device)
38 | torch.save(clap, clap_path)
39 |
40 |
41 | if __name__ == "__main__":
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument(
44 | "-c", "--config", type=str, default=config.emo_gen_config.config_path
45 | )
46 | parser.add_argument(
47 | "--num_processes", type=int, default=config.emo_gen_config.num_processes
48 | )
49 | args, _ = parser.parse_known_args()
50 | config_path = args.config
51 | hps = utils.get_hparams_from_file(config_path)
52 | lines = []
53 | with open(hps.data.training_files, encoding="utf-8") as f:
54 | lines.extend(f.readlines())
55 |
56 | with open(hps.data.validation_files, encoding="utf-8") as f:
57 | lines.extend(f.readlines())
58 | if len(lines) != 0:
59 | num_processes = min(args.num_processes, cpu_count())
60 | with Pool(processes=num_processes) as pool:
61 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
62 | pass
63 |
64 | print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
65 |
--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language:
3 | - zh
4 | tags:
5 | - bert
6 | license: "apache-2.0"
7 | ---
8 |
9 | # Please use 'Bert' related functions to load this model!
10 |
11 | ## Chinese BERT with Whole Word Masking
12 | For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13 |
14 | **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15 | Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16 |
17 | This repository is developed based on:https://github.com/google-research/bert
18 |
19 | You may also interested in,
20 | - Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm
21 | - Chinese MacBERT: https://github.com/ymcui/MacBERT
22 | - Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA
23 | - Chinese XLNet: https://github.com/ymcui/Chinese-XLNet
24 | - Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer
25 |
26 | More resources by HFL: https://github.com/ymcui/HFL-Anthology
27 |
28 | ## Citation
29 | If you find the technical report or resource is useful, please cite the following technical report in your paper.
30 | - Primary: https://arxiv.org/abs/2004.13922
31 | ```
32 | @inproceedings{cui-etal-2020-revisiting,
33 | title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing",
34 | author = "Cui, Yiming and
35 | Che, Wanxiang and
36 | Liu, Ting and
37 | Qin, Bing and
38 | Wang, Shijin and
39 | Hu, Guoping",
40 | booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
41 | month = nov,
42 | year = "2020",
43 | address = "Online",
44 | publisher = "Association for Computational Linguistics",
45 | url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58",
46 | pages = "657--668",
47 | }
48 | ```
49 | - Secondary: https://arxiv.org/abs/1906.08101
50 | ```
51 | @article{chinese-bert-wwm,
52 | title={Pre-Training with Whole Word Masking for Chinese BERT},
53 | author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping},
54 | journal={arXiv preprint arXiv:1906.08101},
55 | year={2019}
56 | }
57 | ```
58 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "name": "Python: WebUI",
6 | "type": "python",
7 | "request": "launch",
8 | "program": "${file}",
9 | "console": "integratedTerminal",
10 | "args": [
11 | "--config_dir",
12 | "${workspaceFolder}/configs/config.json",
13 | "--debug"
14 | ],
15 | "justMyCode": false
16 | },
17 | {
18 | "name": "Train: Init",
19 | "type": "python",
20 | "request": "launch",
21 | "program": "${file}",
22 | "console": "integratedTerminal",
23 | "args": [
24 | "-m",
25 | "OUTPUT_MODEL",
26 | "--config",
27 | "${workspaceFolder}/configs/config.json",
28 | ],
29 | "justMyCode": false
30 | },
31 | {
32 | "name": "Train: Visemes",
33 | "type": "python",
34 | "request": "launch",
35 | "program": "${file}",
36 | "console": "integratedTerminal",
37 | "args": [
38 | "-m",
39 | "OUTPUT_MODEL",
40 | "--config",
41 | "${workspaceFolder}/configs/config.json",
42 | "--visemes",
43 | ],
44 | "justMyCode": false
45 | },
46 | {
47 | "name": "prepare: Visemes",
48 | "type": "python",
49 | "request": "launch",
50 | "program": "${file}",
51 | "console": "integratedTerminal",
52 | "justMyCode": false
53 | },
54 | {
55 | "name": "motion: VMC",
56 | "type": "python",
57 | "request": "launch",
58 | "program": "${file}",
59 | "console": "integratedTerminal",
60 | "args": [
61 | "--a2p",
62 | "a2p_rotations.npy",
63 | "--positions_files",
64 | "a2p_motions.npy",
65 | "--do_linear_interpolation",
66 | "False",
67 | "--fps",
68 | "30",
69 | ],
70 | "justMyCode": false
71 | },
72 | ]
73 | }
--------------------------------------------------------------------------------
/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from text.japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | text = "".join(text2sep_kata(text)[0])
24 | if style_text:
25 | style_text = "".join(text2sep_kata(style_text)[0])
26 | if (
27 | sys.platform == "darwin"
28 | and torch.backends.mps.is_available()
29 | and device == "cpu"
30 | ):
31 | device = "mps"
32 | if not device:
33 | device = "cuda"
34 | if device not in models.keys():
35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 | with torch.no_grad():
37 | inputs = tokenizer(text, return_tensors="pt")
38 | for i in inputs:
39 | inputs[i] = inputs[i].to(device)
40 | res = models[device](**inputs, output_hidden_states=True)
41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 | if style_text:
43 | style_inputs = tokenizer(style_text, return_tensors="pt")
44 | for i in style_inputs:
45 | style_inputs[i] = style_inputs[i].to(device)
46 | style_res = models[device](**style_inputs, output_hidden_states=True)
47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 | style_res_mean = style_res.mean(0)
49 |
50 | assert len(word2ph) == len(text) + 2
51 | word2phone = word2ph
52 | phone_level_feature = []
53 | for i in range(len(word2phone)):
54 | if style_text:
55 | repeat_feature = (
56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 | )
59 | else:
60 | repeat_feature = res[i].repeat(word2phone[i], 1)
61 | phone_level_feature.append(repeat_feature)
62 |
63 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 |
65 | return phone_level_feature.T
66 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/japanese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 | from .japanese import text2sep_kata
8 |
9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 |
13 | models = dict()
14 |
15 |
16 | def get_bert_feature(
17 | text,
18 | word2ph,
19 | device=config.bert_gen_config.device,
20 | style_text=None,
21 | style_weight=0.7,
22 | ):
23 | text = "".join(text2sep_kata(text)[0])
24 | if style_text:
25 | style_text = "".join(text2sep_kata(style_text)[0])
26 | if (
27 | sys.platform == "darwin"
28 | and torch.backends.mps.is_available()
29 | and device == "cpu"
30 | ):
31 | device = "mps"
32 | if not device:
33 | device = "cuda"
34 | if device not in models.keys():
35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 | with torch.no_grad():
37 | inputs = tokenizer(text, return_tensors="pt")
38 | for i in inputs:
39 | inputs[i] = inputs[i].to(device)
40 | res = models[device](**inputs, output_hidden_states=True)
41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 | if style_text:
43 | style_inputs = tokenizer(style_text, return_tensors="pt")
44 | for i in style_inputs:
45 | style_inputs[i] = style_inputs[i].to(device)
46 | style_res = models[device](**style_inputs, output_hidden_states=True)
47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 | style_res_mean = style_res.mean(0)
49 |
50 | assert len(word2ph) == len(text) + 2
51 | word2phone = word2ph
52 | phone_level_feature = []
53 | for i in range(len(word2phone)):
54 | if style_text:
55 | repeat_feature = (
56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 | )
59 | else:
60 | repeat_feature = res[i].repeat(word2phone[i], 1)
61 | phone_level_feature.append(repeat_feature)
62 |
63 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 |
65 | return phone_level_feature.T
66 |
--------------------------------------------------------------------------------
/oldVersion/V101/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.0.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 |
26 | assert bert.shape[-1] == len(phone)
27 |
28 | phone = torch.LongTensor(phone)
29 | tone = torch.LongTensor(tone)
30 | language = torch.LongTensor(language)
31 |
32 | return bert, phone, tone, language
33 |
34 |
35 | def infer(
36 | text,
37 | sdp_ratio,
38 | noise_scale,
39 | noise_scale_w,
40 | length_scale,
41 | sid,
42 | hps,
43 | net_g,
44 | device,
45 | ):
46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device)
47 | with torch.no_grad():
48 | x_tst = phones.to(device).unsqueeze(0)
49 | tones = tones.to(device).unsqueeze(0)
50 | lang_ids = lang_ids.to(device).unsqueeze(0)
51 | bert = bert.to(device).unsqueeze(0)
52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
53 | del phones
54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
55 | audio = (
56 | net_g.infer(
57 | x_tst,
58 | x_tst_lengths,
59 | speakers,
60 | tones,
61 | lang_ids,
62 | bert,
63 | sdp_ratio=sdp_ratio,
64 | noise_scale=noise_scale,
65 | noise_scale_w=noise_scale_w,
66 | length_scale=length_scale,
67 | )[0][0, 0]
68 | .data.cpu()
69 | .float()
70 | .numpy()
71 | )
72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
73 | if torch.cuda.is_available():
74 | torch.cuda.empty_cache()
75 | return audio
76 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 |
8 | def get_bert_feature(text, word2ph, device=None):
9 | if (
10 | sys.platform == "darwin"
11 | and torch.backends.mps.is_available()
12 | and device == "cpu"
13 | ):
14 | device = "mps"
15 | if not device:
16 | device = "cuda"
17 | model = AutoModelForMaskedLM.from_pretrained(
18 | "./bert/chinese-roberta-wwm-ext-large"
19 | ).to(device)
20 | with torch.no_grad():
21 | inputs = tokenizer(text, return_tensors="pt")
22 | for i in inputs:
23 | inputs[i] = inputs[i].to(device)
24 | res = model(**inputs, output_hidden_states=True)
25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 |
27 | assert len(word2ph) == len(text) + 2
28 | word2phone = word2ph
29 | phone_level_feature = []
30 | for i in range(len(word2phone)):
31 | repeat_feature = res[i].repeat(word2phone[i], 1)
32 | phone_level_feature.append(repeat_feature)
33 |
34 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
35 |
36 | return phone_level_feature.T
37 |
38 |
39 | if __name__ == "__main__":
40 | import torch
41 |
42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
43 | word2phone = [
44 | 1,
45 | 2,
46 | 1,
47 | 2,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 2,
59 | 2,
60 | 2,
61 | 1,
62 | 1,
63 | 2,
64 | 2,
65 | 1,
66 | 2,
67 | 2,
68 | 2,
69 | 2,
70 | 1,
71 | 2,
72 | 2,
73 | 2,
74 | 2,
75 | 2,
76 | 1,
77 | 2,
78 | 2,
79 | 2,
80 | 2,
81 | 1,
82 | ]
83 |
84 | # 计算总帧数
85 | total_frames = sum(word2phone)
86 | print(word_level_feature.shape)
87 | print(word2phone)
88 | phone_level_feature = []
89 | for i in range(len(word2phone)):
90 | print(word_level_feature[i].shape)
91 |
92 | # 对每个词重复word2phone[i]次
93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
94 | phone_level_feature.append(repeat_feature)
95 |
96 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
97 | print(phone_level_feature.shape) # torch.Size([36, 1024])
98 |
--------------------------------------------------------------------------------
/motion/record.py:
--------------------------------------------------------------------------------
1 | # 录音的同时开启udp监听,获取arkit发送过来的数据,然后分别保存到文件里
2 | import wave
3 | import threading
4 | import pyaudio
5 | import datetime
6 | import live_link
7 |
8 | class AudioRecorder():
9 | '''
10 | 用pyaudio录音,并保存到 wav 文件
11 | '''
12 | def __init__(self, filename=None, save_path= './records', channels=1, rate=44100, chunk=1024):
13 | if filename is None:
14 | # 时间戳 yyyy-mm-dd-hh-mm-ss.wav
15 | filename = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.wav'
16 | else:
17 | filename += '.wav'
18 | self.isrecording = False
19 | self.save_path = save_path
20 | self.filename = filename
21 | self.channels = channels
22 | self.rate = rate
23 | self.chunk = chunk
24 | self.p = pyaudio.PyAudio()
25 | self.stream = self.p.open(
26 | format=pyaudio.paInt16,
27 | channels=self.channels,
28 | rate=self.rate,
29 | input=True,
30 | frames_per_buffer=self.chunk
31 | )
32 |
33 | def close(self):
34 | # self.stream.stop_stream()
35 | self.stream.close()
36 | self.p.terminate()
37 |
38 | def save(self):
39 | wf = wave.open(self.save_path + '/' + self.filename, 'wb')
40 | wf.setnchannels(self.channels)
41 | wf.setsampwidth(self.p.get_sample_size(pyaudio.paInt16))
42 | wf.setframerate(self.rate)
43 | wf.writeframes(b''.join(self.frames))
44 | wf.close()
45 |
46 | def __recording(self):
47 | while self.isrecording:
48 | self.frames.append(self.stream.read(self.chunk))
49 |
50 | def start(self):
51 | print('Start recording')
52 | self.frames = []
53 | self.isrecording = True
54 | self.t = threading.Thread(target=self.__recording)
55 | self.t.daemon = True
56 | self.t.start()
57 |
58 |
59 | if __name__ == '__main__':
60 | filename_prefix = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
61 | recorder = AudioRecorder(filename=filename_prefix)
62 | # arkit 127.0.0.1 11111
63 | arkit_recorder = live_link.UdpRecvHandlerForArkit(
64 | (
65 | "0.0.0.0",
66 | 11111
67 | ),
68 | filename_prefix=filename_prefix
69 | )
70 | arkit_recorder.start()
71 | recorder.start()
72 | print('Recording audio...')
73 | # wait 10 seconds
74 | import time
75 | time.sleep(20)
76 | recorder.save()
77 | arkit_recorder.save()
78 | print('End audio recording')
79 | recorder.close()
80 | arkit_recorder.close()
81 | print('End arkit recording')
82 |
--------------------------------------------------------------------------------
/Improvement_2025.md:
--------------------------------------------------------------------------------
1 | ### 模型改进思路: CRNN (卷积循环神经网络)
2 |
3 | 旧版模型直接使用 TTS 网络中的隐变量 `z` 作为输入,这依赖于 `z` 能够充分捕捉所有与表情相关的语音特征。一种更直接、可能更有效的方法是直接从音频的梅尔频谱图(mel-spectrogram)中学习。这种方法将问题转化为一个标准的音频到序列的任务,可以使用经典的卷积循环神经网络(CRNN)架构。
4 |
5 | 以下是基于 PyTorch 的 CRNN 模型修改步骤,重点使用 GRU 来处理时序信息:
6 |
7 | 1. **输入调整**:
8 |
9 | - 模型的输入不再是 `z` (`[B, C, T]`),而是音频的梅尔频谱图。
10 | - 输入形状为 `[B, 1, n_mels, n_frames]` (分别对应 批次, 通道, 频率, 时间)。这里我们以 `Conv2D` 为例,因为它能同时处理时间和频率维度上的局部特征。
11 |
12 | 2. **卷积特征提取 (CNN Front-end)**:
13 |
14 | - 在模型前端添加几层 2D 卷积层 (`nn.Conv2d`) 来从频谱图中提取高级特征。
15 | - 典型的结构是 `Conv2d` -> `BatchNorm2d` -> `LeakyReLU` -> `MaxPool2d` 的堆叠。这可以有效降低特征图的维度,同时扩大感受野。
16 |
17 | ```python
18 | # 伪代码
19 | self.cnn = nn.Sequential(
20 | nn.Conv2d(1, 32, kernel_size=3, padding=1),
21 | nn.BatchNorm2d(32),
22 | nn.LeakyReLU(0.2),
23 | nn.MaxPool2d(2), # (B, 32, n_mels/2, n_frames/2)
24 |
25 | nn.Conv2d(32, 64, kernel_size=3, padding=1),
26 | nn.BatchNorm2d(64),
27 | nn.LeakyReLU(0.2),
28 | nn.MaxPool2d(2) # (B, 64, n_mels/4, n_frames/4)
29 | )
30 | ```
31 |
32 | 3. **Reshape**:
33 |
34 | - 将 CNN 模块提取的特征图从 `[B, C, H, W]` 调整为适合循环层处理的 `[B, T, F]` 格式。
35 | - `T` 代表时间序列长度, `F` 代表每个时间步的特征维度。
36 |
37 | ```python
38 | # x is output from self.cnn
39 | B, C, H, W = x.shape
40 | x = x.permute(0, 3, 1, 2) # [B, W, C, H]
41 | x = x.reshape(B, W, C * H) # [B, T, F], where T=W, F=C*H
42 | ```
43 |
44 | 4. **时序特征建模 (GRU)**:
45 |
46 | - 使用 `nn.GRU` 层来捕捉特征序列中的时间依赖关系。
47 | - 使用 `bidirectional=True` 的双向 GRU 通常能获得更好的性能,因为它能同时考虑过去和未来的上下文。
48 | - GRU 的输出是每个时间步的隐藏状态。如果只需要一个最终的特征向量来代表整个序列(例如,用于分类或单帧表情预测),可以只取最后一个时间步的输出,或者对所有时间步的输出进行池化(如 `GlobalAveragePooling1D`)。
49 |
50 | ```python
51 | # 伪代码
52 | self.gru = nn.GRU(input_size=C*H, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True)
53 | # x from reshape
54 | x, _ = self.gru(x)
55 | ```
56 |
57 | 5. **输出层 (Output Head)**:
58 | _ 在 GRU 之后,连接一个或多个全连接层 (`nn.Linear`),将 GRU 的输出映射到最终的 ARKit 表情参数维度。
59 | _ 最后一层使用 `Sigmoid` 激活函数,将输出值归一化到 `[0, 1]` 范围。
60 |
61 | ```python
62 | # 伪代码
63 | # x is output from self.gru
64 | self.fc = nn.Sequential(
65 | nn.Linear(128 * 2, 128), # *2 because of bidirectional
66 | nn.ReLU(),
67 | nn.Dropout(0.3),
68 | nn.Linear(128, 61), # 61 is n_arkit_outputs
69 | nn.Sigmoid()
70 | )
71 | # 如果GRU的return_sequences=True,则需要对时间维度处理
72 | # 例如取最后一个时间步的输出: x = x[:, -1, :]
73 | y = self.fc(x)
74 | ```
75 |
76 | 这种 CRNN 架构的优势在于它解耦了特征提取(CNN)和时序建模(GRU),使得模型结构更清晰,并且能更有效地从原始音频信号中学习面部动画。
77 |
--------------------------------------------------------------------------------
/re_matching.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def extract_language_and_text_updated(speaker, dialogue):
5 | # 使用正则表达式匹配<语言>标签和其后的文本
6 | pattern_language_text = r"<(\S+?)>([^<]+)"
7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
8 | speaker = speaker[1:-1]
9 | # 清理文本:去除两边的空白字符
10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11 | matches_cleaned.append(speaker)
12 | return matches_cleaned
13 |
14 |
15 | def validate_text(input_text):
16 | # 验证说话人的正则表达式
17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18 |
19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21 |
22 | # 对每个匹配到的说话人内容进行进一步验证
23 | for _, dialogue in matches:
24 | language_text_matches = extract_language_and_text_updated(_, dialogue)
25 | if not language_text_matches:
26 | return (
27 | False,
28 | "Error: Invalid format detected in dialogue content. Please check your input.",
29 | )
30 |
31 | # 如果输入的文本中没有找到任何匹配项
32 | if not matches:
33 | return (
34 | False,
35 | "Error: No valid speaker format detected. Please check your input.",
36 | )
37 |
38 | return True, "Input is valid."
39 |
40 |
41 | def text_matching(text: str) -> list:
42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43 | matches = re.findall(speaker_pattern, text, re.DOTALL)
44 | result = []
45 | for speaker, dialogue in matches:
46 | result.append(extract_language_and_text_updated(speaker, dialogue))
47 | return result
48 |
49 |
50 | def cut_para(text):
51 | splitted_para = re.split("[\n]", text) # 按段分
52 | splitted_para = [
53 | sentence.strip() for sentence in splitted_para if sentence.strip()
54 | ] # 删除空字符串
55 | return splitted_para
56 |
57 |
58 | def cut_sent(para):
59 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符
60 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号
61 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号
62 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
63 | para = para.rstrip() # 段尾如果有多余的\n就去掉它
64 | return para.split("\n")
65 |
66 |
67 | if __name__ == "__main__":
68 | text = """
69 | [说话人1]
70 | [说话人2]你好吗?元気ですか?こんにちは,世界。你好吗?
71 | [说话人3]谢谢。どういたしまして。
72 | """
73 | text_matching(text)
74 | # 测试函数
75 | test_text = """
76 | [说话人1]你好,こんにちは!こんにちは,世界。
77 | [说话人2]你好吗?
78 | """
79 | text_matching(test_text)
80 | res = validate_text(test_text)
81 | print(res)
82 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | device = torch.device(
6 | "cuda"
7 | if torch.cuda.is_available()
8 | else (
9 | "mps"
10 | if sys.platform == "darwin" and torch.backends.mps.is_available()
11 | else "cpu"
12 | )
13 | )
14 |
15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(
17 | device
18 | )
19 |
20 |
21 | def get_bert_feature(text, word2ph):
22 | with torch.no_grad():
23 | inputs = tokenizer(text, return_tensors="pt")
24 | for i in inputs:
25 | inputs[i] = inputs[i].to(device)
26 | res = model(**inputs, output_hidden_states=True)
27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
28 |
29 | assert len(word2ph) == len(text) + 2
30 | word2phone = word2ph
31 | phone_level_feature = []
32 | for i in range(len(word2phone)):
33 | repeat_feature = res[i].repeat(word2phone[i], 1)
34 | phone_level_feature.append(repeat_feature)
35 |
36 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 |
38 | return phone_level_feature.T
39 |
40 |
41 | if __name__ == "__main__":
42 | # feature = get_bert_feature('你好,我是说的道理。')
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/compress_model.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from text.symbols import symbols
3 | import torch
4 |
5 | from tools.log import logger
6 | import utils
7 | from models import SynthesizerTrn
8 | import os
9 |
10 |
11 | def copyStateDict(state_dict):
12 | if list(state_dict.keys())[0].startswith("module"):
13 | start_idx = 1
14 | else:
15 | start_idx = 0
16 | new_state_dict = OrderedDict()
17 | for k, v in state_dict.items():
18 | name = ",".join(k.split(".")[start_idx:])
19 | new_state_dict[name] = v
20 | return new_state_dict
21 |
22 |
23 | def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
24 | hps = utils.get_hparams_from_file(config)
25 |
26 | net_g = SynthesizerTrn(
27 | len(symbols),
28 | hps.data.filter_length // 2 + 1,
29 | hps.train.segment_size // hps.data.hop_length,
30 | n_speakers=hps.data.n_speakers,
31 | **hps.model,
32 | )
33 |
34 | optim_g = torch.optim.AdamW(
35 | net_g.parameters(),
36 | hps.train.learning_rate,
37 | betas=hps.train.betas,
38 | eps=hps.train.eps,
39 | )
40 |
41 | state_dict_g = torch.load(input_model, map_location="cpu")
42 | new_dict_g = copyStateDict(state_dict_g)
43 | keys = []
44 | for k, v in new_dict_g["model"].items():
45 | if "enc_q" in k:
46 | continue # noqa: E701
47 | keys.append(k)
48 |
49 | new_dict_g = (
50 | {k: new_dict_g["model"][k].half() for k in keys}
51 | if ishalf
52 | else {k: new_dict_g["model"][k] for k in keys}
53 | )
54 |
55 | torch.save(
56 | {
57 | "model": new_dict_g,
58 | "iteration": 0,
59 | "optimizer": optim_g.state_dict(),
60 | "learning_rate": 0.0001,
61 | },
62 | output_model,
63 | )
64 |
65 |
66 | if __name__ == "__main__":
67 | import argparse
68 |
69 | parser = argparse.ArgumentParser()
70 | parser.add_argument("-c", "--config", type=str, default="configs/config.json")
71 | parser.add_argument("-i", "--input", type=str)
72 | parser.add_argument("-o", "--output", type=str, default=None)
73 | parser.add_argument(
74 | "-hf", "--half", action="store_true", default=False, help="Save as FP16"
75 | )
76 |
77 | args = parser.parse_args()
78 |
79 | output = args.output
80 |
81 | if output is None:
82 | import os.path
83 |
84 | filename, ext = os.path.splitext(args.input)
85 | half = "_half" if args.half else ""
86 | output = filename + "_release" + half + ext
87 |
88 | removeOptimizer(args.config, args.input, args.half, output)
89 | logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")
90 |
--------------------------------------------------------------------------------
/slm/wavlm-base-plus/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "wavlm-base-plus",
3 | "activation_dropout": 0.0,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "WavLMModel"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 256,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": false,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": false,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_norm": "group",
51 | "feat_proj_dropout": 0.1,
52 | "feat_quantizer_dropout": 0.0,
53 | "final_dropout": 0.0,
54 | "freeze_feat_extract_train": true,
55 | "hidden_act": "gelu",
56 | "hidden_dropout": 0.1,
57 | "hidden_size": 768,
58 | "initializer_range": 0.02,
59 | "intermediate_size": 3072,
60 | "layer_norm_eps": 1e-05,
61 | "layerdrop": 0.05,
62 | "mask_channel_length": 10,
63 | "mask_channel_min_space": 1,
64 | "mask_channel_other": 0.0,
65 | "mask_channel_prob": 0.0,
66 | "mask_channel_selection": "static",
67 | "mask_feature_length": 10,
68 | "mask_feature_min_masks": 0,
69 | "mask_feature_prob": 0.0,
70 | "mask_time_length": 10,
71 | "mask_time_min_masks": 2,
72 | "mask_time_min_space": 1,
73 | "mask_time_other": 0.0,
74 | "mask_time_prob": 0.05,
75 | "mask_time_selection": "static",
76 | "model_type": "wavlm",
77 | "no_mask_channel_overlap": false,
78 | "no_mask_time_overlap": false,
79 | "num_adapter_layers": 3,
80 | "num_attention_heads": 12,
81 | "num_buckets": 320,
82 | "num_codevector_groups": 2,
83 | "num_codevectors_per_group": 320,
84 | "num_conv_pos_embedding_groups": 16,
85 | "num_conv_pos_embeddings": 128,
86 | "num_ctc_classes": 80,
87 | "num_feat_extract_layers": 7,
88 | "num_hidden_layers": 12,
89 | "num_negatives": 100,
90 | "output_hidden_size": 768,
91 | "pad_token_id": 0,
92 | "proj_codevector_dim": 256,
93 | "replace_prob": 0.5,
94 | "torch_dtype": "float32",
95 | "transformers_version": "4.13.0.dev0",
96 | "use_weighted_layer_sum": false,
97 | "vocab_size": 32,
98 | "tokenizer_class": "Wav2Vec2CTCTokenizer"
99 | }
100 |
--------------------------------------------------------------------------------
/configs/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 20,
4 | "eval_interval": 10,
5 | "seed": 42,
6 | "epochs": 3000,
7 | "learning_rate": 0.0002,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 16,
14 | "bf16_run": false,
15 | "lr_decay": 0.99995,
16 | "segment_size": 16384,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "c_commit": 100,
22 | "skip_optimizer": true,
23 | "freeze_ZH_bert": false,
24 | "freeze_JP_bert": false,
25 | "freeze_EN_bert": false,
26 | "freeze_emo": false
27 | },
28 | "data": {
29 | "training_files": "filelists/train.list",
30 | "validation_files": "filelists/val.list",
31 | "training_visemes_files": "filelists/train_visemes.list",
32 | "validation_visemes_files": "filelists/val_visemes.list",
33 | "max_wav_value": 32768.0,
34 | "sampling_rate": 44100,
35 | "filter_length": 2048,
36 | "hop_length": 512,
37 | "win_length": 2048,
38 | "n_mel_channels": 128,
39 | "mel_fmin": 0.0,
40 | "mel_fmax": null,
41 | "add_blank": true,
42 | "n_speakers": 4,
43 | "cleaned_text": true,
44 | "spk2id": {
45 | "hualing": 0,
46 | "good": 1,
47 | "ailing": 2,
48 | "lady": 3
49 | }
50 | },
51 | "model": {
52 | "use_spk_conditioned_encoder": true,
53 | "use_noise_scaled_mas": true,
54 | "use_mel_posterior_encoder": false,
55 | "use_duration_discriminator": true,
56 | "inter_channels": 192,
57 | "hidden_channels": 192,
58 | "filter_channels": 768,
59 | "n_heads": 2,
60 | "n_layers": 6,
61 | "kernel_size": 3,
62 | "p_dropout": 0.1,
63 | "resblock": "1",
64 | "resblock_kernel_sizes": [
65 | 3,
66 | 7,
67 | 11
68 | ],
69 | "resblock_dilation_sizes": [
70 | [
71 | 1,
72 | 3,
73 | 5
74 | ],
75 | [
76 | 1,
77 | 3,
78 | 5
79 | ],
80 | [
81 | 1,
82 | 3,
83 | 5
84 | ]
85 | ],
86 | "upsample_rates": [
87 | 8,
88 | 8,
89 | 2,
90 | 2,
91 | 2
92 | ],
93 | "upsample_initial_channel": 512,
94 | "upsample_kernel_sizes": [
95 | 16,
96 | 16,
97 | 8,
98 | 2,
99 | 2
100 | ],
101 | "n_layers_q": 3,
102 | "use_spectral_norm": false,
103 | "gin_channels": 512,
104 | "slm": {
105 | "model": "./slm/wavlm-base-plus",
106 | "sr": 16000,
107 | "hidden": 768,
108 | "nlayers": 13,
109 | "initial_channel": 64
110 | }
111 | },
112 | "version": "2.3"
113 | }
--------------------------------------------------------------------------------
/motion/wav_to_visemes.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import torch
3 | import numpy as np
4 | sys.path.insert(0, os.path.abspath('.'))
5 | import utils
6 | from models import VisemesNet
7 | from mel_processing import spectrogram_torch
8 | import torchaudio
9 | from config import config
10 | from visemes_tools import load_post_enc_dec_model, get_device
11 |
12 |
13 | #测试wav文件到visemes
14 | if __name__ == '__main__':
15 | # 从入参获取wav文件
16 | if sys.argv.__len__() < 2:
17 | print('python wav_to_visemes.py wav_file')
18 | exit(1)
19 | wav_file = sys.argv[1]
20 | if not os.path.exists(wav_file):
21 | print('wav_file not exists')
22 | exit(1)
23 | # load hps
24 | hps = utils.get_hparams_from_file('./configs/config.json')
25 | device = get_device()
26 | # load enc, dec, v_model
27 | enc, dec = load_post_enc_dec_model(hps, device=device)
28 | print('net_g loaded')
29 |
30 | net_v = VisemesNet(hps.model.hidden_channels).to(device)
31 | _ = net_v.eval()
32 | _ = utils.load_checkpoint(config.webui_config.v_model, net_v, None, skip_optimizer=True)
33 | print("load v_model from", config.webui_config.v_model)
34 |
35 | if wav_file.endswith('z.npy'):
36 | print('load z from npy file')
37 | z = np.load(wav_file)
38 | z = torch.from_numpy(z).to(device)
39 | # if type is half, convert to float
40 | if z.dtype == torch.float16:
41 | z = z.float()
42 | visemes = net_v(z)
43 | else:
44 | # load wav file
45 | audio_norm, sampling_rate = torchaudio.load(wav_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
46 | # check sampling_rate == 44100
47 | if sampling_rate != 44100:
48 | print('sampling_rate error:', sampling_rate)
49 | print('ffmpeg -i input.wav -ar 44100 output.wav')
50 | exit(1)
51 | spec = spectrogram_torch(audio_norm, hps.data.filter_length,
52 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
53 | center=False)
54 | spec = spec.to(device=get_device())
55 | audio_norm = audio_norm.unsqueeze(0)
56 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long()
57 |
58 | # get z
59 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None)
60 | print('get z of wav file: ', wav_file)
61 |
62 | visemes_file_path = wav_file[:-4] + '.v.npy'
63 | # generate visemes
64 | visemes = net_v(z)
65 | visemes = visemes.squeeze(0)
66 | visemes = visemes.transpose(0, 1)
67 | visemes = visemes.data.cpu().float().numpy()
68 | print('visemes shape:', visemes.shape)
69 |
70 | # save visemes
71 | np.save(visemes_file_path, visemes)
72 | print('visemes saved to ', visemes_file_path)
73 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
4 |
5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
6 |
7 | models = dict()
8 |
9 |
10 | def get_bert_feature(text, word2ph, device=None):
11 | if (
12 | sys.platform == "darwin"
13 | and torch.backends.mps.is_available()
14 | and device == "cpu"
15 | ):
16 | device = "mps"
17 | if not device:
18 | device = "cuda"
19 | if device not in models.keys():
20 | models[device] = AutoModelForMaskedLM.from_pretrained(
21 | "./bert/chinese-roberta-wwm-ext-large"
22 | ).to(device)
23 | with torch.no_grad():
24 | inputs = tokenizer(text, return_tensors="pt")
25 | for i in inputs:
26 | inputs[i] = inputs[i].to(device)
27 | res = models[device](**inputs, output_hidden_states=True)
28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |
30 | assert len(word2ph) == len(text) + 2
31 | word2phone = word2ph
32 | phone_level_feature = []
33 | for i in range(len(word2phone)):
34 | repeat_feature = res[i].repeat(word2phone[i], 1)
35 | phone_level_feature.append(repeat_feature)
36 |
37 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
38 |
39 | return phone_level_feature.T
40 |
41 |
42 | if __name__ == "__main__":
43 | import torch
44 |
45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46 | word2phone = [
47 | 1,
48 | 2,
49 | 1,
50 | 2,
51 | 2,
52 | 1,
53 | 2,
54 | 2,
55 | 1,
56 | 2,
57 | 2,
58 | 1,
59 | 2,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 1,
65 | 1,
66 | 2,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 2,
72 | 2,
73 | 1,
74 | 2,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 1,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | ]
86 |
87 | # 计算总帧数
88 | total_frames = sum(word2phone)
89 | print(word_level_feature.shape)
90 | print(word2phone)
91 | phone_level_feature = []
92 | for i in range(len(word2phone)):
93 | print(word_level_feature[i].shape)
94 |
95 | # 对每个词重复word2phone[i]次
96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97 | phone_level_feature.append(repeat_feature)
98 |
99 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 | print(phone_level_feature.shape) # torch.Size([36, 1024])
101 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16 | if (
17 | sys.platform == "darwin"
18 | and torch.backends.mps.is_available()
19 | and device == "cpu"
20 | ):
21 | device = "mps"
22 | if not device:
23 | device = "cuda"
24 | if device not in models.keys():
25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26 | with torch.no_grad():
27 | inputs = tokenizer(text, return_tensors="pt")
28 | for i in inputs:
29 | inputs[i] = inputs[i].to(device)
30 | res = models[device](**inputs, output_hidden_states=True)
31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32 |
33 | assert len(word2ph) == len(text) + 2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
44 |
45 | if __name__ == "__main__":
46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47 | word2phone = [
48 | 1,
49 | 2,
50 | 1,
51 | 2,
52 | 2,
53 | 1,
54 | 2,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 2,
65 | 1,
66 | 1,
67 | 2,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 1,
86 | ]
87 |
88 | # 计算总帧数
89 | total_frames = sum(word2phone)
90 | print(word_level_feature.shape)
91 | print(word2phone)
92 | phone_level_feature = []
93 | for i in range(len(word2phone)):
94 | print(word_level_feature[i].shape)
95 |
96 | # 对每个词重复word2phone[i]次
97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98 | phone_level_feature.append(repeat_feature)
99 |
100 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 | print(phone_level_feature.shape) # torch.Size([36, 1024])
102 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16 | if (
17 | sys.platform == "darwin"
18 | and torch.backends.mps.is_available()
19 | and device == "cpu"
20 | ):
21 | device = "mps"
22 | if not device:
23 | device = "cuda"
24 | if device not in models.keys():
25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26 | with torch.no_grad():
27 | inputs = tokenizer(text, return_tensors="pt")
28 | for i in inputs:
29 | inputs[i] = inputs[i].to(device)
30 | res = models[device](**inputs, output_hidden_states=True)
31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32 |
33 | assert len(word2ph) == len(text) + 2
34 | word2phone = word2ph
35 | phone_level_feature = []
36 | for i in range(len(word2phone)):
37 | repeat_feature = res[i].repeat(word2phone[i], 1)
38 | phone_level_feature.append(repeat_feature)
39 |
40 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 |
42 | return phone_level_feature.T
43 |
44 |
45 | if __name__ == "__main__":
46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47 | word2phone = [
48 | 1,
49 | 2,
50 | 1,
51 | 2,
52 | 2,
53 | 1,
54 | 2,
55 | 2,
56 | 1,
57 | 2,
58 | 2,
59 | 1,
60 | 2,
61 | 2,
62 | 2,
63 | 2,
64 | 2,
65 | 1,
66 | 1,
67 | 2,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 2,
78 | 2,
79 | 2,
80 | 1,
81 | 2,
82 | 2,
83 | 2,
84 | 2,
85 | 1,
86 | ]
87 |
88 | # 计算总帧数
89 | total_frames = sum(word2phone)
90 | print(word_level_feature.shape)
91 | print(word2phone)
92 | phone_level_feature = []
93 | for i in range(len(word2phone)):
94 | print(word_level_feature[i].shape)
95 |
96 | # 对每个词重复word2phone[i]次
97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98 | phone_level_feature.append(repeat_feature)
99 |
100 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 | print(phone_level_feature.shape) # torch.Size([36, 1024])
102 |
--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: apache-2.0
3 | datasets:
4 | - cc100
5 | - wikipedia
6 | language:
7 | - ja
8 | widget:
9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 |
12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 |
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 |
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 |
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 |
21 | ## Model architecture
22 |
23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24 |
25 | ## Training Data
26 |
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 |
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 |
33 | ## Tokenization
34 |
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 |
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 |
40 | ## Training
41 |
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 |
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 |
47 | ## Licenses
48 |
49 | The pretrained models are distributed under the Apache License 2.0.
50 |
51 | ## Acknowledgments
52 |
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 |
--------------------------------------------------------------------------------
/motion/prepare_visemes.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import torch
3 | sys.path.append('./')
4 | import utils
5 | from text.symbols import symbols
6 | from models import SynthesizerTrn, PosteriorEncoder, Generator
7 | from mel_processing import spectrogram_torch, mel_spectrogram_torch, spec_to_mel_torch
8 | import torchaudio
9 | from visemes_tools import load_post_enc_dec_model, get_device
10 |
11 |
12 |
13 | # 读取records目录下的 *.wav 音频文件和 *.npy 表情数据[n, 61],相同的文件名为一组。
14 | # 前5组[file1.wav, file1.npy]生成训练数据列表 val_visemes.list
15 | # 剩余的组生成测试数据列表 train_visemes.list
16 | def gen_visemes_train_val_list(hps, input_dir='./records/', output_dir = './filelists/'):
17 | enc, dec = load_post_enc_dec_model(hps, device=get_device())
18 | print('enc, dec loaded')
19 | # read all files in input_dir
20 | files = os.listdir(input_dir)
21 | # filter wav files
22 | wav_files = filter(lambda x: x.endswith('.wav'), files)
23 | wav_files = sorted(wav_files)
24 | # overwrite the list file
25 | with open(output_dir + 'val_visemes.list', 'w') as f:
26 | f.write('')
27 | with open(output_dir + 'train_visemes.list', 'w') as f:
28 | f.write('')
29 | # iterate wav files
30 | for i, wav_file in enumerate(wav_files):
31 | # get the corresponding npy file and make sure it exists
32 | wav_file = input_dir + wav_file
33 | print('processing wav file: ', wav_file)
34 | npy_file = wav_file[:-4] + '.npy'
35 | if not os.path.exists(npy_file):
36 | print('npy file {} does not exist'.format(npy_file))
37 | continue
38 | audio_norm, sampling_rate = torchaudio.load(wav_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
39 | spec = spectrogram_torch(audio_norm, hps.data.filter_length,
40 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
41 | center=False)
42 | spec = spec.to(device=get_device())
43 | audio_norm = audio_norm.unsqueeze(0)
44 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long()
45 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None)
46 | print('get z of wav file: ', wav_file)
47 | z_file_path = wav_file[:-4] + '.z.npy'
48 | z = z.to(device='cpu')
49 | # save z
50 | torch.save(z, z_file_path)
51 | print('z saved to ', z_file_path)
52 |
53 |
54 | # generate the line for the list file
55 | line = z_file_path + '|' + npy_file + '\n'
56 | # write the line to the list file
57 | if i < 5:
58 | with open(output_dir + 'val_visemes.list', 'a') as f:
59 | f.write(line)
60 | else:
61 | with open(output_dir + 'train_visemes.list', 'a') as f:
62 | f.write(line)
63 |
64 |
65 | if __name__ == '__main__':
66 | hps = utils.get_hparams_from_file('./configs/config.json')
67 | gen_visemes_train_val_list(hps)
--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "torch",
3 | "activation_dropout": 0.1,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "Wav2Vec2ForSpeechClassification"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 768,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": true,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": true,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_dropout": 0.0,
51 | "feat_extract_norm": "layer",
52 | "feat_proj_dropout": 0.1,
53 | "feat_quantizer_dropout": 0.0,
54 | "final_dropout": 0.1,
55 | "finetuning_task": "wav2vec2_reg",
56 | "gradient_checkpointing": false,
57 | "hidden_act": "gelu",
58 | "hidden_dropout": 0.1,
59 | "hidden_dropout_prob": 0.1,
60 | "hidden_size": 1024,
61 | "id2label": {
62 | "0": "arousal",
63 | "1": "dominance",
64 | "2": "valence"
65 | },
66 | "initializer_range": 0.02,
67 | "intermediate_size": 4096,
68 | "label2id": {
69 | "arousal": 0,
70 | "dominance": 1,
71 | "valence": 2
72 | },
73 | "layer_norm_eps": 1e-05,
74 | "layerdrop": 0.1,
75 | "mask_feature_length": 10,
76 | "mask_feature_min_masks": 0,
77 | "mask_feature_prob": 0.0,
78 | "mask_time_length": 10,
79 | "mask_time_min_masks": 2,
80 | "mask_time_prob": 0.05,
81 | "model_type": "wav2vec2",
82 | "num_adapter_layers": 3,
83 | "num_attention_heads": 16,
84 | "num_codevector_groups": 2,
85 | "num_codevectors_per_group": 320,
86 | "num_conv_pos_embedding_groups": 16,
87 | "num_conv_pos_embeddings": 128,
88 | "num_feat_extract_layers": 7,
89 | "num_hidden_layers": 12,
90 | "num_negatives": 100,
91 | "output_hidden_size": 1024,
92 | "pad_token_id": 0,
93 | "pooling_mode": "mean",
94 | "problem_type": "regression",
95 | "proj_codevector_dim": 768,
96 | "tdnn_dilation": [
97 | 1,
98 | 2,
99 | 3,
100 | 1,
101 | 1
102 | ],
103 | "tdnn_dim": [
104 | 512,
105 | 512,
106 | 512,
107 | 512,
108 | 1500
109 | ],
110 | "tdnn_kernel": [
111 | 5,
112 | 3,
113 | 3,
114 | 1,
115 | 1
116 | ],
117 | "torch_dtype": "float32",
118 | "transformers_version": "4.17.0.dev0",
119 | "use_weighted_layer_sum": false,
120 | "vocab_size": null,
121 | "xvector_output_dim": 512
122 | }
123 |
--------------------------------------------------------------------------------
/spec_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tqdm import tqdm
3 | from multiprocessing import Pool
4 | from mel_processing import spectrogram_torch, mel_spectrogram_torch
5 | from utils import load_wav_to_torch
6 |
7 |
8 | class AudioProcessor:
9 | def __init__(
10 | self,
11 | max_wav_value,
12 | use_mel_spec_posterior,
13 | filter_length,
14 | n_mel_channels,
15 | sampling_rate,
16 | hop_length,
17 | win_length,
18 | mel_fmin,
19 | mel_fmax,
20 | ):
21 | self.max_wav_value = max_wav_value
22 | self.use_mel_spec_posterior = use_mel_spec_posterior
23 | self.filter_length = filter_length
24 | self.n_mel_channels = n_mel_channels
25 | self.sampling_rate = sampling_rate
26 | self.hop_length = hop_length
27 | self.win_length = win_length
28 | self.mel_fmin = mel_fmin
29 | self.mel_fmax = mel_fmax
30 |
31 | def process_audio(self, filename):
32 | audio, sampling_rate = load_wav_to_torch(filename)
33 | audio_norm = audio / self.max_wav_value
34 | audio_norm = audio_norm.unsqueeze(0)
35 | spec_filename = filename.replace(".wav", ".spec.pt")
36 | if self.use_mel_spec_posterior:
37 | spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
38 | try:
39 | spec = torch.load(spec_filename)
40 | except:
41 | if self.use_mel_spec_posterior:
42 | spec = mel_spectrogram_torch(
43 | audio_norm,
44 | self.filter_length,
45 | self.n_mel_channels,
46 | self.sampling_rate,
47 | self.hop_length,
48 | self.win_length,
49 | self.mel_fmin,
50 | self.mel_fmax,
51 | center=False,
52 | )
53 | else:
54 | spec = spectrogram_torch(
55 | audio_norm,
56 | self.filter_length,
57 | self.sampling_rate,
58 | self.hop_length,
59 | self.win_length,
60 | center=False,
61 | )
62 | spec = torch.squeeze(spec, 0)
63 | torch.save(spec, spec_filename)
64 | return spec, audio_norm
65 |
66 |
67 | # 使用示例
68 | processor = AudioProcessor(
69 | max_wav_value=32768.0,
70 | use_mel_spec_posterior=False,
71 | filter_length=2048,
72 | n_mel_channels=128,
73 | sampling_rate=44100,
74 | hop_length=512,
75 | win_length=2048,
76 | mel_fmin=0.0,
77 | mel_fmax="null",
78 | )
79 |
80 | with open("filelists/train.list", "r") as f:
81 | filepaths = [line.split("|")[0] for line in f] # 取每一行的第一部分作为audiopath
82 |
83 | # 使用多进程处理
84 | with Pool(processes=32) as pool: # 使用4个进程
85 | with tqdm(total=len(filepaths)) as pbar:
86 | for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)):
87 | pbar.update()
88 |
--------------------------------------------------------------------------------
/bert_gen.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from multiprocessing import Pool
3 | import commons
4 | import utils
5 | from tqdm import tqdm
6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert
7 | import argparse
8 | import torch.multiprocessing as mp
9 | from config import config
10 |
11 |
12 | def process_line(x):
13 | line, add_blank = x
14 | device = config.bert_gen_config.device
15 | if config.bert_gen_config.use_multi_device:
16 | rank = mp.current_process()._identity
17 | rank = rank[0] if len(rank) > 0 else 0
18 | if torch.cuda.is_available():
19 | gpu_id = rank % torch.cuda.device_count()
20 | device = torch.device(f"cuda:{gpu_id}")
21 | else:
22 | device = torch.device("cpu")
23 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
24 | phone = phones.split(" ")
25 | tone = [int(i) for i in tone.split(" ")]
26 | word2ph = [int(i) for i in word2ph.split(" ")]
27 | word2ph = [i for i in word2ph]
28 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
29 |
30 | if add_blank:
31 | phone = commons.intersperse(phone, 0)
32 | tone = commons.intersperse(tone, 0)
33 | language = commons.intersperse(language, 0)
34 | for i in range(len(word2ph)):
35 | word2ph[i] = word2ph[i] * 2
36 | word2ph[0] += 1
37 |
38 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
39 |
40 | try:
41 | bert = torch.load(bert_path)
42 | assert bert.shape[-1] == len(phone)
43 | except Exception:
44 | bert = get_bert(text, word2ph, language_str, device)
45 | assert bert.shape[-1] == len(phone)
46 | torch.save(bert, bert_path)
47 |
48 |
49 | preprocess_text_config = config.preprocess_text_config
50 |
51 | if __name__ == "__main__":
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument(
54 | "-c", "--config", type=str, default=config.bert_gen_config.config_path
55 | )
56 | parser.add_argument(
57 | "--num_processes", type=int, default=config.bert_gen_config.num_processes
58 | )
59 | args, _ = parser.parse_known_args()
60 | config_path = args.config
61 | hps = utils.get_hparams_from_file(config_path)
62 | check_bert_models()
63 | lines = []
64 | with open(hps.data.training_files, encoding="utf-8") as f:
65 | lines.extend(f.readlines())
66 |
67 | with open(hps.data.validation_files, encoding="utf-8") as f:
68 | lines.extend(f.readlines())
69 | add_blank = [hps.data.add_blank] * len(lines)
70 |
71 | if len(lines) != 0:
72 | num_processes = args.num_processes
73 | with Pool(processes=num_processes) as pool:
74 | for _ in tqdm(
75 | pool.imap_unordered(process_line, zip(lines, add_blank)),
76 | total=len(lines),
77 | ):
78 | # 这里是缩进的代码块,表示循环体
79 | pass # 使用pass语句作为占位符
80 |
81 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
82 |
--------------------------------------------------------------------------------
/oldVersion/V110/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 1.1 版本兼容
3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1
4 | """
5 | import torch
6 | import commons
7 | from .text.cleaner import clean_text
8 | from .text import cleaned_text_to_sequence
9 | from oldVersion.V111.text import get_bert
10 |
11 |
12 | def get_text(text, language_str, hps, device):
13 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 |
16 | if hps.data.add_blank:
17 | phone = commons.intersperse(phone, 0)
18 | tone = commons.intersperse(tone, 0)
19 | language = commons.intersperse(language, 0)
20 | for i in range(len(word2ph)):
21 | word2ph[i] = word2ph[i] * 2
22 | word2ph[0] += 1
23 | bert = get_bert(norm_text, word2ph, language_str, device)
24 | del word2ph
25 | assert bert.shape[-1] == len(phone), phone
26 |
27 | if language_str == "ZH":
28 | bert = bert
29 | ja_bert = torch.zeros(768, len(phone))
30 | elif language_str == "JP":
31 | ja_bert = bert
32 | bert = torch.zeros(1024, len(phone))
33 | else:
34 | bert = torch.zeros(1024, len(phone))
35 | ja_bert = torch.zeros(768, len(phone))
36 |
37 | assert bert.shape[-1] == len(
38 | phone
39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 |
41 | phone = torch.LongTensor(phone)
42 | tone = torch.LongTensor(tone)
43 | language = torch.LongTensor(language)
44 | return bert, ja_bert, phone, tone, language
45 |
46 |
47 | def infer(
48 | text,
49 | sdp_ratio,
50 | noise_scale,
51 | noise_scale_w,
52 | length_scale,
53 | sid,
54 | language,
55 | hps,
56 | net_g,
57 | device,
58 | ):
59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
60 | with torch.no_grad():
61 | x_tst = phones.to(device).unsqueeze(0)
62 | tones = tones.to(device).unsqueeze(0)
63 | lang_ids = lang_ids.to(device).unsqueeze(0)
64 | bert = bert.to(device).unsqueeze(0)
65 | ja_bert = ja_bert.to(device).unsqueeze(0)
66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
67 | del phones
68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
69 | audio = (
70 | net_g.infer(
71 | x_tst,
72 | x_tst_lengths,
73 | speakers,
74 | tones,
75 | lang_ids,
76 | bert,
77 | ja_bert,
78 | sdp_ratio=sdp_ratio,
79 | noise_scale=noise_scale,
80 | noise_scale_w=noise_scale_w,
81 | length_scale=length_scale,
82 | )[0][0, 0]
83 | .data.cpu()
84 | .float()
85 | .numpy()
86 | )
87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
88 | if torch.cuda.is_available():
89 | torch.cuda.empty_cache()
90 | return audio
91 |
--------------------------------------------------------------------------------
/motion/visemes_tools.py:
--------------------------------------------------------------------------------
1 |
2 | import os,sys
3 | import torch
4 | sys.path.insert(0, os.path.abspath('.'))
5 | import utils
6 | from text.symbols import symbols
7 | from models import SynthesizerTrn, PosteriorEncoder, Generator
8 | from mel_processing import spectrogram_torch, mel_spectrogram_torch, spec_to_mel_torch
9 | import torchaudio
10 |
11 | def get_device():
12 | device = (
13 | "cuda:0"
14 | if torch.cuda.is_available()
15 | else (
16 | "mps"
17 | if sys.platform == "darwin" and torch.backends.mps.is_available()
18 | else "cpu"
19 | )
20 | )
21 | print("Using device: {}".format(device))
22 | return device
23 |
24 | def load_post_enc_dec_model(hps, model_path = './OUTPUT_MODEL/models/G_3000.pth', device='cpu'):
25 | # load the model
26 | print('Loading model from {}'.format(model_path))
27 | net_g = SynthesizerTrn(
28 | len(symbols),
29 | hps.data.filter_length // 2 + 1,
30 | hps.train.segment_size // hps.data.hop_length,
31 | n_speakers=hps.data.n_speakers,
32 | **hps.model).to(device)
33 | _ = net_g.eval()
34 |
35 | _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
36 | print('Model loaded')
37 |
38 | return net_g.get_post_enc_dec()
39 |
40 | def test_wav_enc_dec(hps, input_file='test_in.wav', output_file='test_out.wav', enc = None, dec = None):
41 | if enc == None or dec == None:
42 | enc, dec = load_post_enc_dec_model(hps, device=get_device())
43 | audio_norm, sampling_rate = torchaudio.load(input_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
44 | # 短时傅里叶变换, 非 mel普
45 | spec = spectrogram_torch(audio_norm, hps.data.filter_length,
46 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
47 | center=False)
48 | spec = spec.to(device=get_device())
49 | audio_norm = audio_norm.unsqueeze(0)
50 | print('audio_norm.shape: ', audio_norm.shape, 'spec.shape', spec.shape, 'file: ', input_file)
51 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long()
52 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None)
53 | print('z.shape: ', z.shape)
54 | y = dec(z)
55 | print('y.shape: ', y.shape)
56 | y = y.squeeze(0).data.cpu()
57 | #save y to output_file
58 | torchaudio.save(output_file, y, sampling_rate)
59 | print('output_file: ', output_file, 'saved')
60 |
61 | def save_post_enc_model(hps, model_path = './OUTPUT_MODEL/models/G_3000.pth', device='cpu'):
62 | # load the model
63 | print('Loading model from {}'.format(model_path))
64 | enc, _ = load_post_enc_dec_model(hps, model_path, device)
65 | print('Model loaded')
66 | post_enc_path = os.path.join(os.path.dirname(model_path), 'post_enc.pth')
67 | torch.save(enc.state_dict(), post_enc_path)
68 | print('Post-encoder saved to {}'.format(post_enc_path))
69 |
70 |
71 | if __name__ == '__main__':
72 | hps = utils.get_hparams_from_file('./configs/config.json')
73 | # test_wav_enc_dec(hps)
74 | save_post_enc_model(hps)
--------------------------------------------------------------------------------
/update_status.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gradio as gr
3 |
4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"}
5 |
6 |
7 | def raw_dir_convert_to_path(target_dir: str, lang):
8 | res = target_dir.rstrip("/").rstrip("\\")
9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")):
10 | res = os.path.join("./raw", res)
11 | if (
12 | (not res.endswith("_zh"))
13 | and (not res.endswith("_jp"))
14 | and (not res.endswith("_en"))
15 | ):
16 | res += lang_dict[lang]
17 | return res
18 |
19 |
20 | def update_g_files():
21 | g_files = []
22 | cnt = 0
23 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
24 | for file in files:
25 | if file.startswith("G_") and file.endswith(".pth"):
26 | g_files.append(os.path.join(root, file))
27 | cnt += 1
28 | print(g_files)
29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files)
30 |
31 |
32 | def update_c_files():
33 | c_files = []
34 | cnt = 0
35 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
36 | for file in files:
37 | if file.startswith("config.json"):
38 | c_files.append(os.path.join(root, file))
39 | cnt += 1
40 | print(c_files)
41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files)
42 |
43 |
44 | def update_model_folders():
45 | subdirs = []
46 | cnt = 0
47 | for root, dirs, files in os.walk(os.path.abspath("./logs")):
48 | for dir_name in dirs:
49 | if os.path.basename(dir_name) != "eval":
50 | subdirs.append(os.path.join(root, dir_name))
51 | cnt += 1
52 | print(subdirs)
53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs)
54 |
55 |
56 | def update_wav_lab_pairs():
57 | wav_count = tot_count = 0
58 | for root, _, files in os.walk("./raw"):
59 | for file in files:
60 | # print(file)
61 | file_path = os.path.join(root, file)
62 | if file.lower().endswith(".wav"):
63 | lab_file = os.path.splitext(file_path)[0] + ".lab"
64 | if os.path.exists(lab_file):
65 | wav_count += 1
66 | tot_count += 1
67 | return f"{wav_count} / {tot_count}"
68 |
69 |
70 | def update_raw_folders():
71 | subdirs = []
72 | cnt = 0
73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径
74 | raw_path = os.path.join(script_path, "raw")
75 | print(raw_path)
76 | os.makedirs(raw_path, exist_ok=True)
77 | for root, dirs, files in os.walk(raw_path):
78 | for dir_name in dirs:
79 | relative_path = os.path.relpath(
80 | os.path.join(root, dir_name), script_path
81 | ) # 获取相对路径
82 | subdirs.append(relative_path)
83 | cnt += 1
84 | print(subdirs)
85 | return (
86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹",
87 | gr.Dropdown.update(choices=subdirs),
88 | gr.Textbox.update(value=update_wav_lab_pairs()),
89 | )
90 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "I",
78 | "N",
79 | "U",
80 | "a",
81 | "b",
82 | "by",
83 | "ch",
84 | "cl",
85 | "d",
86 | "dy",
87 | "e",
88 | "f",
89 | "g",
90 | "gy",
91 | "h",
92 | "hy",
93 | "i",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "p",
103 | "py",
104 | "r",
105 | "ry",
106 | "s",
107 | "sh",
108 | "t",
109 | "ts",
110 | "u",
111 | "V",
112 | "w",
113 | "y",
114 | "z",
115 | ]
116 | num_ja_tones = 1
117 |
118 | # English
119 | en_symbols = [
120 | "aa",
121 | "ae",
122 | "ah",
123 | "ao",
124 | "aw",
125 | "ay",
126 | "b",
127 | "ch",
128 | "d",
129 | "dh",
130 | "eh",
131 | "er",
132 | "ey",
133 | "f",
134 | "g",
135 | "hh",
136 | "ih",
137 | "iy",
138 | "jh",
139 | "k",
140 | "l",
141 | "m",
142 | "n",
143 | "ng",
144 | "ow",
145 | "oy",
146 | "p",
147 | "r",
148 | "s",
149 | "sh",
150 | "t",
151 | "th",
152 | "uh",
153 | "uw",
154 | "V",
155 | "w",
156 | "y",
157 | "z",
158 | "zh",
159 | ]
160 | num_en_tones = 4
161 |
162 | # combine all symbols
163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
164 | symbols = [pad] + normal_symbols + pu_symbols
165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
166 |
167 | # combine all tones
168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
169 |
170 | # language maps
171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2}
172 | num_languages = len(language_id_map.keys())
173 |
174 | language_tone_start_map = {
175 | "ZH": 0,
176 | "JA": num_zh_tones,
177 | "EN": num_zh_tones + num_ja_tones,
178 | }
179 |
180 | if __name__ == "__main__":
181 | a = set(zh_symbols)
182 | b = set(en_symbols)
183 | print(sorted(a & b))
184 |
--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V110/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V111/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 1
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V200/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V200/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V210/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V220/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V230/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/text/symbols.py:
--------------------------------------------------------------------------------
1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2 | pu_symbols = punctuation + ["SP", "UNK"]
3 | pad = "_"
4 |
5 | # chinese
6 | zh_symbols = [
7 | "E",
8 | "En",
9 | "a",
10 | "ai",
11 | "an",
12 | "ang",
13 | "ao",
14 | "b",
15 | "c",
16 | "ch",
17 | "d",
18 | "e",
19 | "ei",
20 | "en",
21 | "eng",
22 | "er",
23 | "f",
24 | "g",
25 | "h",
26 | "i",
27 | "i0",
28 | "ia",
29 | "ian",
30 | "iang",
31 | "iao",
32 | "ie",
33 | "in",
34 | "ing",
35 | "iong",
36 | "ir",
37 | "iu",
38 | "j",
39 | "k",
40 | "l",
41 | "m",
42 | "n",
43 | "o",
44 | "ong",
45 | "ou",
46 | "p",
47 | "q",
48 | "r",
49 | "s",
50 | "sh",
51 | "t",
52 | "u",
53 | "ua",
54 | "uai",
55 | "uan",
56 | "uang",
57 | "ui",
58 | "un",
59 | "uo",
60 | "v",
61 | "van",
62 | "ve",
63 | "vn",
64 | "w",
65 | "x",
66 | "y",
67 | "z",
68 | "zh",
69 | "AA",
70 | "EE",
71 | "OO",
72 | ]
73 | num_zh_tones = 6
74 |
75 | # japanese
76 | ja_symbols = [
77 | "N",
78 | "a",
79 | "a:",
80 | "b",
81 | "by",
82 | "ch",
83 | "d",
84 | "dy",
85 | "e",
86 | "e:",
87 | "f",
88 | "g",
89 | "gy",
90 | "h",
91 | "hy",
92 | "i",
93 | "i:",
94 | "j",
95 | "k",
96 | "ky",
97 | "m",
98 | "my",
99 | "n",
100 | "ny",
101 | "o",
102 | "o:",
103 | "p",
104 | "py",
105 | "q",
106 | "r",
107 | "ry",
108 | "s",
109 | "sh",
110 | "t",
111 | "ts",
112 | "ty",
113 | "u",
114 | "u:",
115 | "w",
116 | "y",
117 | "z",
118 | "zy",
119 | ]
120 | num_ja_tones = 2
121 |
122 | # English
123 | en_symbols = [
124 | "aa",
125 | "ae",
126 | "ah",
127 | "ao",
128 | "aw",
129 | "ay",
130 | "b",
131 | "ch",
132 | "d",
133 | "dh",
134 | "eh",
135 | "er",
136 | "ey",
137 | "f",
138 | "g",
139 | "hh",
140 | "ih",
141 | "iy",
142 | "jh",
143 | "k",
144 | "l",
145 | "m",
146 | "n",
147 | "ng",
148 | "ow",
149 | "oy",
150 | "p",
151 | "r",
152 | "s",
153 | "sh",
154 | "t",
155 | "th",
156 | "uh",
157 | "uw",
158 | "V",
159 | "w",
160 | "y",
161 | "z",
162 | "zh",
163 | ]
164 | num_en_tones = 4
165 |
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 |
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 |
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 |
178 | language_tone_start_map = {
179 | "ZH": 0,
180 | "JP": num_zh_tones,
181 | "EN": num_zh_tones + num_ja_tones,
182 | }
183 |
184 | if __name__ == "__main__":
185 | a = set(zh_symbols)
186 | b = set(en_symbols)
187 | print(sorted(a & b))
188 |
--------------------------------------------------------------------------------
/oldVersion/V200/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | @Desc: 2.0版本兼容 对应2.0.1 2.0.2-fix
3 | """
4 | import torch
5 | import commons
6 | from .text import cleaned_text_to_sequence, get_bert
7 | from .text.cleaner import clean_text
8 |
9 |
10 | def get_text(text, language_str, hps, device):
11 | # 在此处实现当前版本的get_text
12 | norm_text, phone, tone, word2ph = clean_text(text, language_str)
13 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
14 |
15 | if hps.data.add_blank:
16 | phone = commons.intersperse(phone, 0)
17 | tone = commons.intersperse(tone, 0)
18 | language = commons.intersperse(language, 0)
19 | for i in range(len(word2ph)):
20 | word2ph[i] = word2ph[i] * 2
21 | word2ph[0] += 1
22 | bert_ori = get_bert(norm_text, word2ph, language_str, device)
23 | del word2ph
24 | assert bert_ori.shape[-1] == len(phone), phone
25 |
26 | if language_str == "ZH":
27 | bert = bert_ori
28 | ja_bert = torch.zeros(1024, len(phone))
29 | en_bert = torch.zeros(1024, len(phone))
30 | elif language_str == "JP":
31 | bert = torch.zeros(1024, len(phone))
32 | ja_bert = bert_ori
33 | en_bert = torch.zeros(1024, len(phone))
34 | elif language_str == "EN":
35 | bert = torch.zeros(1024, len(phone))
36 | ja_bert = torch.zeros(1024, len(phone))
37 | en_bert = bert_ori
38 | else:
39 | raise ValueError("language_str should be ZH, JP or EN")
40 |
41 | assert bert.shape[-1] == len(
42 | phone
43 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
44 |
45 | phone = torch.LongTensor(phone)
46 | tone = torch.LongTensor(tone)
47 | language = torch.LongTensor(language)
48 | return bert, ja_bert, en_bert, phone, tone, language
49 |
50 |
51 | def infer(
52 | text,
53 | sdp_ratio,
54 | noise_scale,
55 | noise_scale_w,
56 | length_scale,
57 | sid,
58 | language,
59 | hps,
60 | net_g,
61 | device,
62 | ):
63 | bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
64 | text, language, hps, device
65 | )
66 | with torch.no_grad():
67 | x_tst = phones.to(device).unsqueeze(0)
68 | tones = tones.to(device).unsqueeze(0)
69 | lang_ids = lang_ids.to(device).unsqueeze(0)
70 | bert = bert.to(device).unsqueeze(0)
71 | ja_bert = ja_bert.to(device).unsqueeze(0)
72 | en_bert = en_bert.to(device).unsqueeze(0)
73 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
74 | del phones
75 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
76 | audio = (
77 | net_g.infer(
78 | x_tst,
79 | x_tst_lengths,
80 | speakers,
81 | tones,
82 | lang_ids,
83 | bert,
84 | ja_bert,
85 | en_bert,
86 | sdp_ratio=sdp_ratio,
87 | noise_scale=noise_scale,
88 | noise_scale_w=noise_scale_w,
89 | length_scale=length_scale,
90 | )[0][0, 0]
91 | .data.cpu()
92 | .float()
93 | .numpy()
94 | )
95 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
96 | if torch.cuda.is_available():
97 | torch.cuda.empty_cache()
98 | return audio
99 |
--------------------------------------------------------------------------------
/oldVersion/V101/text/japanese.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2 | import re
3 | import sys
4 |
5 | import pyopenjtalk
6 |
7 | from . import symbols
8 |
9 | # Regular expression matching Japanese without punctuation marks:
10 | _japanese_characters = re.compile(
11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
12 | )
13 |
14 | # Regular expression matching non-Japanese characters or punctuation marks:
15 | _japanese_marks = re.compile(
16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
17 | )
18 |
19 | # List of (symbol, Japanese) pairs for marks:
20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
21 |
22 |
23 | # List of (consonant, sokuon) pairs:
24 | _real_sokuon = [
25 | (re.compile("%s" % x[0]), x[1])
26 | for x in [
27 | (r"Q([↑↓]*[kg])", r"k#\1"),
28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
29 | (r"Q([↑↓]*[sʃ])", r"s\1"),
30 | (r"Q([↑↓]*[pb])", r"p#\1"),
31 | ]
32 | ]
33 |
34 | # List of (consonant, hatsuon) pairs:
35 | _real_hatsuon = [
36 | (re.compile("%s" % x[0]), x[1])
37 | for x in [
38 | (r"N([↑↓]*[pbm])", r"m\1"),
39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"),
40 | (r"N([↑↓]*[tdn])", r"n\1"),
41 | (r"N([↑↓]*[kg])", r"ŋ\1"),
42 | ]
43 | ]
44 |
45 |
46 | def post_replace_ph(ph):
47 | rep_map = {
48 | ":": ",",
49 | ";": ",",
50 | ",": ",",
51 | "。": ".",
52 | "!": "!",
53 | "?": "?",
54 | "\n": ".",
55 | "·": ",",
56 | "、": ",",
57 | "...": "…",
58 | "v": "V",
59 | }
60 | if ph in rep_map.keys():
61 | ph = rep_map[ph]
62 | if ph in symbols:
63 | return ph
64 | if ph not in symbols:
65 | ph = "UNK"
66 | return ph
67 |
68 |
69 | def symbols_to_japanese(text):
70 | for regex, replacement in _symbols_to_japanese:
71 | text = re.sub(regex, replacement, text)
72 | return text
73 |
74 |
75 | def preprocess_jap(text):
76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
77 | text = symbols_to_japanese(text)
78 | sentences = re.split(_japanese_marks, text)
79 | marks = re.findall(_japanese_marks, text)
80 | text = []
81 | for i, sentence in enumerate(sentences):
82 | if re.match(_japanese_characters, sentence):
83 | p = pyopenjtalk.g2p(sentence)
84 | text += p.split(" ")
85 |
86 | if i < len(marks):
87 | text += [marks[i].replace(" ", "")]
88 | return text
89 |
90 |
91 | def text_normalize(text):
92 | # todo: jap text normalize
93 | return text
94 |
95 |
96 | def g2p(norm_text):
97 | phones = preprocess_jap(norm_text)
98 | phones = [post_replace_ph(i) for i in phones]
99 | # todo: implement tones and word2ph
100 | tones = [0 for i in phones]
101 | word2ph = [1 for i in phones]
102 | return phones, tones, word2ph
103 |
104 |
105 | if __name__ == "__main__":
106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines():
107 | text = line.split(":")[1]
108 | phones, tones, word2ph = g2p(text)
109 | for p in phones:
110 | if p == "z":
111 | print(text, phones)
112 | sys.exit(0)
113 |
--------------------------------------------------------------------------------
/motion/data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "blend shape:"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "np_file_path = r'../records/2023-12-23-17-19-54.npy'\n",
19 | "bs = np.load(np_file_path, allow_pickle=True)\n",
20 | "print(bs.shape)\n",
21 | "# draw lines from bs\n",
22 | "for i in range(bs.shape[1]):\n",
23 | " line_data = bs[:120]\n",
24 | " plt.plot(line_data)\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "旋转测试"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "新的旋转四元数: [0 1 0 0]\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "import numpy as np\n",
49 | "\n",
50 | "# 原始的旋转四元数\n",
51 | "x3, y3, z3, w3 = (0, 0, 0, 1)\n",
52 | "\n",
53 | "# 180度绕 Y 轴的四元数表示\n",
54 | "r = np.array([0, 1, 0, 0])\n",
55 | "\n",
56 | "# 四元数乘法函数\n",
57 | "def quat_multiply(q1, q2):\n",
58 | " x1, y1, z1, w1 = q1\n",
59 | " x2, y2, z2, w2 = q2\n",
60 | " \n",
61 | " w = w1*w2 - x1*x2 - y1*y2 - z1*z2\n",
62 | " x = x1*w2 + w1*x2 + y1*z2 - z1*y2\n",
63 | " y = w1*y2 - x1*z2 + y1*w2 + z1*x2\n",
64 | " z = w1*z2 + x1*y2 - y1*x2 + z1*w2\n",
65 | " \n",
66 | " return np.array([x, y, z, w])\n",
67 | "\n",
68 | "# 现在找到新的旋转四元数表示\n",
69 | "new_quaternion = quat_multiply(r, [x3, y3, z3, w3])\n",
70 | "\n",
71 | "print(\"新的旋转四元数:\", new_quaternion)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "合并文件"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 8,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "(1270, 61)\n",
91 | "(1253, 61)\n",
92 | "(2523, 61)\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "#读取 tmp_cn.npy tmp_en.npy\n",
98 | "import numpy as np\n",
99 | "bs1 = np.load(r'../tmp_cn.npy', allow_pickle=True)\n",
100 | "bs2 = np.load(r'../tmp_en.npy', allow_pickle=True)\n",
101 | "print(bs1.shape)\n",
102 | "print(bs2.shape)\n",
103 | "# 在维度0上合并\n",
104 | "bs = np.concatenate((bs1, bs2), axis=0)\n",
105 | "print(bs.shape)\n",
106 | "#保存维度0的前1500个数值\n",
107 | "# np.save(r'../tmp_16.npy', bs)\n",
108 | "np.save(r'../tmp_16.npy', bs[:1500])"
109 | ]
110 | }
111 | ],
112 | "metadata": {
113 | "kernelspec": {
114 | "display_name": "Python 3",
115 | "language": "python",
116 | "name": "python3"
117 | },
118 | "language_info": {
119 | "codemirror_mode": {
120 | "name": "ipython",
121 | "version": 3
122 | },
123 | "file_extension": ".py",
124 | "mimetype": "text/x-python",
125 | "name": "python",
126 | "nbconvert_exporter": "python",
127 | "pygments_lexer": "ipython3",
128 | "version": "3.10.8"
129 | }
130 | },
131 | "nbformat": 4,
132 | "nbformat_minor": 2
133 | }
134 |
--------------------------------------------------------------------------------
/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(
16 | text,
17 | word2ph,
18 | device=config.bert_gen_config.device,
19 | style_text=None,
20 | style_weight=0.7,
21 | ):
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 | if device not in models.keys():
31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | for i in inputs:
35 | inputs[i] = inputs[i].to(device)
36 | res = models[device](**inputs, output_hidden_states=True)
37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38 | if style_text:
39 | style_inputs = tokenizer(style_text, return_tensors="pt")
40 | for i in style_inputs:
41 | style_inputs[i] = style_inputs[i].to(device)
42 | style_res = models[device](**style_inputs, output_hidden_states=True)
43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44 | style_res_mean = style_res.mean(0)
45 | assert len(word2ph) == len(text) + 2
46 | word2phone = word2ph
47 | phone_level_feature = []
48 | for i in range(len(word2phone)):
49 | if style_text:
50 | repeat_feature = (
51 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
52 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
53 | )
54 | else:
55 | repeat_feature = res[i].repeat(word2phone[i], 1)
56 | phone_level_feature.append(repeat_feature)
57 |
58 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
59 |
60 | return phone_level_feature.T
61 |
62 |
63 | if __name__ == "__main__":
64 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
65 | word2phone = [
66 | 1,
67 | 2,
68 | 1,
69 | 2,
70 | 2,
71 | 1,
72 | 2,
73 | 2,
74 | 1,
75 | 2,
76 | 2,
77 | 1,
78 | 2,
79 | 2,
80 | 2,
81 | 2,
82 | 2,
83 | 1,
84 | 1,
85 | 2,
86 | 2,
87 | 1,
88 | 2,
89 | 2,
90 | 2,
91 | 2,
92 | 1,
93 | 2,
94 | 2,
95 | 2,
96 | 2,
97 | 2,
98 | 1,
99 | 2,
100 | 2,
101 | 2,
102 | 2,
103 | 1,
104 | ]
105 |
106 | # 计算总帧数
107 | total_frames = sum(word2phone)
108 | print(word_level_feature.shape)
109 | print(word2phone)
110 | phone_level_feature = []
111 | for i in range(len(word2phone)):
112 | print(word_level_feature[i].shape)
113 |
114 | # 对每个词重复word2phone[i]次
115 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
116 | phone_level_feature.append(repeat_feature)
117 |
118 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
119 | print(phone_level_feature.shape) # torch.Size([36, 1024])
120 |
--------------------------------------------------------------------------------
/oldVersion/V210/text/chinese_bert.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
5 |
6 | from config import config
7 |
8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9 |
10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11 |
12 | models = dict()
13 |
14 |
15 | def get_bert_feature(
16 | text,
17 | word2ph,
18 | device=config.bert_gen_config.device,
19 | style_text=None,
20 | style_weight=0.7,
21 | ):
22 | if (
23 | sys.platform == "darwin"
24 | and torch.backends.mps.is_available()
25 | and device == "cpu"
26 | ):
27 | device = "mps"
28 | if not device:
29 | device = "cuda"
30 | if device not in models.keys():
31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32 | with torch.no_grad():
33 | inputs = tokenizer(text, return_tensors="pt")
34 | for i in inputs:
35 | inputs[i] = inputs[i].to(device)
36 | res = models[device](**inputs, output_hidden_states=True)
37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38 | if style_text:
39 | style_inputs = tokenizer(style_text, return_tensors="pt")
40 | for i in style_inputs:
41 | style_inputs[i] = style_inputs[i].to(device)
42 | style_res = models[device](**style_inputs, output_hidden_states=True)
43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44 | style_res_mean = style_res.mean(0)
45 |
46 | assert len(word2ph) == len(text) + 2
47 | word2phone = word2ph
48 | phone_level_feature = []
49 | for i in range(len(word2phone)):
50 | if style_text:
51 | repeat_feature = (
52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 | )
55 | else:
56 | repeat_feature = res[i].repeat(word2phone[i], 1)
57 | phone_level_feature.append(repeat_feature)
58 |
59 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 |
61 | return phone_level_feature.T
62 |
63 |
64 | if __name__ == "__main__":
65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
66 | word2phone = [
67 | 1,
68 | 2,
69 | 1,
70 | 2,
71 | 2,
72 | 1,
73 | 2,
74 | 2,
75 | 1,
76 | 2,
77 | 2,
78 | 1,
79 | 2,
80 | 2,
81 | 2,
82 | 2,
83 | 2,
84 | 1,
85 | 1,
86 | 2,
87 | 2,
88 | 1,
89 | 2,
90 | 2,
91 | 2,
92 | 2,
93 | 1,
94 | 2,
95 | 2,
96 | 2,
97 | 2,
98 | 2,
99 | 1,
100 | 2,
101 | 2,
102 | 2,
103 | 2,
104 | 1,
105 | ]
106 |
107 | # 计算总帧数
108 | total_frames = sum(word2phone)
109 | print(word_level_feature.shape)
110 | print(word2phone)
111 | phone_level_feature = []
112 | for i in range(len(word2phone)):
113 | print(word_level_feature[i].shape)
114 |
115 | # 对每个词重复word2phone[i]次
116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
117 | phone_level_feature.append(repeat_feature)
118 |
119 | phone_level_feature = torch.cat(phone_level_feature, dim=0)
120 | print(phone_level_feature.shape) # torch.Size([36, 1024])
121 |
--------------------------------------------------------------------------------
/bert/deberta-v3-large/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | language: en
3 | tags:
4 | - deberta
5 | - deberta-v3
6 | - fill-mask
7 | thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
8 | license: mit
9 | ---
10 |
11 | ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
12 |
13 | [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
14 |
15 | In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
16 |
17 | Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
18 |
19 | The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2.
20 |
21 |
22 | #### Fine-tuning on NLU tasks
23 |
24 | We present the dev results on SQuAD 2.0 and MNLI tasks.
25 |
26 | | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
27 | |-------------------|----------|-------------------|-----------|----------|
28 | | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 |
29 | | XLNet-large |32 |- | 90.6/87.9 | 90.8 |
30 | | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 |
31 | | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**|
32 |
33 |
34 | #### Fine-tuning with HF transformers
35 |
36 | ```bash
37 | #!/bin/bash
38 |
39 | cd transformers/examples/pytorch/text-classification/
40 |
41 | pip install datasets
42 | export TASK_NAME=mnli
43 |
44 | output_dir="ds_results"
45 |
46 | num_gpus=8
47 |
48 | batch_size=8
49 |
50 | python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
51 | run_glue.py \
52 | --model_name_or_path microsoft/deberta-v3-large \
53 | --task_name $TASK_NAME \
54 | --do_train \
55 | --do_eval \
56 | --evaluation_strategy steps \
57 | --max_seq_length 256 \
58 | --warmup_steps 50 \
59 | --per_device_train_batch_size ${batch_size} \
60 | --learning_rate 6e-6 \
61 | --num_train_epochs 2 \
62 | --output_dir $output_dir \
63 | --overwrite_output_dir \
64 | --logging_steps 1000 \
65 | --logging_dir $output_dir
66 |
67 | ```
68 |
69 | ### Citation
70 |
71 | If you find DeBERTa useful for your work, please cite the following papers:
72 |
73 | ``` latex
74 | @misc{he2021debertav3,
75 | title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
76 | author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
77 | year={2021},
78 | eprint={2111.09543},
79 | archivePrefix={arXiv},
80 | primaryClass={cs.CL}
81 | }
82 | ```
83 |
84 | ``` latex
85 | @inproceedings{
86 | he2021deberta,
87 | title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
88 | author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
89 | booktitle={International Conference on Learning Representations},
90 | year={2021},
91 | url={https://openreview.net/forum?id=XPZIaotutsD}
92 | }
93 | ```
94 |
--------------------------------------------------------------------------------
/oldVersion/V210/emo_gen.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from torch.utils.data import Dataset
6 | from torch.utils.data import Dataset
7 | from transformers import Wav2Vec2Processor
8 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
9 | Wav2Vec2Model,
10 | Wav2Vec2PreTrainedModel,
11 | )
12 |
13 | from config import config
14 |
15 |
16 | class RegressionHead(nn.Module):
17 | r"""Classification head."""
18 |
19 | def __init__(self, config):
20 | super().__init__()
21 |
22 | self.dense = nn.Linear(config.hidden_size, config.hidden_size)
23 | self.dropout = nn.Dropout(config.final_dropout)
24 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
25 |
26 | def forward(self, features, **kwargs):
27 | x = features
28 | x = self.dropout(x)
29 | x = self.dense(x)
30 | x = torch.tanh(x)
31 | x = self.dropout(x)
32 | x = self.out_proj(x)
33 |
34 | return x
35 |
36 |
37 | class EmotionModel(Wav2Vec2PreTrainedModel):
38 | r"""Speech emotion classifier."""
39 |
40 | def __init__(self, config):
41 | super().__init__(config)
42 |
43 | self.config = config
44 | self.wav2vec2 = Wav2Vec2Model(config)
45 | self.classifier = RegressionHead(config)
46 | self.init_weights()
47 |
48 | def forward(
49 | self,
50 | input_values,
51 | ):
52 | outputs = self.wav2vec2(input_values)
53 | hidden_states = outputs[0]
54 | hidden_states = torch.mean(hidden_states, dim=1)
55 | logits = self.classifier(hidden_states)
56 |
57 | return hidden_states, logits
58 |
59 |
60 | class AudioDataset(Dataset):
61 | def __init__(self, list_of_wav_files, sr, processor):
62 | self.list_of_wav_files = list_of_wav_files
63 | self.processor = processor
64 | self.sr = sr
65 |
66 | def __len__(self):
67 | return len(self.list_of_wav_files)
68 |
69 | def __getitem__(self, idx):
70 | wav_file = self.list_of_wav_files[idx]
71 | audio_data, _ = librosa.load(wav_file, sr=self.sr)
72 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[
73 | "input_values"
74 | ][0]
75 | return torch.from_numpy(processed_data)
76 |
77 |
78 | device = config.emo_gen_config.device
79 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
80 | processor = Wav2Vec2Processor.from_pretrained(model_name)
81 | model = EmotionModel.from_pretrained(model_name).to(device)
82 |
83 |
84 | def process_func(
85 | x: np.ndarray,
86 | sampling_rate: int,
87 | model: EmotionModel,
88 | processor: Wav2Vec2Processor,
89 | device: str,
90 | embeddings: bool = False,
91 | ) -> np.ndarray:
92 | r"""Predict emotions or extract embeddings from raw audio signal."""
93 | model = model.to(device)
94 | y = processor(x, sampling_rate=sampling_rate)
95 | y = y["input_values"][0]
96 | y = torch.from_numpy(y).unsqueeze(0).to(device)
97 |
98 | # run through model
99 | with torch.no_grad():
100 | y = model(y)[0 if embeddings else 1]
101 |
102 | # convert to numpy
103 | y = y.detach().cpu().numpy()
104 |
105 | return y
106 |
107 |
108 | def get_emo(path):
109 | wav, sr = librosa.load(path, 16000)
110 | return process_func(
111 | np.expand_dims(wav, 0).astype(np.float64),
112 | sr,
113 | model,
114 | processor,
115 | device,
116 | embeddings=True,
117 | ).squeeze(0)
118 |
--------------------------------------------------------------------------------