├── .gitmodules ├── oldVersion ├── V111 │ └── text │ │ ├── fix │ │ ├── __init__.py │ │ └── japanese_bert.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ └── symbols.py ├── __init__.py ├── V200 │ ├── text │ │ ├── cmudict_cache.pickle │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── english_bert_mock.py │ │ ├── __init__.py │ │ ├── japanese_bert.py │ │ ├── chinese_bert.py │ │ └── symbols.py │ └── __init__.py ├── V210 │ ├── text │ │ ├── cmudict_cache.pickle │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── symbols.py │ │ └── chinese_bert.py │ └── emo_gen.py ├── V101 │ ├── text │ │ ├── english_bert_mock.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ ├── symbols.py │ │ └── japanese.py │ └── __init__.py └── V110 │ ├── text │ ├── english_bert_mock.py │ ├── cleaner.py │ ├── __init__.py │ ├── japanese_bert.py │ ├── chinese_bert.py │ └── symbols.py │ └── __init__.py ├── tools ├── __init__.py ├── log.py └── translate.py ├── bert ├── chinese-roberta-wwm-ext-large │ ├── added_tokens.json │ ├── tokenizer_config.json │ ├── special_tokens_map.json │ ├── .gitattributes │ ├── config.json │ └── README.md ├── deberta-v3-large │ ├── tokenizer_config.json │ ├── generator_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md ├── deberta-v2-large-japanese-char-wwm │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── config.json │ └── .gitattributes ├── deberta-v2-large-japanese │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── config.json │ └── .gitattributes ├── bert-base-japanese-v3 │ ├── tokenizer_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md ├── bert-large-japanese-v2 │ ├── tokenizer_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md └── bert_models.json ├── onnx_modules ├── V200 │ ├── text │ │ ├── __init__.py │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── chinese_bert.py │ │ └── symbols.py │ └── __init__.py ├── V210 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V220 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V230 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V220_novq_dev │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py └── __init__.py ├── emotional ├── wav2vec2-large-robust-12-ft-emotion-msp-dim │ ├── vocab.json │ ├── preprocessor_config.json │ ├── .gitattributes │ └── config.json └── clap-htsat-fused │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── preprocessor_config.json │ └── .gitattributes ├── img ├── 宵宫.png ├── yuyu.png ├── 参数说明.png ├── 神里绫华.png ├── 纳西妲.png ├── bert-vits2-e.png └── 微信图片_20231010105112.png ├── text ├── cmudict_cache.pickle ├── bert_utils.py ├── cleaner.py ├── __init__.py ├── english_bert_mock.py ├── japanese_bert.py ├── symbols.py └── chinese_bert.py ├── slm └── wavlm-base-plus │ ├── preprocessor_config.json │ ├── .gitattributes │ └── config.json ├── css └── custom.css ├── requirements.txt ├── export_onnx.py ├── monotonic_align ├── __init__.py └── core.py ├── .pre-commit-config.yaml ├── run_MnodesAndMgpus.sh ├── clap_wrapper.py ├── onnx_infer.py ├── resample_legacy.py ├── resample.py ├── clap_gen.py ├── .vscode └── launch.json ├── motion ├── record.py ├── wav_to_visemes.py ├── prepare_visemes.py ├── visemes_tools.py └── data.ipynb ├── Improvement_2025.md ├── re_matching.py ├── compress_model.py ├── configs └── config.json ├── spec_gen.py ├── bert_gen.py └── update_status.py /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oldVersion/V111/text/fix/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 工具包 3 | """ 4 | -------------------------------------------------------------------------------- /oldVersion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 老版本模型推理兼容 3 | """ 4 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/added_tokens.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V210/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V220/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V230/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /img/宵宫.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/宵宫.png -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"init_inputs": []} 2 | -------------------------------------------------------------------------------- /img/yuyu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/yuyu.png -------------------------------------------------------------------------------- /img/参数说明.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/参数说明.png -------------------------------------------------------------------------------- /img/神里绫华.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/神里绫华.png -------------------------------------------------------------------------------- /img/纳西妲.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/纳西妲.png -------------------------------------------------------------------------------- /img/bert-vits2-e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/bert-vits2-e.png -------------------------------------------------------------------------------- /text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /bert/deberta-v3-large/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_lower_case": false, 3 | "vocab_type": "spm" 4 | } 5 | -------------------------------------------------------------------------------- /img/微信图片_20231010105112.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/img/微信图片_20231010105112.png -------------------------------------------------------------------------------- /oldVersion/V200/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/oldVersion/V200/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V210/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/see2023/Bert-VITS2-ext/HEAD/oldVersion/V210/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V101/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V110/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V111/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /onnx_modules/V200/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V210/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V220/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V230/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"} 2 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "[CLS]", 3 | "cls_token": "[CLS]", 4 | "eos_token": "[SEP]", 5 | "mask_token": "[MASK]", 6 | "pad_token": "[PAD]", 7 | "sep_token": "[SEP]", 8 | "unk_token": "[UNK]" 9 | } 10 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": false, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0.0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0.0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /css/custom.css: -------------------------------------------------------------------------------- 1 | 2 | #yml_code { 3 | height: 600px; 4 | flex-grow: inherit; 5 | overflow-y: auto; 6 | } 7 | 8 | #json_code { 9 | height: 600px; 10 | flex-grow: inherit; 11 | overflow-y: auto; 12 | } 13 | 14 | #gpu_code { 15 | height: 300px; 16 | flex-grow: inherit; 17 | overflow-y: auto; 18 | } 19 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tools/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | logger封装 3 | """ 4 | from loguru import logger 5 | import sys 6 | 7 | 8 | # 移除所有默认的处理器 9 | logger.remove() 10 | 11 | # 自定义格式并添加到标准输出 12 | log_format = ( 13 | "{time:MM-DD HH:mm:ss} {level:<9}| {file}:{line} | {message}" 14 | ) 15 | 16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True) 17 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "", 3 | "cls_token": "", 4 | "eos_token": "", 5 | "mask_token": { 6 | "content": "", 7 | "lstrip": true, 8 | "normalized": false, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "pad_token": "", 13 | "sep_token": "", 14 | "unk_token": "" 15 | } 16 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "[CLS]", 3 | "cls_token": "[CLS]", 4 | "do_lower_case": false, 5 | "eos_token": "[SEP]", 6 | "keep_accents": true, 7 | "mask_token": "[MASK]", 8 | "pad_token": "[PAD]", 9 | "sep_token": "[SEP]", 10 | "sp_model_kwargs": {}, 11 | "special_tokens_map_file": null, 12 | "split_by_punct": false, 13 | "tokenizer_class": "DebertaV2Tokenizer", 14 | "unk_token": "[UNK]" 15 | } 16 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/.gitattributes: -------------------------------------------------------------------------------- 1 | *.bin.* filter=lfs diff=lfs merge=lfs -text 2 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.h5 filter=lfs diff=lfs merge=lfs -text 5 | *.tflite filter=lfs diff=lfs merge=lfs -text 6 | *.tar.gz filter=lfs diff=lfs merge=lfs -text 7 | *.ot filter=lfs diff=lfs merge=lfs -text 8 | *.onnx filter=lfs diff=lfs merge=lfs -text 9 | *.msgpack filter=lfs diff=lfs merge=lfs -text 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa==0.9.2 2 | matplotlib 3 | numpy 4 | numba 5 | phonemizer 6 | scipy 7 | tensorboard 8 | Unidecode 9 | amfm_decompy 10 | jieba 11 | transformers 12 | pypinyin 13 | cn2an 14 | gradio 15 | av 16 | mecab-python3 17 | loguru 18 | unidic-lite 19 | cmudict 20 | fugashi 21 | num2words 22 | PyYAML 23 | requests 24 | pyopenjtalk-prebuilt 25 | jaconv 26 | psutil 27 | GPUtil 28 | vector_quantize_pytorch 29 | g2p_en 30 | sentencepiece 31 | pykakasi 32 | langid 33 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": "", 4 | "cls_token": "", 5 | "eos_token": "", 6 | "errors": "replace", 7 | "mask_token": "", 8 | "model_max_length": 512, 9 | "pad_token": "", 10 | "processor_class": "ClapProcessor", 11 | "sep_token": "", 12 | "special_tokens_map_file": null, 13 | "tokenizer_class": "RobertaTokenizer", 14 | "trim_offsets": true, 15 | "unk_token": "" 16 | } 17 | -------------------------------------------------------------------------------- /export_onnx.py: -------------------------------------------------------------------------------- 1 | from onnx_modules import export_onnx 2 | import os 3 | 4 | if __name__ == "__main__": 5 | export_path = "BertVits2.2PT" 6 | model_path = "model\\G_0.pth" 7 | config_path = "model\\config.json" 8 | novq = False 9 | dev = False 10 | if not os.path.exists("onnx"): 11 | os.makedirs("onnx") 12 | if not os.path.exists(f"onnx/{export_path}"): 13 | os.makedirs(f"onnx/{export_path}") 14 | export_onnx(export_path, model_path, config_path, novq, dev) 15 | -------------------------------------------------------------------------------- /bert/bert_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "deberta-v2-large-japanese-char-wwm": { 3 | "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm", 4 | "files": ["pytorch_model.bin"] 5 | }, 6 | "chinese-roberta-wwm-ext-large": { 7 | "repo_id": "hfl/chinese-roberta-wwm-ext-large", 8 | "files": ["pytorch_model.bin"] 9 | }, 10 | "deberta-v3-large": { 11 | "repo_id": "microsoft/deberta-v3-large", 12 | "files": ["spm.model", "pytorch_model.bin"] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 1024, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 4096, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 16, 15 | "num_hidden_layers": 24, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | 7 | def maximum_path(neg_cent, mask): 8 | device = neg_cent.device 9 | dtype = neg_cent.dtype 10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 11 | path = zeros(neg_cent.shape, dtype=int32) 12 | 13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 16 | return from_numpy(path).to(device=device, dtype=dtype) 17 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_lower_case": false, 4 | "do_subword_tokenize": true, 5 | "do_word_tokenize": true, 6 | "jumanpp_kwargs": null, 7 | "mask_token": "[MASK]", 8 | "mecab_kwargs": null, 9 | "model_max_length": 1000000000000000019884624838656, 10 | "never_split": null, 11 | "pad_token": "[PAD]", 12 | "sep_token": "[SEP]", 13 | "special_tokens_map_file": null, 14 | "subword_tokenizer_type": "character", 15 | "sudachi_kwargs": null, 16 | "tokenizer_class": "BertJapaneseTokenizer", 17 | "unk_token": "[UNK]", 18 | "word_tokenizer_type": "basic" 19 | } 20 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | rev: v0.1.8 11 | hooks: 12 | - id: ruff 13 | args: [ --fix ] 14 | 15 | - repo: https://github.com/psf/black 16 | rev: 23.12.0 17 | hooks: 18 | - id: black 19 | 20 | - repo: https://github.com/codespell-project/codespell 21 | rev: v2.2.6 22 | hooks: 23 | - id: codespell 24 | files: ^.*\.(py|md|rst|yml)$ 25 | args: [-L=fro] 26 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunk_length_s": 10, 3 | "feature_extractor_type": "ClapFeatureExtractor", 4 | "feature_size": 64, 5 | "fft_window_size": 1024, 6 | "frequency_max": 14000, 7 | "frequency_min": 50, 8 | "hop_length": 480, 9 | "max_length_s": 10, 10 | "n_fft": 1024, 11 | "nb_frequency_bins": 513, 12 | "nb_max_frames": 1000, 13 | "nb_max_samples": 480000, 14 | "padding": "repeatpad", 15 | "padding_side": "right", 16 | "padding_value": 0.0, 17 | "processor_class": "ClapProcessor", 18 | "return_attention_mask": false, 19 | "sampling_rate": 48000, 20 | "top_db": null, 21 | "truncation": "fusion" 22 | } 23 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/generator_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "deberta-v2", 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 1024, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 4096, 9 | "max_position_embeddings": 512, 10 | "relative_attention": true, 11 | "position_buckets": 256, 12 | "norm_rel_ebd": "layer_norm", 13 | "share_att_key": true, 14 | "pos_att_type": "p2c|c2p", 15 | "layer_norm_eps": 1e-7, 16 | "max_relative_positions": -1, 17 | "position_biased_input": false, 18 | "num_attention_heads": 16, 19 | "num_hidden_layers": 12, 20 | "type_vocab_size": 0, 21 | "vocab_size": 128100 22 | } 23 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "deberta-v2", 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 1024, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 4096, 9 | "max_position_embeddings": 512, 10 | "relative_attention": true, 11 | "position_buckets": 256, 12 | "norm_rel_ebd": "layer_norm", 13 | "share_att_key": true, 14 | "pos_att_type": "p2c|c2p", 15 | "layer_norm_eps": 1e-7, 16 | "max_relative_positions": -1, 17 | "position_biased_input": false, 18 | "num_attention_heads": 16, 19 | "num_hidden_layers": 24, 20 | "type_vocab_size": 0, 21 | "vocab_size": 128100 22 | } 23 | -------------------------------------------------------------------------------- /text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V200/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V210/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /run_MnodesAndMgpus.sh: -------------------------------------------------------------------------------- 1 | #多机多卡训练 2 | 3 | #--nnodes=1:3 表示 使用一到三台机器 弹性分配资源 4 | #--nnodes=<最小节点数>:<最大节点数> 5 | #--nproc_per_node=每台机器上可用的GPU数 6 | #--rdzv_endpoint=主节点(最先启动的)ip:端口号 7 | #其他不需要变 8 | 9 | #注意: 此版本的分布式训练是基于数据并行的,多机多卡相当于开更大的batchsize,此时epoch迭代速度会增加, 10 | #但由于 该版本的代码中 保存模型是按照global step来计算的,所以会出现的效果就是 : 保存模型的时间不会有明显加速, 11 | #但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数,实现更好的效果” 12 | 13 | #************************* 14 | # torchrun \ 15 | # --nnodes=1:3\ 16 | # --nproc_per_node=2\ 17 | # --rdzv_id=1\ 18 | # --rdzv_backend=c10d\ 19 | # --rdzv_endpoint="inspur1:8880"\ 20 | # train_ms.py 21 | #**************************** 22 | 23 | #多卡训练 24 | #nproc_per_node = 机器上可用的GPU数 25 | 26 | #************************* 27 | torchrun \ 28 | --nnodes=1\ 29 | --nproc_per_node=2\ 30 | train_ms.py 31 | #************************* 32 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "bos_token_id": 0, 7 | "directionality": "bidi", 8 | "eos_token_id": 2, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 1024, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 4096, 14 | "layer_norm_eps": 1e-12, 15 | "max_position_embeddings": 512, 16 | "model_type": "bert", 17 | "num_attention_heads": 16, 18 | "num_hidden_layers": 24, 19 | "output_past": true, 20 | "pad_token_id": 0, 21 | "pooler_fc_size": 768, 22 | "pooler_num_attention_heads": 12, 23 | "pooler_num_fc_layers": 3, 24 | "pooler_size_per_head": 128, 25 | "pooler_type": "first_token_transform", 26 | "type_vocab_size": 2, 27 | "vocab_size": 21128 28 | } 29 | -------------------------------------------------------------------------------- /oldVersion/V101/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V110/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V200/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V210/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V101/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph) 28 | return bert 29 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "DebertaV2ForMaskedLM" 4 | ], 5 | "attention_head_size": 64, 6 | "attention_probs_dropout_prob": 0.1, 7 | "conv_act": "gelu", 8 | "conv_kernel_size": 3, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 1024, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 4096, 14 | "layer_norm_eps": 1e-07, 15 | "max_position_embeddings": 512, 16 | "max_relative_positions": -1, 17 | "model_type": "deberta-v2", 18 | "norm_rel_ebd": "layer_norm", 19 | "num_attention_heads": 16, 20 | "num_hidden_layers": 24, 21 | "pad_token_id": 0, 22 | "pooler_dropout": 0, 23 | "pooler_hidden_act": "gelu", 24 | "pooler_hidden_size": 1024, 25 | "pos_att_type": [ 26 | "p2c", 27 | "c2p" 28 | ], 29 | "position_biased_input": false, 30 | "position_buckets": 256, 31 | "relative_attention": true, 32 | "share_att_key": true, 33 | "torch_dtype": "float16", 34 | "transformers_version": "4.25.1", 35 | "type_vocab_size": 0, 36 | "vocab_size": 22012 37 | } 38 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "configs/deberta_v2_large.json", 3 | "architectures": [ 4 | "DebertaV2ForMaskedLM" 5 | ], 6 | "attention_head_size": 64, 7 | "attention_probs_dropout_prob": 0.1, 8 | "conv_act": "gelu", 9 | "conv_kernel_size": 3, 10 | "hidden_act": "gelu", 11 | "hidden_dropout_prob": 0.1, 12 | "hidden_size": 1024, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 4096, 15 | "layer_norm_eps": 1e-07, 16 | "max_position_embeddings": 512, 17 | "max_relative_positions": -1, 18 | "model_type": "deberta-v2", 19 | "norm_rel_ebd": "layer_norm", 20 | "num_attention_heads": 16, 21 | "num_hidden_layers": 24, 22 | "pad_token_id": 0, 23 | "pooler_dropout": 0, 24 | "pooler_hidden_act": "gelu", 25 | "pooler_hidden_size": 1024, 26 | "pos_att_type": [ 27 | "p2c", 28 | "c2p" 29 | ], 30 | "position_biased_input": false, 31 | "position_buckets": 256, 32 | "relative_attention": true, 33 | "share_att_key": true, 34 | "torch_dtype": "float32", 35 | "transformers_version": "4.23.1", 36 | "type_vocab_size": 0, 37 | "vocab_size": 32000 38 | } 39 | -------------------------------------------------------------------------------- /oldVersion/V110/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language, device): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | from .japanese_bert import get_bert_feature as jp_bert 26 | 27 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 28 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 29 | return bert 30 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.xz filter=lfs diff=lfs merge=lfs -text 25 | *.zip filter=lfs diff=lfs merge=lfs -text 26 | *.zstandard filter=lfs diff=lfs merge=lfs -text 27 | *tfevents* filter=lfs diff=lfs merge=lfs -text 28 | -------------------------------------------------------------------------------- /oldVersion/V110/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to( 18 | device 19 | ) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | assert inputs["input_ids"].shape[-1] == len(word2ph) 27 | word2phone = word2ph 28 | phone_level_feature = [] 29 | for i in range(len(word2phone)): 30 | repeat_feature = res[i].repeat(word2phone[i], 1) 31 | phone_level_feature.append(repeat_feature) 32 | 33 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 34 | 35 | return phone_level_feature.T 36 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.xz filter=lfs diff=lfs merge=lfs -text 25 | *.zip filter=lfs diff=lfs merge=lfs -text 26 | *.zstandard filter=lfs diff=lfs merge=lfs -text 27 | *tfevents* filter=lfs diff=lfs merge=lfs -text 28 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.wasm filter=lfs diff=lfs merge=lfs -text 25 | *.xz filter=lfs diff=lfs merge=lfs -text 26 | *.zip filter=lfs diff=lfs merge=lfs -text 27 | *.zstandard filter=lfs diff=lfs merge=lfs -text 28 | *tfevents* filter=lfs diff=lfs merge=lfs -text 29 | -------------------------------------------------------------------------------- /oldVersion/V111/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/bert-base-japanese-v3" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | assert inputs["input_ids"].shape[-1] == len(word2ph) 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | -------------------------------------------------------------------------------- /oldVersion/V111/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | from .fix import japanese as japanese_fix 3 | 4 | 5 | language_module_map = {"ZH": chinese, "JP": japanese} 6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix} 7 | 8 | 9 | def clean_text(text, language): 10 | language_module = language_module_map[language] 11 | norm_text = language_module.text_normalize(text) 12 | phones, tones, word2ph = language_module.g2p(norm_text) 13 | return norm_text, phones, tones, word2ph 14 | 15 | 16 | def clean_text_fix(text, language): 17 | """使用dev分支修复""" 18 | language_module = language_module_map_fix[language] 19 | norm_text = language_module.text_normalize(text) 20 | phones, tones, word2ph = language_module.g2p(norm_text) 21 | return norm_text, phones, tones, word2ph 22 | 23 | 24 | def clean_text_bert(text, language): 25 | language_module = language_module_map[language] 26 | norm_text = language_module.text_normalize(text) 27 | phones, tones, word2ph = language_module.g2p(norm_text) 28 | bert = language_module.get_bert_feature(norm_text, word2ph) 29 | return phones, tones, bert 30 | 31 | 32 | def text_to_sequence(text, language): 33 | norm_text, phones, tones, word2ph = clean_text(text, language) 34 | return cleaned_text_to_sequence(phones, tones, language) 35 | 36 | 37 | if __name__ == "__main__": 38 | pass 39 | -------------------------------------------------------------------------------- /monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit( 5 | numba.void( 6 | numba.int32[:, :, ::1], 7 | numba.float32[:, :, ::1], 8 | numba.int32[::1], 9 | numba.int32[::1], 10 | ), 11 | nopython=True, 12 | nogil=True, 13 | ) 14 | def maximum_path_jit(paths, values, t_ys, t_xs): 15 | b = paths.shape[0] 16 | max_neg_val = -1e9 17 | for i in range(int(b)): 18 | path = paths[i] 19 | value = values[i] 20 | t_y = t_ys[i] 21 | t_x = t_xs[i] 22 | 23 | v_prev = v_cur = 0.0 24 | index = t_x - 1 25 | 26 | for y in range(t_y): 27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 28 | if x == y: 29 | v_cur = max_neg_val 30 | else: 31 | v_cur = value[y - 1, x] 32 | if x == 0: 33 | if y == 0: 34 | v_prev = 0.0 35 | else: 36 | v_prev = max_neg_val 37 | else: 38 | v_prev = value[y - 1, x - 1] 39 | value[y, x] += max(v_prev, v_cur) 40 | 41 | for y in range(t_y - 1, -1, -1): 42 | path[y, index] = 1 43 | if index != 0 and ( 44 | index == y or value[y - 1, index] < value[y - 1, index - 1] 45 | ): 46 | index = index - 1 47 | -------------------------------------------------------------------------------- /oldVersion/V200/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | if ( 18 | sys.platform == "darwin" 19 | and torch.backends.mps.is_available() 20 | and device == "cpu" 21 | ): 22 | device = "mps" 23 | if not device: 24 | device = "cuda" 25 | if device not in models.keys(): 26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 27 | with torch.no_grad(): 28 | inputs = tokenizer(text, return_tensors="pt") 29 | for i in inputs: 30 | inputs[i] = inputs[i].to(device) 31 | res = models[device](**inputs, output_hidden_states=True) 32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 33 | # assert len(word2ph) == len(text)+2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | if ( 18 | sys.platform == "darwin" 19 | and torch.backends.mps.is_available() 20 | and device == "cpu" 21 | ): 22 | device = "mps" 23 | if not device: 24 | device = "cuda" 25 | if device not in models.keys(): 26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 27 | with torch.no_grad(): 28 | inputs = tokenizer(text, return_tensors="pt") 29 | for i in inputs: 30 | inputs[i] = inputs[i].to(device) 31 | res = models[device](**inputs, output_hidden_states=True) 32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 33 | # assert len(word2ph) == len(text)+2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | -------------------------------------------------------------------------------- /oldVersion/V111/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def get_bert_fix(norm_text, word2ph, language, device): 32 | from .chinese_bert import get_bert_feature as zh_bert 33 | from .english_bert_mock import get_bert_feature as en_bert 34 | from .fix.japanese_bert import get_bert_feature as jp_bert 35 | 36 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 37 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 38 | return bert 39 | -------------------------------------------------------------------------------- /clap_wrapper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import ClapModel, ClapProcessor 5 | 6 | from config import config 7 | 8 | models = dict() 9 | processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused") 10 | 11 | 12 | def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device): 13 | if ( 14 | sys.platform == "darwin" 15 | and torch.backends.mps.is_available() 16 | and device == "cpu" 17 | ): 18 | device = "mps" 19 | if not device: 20 | device = "cuda" 21 | if device not in models.keys(): 22 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to( 23 | device 24 | ) 25 | with torch.no_grad(): 26 | inputs = processor( 27 | audios=audio_data, return_tensors="pt", sampling_rate=48000 28 | ).to(device) 29 | emb = models[device].get_audio_features(**inputs) 30 | return emb.T 31 | 32 | 33 | def get_clap_text_feature(text, device=config.bert_gen_config.device): 34 | if ( 35 | sys.platform == "darwin" 36 | and torch.backends.mps.is_available() 37 | and device == "cpu" 38 | ): 39 | device = "mps" 40 | if not device: 41 | device = "cuda" 42 | if device not in models.keys(): 43 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to( 44 | device 45 | ) 46 | with torch.no_grad(): 47 | inputs = processor(text=text, return_tensors="pt").to(device) 48 | emb = models[device].get_text_features(**inputs) 49 | return emb.T 50 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /onnx_infer.py: -------------------------------------------------------------------------------- 1 | from onnx_modules.V220_OnnxInference import OnnxInferenceSession 2 | import numpy as np 3 | Session = OnnxInferenceSession( 4 | { 5 | "enc" : "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx", 6 | "emb_g" : "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx", 7 | "dp" : "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx", 8 | "sdp" : "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx", 9 | "flow" : "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx", 10 | "dec" : "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx" 11 | }, 12 | Providers = ["CPUExecutionProvider"] 13 | ) 14 | 15 | #这里的输入和原版是一样的,只需要在原版预处理结果出来之后加上.numpy()即可 16 | x = np.array( 17 | [ 18 | 0, 19 | 97, 20 | 0, 21 | 8, 22 | 0, 23 | 78, 24 | 0, 25 | 8, 26 | 0, 27 | 76, 28 | 0, 29 | 37, 30 | 0, 31 | 40, 32 | 0, 33 | 97, 34 | 0, 35 | 8, 36 | 0, 37 | 23, 38 | 0, 39 | 8, 40 | 0, 41 | 74, 42 | 0, 43 | 26, 44 | 0, 45 | 104, 46 | 0, 47 | ] 48 | ) 49 | tone = np.zeros_like(x) 50 | language = np.zeros_like(x) 51 | sid = np.array([0]) 52 | bert = np.random.randn(x.shape[0], 1024) 53 | ja_bert = np.random.randn(x.shape[0], 1024) 54 | en_bert = np.random.randn(x.shape[0], 1024) 55 | emo = np.random.randn(512, 1) 56 | 57 | audio = Session( 58 | x, 59 | tone, 60 | language, 61 | bert, 62 | ja_bert, 63 | en_bert, 64 | emo, 65 | sid 66 | ) 67 | 68 | print(audio) 69 | -------------------------------------------------------------------------------- /oldVersion/V200/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def check_bert_models(): 32 | import json 33 | from pathlib import Path 34 | 35 | from config import config 36 | from .bert_utils import _check_bert 37 | 38 | if config.mirror.lower() == "openi": 39 | import openi 40 | 41 | kwargs = {"token": config.openi_token} if config.openi_token else {} 42 | openi.login(**kwargs) 43 | 44 | with open("./bert/bert_models.json", "r") as fp: 45 | models = json.load(fp) 46 | for k, v in models.items(): 47 | local_path = Path("./bert").joinpath(k) 48 | _check_bert(v["repo_id"], v["files"], local_path) 49 | -------------------------------------------------------------------------------- /oldVersion/V210/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device, style_text, style_weight): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language]( 28 | norm_text, word2ph, device, style_text, style_weight 29 | ) 30 | return bert 31 | 32 | 33 | def check_bert_models(): 34 | import json 35 | from pathlib import Path 36 | 37 | from config import config 38 | from .bert_utils import _check_bert 39 | 40 | if config.mirror.lower() == "openi": 41 | import openi 42 | 43 | kwargs = {"token": config.openi_token} if config.openi_token else {} 44 | openi.login(**kwargs) 45 | 46 | with open("./bert/bert_models.json", "r") as fp: 47 | models = json.load(fp) 48 | for k, v in models.items(): 49 | local_path = Path("./bert").joinpath(k) 50 | _check_bert(v["repo_id"], v["files"], local_path) 51 | 52 | 53 | check_bert_models() 54 | -------------------------------------------------------------------------------- /onnx_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from utils import get_hparams_from_file, load_checkpoint 2 | import json 3 | 4 | 5 | def export_onnx(export_path, model_path, config_path, novq, dev): 6 | hps = get_hparams_from_file(config_path) 7 | version = hps.version[0:3] 8 | if version == "2.0" or (version == "2.1" and novq): 9 | from .V200 import SynthesizerTrn, symbols 10 | elif version == "2.1" and (not novq): 11 | from .V210 import SynthesizerTrn, symbols 12 | elif version == "2.2": 13 | if novq and dev: 14 | from .V220_novq_dev import SynthesizerTrn, symbols 15 | else: 16 | from .V220 import SynthesizerTrn, symbols 17 | elif version == "2.3": 18 | from .V230 import SynthesizerTrn, symbols 19 | net_g = SynthesizerTrn( 20 | len(symbols), 21 | hps.data.filter_length // 2 + 1, 22 | hps.train.segment_size // hps.data.hop_length, 23 | n_speakers=hps.data.n_speakers, 24 | **hps.model, 25 | ) 26 | _ = net_g.eval() 27 | _ = load_checkpoint(model_path, net_g, None, skip_optimizer=True) 28 | net_g.cpu() 29 | net_g.export_onnx(export_path) 30 | 31 | spklist = [] 32 | for key in hps.data.spk2id.keys(): 33 | spklist.append(key) 34 | 35 | MoeVSConf = { 36 | "Folder": f"{export_path}", 37 | "Name": f"{export_path}", 38 | "Type": "BertVits", 39 | "Symbol": symbols, 40 | "Cleaner": "", 41 | "Rate": hps.data.sampling_rate, 42 | "CharaMix": True, 43 | "Characters": spklist, 44 | "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]}, 45 | "Dict": "BasicDict", 46 | "BertPath": [ 47 | "chinese-roberta-wwm-ext-large", 48 | "deberta-v2-large-japanese", 49 | "bert-base-japanese-v3", 50 | ], 51 | "Clap": "clap-htsat-fused", 52 | } 53 | 54 | with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile: 55 | json.dump(MoeVSConf, MoeVsConfFile, indent=4) 56 | -------------------------------------------------------------------------------- /resample_legacy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | from multiprocessing import Pool, cpu_count 5 | 6 | import soundfile 7 | from tqdm import tqdm 8 | 9 | from config import config 10 | 11 | 12 | def process(item): 13 | wav_name, args = item 14 | wav_path = os.path.join(args.in_dir, wav_name) 15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): 16 | wav, sr = librosa.load(wav_path, sr=args.sr) 17 | soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "--sr", 24 | type=int, 25 | default=config.resample_config.sampling_rate, 26 | help="sampling rate", 27 | ) 28 | parser.add_argument( 29 | "--in_dir", 30 | type=str, 31 | default=config.resample_config.in_dir, 32 | help="path to source dir", 33 | ) 34 | parser.add_argument( 35 | "--out_dir", 36 | type=str, 37 | default=config.resample_config.out_dir, 38 | help="path to target dir", 39 | ) 40 | parser.add_argument( 41 | "--processes", 42 | type=int, 43 | default=0, 44 | help="cpu_processes", 45 | ) 46 | args, _ = parser.parse_known_args() 47 | # autodl 无卡模式会识别出46个cpu 48 | if args.processes == 0: 49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1 50 | else: 51 | processes = args.processes 52 | pool = Pool(processes=processes) 53 | 54 | tasks = [] 55 | 56 | for dirpath, _, filenames in os.walk(args.in_dir): 57 | if not os.path.isdir(args.out_dir): 58 | os.makedirs(args.out_dir, exist_ok=True) 59 | for filename in filenames: 60 | if filename.lower().endswith(".wav"): 61 | tasks.append((filename, args)) 62 | 63 | for _ in tqdm( 64 | pool.imap_unordered(process, tasks), 65 | ): 66 | pass 67 | 68 | pool.close() 69 | pool.join() 70 | 71 | print("音频重采样完毕!") 72 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language]( 28 | norm_text, word2ph, device, style_text, style_weight 29 | ) 30 | return bert 31 | 32 | 33 | def check_bert_models(): 34 | import json 35 | from pathlib import Path 36 | 37 | from config import config 38 | from .bert_utils import _check_bert 39 | 40 | if config.mirror.lower() == "openi": 41 | import openi 42 | 43 | kwargs = {"token": config.openi_token} if config.openi_token else {} 44 | openi.login(**kwargs) 45 | 46 | with open("./bert/bert_models.json", "r") as fp: 47 | models = json.load(fp) 48 | for k, v in models.items(): 49 | local_path = Path("./bert").joinpath(k) 50 | _check_bert(v["repo_id"], v["files"], local_path) 51 | 52 | 53 | def init_openjtalk(): 54 | import platform 55 | 56 | if platform.platform() == "Linux": 57 | import pyopenjtalk 58 | 59 | pyopenjtalk.g2p("こんにちは,世界。") 60 | 61 | 62 | init_openjtalk() 63 | check_bert_models() 64 | -------------------------------------------------------------------------------- /tools/translate.py: -------------------------------------------------------------------------------- 1 | """ 2 | 翻译api 3 | """ 4 | from config import config 5 | 6 | import random 7 | import hashlib 8 | import requests 9 | 10 | 11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""): 12 | """ 13 | :param Sentence: 待翻译语句 14 | :param from_Language: 待翻译语句语言 15 | :param to_Language: 目标语言 16 | :return: 翻译后语句 出错时返回None 17 | 18 | 常见语言代码:中文 zh 英语 en 日语 jp 19 | """ 20 | appid = config.translate_config.app_key 21 | key = config.translate_config.secret_key 22 | if appid == "" or key == "": 23 | return "请开发者在config.yml中配置app_key与secret_key" 24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate" 25 | texts = Sentence.splitlines() 26 | outTexts = [] 27 | for t in texts: 28 | if t != "": 29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113 30 | salt = str(random.randint(1, 100000)) 31 | signString = appid + t + salt + key 32 | hs = hashlib.md5() 33 | hs.update(signString.encode("utf-8")) 34 | signString = hs.hexdigest() 35 | if from_Language == "": 36 | from_Language = "auto" 37 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 38 | payload = { 39 | "q": t, 40 | "from": from_Language, 41 | "to": to_Language, 42 | "appid": appid, 43 | "salt": salt, 44 | "sign": signString, 45 | } 46 | # 发送请求 47 | try: 48 | response = requests.post( 49 | url=url, data=payload, headers=headers, timeout=3 50 | ) 51 | response = response.json() 52 | if "trans_result" in response.keys(): 53 | result = response["trans_result"][0] 54 | if "dst" in result.keys(): 55 | dst = result["dst"] 56 | outTexts.append(dst) 57 | except Exception: 58 | return Sentence 59 | else: 60 | outTexts.append(t) 61 | return "\n".join(outTexts) 62 | -------------------------------------------------------------------------------- /text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | if ( 24 | sys.platform == "darwin" 25 | and torch.backends.mps.is_available() 26 | and device == "cpu" 27 | ): 28 | device = "mps" 29 | if not device: 30 | device = "cuda" 31 | if device not in models.keys(): 32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 33 | with torch.no_grad(): 34 | inputs = tokenizer(text, return_tensors="pt") 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = models[device](**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | if style_text: 40 | style_inputs = tokenizer(style_text, return_tensors="pt") 41 | for i in style_inputs: 42 | style_inputs[i] = style_inputs[i].to(device) 43 | style_res = models[device](**style_inputs, output_hidden_states=True) 44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 45 | style_res_mean = style_res.mean(0) 46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | -------------------------------------------------------------------------------- /oldVersion/V210/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | if ( 24 | sys.platform == "darwin" 25 | and torch.backends.mps.is_available() 26 | and device == "cpu" 27 | ): 28 | device = "mps" 29 | if not device: 30 | device = "cuda" 31 | if device not in models.keys(): 32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 33 | with torch.no_grad(): 34 | inputs = tokenizer(text, return_tensors="pt") 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = models[device](**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | if style_text: 40 | style_inputs = tokenizer(style_text, return_tensors="pt") 41 | for i in style_inputs: 42 | style_inputs[i] = style_inputs[i].to(device) 43 | style_res = models[device](**style_inputs, output_hidden_states=True) 44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 45 | style_res_mean = style_res.mean(0) 46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | -------------------------------------------------------------------------------- /oldVersion/V200/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | sep_text, _, _ = text2sep_kata(text) 18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 21 | return get_bert_feature_with_token(sep_ids, word2ph, device) 22 | 23 | 24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 25 | if ( 26 | sys.platform == "darwin" 27 | and torch.backends.mps.is_available() 28 | and device == "cpu" 29 | ): 30 | device = "mps" 31 | if not device: 32 | device = "cuda" 33 | if device not in models.keys(): 34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 35 | with torch.no_grad(): 36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 37 | token_type_ids = torch.zeros_like(inputs).to(device) 38 | attention_mask = torch.ones_like(inputs).to(device) 39 | inputs = { 40 | "input_ids": inputs, 41 | "token_type_ids": token_type_ids, 42 | "attention_mask": attention_mask, 43 | } 44 | 45 | # for i in inputs: 46 | # inputs[i] = inputs[i].to(device) 47 | res = models[device](**inputs, output_hidden_states=True) 48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 49 | assert inputs["input_ids"].shape[-1] == len(word2ph) 50 | word2phone = word2ph 51 | phone_level_feature = [] 52 | for i in range(len(word2phone)): 53 | repeat_feature = res[i].repeat(word2phone[i], 1) 54 | phone_level_feature.append(repeat_feature) 55 | 56 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 57 | 58 | return phone_level_feature.T 59 | -------------------------------------------------------------------------------- /oldVersion/V111/text/fix/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | from .japanese import text2sep_kata 5 | from config import config 6 | 7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 8 | 9 | models = dict() 10 | 11 | 12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 13 | sep_text, _ = text2sep_kata(text) 14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 17 | return get_bert_feature_with_token(sep_ids, word2ph, device) 18 | 19 | 20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 21 | if ( 22 | sys.platform == "darwin" 23 | and torch.backends.mps.is_available() 24 | and device == "cpu" 25 | ): 26 | device = "mps" 27 | if not device: 28 | device = "cuda" 29 | if device not in models.keys(): 30 | models[device] = AutoModelForMaskedLM.from_pretrained( 31 | "./bert/bert-base-japanese-v3" 32 | ).to(device) 33 | with torch.no_grad(): 34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 35 | token_type_ids = torch.zeros_like(inputs).to(device) 36 | attention_mask = torch.ones_like(inputs).to(device) 37 | inputs = { 38 | "input_ids": inputs, 39 | "token_type_ids": token_type_ids, 40 | "attention_mask": attention_mask, 41 | } 42 | 43 | # for i in inputs: 44 | # inputs[i] = inputs[i].to(device) 45 | res = models[device](**inputs, output_hidden_states=True) 46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 47 | assert inputs["input_ids"].shape[-1] == len(word2ph) 48 | word2phone = word2ph 49 | phone_level_feature = [] 50 | for i in range(len(word2phone)): 51 | repeat_feature = res[i].repeat(word2phone[i], 1) 52 | phone_level_feature.append(repeat_feature) 53 | 54 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 55 | 56 | return phone_level_feature.T 57 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | sep_text, _, _ = text2sep_kata(text) 18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 21 | return get_bert_feature_with_token(sep_ids, word2ph, device) 22 | 23 | 24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 25 | if ( 26 | sys.platform == "darwin" 27 | and torch.backends.mps.is_available() 28 | and device == "cpu" 29 | ): 30 | device = "mps" 31 | if not device: 32 | device = "cuda" 33 | if device not in models.keys(): 34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 35 | with torch.no_grad(): 36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 37 | token_type_ids = torch.zeros_like(inputs).to(device) 38 | attention_mask = torch.ones_like(inputs).to(device) 39 | inputs = { 40 | "input_ids": inputs, 41 | "token_type_ids": token_type_ids, 42 | "attention_mask": attention_mask, 43 | } 44 | 45 | # for i in inputs: 46 | # inputs[i] = inputs[i].to(device) 47 | res = models[device](**inputs, output_hidden_states=True) 48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 49 | assert inputs["input_ids"].shape[-1] == len(word2ph) 50 | word2phone = word2ph 51 | phone_level_feature = [] 52 | for i in range(len(word2phone)): 53 | repeat_feature = res[i].repeat(word2phone[i], 1) 54 | phone_level_feature.append(repeat_feature) 55 | 56 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 57 | 58 | return phone_level_feature.T 59 | -------------------------------------------------------------------------------- /resample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | from multiprocessing import Pool, cpu_count 5 | 6 | import soundfile 7 | from tqdm import tqdm 8 | 9 | from config import config 10 | 11 | 12 | def process(item): 13 | spkdir, wav_name, args = item 14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name) 15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): 16 | wav, sr = librosa.load(wav_path, sr=args.sr) 17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "--sr", 24 | type=int, 25 | default=config.resample_config.sampling_rate, 26 | help="sampling rate", 27 | ) 28 | parser.add_argument( 29 | "--in_dir", 30 | type=str, 31 | default=config.resample_config.in_dir, 32 | help="path to source dir", 33 | ) 34 | parser.add_argument( 35 | "--out_dir", 36 | type=str, 37 | default=config.resample_config.out_dir, 38 | help="path to target dir", 39 | ) 40 | parser.add_argument( 41 | "--processes", 42 | type=int, 43 | default=0, 44 | help="cpu_processes", 45 | ) 46 | args, _ = parser.parse_known_args() 47 | # autodl 无卡模式会识别出46个cpu 48 | if args.processes == 0: 49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1 50 | else: 51 | processes = args.processes 52 | pool = Pool(processes=processes) 53 | 54 | tasks = [] 55 | 56 | for dirpath, _, filenames in os.walk(args.in_dir): 57 | # 子级目录 58 | spk_dir = os.path.relpath(dirpath, args.in_dir) 59 | spk_dir_out = os.path.join(args.out_dir, spk_dir) 60 | if not os.path.isdir(spk_dir_out): 61 | os.makedirs(spk_dir_out, exist_ok=True) 62 | for filename in filenames: 63 | if filename.lower().endswith(".wav"): 64 | twople = (spk_dir, filename, args) 65 | tasks.append(twople) 66 | 67 | for _ in tqdm( 68 | pool.imap_unordered(process, tasks), 69 | ): 70 | pass 71 | 72 | pool.close() 73 | pool.join() 74 | 75 | print("音频重采样完毕!") 76 | -------------------------------------------------------------------------------- /clap_gen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from multiprocessing import Pool, cpu_count 3 | 4 | import torch 5 | import torch.multiprocessing as mp 6 | from tqdm import tqdm 7 | 8 | import utils 9 | from config import config 10 | from clap_wrapper import get_clap_audio_feature 11 | import librosa 12 | import os 13 | 14 | os.environ["OMP_NUM_THREADS"] = "1" 15 | os.environ["MKL_NUM_THREADS"] = "1" 16 | 17 | 18 | def process_line(line): 19 | device = config.emo_gen_config.device 20 | if config.emo_gen_config.use_multi_device: 21 | rank = mp.current_process()._identity 22 | rank = rank[0] if len(rank) > 0 else 0 23 | if torch.cuda.is_available(): 24 | gpu_id = rank % torch.cuda.device_count() 25 | device = torch.device(f"cuda:{gpu_id}") 26 | else: 27 | device = torch.device("cpu") 28 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") 29 | 30 | clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.pt") 31 | if os.path.isfile(clap_path): 32 | return 33 | 34 | audio = librosa.load(wav_path, 48000)[0] 35 | # audio = librosa.resample(audio, 44100, 48000) 36 | 37 | clap = get_clap_audio_feature(audio, device) 38 | torch.save(clap, clap_path) 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | "-c", "--config", type=str, default=config.emo_gen_config.config_path 45 | ) 46 | parser.add_argument( 47 | "--num_processes", type=int, default=config.emo_gen_config.num_processes 48 | ) 49 | args, _ = parser.parse_known_args() 50 | config_path = args.config 51 | hps = utils.get_hparams_from_file(config_path) 52 | lines = [] 53 | with open(hps.data.training_files, encoding="utf-8") as f: 54 | lines.extend(f.readlines()) 55 | 56 | with open(hps.data.validation_files, encoding="utf-8") as f: 57 | lines.extend(f.readlines()) 58 | if len(lines) != 0: 59 | num_processes = min(args.num_processes, cpu_count()) 60 | with Pool(processes=num_processes) as pool: 61 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)): 62 | pass 63 | 64 | print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!") 65 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - zh 4 | tags: 5 | - bert 6 | license: "apache-2.0" 7 | --- 8 | 9 | # Please use 'Bert' related functions to load this model! 10 | 11 | ## Chinese BERT with Whole Word Masking 12 | For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**. 13 | 14 | **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)** 15 | Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu 16 | 17 | This repository is developed based on:https://github.com/google-research/bert 18 | 19 | You may also interested in, 20 | - Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm 21 | - Chinese MacBERT: https://github.com/ymcui/MacBERT 22 | - Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA 23 | - Chinese XLNet: https://github.com/ymcui/Chinese-XLNet 24 | - Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer 25 | 26 | More resources by HFL: https://github.com/ymcui/HFL-Anthology 27 | 28 | ## Citation 29 | If you find the technical report or resource is useful, please cite the following technical report in your paper. 30 | - Primary: https://arxiv.org/abs/2004.13922 31 | ``` 32 | @inproceedings{cui-etal-2020-revisiting, 33 | title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing", 34 | author = "Cui, Yiming and 35 | Che, Wanxiang and 36 | Liu, Ting and 37 | Qin, Bing and 38 | Wang, Shijin and 39 | Hu, Guoping", 40 | booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings", 41 | month = nov, 42 | year = "2020", 43 | address = "Online", 44 | publisher = "Association for Computational Linguistics", 45 | url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58", 46 | pages = "657--668", 47 | } 48 | ``` 49 | - Secondary: https://arxiv.org/abs/1906.08101 50 | ``` 51 | @article{chinese-bert-wwm, 52 | title={Pre-Training with Whole Word Masking for Chinese BERT}, 53 | author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping}, 54 | journal={arXiv preprint arXiv:1906.08101}, 55 | year={2019} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Python: WebUI", 6 | "type": "python", 7 | "request": "launch", 8 | "program": "${file}", 9 | "console": "integratedTerminal", 10 | "args": [ 11 | "--config_dir", 12 | "${workspaceFolder}/configs/config.json", 13 | "--debug" 14 | ], 15 | "justMyCode": false 16 | }, 17 | { 18 | "name": "Train: Init", 19 | "type": "python", 20 | "request": "launch", 21 | "program": "${file}", 22 | "console": "integratedTerminal", 23 | "args": [ 24 | "-m", 25 | "OUTPUT_MODEL", 26 | "--config", 27 | "${workspaceFolder}/configs/config.json", 28 | ], 29 | "justMyCode": false 30 | }, 31 | { 32 | "name": "Train: Visemes", 33 | "type": "python", 34 | "request": "launch", 35 | "program": "${file}", 36 | "console": "integratedTerminal", 37 | "args": [ 38 | "-m", 39 | "OUTPUT_MODEL", 40 | "--config", 41 | "${workspaceFolder}/configs/config.json", 42 | "--visemes", 43 | ], 44 | "justMyCode": false 45 | }, 46 | { 47 | "name": "prepare: Visemes", 48 | "type": "python", 49 | "request": "launch", 50 | "program": "${file}", 51 | "console": "integratedTerminal", 52 | "justMyCode": false 53 | }, 54 | { 55 | "name": "motion: VMC", 56 | "type": "python", 57 | "request": "launch", 58 | "program": "${file}", 59 | "console": "integratedTerminal", 60 | "args": [ 61 | "--a2p", 62 | "a2p_rotations.npy", 63 | "--positions_files", 64 | "a2p_motions.npy", 65 | "--do_linear_interpolation", 66 | "False", 67 | "--fps", 68 | "30", 69 | ], 70 | "justMyCode": false 71 | }, 72 | ] 73 | } -------------------------------------------------------------------------------- /text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from text.japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | text = "".join(text2sep_kata(text)[0]) 24 | if style_text: 25 | style_text = "".join(text2sep_kata(style_text)[0]) 26 | if ( 27 | sys.platform == "darwin" 28 | and torch.backends.mps.is_available() 29 | and device == "cpu" 30 | ): 31 | device = "mps" 32 | if not device: 33 | device = "cuda" 34 | if device not in models.keys(): 35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 36 | with torch.no_grad(): 37 | inputs = tokenizer(text, return_tensors="pt") 38 | for i in inputs: 39 | inputs[i] = inputs[i].to(device) 40 | res = models[device](**inputs, output_hidden_states=True) 41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 42 | if style_text: 43 | style_inputs = tokenizer(style_text, return_tensors="pt") 44 | for i in style_inputs: 45 | style_inputs[i] = style_inputs[i].to(device) 46 | style_res = models[device](**style_inputs, output_hidden_states=True) 47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 48 | style_res_mean = style_res.mean(0) 49 | 50 | assert len(word2ph) == len(text) + 2 51 | word2phone = word2ph 52 | phone_level_feature = [] 53 | for i in range(len(word2phone)): 54 | if style_text: 55 | repeat_feature = ( 56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 58 | ) 59 | else: 60 | repeat_feature = res[i].repeat(word2phone[i], 1) 61 | phone_level_feature.append(repeat_feature) 62 | 63 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 64 | 65 | return phone_level_feature.T 66 | -------------------------------------------------------------------------------- /oldVersion/V210/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | text = "".join(text2sep_kata(text)[0]) 24 | if style_text: 25 | style_text = "".join(text2sep_kata(style_text)[0]) 26 | if ( 27 | sys.platform == "darwin" 28 | and torch.backends.mps.is_available() 29 | and device == "cpu" 30 | ): 31 | device = "mps" 32 | if not device: 33 | device = "cuda" 34 | if device not in models.keys(): 35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 36 | with torch.no_grad(): 37 | inputs = tokenizer(text, return_tensors="pt") 38 | for i in inputs: 39 | inputs[i] = inputs[i].to(device) 40 | res = models[device](**inputs, output_hidden_states=True) 41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 42 | if style_text: 43 | style_inputs = tokenizer(style_text, return_tensors="pt") 44 | for i in style_inputs: 45 | style_inputs[i] = style_inputs[i].to(device) 46 | style_res = models[device](**style_inputs, output_hidden_states=True) 47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 48 | style_res_mean = style_res.mean(0) 49 | 50 | assert len(word2ph) == len(text) + 2 51 | word2phone = word2ph 52 | phone_level_feature = [] 53 | for i in range(len(word2phone)): 54 | if style_text: 55 | repeat_feature = ( 56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 58 | ) 59 | else: 60 | repeat_feature = res[i].repeat(word2phone[i], 1) 61 | phone_level_feature.append(repeat_feature) 62 | 63 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 64 | 65 | return phone_level_feature.T 66 | -------------------------------------------------------------------------------- /oldVersion/V101/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.0.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | 26 | assert bert.shape[-1] == len(phone) 27 | 28 | phone = torch.LongTensor(phone) 29 | tone = torch.LongTensor(tone) 30 | language = torch.LongTensor(language) 31 | 32 | return bert, phone, tone, language 33 | 34 | 35 | def infer( 36 | text, 37 | sdp_ratio, 38 | noise_scale, 39 | noise_scale_w, 40 | length_scale, 41 | sid, 42 | hps, 43 | net_g, 44 | device, 45 | ): 46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device) 47 | with torch.no_grad(): 48 | x_tst = phones.to(device).unsqueeze(0) 49 | tones = tones.to(device).unsqueeze(0) 50 | lang_ids = lang_ids.to(device).unsqueeze(0) 51 | bert = bert.to(device).unsqueeze(0) 52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 53 | del phones 54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 55 | audio = ( 56 | net_g.infer( 57 | x_tst, 58 | x_tst_lengths, 59 | speakers, 60 | tones, 61 | lang_ids, 62 | bert, 63 | sdp_ratio=sdp_ratio, 64 | noise_scale=noise_scale, 65 | noise_scale_w=noise_scale_w, 66 | length_scale=length_scale, 67 | )[0][0, 0] 68 | .data.cpu() 69 | .float() 70 | .numpy() 71 | ) 72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers 73 | if torch.cuda.is_available(): 74 | torch.cuda.empty_cache() 75 | return audio 76 | -------------------------------------------------------------------------------- /oldVersion/V110/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained( 18 | "./bert/chinese-roberta-wwm-ext-large" 19 | ).to(device) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | 27 | assert len(word2ph) == len(text) + 2 28 | word2phone = word2ph 29 | phone_level_feature = [] 30 | for i in range(len(word2phone)): 31 | repeat_feature = res[i].repeat(word2phone[i], 1) 32 | phone_level_feature.append(repeat_feature) 33 | 34 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 35 | 36 | return phone_level_feature.T 37 | 38 | 39 | if __name__ == "__main__": 40 | import torch 41 | 42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 43 | word2phone = [ 44 | 1, 45 | 2, 46 | 1, 47 | 2, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 2, 59 | 2, 60 | 2, 61 | 1, 62 | 1, 63 | 2, 64 | 2, 65 | 1, 66 | 2, 67 | 2, 68 | 2, 69 | 2, 70 | 1, 71 | 2, 72 | 2, 73 | 2, 74 | 2, 75 | 2, 76 | 1, 77 | 2, 78 | 2, 79 | 2, 80 | 2, 81 | 1, 82 | ] 83 | 84 | # 计算总帧数 85 | total_frames = sum(word2phone) 86 | print(word_level_feature.shape) 87 | print(word2phone) 88 | phone_level_feature = [] 89 | for i in range(len(word2phone)): 90 | print(word_level_feature[i].shape) 91 | 92 | # 对每个词重复word2phone[i]次 93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 94 | phone_level_feature.append(repeat_feature) 95 | 96 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 97 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 98 | -------------------------------------------------------------------------------- /motion/record.py: -------------------------------------------------------------------------------- 1 | # 录音的同时开启udp监听,获取arkit发送过来的数据,然后分别保存到文件里 2 | import wave 3 | import threading 4 | import pyaudio 5 | import datetime 6 | import live_link 7 | 8 | class AudioRecorder(): 9 | ''' 10 | 用pyaudio录音,并保存到 wav 文件 11 | ''' 12 | def __init__(self, filename=None, save_path= './records', channels=1, rate=44100, chunk=1024): 13 | if filename is None: 14 | # 时间戳 yyyy-mm-dd-hh-mm-ss.wav 15 | filename = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.wav' 16 | else: 17 | filename += '.wav' 18 | self.isrecording = False 19 | self.save_path = save_path 20 | self.filename = filename 21 | self.channels = channels 22 | self.rate = rate 23 | self.chunk = chunk 24 | self.p = pyaudio.PyAudio() 25 | self.stream = self.p.open( 26 | format=pyaudio.paInt16, 27 | channels=self.channels, 28 | rate=self.rate, 29 | input=True, 30 | frames_per_buffer=self.chunk 31 | ) 32 | 33 | def close(self): 34 | # self.stream.stop_stream() 35 | self.stream.close() 36 | self.p.terminate() 37 | 38 | def save(self): 39 | wf = wave.open(self.save_path + '/' + self.filename, 'wb') 40 | wf.setnchannels(self.channels) 41 | wf.setsampwidth(self.p.get_sample_size(pyaudio.paInt16)) 42 | wf.setframerate(self.rate) 43 | wf.writeframes(b''.join(self.frames)) 44 | wf.close() 45 | 46 | def __recording(self): 47 | while self.isrecording: 48 | self.frames.append(self.stream.read(self.chunk)) 49 | 50 | def start(self): 51 | print('Start recording') 52 | self.frames = [] 53 | self.isrecording = True 54 | self.t = threading.Thread(target=self.__recording) 55 | self.t.daemon = True 56 | self.t.start() 57 | 58 | 59 | if __name__ == '__main__': 60 | filename_prefix = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 61 | recorder = AudioRecorder(filename=filename_prefix) 62 | # arkit 127.0.0.1 11111 63 | arkit_recorder = live_link.UdpRecvHandlerForArkit( 64 | ( 65 | "0.0.0.0", 66 | 11111 67 | ), 68 | filename_prefix=filename_prefix 69 | ) 70 | arkit_recorder.start() 71 | recorder.start() 72 | print('Recording audio...') 73 | # wait 10 seconds 74 | import time 75 | time.sleep(20) 76 | recorder.save() 77 | arkit_recorder.save() 78 | print('End audio recording') 79 | recorder.close() 80 | arkit_recorder.close() 81 | print('End arkit recording') 82 | -------------------------------------------------------------------------------- /Improvement_2025.md: -------------------------------------------------------------------------------- 1 | ### 模型改进思路: CRNN (卷积循环神经网络) 2 | 3 | 旧版模型直接使用 TTS 网络中的隐变量 `z` 作为输入,这依赖于 `z` 能够充分捕捉所有与表情相关的语音特征。一种更直接、可能更有效的方法是直接从音频的梅尔频谱图(mel-spectrogram)中学习。这种方法将问题转化为一个标准的音频到序列的任务,可以使用经典的卷积循环神经网络(CRNN)架构。 4 | 5 | 以下是基于 PyTorch 的 CRNN 模型修改步骤,重点使用 GRU 来处理时序信息: 6 | 7 | 1. **输入调整**: 8 | 9 | - 模型的输入不再是 `z` (`[B, C, T]`),而是音频的梅尔频谱图。 10 | - 输入形状为 `[B, 1, n_mels, n_frames]` (分别对应 批次, 通道, 频率, 时间)。这里我们以 `Conv2D` 为例,因为它能同时处理时间和频率维度上的局部特征。 11 | 12 | 2. **卷积特征提取 (CNN Front-end)**: 13 | 14 | - 在模型前端添加几层 2D 卷积层 (`nn.Conv2d`) 来从频谱图中提取高级特征。 15 | - 典型的结构是 `Conv2d` -> `BatchNorm2d` -> `LeakyReLU` -> `MaxPool2d` 的堆叠。这可以有效降低特征图的维度,同时扩大感受野。 16 | 17 | ```python 18 | # 伪代码 19 | self.cnn = nn.Sequential( 20 | nn.Conv2d(1, 32, kernel_size=3, padding=1), 21 | nn.BatchNorm2d(32), 22 | nn.LeakyReLU(0.2), 23 | nn.MaxPool2d(2), # (B, 32, n_mels/2, n_frames/2) 24 | 25 | nn.Conv2d(32, 64, kernel_size=3, padding=1), 26 | nn.BatchNorm2d(64), 27 | nn.LeakyReLU(0.2), 28 | nn.MaxPool2d(2) # (B, 64, n_mels/4, n_frames/4) 29 | ) 30 | ``` 31 | 32 | 3. **Reshape**: 33 | 34 | - 将 CNN 模块提取的特征图从 `[B, C, H, W]` 调整为适合循环层处理的 `[B, T, F]` 格式。 35 | - `T` 代表时间序列长度, `F` 代表每个时间步的特征维度。 36 | 37 | ```python 38 | # x is output from self.cnn 39 | B, C, H, W = x.shape 40 | x = x.permute(0, 3, 1, 2) # [B, W, C, H] 41 | x = x.reshape(B, W, C * H) # [B, T, F], where T=W, F=C*H 42 | ``` 43 | 44 | 4. **时序特征建模 (GRU)**: 45 | 46 | - 使用 `nn.GRU` 层来捕捉特征序列中的时间依赖关系。 47 | - 使用 `bidirectional=True` 的双向 GRU 通常能获得更好的性能,因为它能同时考虑过去和未来的上下文。 48 | - GRU 的输出是每个时间步的隐藏状态。如果只需要一个最终的特征向量来代表整个序列(例如,用于分类或单帧表情预测),可以只取最后一个时间步的输出,或者对所有时间步的输出进行池化(如 `GlobalAveragePooling1D`)。 49 | 50 | ```python 51 | # 伪代码 52 | self.gru = nn.GRU(input_size=C*H, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True) 53 | # x from reshape 54 | x, _ = self.gru(x) 55 | ``` 56 | 57 | 5. **输出层 (Output Head)**: 58 | _ 在 GRU 之后,连接一个或多个全连接层 (`nn.Linear`),将 GRU 的输出映射到最终的 ARKit 表情参数维度。 59 | _ 最后一层使用 `Sigmoid` 激活函数,将输出值归一化到 `[0, 1]` 范围。 60 | 61 | ```python 62 | # 伪代码 63 | # x is output from self.gru 64 | self.fc = nn.Sequential( 65 | nn.Linear(128 * 2, 128), # *2 because of bidirectional 66 | nn.ReLU(), 67 | nn.Dropout(0.3), 68 | nn.Linear(128, 61), # 61 is n_arkit_outputs 69 | nn.Sigmoid() 70 | ) 71 | # 如果GRU的return_sequences=True,则需要对时间维度处理 72 | # 例如取最后一个时间步的输出: x = x[:, -1, :] 73 | y = self.fc(x) 74 | ``` 75 | 76 | 这种 CRNN 架构的优势在于它解耦了特征提取(CNN)和时序建模(GRU),使得模型结构更清晰,并且能更有效地从原始音频信号中学习面部动画。 77 | -------------------------------------------------------------------------------- /re_matching.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def extract_language_and_text_updated(speaker, dialogue): 5 | # 使用正则表达式匹配<语言>标签和其后的文本 6 | pattern_language_text = r"<(\S+?)>([^<]+)" 7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL) 8 | speaker = speaker[1:-1] 9 | # 清理文本:去除两边的空白字符 10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] 11 | matches_cleaned.append(speaker) 12 | return matches_cleaned 13 | 14 | 15 | def validate_text(input_text): 16 | # 验证说话人的正则表达式 17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" 18 | 19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符 20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL) 21 | 22 | # 对每个匹配到的说话人内容进行进一步验证 23 | for _, dialogue in matches: 24 | language_text_matches = extract_language_and_text_updated(_, dialogue) 25 | if not language_text_matches: 26 | return ( 27 | False, 28 | "Error: Invalid format detected in dialogue content. Please check your input.", 29 | ) 30 | 31 | # 如果输入的文本中没有找到任何匹配项 32 | if not matches: 33 | return ( 34 | False, 35 | "Error: No valid speaker format detected. Please check your input.", 36 | ) 37 | 38 | return True, "Input is valid." 39 | 40 | 41 | def text_matching(text: str) -> list: 42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" 43 | matches = re.findall(speaker_pattern, text, re.DOTALL) 44 | result = [] 45 | for speaker, dialogue in matches: 46 | result.append(extract_language_and_text_updated(speaker, dialogue)) 47 | return result 48 | 49 | 50 | def cut_para(text): 51 | splitted_para = re.split("[\n]", text) # 按段分 52 | splitted_para = [ 53 | sentence.strip() for sentence in splitted_para if sentence.strip() 54 | ] # 删除空字符串 55 | return splitted_para 56 | 57 | 58 | def cut_sent(para): 59 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符 60 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号 61 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号 62 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) 63 | para = para.rstrip() # 段尾如果有多余的\n就去掉它 64 | return para.split("\n") 65 | 66 | 67 | if __name__ == "__main__": 68 | text = """ 69 | [说话人1] 70 | [说话人2]你好吗?元気ですか?こんにちは,世界。你好吗? 71 | [说话人3]谢谢。どういたしまして。 72 | """ 73 | text_matching(text) 74 | # 测试函数 75 | test_text = """ 76 | [说话人1]你好,こんにちは!こんにちは,世界。 77 | [说话人2]你好吗? 78 | """ 79 | text_matching(test_text) 80 | res = validate_text(test_text) 81 | print(res) 82 | -------------------------------------------------------------------------------- /oldVersion/V101/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | device = torch.device( 6 | "cuda" 7 | if torch.cuda.is_available() 8 | else ( 9 | "mps" 10 | if sys.platform == "darwin" and torch.backends.mps.is_available() 11 | else "cpu" 12 | ) 13 | ) 14 | 15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to( 17 | device 18 | ) 19 | 20 | 21 | def get_bert_feature(text, word2ph): 22 | with torch.no_grad(): 23 | inputs = tokenizer(text, return_tensors="pt") 24 | for i in inputs: 25 | inputs[i] = inputs[i].to(device) 26 | res = model(**inputs, output_hidden_states=True) 27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 28 | 29 | assert len(word2ph) == len(text) + 2 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | 40 | 41 | if __name__ == "__main__": 42 | # feature = get_bert_feature('你好,我是说的道理。') 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /compress_model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from text.symbols import symbols 3 | import torch 4 | 5 | from tools.log import logger 6 | import utils 7 | from models import SynthesizerTrn 8 | import os 9 | 10 | 11 | def copyStateDict(state_dict): 12 | if list(state_dict.keys())[0].startswith("module"): 13 | start_idx = 1 14 | else: 15 | start_idx = 0 16 | new_state_dict = OrderedDict() 17 | for k, v in state_dict.items(): 18 | name = ",".join(k.split(".")[start_idx:]) 19 | new_state_dict[name] = v 20 | return new_state_dict 21 | 22 | 23 | def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str): 24 | hps = utils.get_hparams_from_file(config) 25 | 26 | net_g = SynthesizerTrn( 27 | len(symbols), 28 | hps.data.filter_length // 2 + 1, 29 | hps.train.segment_size // hps.data.hop_length, 30 | n_speakers=hps.data.n_speakers, 31 | **hps.model, 32 | ) 33 | 34 | optim_g = torch.optim.AdamW( 35 | net_g.parameters(), 36 | hps.train.learning_rate, 37 | betas=hps.train.betas, 38 | eps=hps.train.eps, 39 | ) 40 | 41 | state_dict_g = torch.load(input_model, map_location="cpu") 42 | new_dict_g = copyStateDict(state_dict_g) 43 | keys = [] 44 | for k, v in new_dict_g["model"].items(): 45 | if "enc_q" in k: 46 | continue # noqa: E701 47 | keys.append(k) 48 | 49 | new_dict_g = ( 50 | {k: new_dict_g["model"][k].half() for k in keys} 51 | if ishalf 52 | else {k: new_dict_g["model"][k] for k in keys} 53 | ) 54 | 55 | torch.save( 56 | { 57 | "model": new_dict_g, 58 | "iteration": 0, 59 | "optimizer": optim_g.state_dict(), 60 | "learning_rate": 0.0001, 61 | }, 62 | output_model, 63 | ) 64 | 65 | 66 | if __name__ == "__main__": 67 | import argparse 68 | 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument("-c", "--config", type=str, default="configs/config.json") 71 | parser.add_argument("-i", "--input", type=str) 72 | parser.add_argument("-o", "--output", type=str, default=None) 73 | parser.add_argument( 74 | "-hf", "--half", action="store_true", default=False, help="Save as FP16" 75 | ) 76 | 77 | args = parser.parse_args() 78 | 79 | output = args.output 80 | 81 | if output is None: 82 | import os.path 83 | 84 | filename, ext = os.path.splitext(args.input) 85 | half = "_half" if args.half else "" 86 | output = filename + "_release" + half + ext 87 | 88 | removeOptimizer(args.config, args.input, args.half, output) 89 | logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}") 90 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "wavlm-base-plus", 3 | "activation_dropout": 0.0, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "WavLMModel" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 256, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": false, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": false, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_norm": "group", 51 | "feat_proj_dropout": 0.1, 52 | "feat_quantizer_dropout": 0.0, 53 | "final_dropout": 0.0, 54 | "freeze_feat_extract_train": true, 55 | "hidden_act": "gelu", 56 | "hidden_dropout": 0.1, 57 | "hidden_size": 768, 58 | "initializer_range": 0.02, 59 | "intermediate_size": 3072, 60 | "layer_norm_eps": 1e-05, 61 | "layerdrop": 0.05, 62 | "mask_channel_length": 10, 63 | "mask_channel_min_space": 1, 64 | "mask_channel_other": 0.0, 65 | "mask_channel_prob": 0.0, 66 | "mask_channel_selection": "static", 67 | "mask_feature_length": 10, 68 | "mask_feature_min_masks": 0, 69 | "mask_feature_prob": 0.0, 70 | "mask_time_length": 10, 71 | "mask_time_min_masks": 2, 72 | "mask_time_min_space": 1, 73 | "mask_time_other": 0.0, 74 | "mask_time_prob": 0.05, 75 | "mask_time_selection": "static", 76 | "model_type": "wavlm", 77 | "no_mask_channel_overlap": false, 78 | "no_mask_time_overlap": false, 79 | "num_adapter_layers": 3, 80 | "num_attention_heads": 12, 81 | "num_buckets": 320, 82 | "num_codevector_groups": 2, 83 | "num_codevectors_per_group": 320, 84 | "num_conv_pos_embedding_groups": 16, 85 | "num_conv_pos_embeddings": 128, 86 | "num_ctc_classes": 80, 87 | "num_feat_extract_layers": 7, 88 | "num_hidden_layers": 12, 89 | "num_negatives": 100, 90 | "output_hidden_size": 768, 91 | "pad_token_id": 0, 92 | "proj_codevector_dim": 256, 93 | "replace_prob": 0.5, 94 | "torch_dtype": "float32", 95 | "transformers_version": "4.13.0.dev0", 96 | "use_weighted_layer_sum": false, 97 | "vocab_size": 32, 98 | "tokenizer_class": "Wav2Vec2CTCTokenizer" 99 | } 100 | -------------------------------------------------------------------------------- /configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 20, 4 | "eval_interval": 10, 5 | "seed": 42, 6 | "epochs": 3000, 7 | "learning_rate": 0.0002, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 16, 14 | "bf16_run": false, 15 | "lr_decay": 0.99995, 16 | "segment_size": 16384, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "c_commit": 100, 22 | "skip_optimizer": true, 23 | "freeze_ZH_bert": false, 24 | "freeze_JP_bert": false, 25 | "freeze_EN_bert": false, 26 | "freeze_emo": false 27 | }, 28 | "data": { 29 | "training_files": "filelists/train.list", 30 | "validation_files": "filelists/val.list", 31 | "training_visemes_files": "filelists/train_visemes.list", 32 | "validation_visemes_files": "filelists/val_visemes.list", 33 | "max_wav_value": 32768.0, 34 | "sampling_rate": 44100, 35 | "filter_length": 2048, 36 | "hop_length": 512, 37 | "win_length": 2048, 38 | "n_mel_channels": 128, 39 | "mel_fmin": 0.0, 40 | "mel_fmax": null, 41 | "add_blank": true, 42 | "n_speakers": 4, 43 | "cleaned_text": true, 44 | "spk2id": { 45 | "hualing": 0, 46 | "good": 1, 47 | "ailing": 2, 48 | "lady": 3 49 | } 50 | }, 51 | "model": { 52 | "use_spk_conditioned_encoder": true, 53 | "use_noise_scaled_mas": true, 54 | "use_mel_posterior_encoder": false, 55 | "use_duration_discriminator": true, 56 | "inter_channels": 192, 57 | "hidden_channels": 192, 58 | "filter_channels": 768, 59 | "n_heads": 2, 60 | "n_layers": 6, 61 | "kernel_size": 3, 62 | "p_dropout": 0.1, 63 | "resblock": "1", 64 | "resblock_kernel_sizes": [ 65 | 3, 66 | 7, 67 | 11 68 | ], 69 | "resblock_dilation_sizes": [ 70 | [ 71 | 1, 72 | 3, 73 | 5 74 | ], 75 | [ 76 | 1, 77 | 3, 78 | 5 79 | ], 80 | [ 81 | 1, 82 | 3, 83 | 5 84 | ] 85 | ], 86 | "upsample_rates": [ 87 | 8, 88 | 8, 89 | 2, 90 | 2, 91 | 2 92 | ], 93 | "upsample_initial_channel": 512, 94 | "upsample_kernel_sizes": [ 95 | 16, 96 | 16, 97 | 8, 98 | 2, 99 | 2 100 | ], 101 | "n_layers_q": 3, 102 | "use_spectral_norm": false, 103 | "gin_channels": 512, 104 | "slm": { 105 | "model": "./slm/wavlm-base-plus", 106 | "sr": 16000, 107 | "hidden": 768, 108 | "nlayers": 13, 109 | "initial_channel": 64 110 | } 111 | }, 112 | "version": "2.3" 113 | } -------------------------------------------------------------------------------- /motion/wav_to_visemes.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | import torch 3 | import numpy as np 4 | sys.path.insert(0, os.path.abspath('.')) 5 | import utils 6 | from models import VisemesNet 7 | from mel_processing import spectrogram_torch 8 | import torchaudio 9 | from config import config 10 | from visemes_tools import load_post_enc_dec_model, get_device 11 | 12 | 13 | #测试wav文件到visemes 14 | if __name__ == '__main__': 15 | # 从入参获取wav文件 16 | if sys.argv.__len__() < 2: 17 | print('python wav_to_visemes.py wav_file') 18 | exit(1) 19 | wav_file = sys.argv[1] 20 | if not os.path.exists(wav_file): 21 | print('wav_file not exists') 22 | exit(1) 23 | # load hps 24 | hps = utils.get_hparams_from_file('./configs/config.json') 25 | device = get_device() 26 | # load enc, dec, v_model 27 | enc, dec = load_post_enc_dec_model(hps, device=device) 28 | print('net_g loaded') 29 | 30 | net_v = VisemesNet(hps.model.hidden_channels).to(device) 31 | _ = net_v.eval() 32 | _ = utils.load_checkpoint(config.webui_config.v_model, net_v, None, skip_optimizer=True) 33 | print("load v_model from", config.webui_config.v_model) 34 | 35 | if wav_file.endswith('z.npy'): 36 | print('load z from npy file') 37 | z = np.load(wav_file) 38 | z = torch.from_numpy(z).to(device) 39 | # if type is half, convert to float 40 | if z.dtype == torch.float16: 41 | z = z.float() 42 | visemes = net_v(z) 43 | else: 44 | # load wav file 45 | audio_norm, sampling_rate = torchaudio.load(wav_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True) 46 | # check sampling_rate == 44100 47 | if sampling_rate != 44100: 48 | print('sampling_rate error:', sampling_rate) 49 | print('ffmpeg -i input.wav -ar 44100 output.wav') 50 | exit(1) 51 | spec = spectrogram_torch(audio_norm, hps.data.filter_length, 52 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, 53 | center=False) 54 | spec = spec.to(device=get_device()) 55 | audio_norm = audio_norm.unsqueeze(0) 56 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long() 57 | 58 | # get z 59 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None) 60 | print('get z of wav file: ', wav_file) 61 | 62 | visemes_file_path = wav_file[:-4] + '.v.npy' 63 | # generate visemes 64 | visemes = net_v(z) 65 | visemes = visemes.squeeze(0) 66 | visemes = visemes.transpose(0, 1) 67 | visemes = visemes.data.cpu().float().numpy() 68 | print('visemes shape:', visemes.shape) 69 | 70 | # save visemes 71 | np.save(visemes_file_path, visemes) 72 | print('visemes saved to ', visemes_file_path) 73 | -------------------------------------------------------------------------------- /oldVersion/V111/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/chinese-roberta-wwm-ext-large" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert len(word2ph) == len(text) + 2 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | 41 | 42 | if __name__ == "__main__": 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /oldVersion/V200/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 16 | if ( 17 | sys.platform == "darwin" 18 | and torch.backends.mps.is_available() 19 | and device == "cpu" 20 | ): 21 | device = "mps" 22 | if not device: 23 | device = "cuda" 24 | if device not in models.keys(): 25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 26 | with torch.no_grad(): 27 | inputs = tokenizer(text, return_tensors="pt") 28 | for i in inputs: 29 | inputs[i] = inputs[i].to(device) 30 | res = models[device](**inputs, output_hidden_states=True) 31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 32 | 33 | assert len(word2ph) == len(text) + 2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | 44 | 45 | if __name__ == "__main__": 46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 47 | word2phone = [ 48 | 1, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 1, 54 | 2, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 2, 65 | 1, 66 | 1, 67 | 2, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 1, 86 | ] 87 | 88 | # 计算总帧数 89 | total_frames = sum(word2phone) 90 | print(word_level_feature.shape) 91 | print(word2phone) 92 | phone_level_feature = [] 93 | for i in range(len(word2phone)): 94 | print(word_level_feature[i].shape) 95 | 96 | # 对每个词重复word2phone[i]次 97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 98 | phone_level_feature.append(repeat_feature) 99 | 100 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 101 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 102 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 16 | if ( 17 | sys.platform == "darwin" 18 | and torch.backends.mps.is_available() 19 | and device == "cpu" 20 | ): 21 | device = "mps" 22 | if not device: 23 | device = "cuda" 24 | if device not in models.keys(): 25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 26 | with torch.no_grad(): 27 | inputs = tokenizer(text, return_tensors="pt") 28 | for i in inputs: 29 | inputs[i] = inputs[i].to(device) 30 | res = models[device](**inputs, output_hidden_states=True) 31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 32 | 33 | assert len(word2ph) == len(text) + 2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | 44 | 45 | if __name__ == "__main__": 46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 47 | word2phone = [ 48 | 1, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 1, 54 | 2, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 2, 65 | 1, 66 | 1, 67 | 2, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 1, 86 | ] 87 | 88 | # 计算总帧数 89 | total_frames = sum(word2phone) 90 | print(word_level_feature.shape) 91 | print(word2phone) 92 | phone_level_feature = [] 93 | for i in range(len(word2phone)): 94 | print(word_level_feature[i].shape) 95 | 96 | # 对每个词重复word2phone[i]次 97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 98 | phone_level_feature.append(repeat_feature) 99 | 100 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 101 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 102 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /motion/prepare_visemes.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | import torch 3 | sys.path.append('./') 4 | import utils 5 | from text.symbols import symbols 6 | from models import SynthesizerTrn, PosteriorEncoder, Generator 7 | from mel_processing import spectrogram_torch, mel_spectrogram_torch, spec_to_mel_torch 8 | import torchaudio 9 | from visemes_tools import load_post_enc_dec_model, get_device 10 | 11 | 12 | 13 | # 读取records目录下的 *.wav 音频文件和 *.npy 表情数据[n, 61],相同的文件名为一组。 14 | # 前5组[file1.wav, file1.npy]生成训练数据列表 val_visemes.list 15 | # 剩余的组生成测试数据列表 train_visemes.list 16 | def gen_visemes_train_val_list(hps, input_dir='./records/', output_dir = './filelists/'): 17 | enc, dec = load_post_enc_dec_model(hps, device=get_device()) 18 | print('enc, dec loaded') 19 | # read all files in input_dir 20 | files = os.listdir(input_dir) 21 | # filter wav files 22 | wav_files = filter(lambda x: x.endswith('.wav'), files) 23 | wav_files = sorted(wav_files) 24 | # overwrite the list file 25 | with open(output_dir + 'val_visemes.list', 'w') as f: 26 | f.write('') 27 | with open(output_dir + 'train_visemes.list', 'w') as f: 28 | f.write('') 29 | # iterate wav files 30 | for i, wav_file in enumerate(wav_files): 31 | # get the corresponding npy file and make sure it exists 32 | wav_file = input_dir + wav_file 33 | print('processing wav file: ', wav_file) 34 | npy_file = wav_file[:-4] + '.npy' 35 | if not os.path.exists(npy_file): 36 | print('npy file {} does not exist'.format(npy_file)) 37 | continue 38 | audio_norm, sampling_rate = torchaudio.load(wav_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True) 39 | spec = spectrogram_torch(audio_norm, hps.data.filter_length, 40 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, 41 | center=False) 42 | spec = spec.to(device=get_device()) 43 | audio_norm = audio_norm.unsqueeze(0) 44 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long() 45 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None) 46 | print('get z of wav file: ', wav_file) 47 | z_file_path = wav_file[:-4] + '.z.npy' 48 | z = z.to(device='cpu') 49 | # save z 50 | torch.save(z, z_file_path) 51 | print('z saved to ', z_file_path) 52 | 53 | 54 | # generate the line for the list file 55 | line = z_file_path + '|' + npy_file + '\n' 56 | # write the line to the list file 57 | if i < 5: 58 | with open(output_dir + 'val_visemes.list', 'a') as f: 59 | f.write(line) 60 | else: 61 | with open(output_dir + 'train_visemes.list', 'a') as f: 62 | f.write(line) 63 | 64 | 65 | if __name__ == '__main__': 66 | hps = utils.get_hparams_from_file('./configs/config.json') 67 | gen_visemes_train_val_list(hps) -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "torch", 3 | "activation_dropout": 0.1, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "Wav2Vec2ForSpeechClassification" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 768, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": true, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_dropout": 0.0, 51 | "feat_extract_norm": "layer", 52 | "feat_proj_dropout": 0.1, 53 | "feat_quantizer_dropout": 0.0, 54 | "final_dropout": 0.1, 55 | "finetuning_task": "wav2vec2_reg", 56 | "gradient_checkpointing": false, 57 | "hidden_act": "gelu", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "id2label": { 62 | "0": "arousal", 63 | "1": "dominance", 64 | "2": "valence" 65 | }, 66 | "initializer_range": 0.02, 67 | "intermediate_size": 4096, 68 | "label2id": { 69 | "arousal": 0, 70 | "dominance": 1, 71 | "valence": 2 72 | }, 73 | "layer_norm_eps": 1e-05, 74 | "layerdrop": 0.1, 75 | "mask_feature_length": 10, 76 | "mask_feature_min_masks": 0, 77 | "mask_feature_prob": 0.0, 78 | "mask_time_length": 10, 79 | "mask_time_min_masks": 2, 80 | "mask_time_prob": 0.05, 81 | "model_type": "wav2vec2", 82 | "num_adapter_layers": 3, 83 | "num_attention_heads": 16, 84 | "num_codevector_groups": 2, 85 | "num_codevectors_per_group": 320, 86 | "num_conv_pos_embedding_groups": 16, 87 | "num_conv_pos_embeddings": 128, 88 | "num_feat_extract_layers": 7, 89 | "num_hidden_layers": 12, 90 | "num_negatives": 100, 91 | "output_hidden_size": 1024, 92 | "pad_token_id": 0, 93 | "pooling_mode": "mean", 94 | "problem_type": "regression", 95 | "proj_codevector_dim": 768, 96 | "tdnn_dilation": [ 97 | 1, 98 | 2, 99 | 3, 100 | 1, 101 | 1 102 | ], 103 | "tdnn_dim": [ 104 | 512, 105 | 512, 106 | 512, 107 | 512, 108 | 1500 109 | ], 110 | "tdnn_kernel": [ 111 | 5, 112 | 3, 113 | 3, 114 | 1, 115 | 1 116 | ], 117 | "torch_dtype": "float32", 118 | "transformers_version": "4.17.0.dev0", 119 | "use_weighted_layer_sum": false, 120 | "vocab_size": null, 121 | "xvector_output_dim": 512 122 | } 123 | -------------------------------------------------------------------------------- /spec_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | from multiprocessing import Pool 4 | from mel_processing import spectrogram_torch, mel_spectrogram_torch 5 | from utils import load_wav_to_torch 6 | 7 | 8 | class AudioProcessor: 9 | def __init__( 10 | self, 11 | max_wav_value, 12 | use_mel_spec_posterior, 13 | filter_length, 14 | n_mel_channels, 15 | sampling_rate, 16 | hop_length, 17 | win_length, 18 | mel_fmin, 19 | mel_fmax, 20 | ): 21 | self.max_wav_value = max_wav_value 22 | self.use_mel_spec_posterior = use_mel_spec_posterior 23 | self.filter_length = filter_length 24 | self.n_mel_channels = n_mel_channels 25 | self.sampling_rate = sampling_rate 26 | self.hop_length = hop_length 27 | self.win_length = win_length 28 | self.mel_fmin = mel_fmin 29 | self.mel_fmax = mel_fmax 30 | 31 | def process_audio(self, filename): 32 | audio, sampling_rate = load_wav_to_torch(filename) 33 | audio_norm = audio / self.max_wav_value 34 | audio_norm = audio_norm.unsqueeze(0) 35 | spec_filename = filename.replace(".wav", ".spec.pt") 36 | if self.use_mel_spec_posterior: 37 | spec_filename = spec_filename.replace(".spec.pt", ".mel.pt") 38 | try: 39 | spec = torch.load(spec_filename) 40 | except: 41 | if self.use_mel_spec_posterior: 42 | spec = mel_spectrogram_torch( 43 | audio_norm, 44 | self.filter_length, 45 | self.n_mel_channels, 46 | self.sampling_rate, 47 | self.hop_length, 48 | self.win_length, 49 | self.mel_fmin, 50 | self.mel_fmax, 51 | center=False, 52 | ) 53 | else: 54 | spec = spectrogram_torch( 55 | audio_norm, 56 | self.filter_length, 57 | self.sampling_rate, 58 | self.hop_length, 59 | self.win_length, 60 | center=False, 61 | ) 62 | spec = torch.squeeze(spec, 0) 63 | torch.save(spec, spec_filename) 64 | return spec, audio_norm 65 | 66 | 67 | # 使用示例 68 | processor = AudioProcessor( 69 | max_wav_value=32768.0, 70 | use_mel_spec_posterior=False, 71 | filter_length=2048, 72 | n_mel_channels=128, 73 | sampling_rate=44100, 74 | hop_length=512, 75 | win_length=2048, 76 | mel_fmin=0.0, 77 | mel_fmax="null", 78 | ) 79 | 80 | with open("filelists/train.list", "r") as f: 81 | filepaths = [line.split("|")[0] for line in f] # 取每一行的第一部分作为audiopath 82 | 83 | # 使用多进程处理 84 | with Pool(processes=32) as pool: # 使用4个进程 85 | with tqdm(total=len(filepaths)) as pbar: 86 | for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)): 87 | pbar.update() 88 | -------------------------------------------------------------------------------- /bert_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from multiprocessing import Pool 3 | import commons 4 | import utils 5 | from tqdm import tqdm 6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert 7 | import argparse 8 | import torch.multiprocessing as mp 9 | from config import config 10 | 11 | 12 | def process_line(x): 13 | line, add_blank = x 14 | device = config.bert_gen_config.device 15 | if config.bert_gen_config.use_multi_device: 16 | rank = mp.current_process()._identity 17 | rank = rank[0] if len(rank) > 0 else 0 18 | if torch.cuda.is_available(): 19 | gpu_id = rank % torch.cuda.device_count() 20 | device = torch.device(f"cuda:{gpu_id}") 21 | else: 22 | device = torch.device("cpu") 23 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") 24 | phone = phones.split(" ") 25 | tone = [int(i) for i in tone.split(" ")] 26 | word2ph = [int(i) for i in word2ph.split(" ")] 27 | word2ph = [i for i in word2ph] 28 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 29 | 30 | if add_blank: 31 | phone = commons.intersperse(phone, 0) 32 | tone = commons.intersperse(tone, 0) 33 | language = commons.intersperse(language, 0) 34 | for i in range(len(word2ph)): 35 | word2ph[i] = word2ph[i] * 2 36 | word2ph[0] += 1 37 | 38 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") 39 | 40 | try: 41 | bert = torch.load(bert_path) 42 | assert bert.shape[-1] == len(phone) 43 | except Exception: 44 | bert = get_bert(text, word2ph, language_str, device) 45 | assert bert.shape[-1] == len(phone) 46 | torch.save(bert, bert_path) 47 | 48 | 49 | preprocess_text_config = config.preprocess_text_config 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument( 54 | "-c", "--config", type=str, default=config.bert_gen_config.config_path 55 | ) 56 | parser.add_argument( 57 | "--num_processes", type=int, default=config.bert_gen_config.num_processes 58 | ) 59 | args, _ = parser.parse_known_args() 60 | config_path = args.config 61 | hps = utils.get_hparams_from_file(config_path) 62 | check_bert_models() 63 | lines = [] 64 | with open(hps.data.training_files, encoding="utf-8") as f: 65 | lines.extend(f.readlines()) 66 | 67 | with open(hps.data.validation_files, encoding="utf-8") as f: 68 | lines.extend(f.readlines()) 69 | add_blank = [hps.data.add_blank] * len(lines) 70 | 71 | if len(lines) != 0: 72 | num_processes = args.num_processes 73 | with Pool(processes=num_processes) as pool: 74 | for _ in tqdm( 75 | pool.imap_unordered(process_line, zip(lines, add_blank)), 76 | total=len(lines), 77 | ): 78 | # 这里是缩进的代码块,表示循环体 79 | pass # 使用pass语句作为占位符 80 | 81 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!") 82 | -------------------------------------------------------------------------------- /oldVersion/V110/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | assert bert.shape[-1] == len(phone), phone 26 | 27 | if language_str == "ZH": 28 | bert = bert 29 | ja_bert = torch.zeros(768, len(phone)) 30 | elif language_str == "JP": 31 | ja_bert = bert 32 | bert = torch.zeros(1024, len(phone)) 33 | else: 34 | bert = torch.zeros(1024, len(phone)) 35 | ja_bert = torch.zeros(768, len(phone)) 36 | 37 | assert bert.shape[-1] == len( 38 | phone 39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 40 | 41 | phone = torch.LongTensor(phone) 42 | tone = torch.LongTensor(tone) 43 | language = torch.LongTensor(language) 44 | return bert, ja_bert, phone, tone, language 45 | 46 | 47 | def infer( 48 | text, 49 | sdp_ratio, 50 | noise_scale, 51 | noise_scale_w, 52 | length_scale, 53 | sid, 54 | language, 55 | hps, 56 | net_g, 57 | device, 58 | ): 59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device) 60 | with torch.no_grad(): 61 | x_tst = phones.to(device).unsqueeze(0) 62 | tones = tones.to(device).unsqueeze(0) 63 | lang_ids = lang_ids.to(device).unsqueeze(0) 64 | bert = bert.to(device).unsqueeze(0) 65 | ja_bert = ja_bert.to(device).unsqueeze(0) 66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 67 | del phones 68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 69 | audio = ( 70 | net_g.infer( 71 | x_tst, 72 | x_tst_lengths, 73 | speakers, 74 | tones, 75 | lang_ids, 76 | bert, 77 | ja_bert, 78 | sdp_ratio=sdp_ratio, 79 | noise_scale=noise_scale, 80 | noise_scale_w=noise_scale_w, 81 | length_scale=length_scale, 82 | )[0][0, 0] 83 | .data.cpu() 84 | .float() 85 | .numpy() 86 | ) 87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert 88 | if torch.cuda.is_available(): 89 | torch.cuda.empty_cache() 90 | return audio 91 | -------------------------------------------------------------------------------- /motion/visemes_tools.py: -------------------------------------------------------------------------------- 1 | 2 | import os,sys 3 | import torch 4 | sys.path.insert(0, os.path.abspath('.')) 5 | import utils 6 | from text.symbols import symbols 7 | from models import SynthesizerTrn, PosteriorEncoder, Generator 8 | from mel_processing import spectrogram_torch, mel_spectrogram_torch, spec_to_mel_torch 9 | import torchaudio 10 | 11 | def get_device(): 12 | device = ( 13 | "cuda:0" 14 | if torch.cuda.is_available() 15 | else ( 16 | "mps" 17 | if sys.platform == "darwin" and torch.backends.mps.is_available() 18 | else "cpu" 19 | ) 20 | ) 21 | print("Using device: {}".format(device)) 22 | return device 23 | 24 | def load_post_enc_dec_model(hps, model_path = './OUTPUT_MODEL/models/G_3000.pth', device='cpu'): 25 | # load the model 26 | print('Loading model from {}'.format(model_path)) 27 | net_g = SynthesizerTrn( 28 | len(symbols), 29 | hps.data.filter_length // 2 + 1, 30 | hps.train.segment_size // hps.data.hop_length, 31 | n_speakers=hps.data.n_speakers, 32 | **hps.model).to(device) 33 | _ = net_g.eval() 34 | 35 | _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True) 36 | print('Model loaded') 37 | 38 | return net_g.get_post_enc_dec() 39 | 40 | def test_wav_enc_dec(hps, input_file='test_in.wav', output_file='test_out.wav', enc = None, dec = None): 41 | if enc == None or dec == None: 42 | enc, dec = load_post_enc_dec_model(hps, device=get_device()) 43 | audio_norm, sampling_rate = torchaudio.load(input_file, frame_offset=0, num_frames=-1, normalize=True, channels_first=True) 44 | # 短时傅里叶变换, 非 mel普 45 | spec = spectrogram_torch(audio_norm, hps.data.filter_length, 46 | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, 47 | center=False) 48 | spec = spec.to(device=get_device()) 49 | audio_norm = audio_norm.unsqueeze(0) 50 | print('audio_norm.shape: ', audio_norm.shape, 'spec.shape', spec.shape, 'file: ', input_file) 51 | x_lengths = torch.clamp_min(torch.sum(spec, [1, 2]), 1).long() 52 | z, m_q, logs_q, y_mask = enc(spec, x_lengths=x_lengths, g=None) 53 | print('z.shape: ', z.shape) 54 | y = dec(z) 55 | print('y.shape: ', y.shape) 56 | y = y.squeeze(0).data.cpu() 57 | #save y to output_file 58 | torchaudio.save(output_file, y, sampling_rate) 59 | print('output_file: ', output_file, 'saved') 60 | 61 | def save_post_enc_model(hps, model_path = './OUTPUT_MODEL/models/G_3000.pth', device='cpu'): 62 | # load the model 63 | print('Loading model from {}'.format(model_path)) 64 | enc, _ = load_post_enc_dec_model(hps, model_path, device) 65 | print('Model loaded') 66 | post_enc_path = os.path.join(os.path.dirname(model_path), 'post_enc.pth') 67 | torch.save(enc.state_dict(), post_enc_path) 68 | print('Post-encoder saved to {}'.format(post_enc_path)) 69 | 70 | 71 | if __name__ == '__main__': 72 | hps = utils.get_hparams_from_file('./configs/config.json') 73 | # test_wav_enc_dec(hps) 74 | save_post_enc_model(hps) -------------------------------------------------------------------------------- /update_status.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gradio as gr 3 | 4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"} 5 | 6 | 7 | def raw_dir_convert_to_path(target_dir: str, lang): 8 | res = target_dir.rstrip("/").rstrip("\\") 9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")): 10 | res = os.path.join("./raw", res) 11 | if ( 12 | (not res.endswith("_zh")) 13 | and (not res.endswith("_jp")) 14 | and (not res.endswith("_en")) 15 | ): 16 | res += lang_dict[lang] 17 | return res 18 | 19 | 20 | def update_g_files(): 21 | g_files = [] 22 | cnt = 0 23 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 24 | for file in files: 25 | if file.startswith("G_") and file.endswith(".pth"): 26 | g_files.append(os.path.join(root, file)) 27 | cnt += 1 28 | print(g_files) 29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files) 30 | 31 | 32 | def update_c_files(): 33 | c_files = [] 34 | cnt = 0 35 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 36 | for file in files: 37 | if file.startswith("config.json"): 38 | c_files.append(os.path.join(root, file)) 39 | cnt += 1 40 | print(c_files) 41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files) 42 | 43 | 44 | def update_model_folders(): 45 | subdirs = [] 46 | cnt = 0 47 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 48 | for dir_name in dirs: 49 | if os.path.basename(dir_name) != "eval": 50 | subdirs.append(os.path.join(root, dir_name)) 51 | cnt += 1 52 | print(subdirs) 53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs) 54 | 55 | 56 | def update_wav_lab_pairs(): 57 | wav_count = tot_count = 0 58 | for root, _, files in os.walk("./raw"): 59 | for file in files: 60 | # print(file) 61 | file_path = os.path.join(root, file) 62 | if file.lower().endswith(".wav"): 63 | lab_file = os.path.splitext(file_path)[0] + ".lab" 64 | if os.path.exists(lab_file): 65 | wav_count += 1 66 | tot_count += 1 67 | return f"{wav_count} / {tot_count}" 68 | 69 | 70 | def update_raw_folders(): 71 | subdirs = [] 72 | cnt = 0 73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径 74 | raw_path = os.path.join(script_path, "raw") 75 | print(raw_path) 76 | os.makedirs(raw_path, exist_ok=True) 77 | for root, dirs, files in os.walk(raw_path): 78 | for dir_name in dirs: 79 | relative_path = os.path.relpath( 80 | os.path.join(root, dir_name), script_path 81 | ) # 获取相对路径 82 | subdirs.append(relative_path) 83 | cnt += 1 84 | print(subdirs) 85 | return ( 86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹", 87 | gr.Dropdown.update(choices=subdirs), 88 | gr.Textbox.update(value=update_wav_lab_pairs()), 89 | ) 90 | -------------------------------------------------------------------------------- /oldVersion/V101/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "I", 78 | "N", 79 | "U", 80 | "a", 81 | "b", 82 | "by", 83 | "ch", 84 | "cl", 85 | "d", 86 | "dy", 87 | "e", 88 | "f", 89 | "g", 90 | "gy", 91 | "h", 92 | "hy", 93 | "i", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "p", 103 | "py", 104 | "r", 105 | "ry", 106 | "s", 107 | "sh", 108 | "t", 109 | "ts", 110 | "u", 111 | "V", 112 | "w", 113 | "y", 114 | "z", 115 | ] 116 | num_ja_tones = 1 117 | 118 | # English 119 | en_symbols = [ 120 | "aa", 121 | "ae", 122 | "ah", 123 | "ao", 124 | "aw", 125 | "ay", 126 | "b", 127 | "ch", 128 | "d", 129 | "dh", 130 | "eh", 131 | "er", 132 | "ey", 133 | "f", 134 | "g", 135 | "hh", 136 | "ih", 137 | "iy", 138 | "jh", 139 | "k", 140 | "l", 141 | "m", 142 | "n", 143 | "ng", 144 | "ow", 145 | "oy", 146 | "p", 147 | "r", 148 | "s", 149 | "sh", 150 | "t", 151 | "th", 152 | "uh", 153 | "uw", 154 | "V", 155 | "w", 156 | "y", 157 | "z", 158 | "zh", 159 | ] 160 | num_en_tones = 4 161 | 162 | # combine all symbols 163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 164 | symbols = [pad] + normal_symbols + pu_symbols 165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 166 | 167 | # combine all tones 168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 169 | 170 | # language maps 171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2} 172 | num_languages = len(language_id_map.keys()) 173 | 174 | language_tone_start_map = { 175 | "ZH": 0, 176 | "JA": num_zh_tones, 177 | "EN": num_zh_tones + num_ja_tones, 178 | } 179 | 180 | if __name__ == "__main__": 181 | a = set(zh_symbols) 182 | b = set(en_symbols) 183 | print(sorted(a & b)) 184 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V110/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V111/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V200/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V210/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V210/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V220/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V230/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V200/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | @Desc: 2.0版本兼容 对应2.0.1 2.0.2-fix 3 | """ 4 | import torch 5 | import commons 6 | from .text import cleaned_text_to_sequence, get_bert 7 | from .text.cleaner import clean_text 8 | 9 | 10 | def get_text(text, language_str, hps, device): 11 | # 在此处实现当前版本的get_text 12 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 13 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 14 | 15 | if hps.data.add_blank: 16 | phone = commons.intersperse(phone, 0) 17 | tone = commons.intersperse(tone, 0) 18 | language = commons.intersperse(language, 0) 19 | for i in range(len(word2ph)): 20 | word2ph[i] = word2ph[i] * 2 21 | word2ph[0] += 1 22 | bert_ori = get_bert(norm_text, word2ph, language_str, device) 23 | del word2ph 24 | assert bert_ori.shape[-1] == len(phone), phone 25 | 26 | if language_str == "ZH": 27 | bert = bert_ori 28 | ja_bert = torch.zeros(1024, len(phone)) 29 | en_bert = torch.zeros(1024, len(phone)) 30 | elif language_str == "JP": 31 | bert = torch.zeros(1024, len(phone)) 32 | ja_bert = bert_ori 33 | en_bert = torch.zeros(1024, len(phone)) 34 | elif language_str == "EN": 35 | bert = torch.zeros(1024, len(phone)) 36 | ja_bert = torch.zeros(1024, len(phone)) 37 | en_bert = bert_ori 38 | else: 39 | raise ValueError("language_str should be ZH, JP or EN") 40 | 41 | assert bert.shape[-1] == len( 42 | phone 43 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 44 | 45 | phone = torch.LongTensor(phone) 46 | tone = torch.LongTensor(tone) 47 | language = torch.LongTensor(language) 48 | return bert, ja_bert, en_bert, phone, tone, language 49 | 50 | 51 | def infer( 52 | text, 53 | sdp_ratio, 54 | noise_scale, 55 | noise_scale_w, 56 | length_scale, 57 | sid, 58 | language, 59 | hps, 60 | net_g, 61 | device, 62 | ): 63 | bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( 64 | text, language, hps, device 65 | ) 66 | with torch.no_grad(): 67 | x_tst = phones.to(device).unsqueeze(0) 68 | tones = tones.to(device).unsqueeze(0) 69 | lang_ids = lang_ids.to(device).unsqueeze(0) 70 | bert = bert.to(device).unsqueeze(0) 71 | ja_bert = ja_bert.to(device).unsqueeze(0) 72 | en_bert = en_bert.to(device).unsqueeze(0) 73 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 74 | del phones 75 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 76 | audio = ( 77 | net_g.infer( 78 | x_tst, 79 | x_tst_lengths, 80 | speakers, 81 | tones, 82 | lang_ids, 83 | bert, 84 | ja_bert, 85 | en_bert, 86 | sdp_ratio=sdp_ratio, 87 | noise_scale=noise_scale, 88 | noise_scale_w=noise_scale_w, 89 | length_scale=length_scale, 90 | )[0][0, 0] 91 | .data.cpu() 92 | .float() 93 | .numpy() 94 | ) 95 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert 96 | if torch.cuda.is_available(): 97 | torch.cuda.empty_cache() 98 | return audio 99 | -------------------------------------------------------------------------------- /oldVersion/V101/text/japanese.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py 2 | import re 3 | import sys 4 | 5 | import pyopenjtalk 6 | 7 | from . import symbols 8 | 9 | # Regular expression matching Japanese without punctuation marks: 10 | _japanese_characters = re.compile( 11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 12 | ) 13 | 14 | # Regular expression matching non-Japanese characters or punctuation marks: 15 | _japanese_marks = re.compile( 16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 17 | ) 18 | 19 | # List of (symbol, Japanese) pairs for marks: 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] 21 | 22 | 23 | # List of (consonant, sokuon) pairs: 24 | _real_sokuon = [ 25 | (re.compile("%s" % x[0]), x[1]) 26 | for x in [ 27 | (r"Q([↑↓]*[kg])", r"k#\1"), 28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"), 29 | (r"Q([↑↓]*[sʃ])", r"s\1"), 30 | (r"Q([↑↓]*[pb])", r"p#\1"), 31 | ] 32 | ] 33 | 34 | # List of (consonant, hatsuon) pairs: 35 | _real_hatsuon = [ 36 | (re.compile("%s" % x[0]), x[1]) 37 | for x in [ 38 | (r"N([↑↓]*[pbm])", r"m\1"), 39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"), 40 | (r"N([↑↓]*[tdn])", r"n\1"), 41 | (r"N([↑↓]*[kg])", r"ŋ\1"), 42 | ] 43 | ] 44 | 45 | 46 | def post_replace_ph(ph): 47 | rep_map = { 48 | ":": ",", 49 | ";": ",", 50 | ",": ",", 51 | "。": ".", 52 | "!": "!", 53 | "?": "?", 54 | "\n": ".", 55 | "·": ",", 56 | "、": ",", 57 | "...": "…", 58 | "v": "V", 59 | } 60 | if ph in rep_map.keys(): 61 | ph = rep_map[ph] 62 | if ph in symbols: 63 | return ph 64 | if ph not in symbols: 65 | ph = "UNK" 66 | return ph 67 | 68 | 69 | def symbols_to_japanese(text): 70 | for regex, replacement in _symbols_to_japanese: 71 | text = re.sub(regex, replacement, text) 72 | return text 73 | 74 | 75 | def preprocess_jap(text): 76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" 77 | text = symbols_to_japanese(text) 78 | sentences = re.split(_japanese_marks, text) 79 | marks = re.findall(_japanese_marks, text) 80 | text = [] 81 | for i, sentence in enumerate(sentences): 82 | if re.match(_japanese_characters, sentence): 83 | p = pyopenjtalk.g2p(sentence) 84 | text += p.split(" ") 85 | 86 | if i < len(marks): 87 | text += [marks[i].replace(" ", "")] 88 | return text 89 | 90 | 91 | def text_normalize(text): 92 | # todo: jap text normalize 93 | return text 94 | 95 | 96 | def g2p(norm_text): 97 | phones = preprocess_jap(norm_text) 98 | phones = [post_replace_ph(i) for i in phones] 99 | # todo: implement tones and word2ph 100 | tones = [0 for i in phones] 101 | word2ph = [1 for i in phones] 102 | return phones, tones, word2ph 103 | 104 | 105 | if __name__ == "__main__": 106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines(): 107 | text = line.split(":")[1] 108 | phones, tones, word2ph = g2p(text) 109 | for p in phones: 110 | if p == "z": 111 | print(text, phones) 112 | sys.exit(0) 113 | -------------------------------------------------------------------------------- /motion/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "blend shape:" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "np_file_path = r'../records/2023-12-23-17-19-54.npy'\n", 19 | "bs = np.load(np_file_path, allow_pickle=True)\n", 20 | "print(bs.shape)\n", 21 | "# draw lines from bs\n", 22 | "for i in range(bs.shape[1]):\n", 23 | " line_data = bs[:120]\n", 24 | " plt.plot(line_data)\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "旋转测试" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "新的旋转四元数: [0 1 0 0]\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "import numpy as np\n", 49 | "\n", 50 | "# 原始的旋转四元数\n", 51 | "x3, y3, z3, w3 = (0, 0, 0, 1)\n", 52 | "\n", 53 | "# 180度绕 Y 轴的四元数表示\n", 54 | "r = np.array([0, 1, 0, 0])\n", 55 | "\n", 56 | "# 四元数乘法函数\n", 57 | "def quat_multiply(q1, q2):\n", 58 | " x1, y1, z1, w1 = q1\n", 59 | " x2, y2, z2, w2 = q2\n", 60 | " \n", 61 | " w = w1*w2 - x1*x2 - y1*y2 - z1*z2\n", 62 | " x = x1*w2 + w1*x2 + y1*z2 - z1*y2\n", 63 | " y = w1*y2 - x1*z2 + y1*w2 + z1*x2\n", 64 | " z = w1*z2 + x1*y2 - y1*x2 + z1*w2\n", 65 | " \n", 66 | " return np.array([x, y, z, w])\n", 67 | "\n", 68 | "# 现在找到新的旋转四元数表示\n", 69 | "new_quaternion = quat_multiply(r, [x3, y3, z3, w3])\n", 70 | "\n", 71 | "print(\"新的旋转四元数:\", new_quaternion)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "合并文件" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 8, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "(1270, 61)\n", 91 | "(1253, 61)\n", 92 | "(2523, 61)\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "#读取 tmp_cn.npy tmp_en.npy\n", 98 | "import numpy as np\n", 99 | "bs1 = np.load(r'../tmp_cn.npy', allow_pickle=True)\n", 100 | "bs2 = np.load(r'../tmp_en.npy', allow_pickle=True)\n", 101 | "print(bs1.shape)\n", 102 | "print(bs2.shape)\n", 103 | "# 在维度0上合并\n", 104 | "bs = np.concatenate((bs1, bs2), axis=0)\n", 105 | "print(bs.shape)\n", 106 | "#保存维度0的前1500个数值\n", 107 | "# np.save(r'../tmp_16.npy', bs)\n", 108 | "np.save(r'../tmp_16.npy', bs[:1500])" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.10.8" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 2 133 | } 134 | -------------------------------------------------------------------------------- /text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature( 16 | text, 17 | word2ph, 18 | device=config.bert_gen_config.device, 19 | style_text=None, 20 | style_weight=0.7, 21 | ): 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | if device not in models.keys(): 31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | for i in inputs: 35 | inputs[i] = inputs[i].to(device) 36 | res = models[device](**inputs, output_hidden_states=True) 37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 38 | if style_text: 39 | style_inputs = tokenizer(style_text, return_tensors="pt") 40 | for i in style_inputs: 41 | style_inputs[i] = style_inputs[i].to(device) 42 | style_res = models[device](**style_inputs, output_hidden_states=True) 43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 44 | style_res_mean = style_res.mean(0) 45 | assert len(word2ph) == len(text) + 2 46 | word2phone = word2ph 47 | phone_level_feature = [] 48 | for i in range(len(word2phone)): 49 | if style_text: 50 | repeat_feature = ( 51 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 52 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 53 | ) 54 | else: 55 | repeat_feature = res[i].repeat(word2phone[i], 1) 56 | phone_level_feature.append(repeat_feature) 57 | 58 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 59 | 60 | return phone_level_feature.T 61 | 62 | 63 | if __name__ == "__main__": 64 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 65 | word2phone = [ 66 | 1, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 1, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 1, 78 | 2, 79 | 2, 80 | 2, 81 | 2, 82 | 2, 83 | 1, 84 | 1, 85 | 2, 86 | 2, 87 | 1, 88 | 2, 89 | 2, 90 | 2, 91 | 2, 92 | 1, 93 | 2, 94 | 2, 95 | 2, 96 | 2, 97 | 2, 98 | 1, 99 | 2, 100 | 2, 101 | 2, 102 | 2, 103 | 1, 104 | ] 105 | 106 | # 计算总帧数 107 | total_frames = sum(word2phone) 108 | print(word_level_feature.shape) 109 | print(word2phone) 110 | phone_level_feature = [] 111 | for i in range(len(word2phone)): 112 | print(word_level_feature[i].shape) 113 | 114 | # 对每个词重复word2phone[i]次 115 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 116 | phone_level_feature.append(repeat_feature) 117 | 118 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 119 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 120 | -------------------------------------------------------------------------------- /oldVersion/V210/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature( 16 | text, 17 | word2ph, 18 | device=config.bert_gen_config.device, 19 | style_text=None, 20 | style_weight=0.7, 21 | ): 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | if device not in models.keys(): 31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | for i in inputs: 35 | inputs[i] = inputs[i].to(device) 36 | res = models[device](**inputs, output_hidden_states=True) 37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 38 | if style_text: 39 | style_inputs = tokenizer(style_text, return_tensors="pt") 40 | for i in style_inputs: 41 | style_inputs[i] = style_inputs[i].to(device) 42 | style_res = models[device](**style_inputs, output_hidden_states=True) 43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 44 | style_res_mean = style_res.mean(0) 45 | 46 | assert len(word2ph) == len(text) + 2 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | 63 | 64 | if __name__ == "__main__": 65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 66 | word2phone = [ 67 | 1, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 1, 73 | 2, 74 | 2, 75 | 1, 76 | 2, 77 | 2, 78 | 1, 79 | 2, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | 1, 86 | 2, 87 | 2, 88 | 1, 89 | 2, 90 | 2, 91 | 2, 92 | 2, 93 | 1, 94 | 2, 95 | 2, 96 | 2, 97 | 2, 98 | 2, 99 | 1, 100 | 2, 101 | 2, 102 | 2, 103 | 2, 104 | 1, 105 | ] 106 | 107 | # 计算总帧数 108 | total_frames = sum(word2phone) 109 | print(word_level_feature.shape) 110 | print(word2phone) 111 | phone_level_feature = [] 112 | for i in range(len(word2phone)): 113 | print(word_level_feature[i].shape) 114 | 115 | # 对每个词重复word2phone[i]次 116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 117 | phone_level_feature.append(repeat_feature) 118 | 119 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 120 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 121 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | tags: 4 | - deberta 5 | - deberta-v3 6 | - fill-mask 7 | thumbnail: https://huggingface.co/front/thumbnails/microsoft.png 8 | license: mit 9 | --- 10 | 11 | ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing 12 | 13 | [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data. 14 | 15 | In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543). 16 | 17 | Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates. 18 | 19 | The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2. 20 | 21 | 22 | #### Fine-tuning on NLU tasks 23 | 24 | We present the dev results on SQuAD 2.0 and MNLI tasks. 25 | 26 | | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)| 27 | |-------------------|----------|-------------------|-----------|----------| 28 | | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 | 29 | | XLNet-large |32 |- | 90.6/87.9 | 90.8 | 30 | | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 | 31 | | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**| 32 | 33 | 34 | #### Fine-tuning with HF transformers 35 | 36 | ```bash 37 | #!/bin/bash 38 | 39 | cd transformers/examples/pytorch/text-classification/ 40 | 41 | pip install datasets 42 | export TASK_NAME=mnli 43 | 44 | output_dir="ds_results" 45 | 46 | num_gpus=8 47 | 48 | batch_size=8 49 | 50 | python -m torch.distributed.launch --nproc_per_node=${num_gpus} \ 51 | run_glue.py \ 52 | --model_name_or_path microsoft/deberta-v3-large \ 53 | --task_name $TASK_NAME \ 54 | --do_train \ 55 | --do_eval \ 56 | --evaluation_strategy steps \ 57 | --max_seq_length 256 \ 58 | --warmup_steps 50 \ 59 | --per_device_train_batch_size ${batch_size} \ 60 | --learning_rate 6e-6 \ 61 | --num_train_epochs 2 \ 62 | --output_dir $output_dir \ 63 | --overwrite_output_dir \ 64 | --logging_steps 1000 \ 65 | --logging_dir $output_dir 66 | 67 | ``` 68 | 69 | ### Citation 70 | 71 | If you find DeBERTa useful for your work, please cite the following papers: 72 | 73 | ``` latex 74 | @misc{he2021debertav3, 75 | title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, 76 | author={Pengcheng He and Jianfeng Gao and Weizhu Chen}, 77 | year={2021}, 78 | eprint={2111.09543}, 79 | archivePrefix={arXiv}, 80 | primaryClass={cs.CL} 81 | } 82 | ``` 83 | 84 | ``` latex 85 | @inproceedings{ 86 | he2021deberta, 87 | title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION}, 88 | author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen}, 89 | booktitle={International Conference on Learning Representations}, 90 | year={2021}, 91 | url={https://openreview.net/forum?id=XPZIaotutsD} 92 | } 93 | ``` 94 | -------------------------------------------------------------------------------- /oldVersion/V210/emo_gen.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.utils.data import Dataset 6 | from torch.utils.data import Dataset 7 | from transformers import Wav2Vec2Processor 8 | from transformers.models.wav2vec2.modeling_wav2vec2 import ( 9 | Wav2Vec2Model, 10 | Wav2Vec2PreTrainedModel, 11 | ) 12 | 13 | from config import config 14 | 15 | 16 | class RegressionHead(nn.Module): 17 | r"""Classification head.""" 18 | 19 | def __init__(self, config): 20 | super().__init__() 21 | 22 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 23 | self.dropout = nn.Dropout(config.final_dropout) 24 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 25 | 26 | def forward(self, features, **kwargs): 27 | x = features 28 | x = self.dropout(x) 29 | x = self.dense(x) 30 | x = torch.tanh(x) 31 | x = self.dropout(x) 32 | x = self.out_proj(x) 33 | 34 | return x 35 | 36 | 37 | class EmotionModel(Wav2Vec2PreTrainedModel): 38 | r"""Speech emotion classifier.""" 39 | 40 | def __init__(self, config): 41 | super().__init__(config) 42 | 43 | self.config = config 44 | self.wav2vec2 = Wav2Vec2Model(config) 45 | self.classifier = RegressionHead(config) 46 | self.init_weights() 47 | 48 | def forward( 49 | self, 50 | input_values, 51 | ): 52 | outputs = self.wav2vec2(input_values) 53 | hidden_states = outputs[0] 54 | hidden_states = torch.mean(hidden_states, dim=1) 55 | logits = self.classifier(hidden_states) 56 | 57 | return hidden_states, logits 58 | 59 | 60 | class AudioDataset(Dataset): 61 | def __init__(self, list_of_wav_files, sr, processor): 62 | self.list_of_wav_files = list_of_wav_files 63 | self.processor = processor 64 | self.sr = sr 65 | 66 | def __len__(self): 67 | return len(self.list_of_wav_files) 68 | 69 | def __getitem__(self, idx): 70 | wav_file = self.list_of_wav_files[idx] 71 | audio_data, _ = librosa.load(wav_file, sr=self.sr) 72 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[ 73 | "input_values" 74 | ][0] 75 | return torch.from_numpy(processed_data) 76 | 77 | 78 | device = config.emo_gen_config.device 79 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim" 80 | processor = Wav2Vec2Processor.from_pretrained(model_name) 81 | model = EmotionModel.from_pretrained(model_name).to(device) 82 | 83 | 84 | def process_func( 85 | x: np.ndarray, 86 | sampling_rate: int, 87 | model: EmotionModel, 88 | processor: Wav2Vec2Processor, 89 | device: str, 90 | embeddings: bool = False, 91 | ) -> np.ndarray: 92 | r"""Predict emotions or extract embeddings from raw audio signal.""" 93 | model = model.to(device) 94 | y = processor(x, sampling_rate=sampling_rate) 95 | y = y["input_values"][0] 96 | y = torch.from_numpy(y).unsqueeze(0).to(device) 97 | 98 | # run through model 99 | with torch.no_grad(): 100 | y = model(y)[0 if embeddings else 1] 101 | 102 | # convert to numpy 103 | y = y.detach().cpu().numpy() 104 | 105 | return y 106 | 107 | 108 | def get_emo(path): 109 | wav, sr = librosa.load(path, 16000) 110 | return process_func( 111 | np.expand_dims(wav, 0).astype(np.float64), 112 | sr, 113 | model, 114 | processor, 115 | device, 116 | embeddings=True, 117 | ).squeeze(0) 118 | --------------------------------------------------------------------------------