├── .gitmodules ├── oldVersion ├── V111 │ └── text │ │ ├── fix │ │ ├── __init__.py │ │ └── japanese_bert.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ └── symbols.py ├── __init__.py ├── V200 │ ├── text │ │ ├── cmudict_cache.pickle │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── english_bert_mock.py │ │ ├── __init__.py │ │ ├── japanese_bert.py │ │ ├── chinese_bert.py │ │ └── symbols.py │ └── __init__.py ├── V210 │ ├── text │ │ ├── cmudict_cache.pickle │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── symbols.py │ │ └── chinese_bert.py │ └── emo_gen.py ├── V220 │ ├── text │ │ ├── cmudict_cache.pickle │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── symbols.py │ │ └── chinese_bert.py │ ├── clap_wrapper.py │ └── clap_gen.py ├── V101 │ ├── text │ │ ├── english_bert_mock.py │ │ ├── cleaner.py │ │ ├── __init__.py │ │ ├── chinese_bert.py │ │ ├── symbols.py │ │ └── japanese.py │ └── __init__.py └── V110 │ ├── text │ ├── english_bert_mock.py │ ├── cleaner.py │ ├── __init__.py │ ├── japanese_bert.py │ ├── chinese_bert.py │ └── symbols.py │ └── __init__.py ├── tools ├── __init__.py ├── log.py └── translate.py ├── bert ├── chinese-roberta-wwm-ext-large │ ├── added_tokens.json │ ├── tokenizer_config.json │ ├── special_tokens_map.json │ ├── .gitattributes │ ├── config.json │ └── README.md ├── deberta-v3-large │ ├── tokenizer_config.json │ ├── generator_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md ├── deberta-v2-large-japanese-char-wwm │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── config.json │ └── .gitattributes ├── deberta-v2-large-japanese │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── config.json │ └── .gitattributes ├── bert-base-japanese-v3 │ ├── tokenizer_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md ├── bert-large-japanese-v2 │ ├── tokenizer_config.json │ ├── config.json │ ├── .gitattributes │ └── README.md └── bert_models.json ├── onnx_modules ├── V200 │ ├── text │ │ ├── __init__.py │ │ ├── bert_utils.py │ │ ├── cleaner.py │ │ ├── english_bert_mock.py │ │ ├── japanese_bert.py │ │ ├── chinese_bert.py │ │ └── symbols.py │ └── __init__.py ├── V210 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V220 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V230 │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py ├── V220_novq_dev │ ├── text │ │ ├── __init__.py │ │ └── symbols.py │ └── __init__.py └── __init__.py ├── emotional ├── wav2vec2-large-robust-12-ft-emotion-msp-dim │ ├── vocab.json │ ├── preprocessor_config.json │ ├── .gitattributes │ └── config.json └── clap-htsat-fused │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── preprocessor_config.json │ └── .gitattributes ├── img ├── 宵宫.png ├── 纳西妲.png ├── yuyu.png ├── 参数说明.png ├── 神里绫华.png └── 微信图片_20231010105112.png ├── text ├── cmudict_cache.pickle ├── bert_utils.py ├── cleaner.py ├── __init__.py ├── english_bert_mock.py ├── japanese_bert.py ├── symbols.py └── chinese_bert.py ├── filelists └── sample.list ├── slm └── wavlm-base-plus │ ├── preprocessor_config.json │ ├── .gitattributes │ └── config.json ├── css └── custom.css ├── requirements.txt ├── export_onnx.py ├── monotonic_align ├── __init__.py └── core.py ├── .pre-commit-config.yaml ├── run_MnodesAndMgpus.sh ├── .github └── workflows │ ├── pull_format.yml │ └── push_format.yml ├── onnx_infer.py ├── resample_legacy.py ├── README.md ├── resample.py ├── whisper_transcribe.py ├── re_matching.py ├── compress_model.py ├── spec_gen.py ├── bert_gen.py └── update_status.py /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oldVersion/V111/text/fix/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 工具包 3 | """ 4 | -------------------------------------------------------------------------------- /oldVersion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 老版本模型推理兼容 3 | """ 4 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/added_tokens.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V210/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V220/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /onnx_modules/V230/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"init_inputs": []} 2 | -------------------------------------------------------------------------------- /img/宵宫.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/宵宫.png -------------------------------------------------------------------------------- /img/纳西妲.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/纳西妲.png -------------------------------------------------------------------------------- /img/yuyu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/yuyu.png -------------------------------------------------------------------------------- /img/参数说明.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/参数说明.png -------------------------------------------------------------------------------- /img/神里绫华.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/神里绫华.png -------------------------------------------------------------------------------- /bert/deberta-v3-large/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_lower_case": false, 3 | "vocab_type": "spm" 4 | } 5 | -------------------------------------------------------------------------------- /text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /filelists/sample.list: -------------------------------------------------------------------------------- 1 | Example: 2 | {wav_path}|{speaker_name}|{language}|{text} 3 | 派蒙_1.wav|派蒙|ZH|前面的区域,以后再来探索吧! 4 | -------------------------------------------------------------------------------- /img/微信图片_20231010105112.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/微信图片_20231010105112.png -------------------------------------------------------------------------------- /oldVersion/V200/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V200/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V210/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V210/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V220/text/cmudict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V220/text/cmudict_cache.pickle -------------------------------------------------------------------------------- /oldVersion/V101/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V110/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /oldVersion/V111/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_bert_feature(norm_text, word2ph): 5 | return torch.zeros(1024, sum(word2ph)) 6 | -------------------------------------------------------------------------------- /onnx_modules/V200/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V210/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V220/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V230/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/__init__.py: -------------------------------------------------------------------------------- 1 | from .text.symbols import symbols 2 | from .models_onnx import SynthesizerTrn 3 | 4 | __all__ = ["symbols", "SynthesizerTrn"] 5 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"} 2 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "[CLS]", 3 | "cls_token": "[CLS]", 4 | "eos_token": "[SEP]", 5 | "mask_token": "[MASK]", 6 | "pad_token": "[PAD]", 7 | "sep_token": "[SEP]", 8 | "unk_token": "[UNK]" 9 | } 10 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": false, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0.0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "feature_extractor_type": "Wav2Vec2FeatureExtractor", 4 | "feature_size": 1, 5 | "padding_side": "right", 6 | "padding_value": 0.0, 7 | "return_attention_mask": true, 8 | "sampling_rate": 16000 9 | } 10 | -------------------------------------------------------------------------------- /css/custom.css: -------------------------------------------------------------------------------- 1 | 2 | #yml_code { 3 | height: 600px; 4 | flex-grow: inherit; 5 | overflow-y: auto; 6 | } 7 | 8 | #json_code { 9 | height: 600px; 10 | flex-grow: inherit; 11 | overflow-y: auto; 12 | } 13 | 14 | #gpu_code { 15 | height: 300px; 16 | flex-grow: inherit; 17 | overflow-y: auto; 18 | } 19 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tokenizer_class": "BertJapaneseTokenizer", 3 | "model_max_length": 512, 4 | "do_lower_case": false, 5 | "word_tokenizer_type": "mecab", 6 | "subword_tokenizer_type": "wordpiece", 7 | "mecab_kwargs": { 8 | "mecab_dic": "unidic_lite" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tools/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | logger封装 3 | """ 4 | from loguru import logger 5 | import sys 6 | 7 | 8 | # 移除所有默认的处理器 9 | logger.remove() 10 | 11 | # 自定义格式并添加到标准输出 12 | log_format = ( 13 | "{time:MM-DD HH:mm:ss} {level:<9}| {file}:{line} | {message}" 14 | ) 15 | 16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True) 17 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "", 3 | "cls_token": "", 4 | "eos_token": "", 5 | "mask_token": { 6 | "content": "", 7 | "lstrip": true, 8 | "normalized": false, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "pad_token": "", 13 | "sep_token": "", 14 | "unk_token": "" 15 | } 16 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "[CLS]", 3 | "cls_token": "[CLS]", 4 | "do_lower_case": false, 5 | "eos_token": "[SEP]", 6 | "keep_accents": true, 7 | "mask_token": "[MASK]", 8 | "pad_token": "[PAD]", 9 | "sep_token": "[SEP]", 10 | "sp_model_kwargs": {}, 11 | "special_tokens_map_file": null, 12 | "split_by_punct": false, 13 | "tokenizer_class": "DebertaV2Tokenizer", 14 | "unk_token": "[UNK]" 15 | } 16 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/.gitattributes: -------------------------------------------------------------------------------- 1 | *.bin.* filter=lfs diff=lfs merge=lfs -text 2 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.h5 filter=lfs diff=lfs merge=lfs -text 5 | *.tflite filter=lfs diff=lfs merge=lfs -text 6 | *.tar.gz filter=lfs diff=lfs merge=lfs -text 7 | *.ot filter=lfs diff=lfs merge=lfs -text 8 | *.onnx filter=lfs diff=lfs merge=lfs -text 9 | *.msgpack filter=lfs diff=lfs merge=lfs -text 10 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": "", 4 | "cls_token": "", 5 | "eos_token": "", 6 | "errors": "replace", 7 | "mask_token": "", 8 | "model_max_length": 512, 9 | "pad_token": "", 10 | "processor_class": "ClapProcessor", 11 | "sep_token": "", 12 | "special_tokens_map_file": null, 13 | "tokenizer_class": "RobertaTokenizer", 14 | "trim_offsets": true, 15 | "unk_token": "" 16 | } 17 | -------------------------------------------------------------------------------- /bert/bert_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "deberta-v2-large-japanese-char-wwm": { 3 | "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm", 4 | "files": ["pytorch_model.bin"] 5 | }, 6 | "chinese-roberta-wwm-ext-large": { 7 | "repo_id": "hfl/chinese-roberta-wwm-ext-large", 8 | "files": ["pytorch_model.bin"] 9 | }, 10 | "deberta-v3-large": { 11 | "repo_id": "microsoft/deberta-v3-large", 12 | "files": ["spm.model", "pytorch_model.bin"] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa==0.9.2 2 | matplotlib 3 | numpy 4 | numba 5 | phonemizer 6 | scipy 7 | tensorboard 8 | Unidecode 9 | amfm_decompy 10 | jieba 11 | transformers 12 | pypinyin 13 | cn2an 14 | gradio==3.50.2 15 | av 16 | mecab-python3 17 | loguru 18 | unidic-lite 19 | cmudict 20 | fugashi 21 | num2words 22 | PyYAML 23 | requests 24 | pyopenjtalk-prebuilt 25 | jaconv 26 | psutil 27 | GPUtil 28 | vector_quantize_pytorch 29 | g2p_en 30 | sentencepiece 31 | pykakasi 32 | langid 33 | WeTextProcessing>=0.1.10 34 | -------------------------------------------------------------------------------- /export_onnx.py: -------------------------------------------------------------------------------- 1 | from onnx_modules import export_onnx 2 | import os 3 | 4 | if __name__ == "__main__": 5 | export_path = "BertVits2.2PT" 6 | model_path = "model\\G_0.pth" 7 | config_path = "model\\config.json" 8 | novq = False 9 | dev = False 10 | if not os.path.exists("onnx"): 11 | os.makedirs("onnx") 12 | if not os.path.exists(f"onnx/{export_path}"): 13 | os.makedirs(f"onnx/{export_path}") 14 | export_onnx(export_path, model_path, config_path, novq, dev) 15 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForPreTraining" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 1024, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 4096, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 16, 15 | "num_hidden_layers": 24, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 32768 19 | } 20 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | 7 | def maximum_path(neg_cent, mask): 8 | device = neg_cent.device 9 | dtype = neg_cent.dtype 10 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 11 | path = zeros(neg_cent.shape, dtype=int32) 12 | 13 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 14 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 15 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 16 | return from_numpy(path).to(device=device, dtype=dtype) 17 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_lower_case": false, 4 | "do_subword_tokenize": true, 5 | "do_word_tokenize": true, 6 | "jumanpp_kwargs": null, 7 | "mask_token": "[MASK]", 8 | "mecab_kwargs": null, 9 | "model_max_length": 1000000000000000019884624838656, 10 | "never_split": null, 11 | "pad_token": "[PAD]", 12 | "sep_token": "[SEP]", 13 | "special_tokens_map_file": null, 14 | "subword_tokenizer_type": "character", 15 | "sudachi_kwargs": null, 16 | "tokenizer_class": "BertJapaneseTokenizer", 17 | "unk_token": "[UNK]", 18 | "word_tokenizer_type": "basic" 19 | } 20 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | rev: v0.1.11 11 | hooks: 12 | - id: ruff 13 | args: [ --fix ] 14 | 15 | - repo: https://github.com/psf/black 16 | rev: 23.12.1 17 | hooks: 18 | - id: black 19 | 20 | - repo: https://github.com/codespell-project/codespell 21 | rev: v2.2.6 22 | hooks: 23 | - id: codespell 24 | files: ^.*\.(py|md|rst|yml)$ 25 | args: [-L=fro] 26 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunk_length_s": 10, 3 | "feature_extractor_type": "ClapFeatureExtractor", 4 | "feature_size": 64, 5 | "fft_window_size": 1024, 6 | "frequency_max": 14000, 7 | "frequency_min": 50, 8 | "hop_length": 480, 9 | "max_length_s": 10, 10 | "n_fft": 1024, 11 | "nb_frequency_bins": 513, 12 | "nb_max_frames": 1000, 13 | "nb_max_samples": 480000, 14 | "padding": "repeatpad", 15 | "padding_side": "right", 16 | "padding_value": 0.0, 17 | "processor_class": "ClapProcessor", 18 | "return_attention_mask": false, 19 | "sampling_rate": 48000, 20 | "top_db": null, 21 | "truncation": "fusion" 22 | } 23 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/generator_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "deberta-v2", 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 1024, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 4096, 9 | "max_position_embeddings": 512, 10 | "relative_attention": true, 11 | "position_buckets": 256, 12 | "norm_rel_ebd": "layer_norm", 13 | "share_att_key": true, 14 | "pos_att_type": "p2c|c2p", 15 | "layer_norm_eps": 1e-7, 16 | "max_relative_positions": -1, 17 | "position_biased_input": false, 18 | "num_attention_heads": 16, 19 | "num_hidden_layers": 12, 20 | "type_vocab_size": 0, 21 | "vocab_size": 128100 22 | } 23 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "deberta-v2", 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 1024, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 4096, 9 | "max_position_embeddings": 512, 10 | "relative_attention": true, 11 | "position_buckets": 256, 12 | "norm_rel_ebd": "layer_norm", 13 | "share_att_key": true, 14 | "pos_att_type": "p2c|c2p", 15 | "layer_norm_eps": 1e-7, 16 | "max_relative_positions": -1, 17 | "position_biased_input": false, 18 | "num_attention_heads": 16, 19 | "num_hidden_layers": 24, 20 | "type_vocab_size": 0, 21 | "vocab_size": 128100 22 | } 23 | -------------------------------------------------------------------------------- /text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V200/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V210/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /oldVersion/V220/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/bert_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from config import config 6 | 7 | 8 | MIRROR: str = config.mirror 9 | 10 | 11 | def _check_bert(repo_id, files, local_path): 12 | for file in files: 13 | if not Path(local_path).joinpath(file).exists(): 14 | if MIRROR.lower() == "openi": 15 | import openi 16 | 17 | openi.model.download_model( 18 | "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert" 19 | ) 20 | else: 21 | hf_hub_download( 22 | repo_id, file, local_dir=local_path, local_dir_use_symlinks=False 23 | ) 24 | -------------------------------------------------------------------------------- /run_MnodesAndMgpus.sh: -------------------------------------------------------------------------------- 1 | #多机多卡训练 2 | 3 | #--nnodes=1:3 表示 使用一到三台机器 弹性分配资源 4 | #--nnodes=<最小节点数>:<最大节点数> 5 | #--nproc_per_node=每台机器上可用的GPU数 6 | #--rdzv_endpoint=主节点(最先启动的)ip:端口号 7 | #其他不需要变 8 | 9 | #注意: 此版本的分布式训练是基于数据并行的,多机多卡相当于开更大的batchsize,此时epoch迭代速度会增加, 10 | #但由于 该版本的代码中 保存模型是按照global step来计算的,所以会出现的效果就是 : 保存模型的时间不会有明显加速, 11 | #但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数,实现更好的效果” 12 | 13 | #************************* 14 | # torchrun \ 15 | # --nnodes=1:3\ 16 | # --nproc_per_node=2\ 17 | # --rdzv_id=1\ 18 | # --rdzv_backend=c10d\ 19 | # --rdzv_endpoint="inspur1:8880"\ 20 | # train_ms.py 21 | #**************************** 22 | 23 | #多卡训练 24 | #nproc_per_node = 机器上可用的GPU数 25 | 26 | #************************* 27 | torchrun \ 28 | --nnodes=1\ 29 | --nproc_per_node=2\ 30 | train_ms.py 31 | #************************* 32 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "bos_token_id": 0, 7 | "directionality": "bidi", 8 | "eos_token_id": 2, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 1024, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 4096, 14 | "layer_norm_eps": 1e-12, 15 | "max_position_embeddings": 512, 16 | "model_type": "bert", 17 | "num_attention_heads": 16, 18 | "num_hidden_layers": 24, 19 | "output_past": true, 20 | "pad_token_id": 0, 21 | "pooler_fc_size": 768, 22 | "pooler_num_attention_heads": 12, 23 | "pooler_num_fc_layers": 3, 24 | "pooler_size_per_head": 128, 25 | "pooler_type": "first_token_transform", 26 | "type_vocab_size": 2, 27 | "vocab_size": 21128 28 | } 29 | -------------------------------------------------------------------------------- /oldVersion/V101/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V110/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V200/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V210/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V220/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, english, cleaned_text_to_sequence 2 | 3 | 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english} 5 | 6 | 7 | def clean_text(text, language): 8 | language_module = language_module_map[language] 9 | norm_text = language_module.text_normalize(text) 10 | phones, tones, word2ph = language_module.g2p(norm_text) 11 | return norm_text, phones, tones, word2ph 12 | 13 | 14 | def clean_text_bert(text, language): 15 | language_module = language_module_map[language] 16 | norm_text = language_module.text_normalize(text) 17 | phones, tones, word2ph = language_module.g2p(norm_text) 18 | bert = language_module.get_bert_feature(norm_text, word2ph) 19 | return phones, tones, bert 20 | 21 | 22 | def text_to_sequence(text, language): 23 | norm_text, phones, tones, word2ph = clean_text(text, language) 24 | return cleaned_text_to_sequence(phones, tones, language) 25 | 26 | 27 | if __name__ == "__main__": 28 | pass 29 | -------------------------------------------------------------------------------- /oldVersion/V101/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph) 28 | return bert 29 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "DebertaV2ForMaskedLM" 4 | ], 5 | "attention_head_size": 64, 6 | "attention_probs_dropout_prob": 0.1, 7 | "conv_act": "gelu", 8 | "conv_kernel_size": 3, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 1024, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 4096, 14 | "layer_norm_eps": 1e-07, 15 | "max_position_embeddings": 512, 16 | "max_relative_positions": -1, 17 | "model_type": "deberta-v2", 18 | "norm_rel_ebd": "layer_norm", 19 | "num_attention_heads": 16, 20 | "num_hidden_layers": 24, 21 | "pad_token_id": 0, 22 | "pooler_dropout": 0, 23 | "pooler_hidden_act": "gelu", 24 | "pooler_hidden_size": 1024, 25 | "pos_att_type": [ 26 | "p2c", 27 | "c2p" 28 | ], 29 | "position_biased_input": false, 30 | "position_buckets": 256, 31 | "relative_attention": true, 32 | "share_att_key": true, 33 | "torch_dtype": "float16", 34 | "transformers_version": "4.25.1", 35 | "type_vocab_size": 0, 36 | "vocab_size": 22012 37 | } 38 | -------------------------------------------------------------------------------- /.github/workflows/pull_format.yml: -------------------------------------------------------------------------------- 1 | name: pull format 2 | 3 | on: [pull_request] 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | pull_format: 10 | runs-on: ${{ matrix.os }} 11 | 12 | strategy: 13 | matrix: 14 | python-version: ["3.10"] 15 | os: [ubuntu-latest] 16 | fail-fast: false 17 | 18 | continue-on-error: true 19 | 20 | steps: 21 | - name: checkout 22 | continue-on-error: true 23 | uses: actions/checkout@v3 24 | with: 25 | ref: ${{ github.head_ref }} 26 | fetch-depth: 0 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Install Black 34 | run: pip install "black[jupyter]" 35 | 36 | - name: Run Black 37 | # run: black $(git ls-files '*.py') 38 | run: black . 39 | 40 | - name: Commit Back 41 | uses: stefanzweifel/git-auto-commit-action@v4 42 | with: 43 | commit_message: Apply Code Formatter Change 44 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "configs/deberta_v2_large.json", 3 | "architectures": [ 4 | "DebertaV2ForMaskedLM" 5 | ], 6 | "attention_head_size": 64, 7 | "attention_probs_dropout_prob": 0.1, 8 | "conv_act": "gelu", 9 | "conv_kernel_size": 3, 10 | "hidden_act": "gelu", 11 | "hidden_dropout_prob": 0.1, 12 | "hidden_size": 1024, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 4096, 15 | "layer_norm_eps": 1e-07, 16 | "max_position_embeddings": 512, 17 | "max_relative_positions": -1, 18 | "model_type": "deberta-v2", 19 | "norm_rel_ebd": "layer_norm", 20 | "num_attention_heads": 16, 21 | "num_hidden_layers": 24, 22 | "pad_token_id": 0, 23 | "pooler_dropout": 0, 24 | "pooler_hidden_act": "gelu", 25 | "pooler_hidden_size": 1024, 26 | "pos_att_type": [ 27 | "p2c", 28 | "c2p" 29 | ], 30 | "position_biased_input": false, 31 | "position_buckets": 256, 32 | "relative_attention": true, 33 | "share_att_key": true, 34 | "torch_dtype": "float32", 35 | "transformers_version": "4.23.1", 36 | "type_vocab_size": 0, 37 | "vocab_size": 32000 38 | } 39 | -------------------------------------------------------------------------------- /oldVersion/V110/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | 7 | def cleaned_text_to_sequence(cleaned_text, tones, language): 8 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 9 | Args: 10 | text: string to convert to a sequence 11 | Returns: 12 | List of integers corresponding to the symbols in the text 13 | """ 14 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 15 | tone_start = language_tone_start_map[language] 16 | tones = [i + tone_start for i in tones] 17 | lang_id = language_id_map[language] 18 | lang_ids = [lang_id for i in phones] 19 | return phones, tones, lang_ids 20 | 21 | 22 | def get_bert(norm_text, word2ph, language, device): 23 | from .chinese_bert import get_bert_feature as zh_bert 24 | from .english_bert_mock import get_bert_feature as en_bert 25 | from .japanese_bert import get_bert_feature as jp_bert 26 | 27 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 28 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 29 | return bert 30 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.xz filter=lfs diff=lfs merge=lfs -text 25 | *.zip filter=lfs diff=lfs merge=lfs -text 26 | *.zstandard filter=lfs diff=lfs merge=lfs -text 27 | *tfevents* filter=lfs diff=lfs merge=lfs -text 28 | -------------------------------------------------------------------------------- /oldVersion/V110/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to( 18 | device 19 | ) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | assert inputs["input_ids"].shape[-1] == len(word2ph) 27 | word2phone = word2ph 28 | phone_level_feature = [] 29 | for i in range(len(word2phone)): 30 | repeat_feature = res[i].repeat(word2phone[i], 1) 31 | phone_level_feature.append(repeat_feature) 32 | 33 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 34 | 35 | return phone_level_feature.T 36 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.xz filter=lfs diff=lfs merge=lfs -text 25 | *.zip filter=lfs diff=lfs merge=lfs -text 26 | *.zstandard filter=lfs diff=lfs merge=lfs -text 27 | *tfevents* filter=lfs diff=lfs merge=lfs -text 28 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.wasm filter=lfs diff=lfs merge=lfs -text 25 | *.xz filter=lfs diff=lfs merge=lfs -text 26 | *.zip filter=lfs diff=lfs merge=lfs -text 27 | *.zstandard filter=lfs diff=lfs merge=lfs -text 28 | *tfevents* filter=lfs diff=lfs merge=lfs -text 29 | -------------------------------------------------------------------------------- /oldVersion/V111/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/bert-base-japanese-v3" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | assert inputs["input_ids"].shape[-1] == len(word2ph) 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | -------------------------------------------------------------------------------- /oldVersion/V111/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from . import chinese, japanese, cleaned_text_to_sequence 2 | from .fix import japanese as japanese_fix 3 | 4 | 5 | language_module_map = {"ZH": chinese, "JP": japanese} 6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix} 7 | 8 | 9 | def clean_text(text, language): 10 | language_module = language_module_map[language] 11 | norm_text = language_module.text_normalize(text) 12 | phones, tones, word2ph = language_module.g2p(norm_text) 13 | return norm_text, phones, tones, word2ph 14 | 15 | 16 | def clean_text_fix(text, language): 17 | """使用dev分支修复""" 18 | language_module = language_module_map_fix[language] 19 | norm_text = language_module.text_normalize(text) 20 | phones, tones, word2ph = language_module.g2p(norm_text) 21 | return norm_text, phones, tones, word2ph 22 | 23 | 24 | def clean_text_bert(text, language): 25 | language_module = language_module_map[language] 26 | norm_text = language_module.text_normalize(text) 27 | phones, tones, word2ph = language_module.g2p(norm_text) 28 | bert = language_module.get_bert_feature(norm_text, word2ph) 29 | return phones, tones, bert 30 | 31 | 32 | def text_to_sequence(text, language): 33 | norm_text, phones, tones, word2ph = clean_text(text, language) 34 | return cleaned_text_to_sequence(phones, tones, language) 35 | 36 | 37 | if __name__ == "__main__": 38 | pass 39 | -------------------------------------------------------------------------------- /oldVersion/V200/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | if ( 18 | sys.platform == "darwin" 19 | and torch.backends.mps.is_available() 20 | and device == "cpu" 21 | ): 22 | device = "mps" 23 | if not device: 24 | device = "cuda" 25 | if device not in models.keys(): 26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 27 | with torch.no_grad(): 28 | inputs = tokenizer(text, return_tensors="pt") 29 | for i in inputs: 30 | inputs[i] = inputs[i].to(device) 31 | res = models[device](**inputs, output_hidden_states=True) 32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 33 | # assert len(word2ph) == len(text)+2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | -------------------------------------------------------------------------------- /monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit( 5 | numba.void( 6 | numba.int32[:, :, ::1], 7 | numba.float32[:, :, ::1], 8 | numba.int32[::1], 9 | numba.int32[::1], 10 | ), 11 | nopython=True, 12 | nogil=True, 13 | ) 14 | def maximum_path_jit(paths, values, t_ys, t_xs): 15 | b = paths.shape[0] 16 | max_neg_val = -1e9 17 | for i in range(int(b)): 18 | path = paths[i] 19 | value = values[i] 20 | t_y = t_ys[i] 21 | t_x = t_xs[i] 22 | 23 | v_prev = v_cur = 0.0 24 | index = t_x - 1 25 | 26 | for y in range(t_y): 27 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 28 | if x == y: 29 | v_cur = max_neg_val 30 | else: 31 | v_cur = value[y - 1, x] 32 | if x == 0: 33 | if y == 0: 34 | v_prev = 0.0 35 | else: 36 | v_prev = max_neg_val 37 | else: 38 | v_prev = value[y - 1, x - 1] 39 | value[y, x] += max(v_prev, v_cur) 40 | 41 | for y in range(t_y - 1, -1, -1): 42 | path[y, index] = 1 43 | if index != 0 and ( 44 | index == y or value[y - 1, index] < value[y - 1, index - 1] 45 | ): 46 | index = index - 1 47 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | if ( 18 | sys.platform == "darwin" 19 | and torch.backends.mps.is_available() 20 | and device == "cpu" 21 | ): 22 | device = "mps" 23 | if not device: 24 | device = "cuda" 25 | if device not in models.keys(): 26 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 27 | with torch.no_grad(): 28 | inputs = tokenizer(text, return_tensors="pt") 29 | for i in inputs: 30 | inputs[i] = inputs[i].to(device) 31 | res = models[device](**inputs, output_hidden_states=True) 32 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 33 | # assert len(word2ph) == len(text)+2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | -------------------------------------------------------------------------------- /onnx_infer.py: -------------------------------------------------------------------------------- 1 | from onnx_modules.V220_OnnxInference import OnnxInferenceSession 2 | import numpy as np 3 | 4 | Session = OnnxInferenceSession( 5 | { 6 | "enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx", 7 | "emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx", 8 | "dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx", 9 | "sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx", 10 | "flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx", 11 | "dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx", 12 | }, 13 | Providers=["CPUExecutionProvider"], 14 | ) 15 | 16 | # 这里的输入和原版是一样的,只需要在原版预处理结果出来之后加上.numpy()即可 17 | x = np.array( 18 | [ 19 | 0, 20 | 97, 21 | 0, 22 | 8, 23 | 0, 24 | 78, 25 | 0, 26 | 8, 27 | 0, 28 | 76, 29 | 0, 30 | 37, 31 | 0, 32 | 40, 33 | 0, 34 | 97, 35 | 0, 36 | 8, 37 | 0, 38 | 23, 39 | 0, 40 | 8, 41 | 0, 42 | 74, 43 | 0, 44 | 26, 45 | 0, 46 | 104, 47 | 0, 48 | ] 49 | ) 50 | tone = np.zeros_like(x) 51 | language = np.zeros_like(x) 52 | sid = np.array([0]) 53 | bert = np.random.randn(x.shape[0], 1024) 54 | ja_bert = np.random.randn(x.shape[0], 1024) 55 | en_bert = np.random.randn(x.shape[0], 1024) 56 | emo = np.random.randn(512, 1) 57 | 58 | audio = Session(x, tone, language, bert, ja_bert, en_bert, emo, sid) 59 | 60 | print(audio) 61 | -------------------------------------------------------------------------------- /oldVersion/V111/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def get_bert_fix(norm_text, word2ph, language, device): 32 | from .chinese_bert import get_bert_feature as zh_bert 33 | from .english_bert_mock import get_bert_feature as en_bert 34 | from .fix.japanese_bert import get_bert_feature as jp_bert 35 | 36 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 37 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 38 | return bert 39 | -------------------------------------------------------------------------------- /.github/workflows/push_format.yml: -------------------------------------------------------------------------------- 1 | name: push format 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - dev 8 | 9 | permissions: 10 | contents: write 11 | pull-requests: write 12 | 13 | jobs: 14 | push_format: 15 | runs-on: ${{ matrix.os }} 16 | 17 | strategy: 18 | matrix: 19 | python-version: ["3.10"] 20 | os: [ubuntu-latest] 21 | fail-fast: false 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | with: 26 | ref: ${{github.ref_name}} 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Install Black 34 | run: pip install "black[jupyter]" 35 | 36 | - name: Run Black 37 | # run: black $(git ls-files '*.py') 38 | run: black . 39 | 40 | - name: Commit Back 41 | continue-on-error: true 42 | id: commitback 43 | run: | 44 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 45 | git config --local user.name "github-actions[bot]" 46 | git add --all 47 | git commit -m "Format code" 48 | 49 | - name: Create Pull Request 50 | if: steps.commitback.outcome == 'success' 51 | continue-on-error: true 52 | uses: peter-evans/create-pull-request@v5 53 | with: 54 | delete-branch: true 55 | body: Apply Code Formatter Change 56 | title: Apply Code Formatter Change 57 | commit-message: Automatic code format 58 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /emotional/clap-htsat-fused/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /oldVersion/V220/clap_wrapper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import ClapModel, ClapProcessor 5 | 6 | from config import config 7 | 8 | models = dict() 9 | processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused") 10 | 11 | 12 | def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device): 13 | if ( 14 | sys.platform == "darwin" 15 | and torch.backends.mps.is_available() 16 | and device == "cpu" 17 | ): 18 | device = "mps" 19 | if not device: 20 | device = "cuda" 21 | if device not in models.keys(): 22 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to( 23 | device 24 | ) 25 | with torch.no_grad(): 26 | inputs = processor( 27 | audios=audio_data, return_tensors="pt", sampling_rate=48000 28 | ).to(device) 29 | emb = models[device].get_audio_features(**inputs) 30 | return emb.T 31 | 32 | 33 | def get_clap_text_feature(text, device=config.bert_gen_config.device): 34 | if ( 35 | sys.platform == "darwin" 36 | and torch.backends.mps.is_available() 37 | and device == "cpu" 38 | ): 39 | device = "mps" 40 | if not device: 41 | device = "cuda" 42 | if device not in models.keys(): 43 | models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to( 44 | device 45 | ) 46 | with torch.no_grad(): 47 | inputs = processor(text=text, return_tensors="pt").to(device) 48 | emb = models[device].get_text_features(**inputs) 49 | return emb.T 50 | -------------------------------------------------------------------------------- /bert/deberta-v2-large-japanese-char-wwm/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /oldVersion/V200/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language](norm_text, word2ph, device) 28 | return bert 29 | 30 | 31 | def check_bert_models(): 32 | import json 33 | from pathlib import Path 34 | 35 | from config import config 36 | from .bert_utils import _check_bert 37 | 38 | if config.mirror.lower() == "openi": 39 | import openi 40 | 41 | kwargs = {"token": config.openi_token} if config.openi_token else {} 42 | openi.login(**kwargs) 43 | 44 | with open("./bert/bert_models.json", "r") as fp: 45 | models = json.load(fp) 46 | for k, v in models.items(): 47 | local_path = Path("./bert").joinpath(k) 48 | _check_bert(v["repo_id"], v["files"], local_path) 49 | -------------------------------------------------------------------------------- /oldVersion/V210/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device, style_text, style_weight): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language]( 28 | norm_text, word2ph, device, style_text, style_weight 29 | ) 30 | return bert 31 | 32 | 33 | def check_bert_models(): 34 | import json 35 | from pathlib import Path 36 | 37 | from config import config 38 | from .bert_utils import _check_bert 39 | 40 | if config.mirror.lower() == "openi": 41 | import openi 42 | 43 | kwargs = {"token": config.openi_token} if config.openi_token else {} 44 | openi.login(**kwargs) 45 | 46 | with open("./bert/bert_models.json", "r") as fp: 47 | models = json.load(fp) 48 | for k, v in models.items(): 49 | local_path = Path("./bert").joinpath(k) 50 | _check_bert(v["repo_id"], v["files"], local_path) 51 | 52 | 53 | check_bert_models() 54 | -------------------------------------------------------------------------------- /onnx_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from utils import get_hparams_from_file, load_checkpoint 2 | import json 3 | 4 | 5 | def export_onnx(export_path, model_path, config_path, novq, dev): 6 | hps = get_hparams_from_file(config_path) 7 | version = hps.version[0:3] 8 | if version == "2.0" or (version == "2.1" and novq): 9 | from .V200 import SynthesizerTrn, symbols 10 | elif version == "2.1" and (not novq): 11 | from .V210 import SynthesizerTrn, symbols 12 | elif version == "2.2": 13 | if novq and dev: 14 | from .V220_novq_dev import SynthesizerTrn, symbols 15 | else: 16 | from .V220 import SynthesizerTrn, symbols 17 | elif version == "2.3": 18 | from .V230 import SynthesizerTrn, symbols 19 | net_g = SynthesizerTrn( 20 | len(symbols), 21 | hps.data.filter_length // 2 + 1, 22 | hps.train.segment_size // hps.data.hop_length, 23 | n_speakers=hps.data.n_speakers, 24 | **hps.model, 25 | ) 26 | _ = net_g.eval() 27 | _ = load_checkpoint(model_path, net_g, None, skip_optimizer=True) 28 | net_g.cpu() 29 | net_g.export_onnx(export_path) 30 | 31 | spklist = [] 32 | for key in hps.data.spk2id.keys(): 33 | spklist.append(key) 34 | 35 | MoeVSConf = { 36 | "Folder": f"{export_path}", 37 | "Name": f"{export_path}", 38 | "Type": "BertVits", 39 | "Symbol": symbols, 40 | "Cleaner": "", 41 | "Rate": hps.data.sampling_rate, 42 | "CharaMix": True, 43 | "Characters": spklist, 44 | "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]}, 45 | "Dict": "BasicDict", 46 | "BertPath": [ 47 | "chinese-roberta-wwm-ext-large", 48 | "deberta-v2-large-japanese", 49 | "bert-base-japanese-v3", 50 | ], 51 | "Clap": "clap-htsat-fused", 52 | } 53 | 54 | with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile: 55 | json.dump(MoeVSConf, MoeVsConfFile, indent=4) 56 | -------------------------------------------------------------------------------- /resample_legacy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | from multiprocessing import Pool, cpu_count 5 | 6 | import soundfile 7 | from tqdm import tqdm 8 | 9 | from config import config 10 | 11 | 12 | def process(item): 13 | wav_name, args = item 14 | wav_path = os.path.join(args.in_dir, wav_name) 15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): 16 | wav, sr = librosa.load(wav_path, sr=args.sr) 17 | soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "--sr", 24 | type=int, 25 | default=config.resample_config.sampling_rate, 26 | help="sampling rate", 27 | ) 28 | parser.add_argument( 29 | "--in_dir", 30 | type=str, 31 | default=config.resample_config.in_dir, 32 | help="path to source dir", 33 | ) 34 | parser.add_argument( 35 | "--out_dir", 36 | type=str, 37 | default=config.resample_config.out_dir, 38 | help="path to target dir", 39 | ) 40 | parser.add_argument( 41 | "--processes", 42 | type=int, 43 | default=0, 44 | help="cpu_processes", 45 | ) 46 | args, _ = parser.parse_known_args() 47 | # autodl 无卡模式会识别出46个cpu 48 | if args.processes == 0: 49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1 50 | else: 51 | processes = args.processes 52 | pool = Pool(processes=processes) 53 | 54 | tasks = [] 55 | 56 | for dirpath, _, filenames in os.walk(args.in_dir): 57 | if not os.path.isdir(args.out_dir): 58 | os.makedirs(args.out_dir, exist_ok=True) 59 | for filename in filenames: 60 | if filename.lower().endswith(".wav"): 61 | tasks.append((filename, args)) 62 | 63 | for _ in tqdm( 64 | pool.imap_unordered(process, tasks), 65 | ): 66 | pass 67 | 68 | pool.close() 69 | pool.join() 70 | 71 | print("音频重采样完毕!") 72 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language]( 28 | norm_text, word2ph, device, style_text, style_weight 29 | ) 30 | return bert 31 | 32 | 33 | def check_bert_models(): 34 | import json 35 | from pathlib import Path 36 | 37 | from config import config 38 | from .bert_utils import _check_bert 39 | 40 | if config.mirror.lower() == "openi": 41 | import openi 42 | 43 | kwargs = {"token": config.openi_token} if config.openi_token else {} 44 | openi.login(**kwargs) 45 | 46 | with open("./bert/bert_models.json", "r") as fp: 47 | models = json.load(fp) 48 | for k, v in models.items(): 49 | local_path = Path("./bert").joinpath(k) 50 | _check_bert(v["repo_id"], v["files"], local_path) 51 | 52 | 53 | def init_openjtalk(): 54 | import platform 55 | 56 | if platform.platform() == "Linux": 57 | import pyopenjtalk 58 | 59 | pyopenjtalk.g2p("こんにちは,世界。") 60 | 61 | 62 | init_openjtalk() 63 | check_bert_models() 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | LOGO 4 | 5 | # Bert-VITS2 6 | 7 | VITS2 Backbone with multilingual bert 8 | 9 | For quick guide, please refer to `webui_preprocess.py`. 10 | 11 | 简易教程请参见 `webui_preprocess.py`。 12 | 13 | ## 请注意,本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目 14 | ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9) 15 | 16 | [//]: # (## 本项目与[PlayVoice/vits_chinese](https://github.com/PlayVoice/vits_chinese) 没有任何关系) 17 | 18 | [//]: # () 19 | [//]: # (本仓库来源于之前朋友分享了ai峰哥的视频,本人被其中的效果惊艳,在自己尝试MassTTS以后发现fs在音质方面与vits有一定差距,并且training的pipeline比vits更复杂,因此按照其思路将bert) 20 | 21 | ## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应当参阅代码自己学习如何训练。 22 | 23 | ### 严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。 24 | ### 严禁用于任何政治相关用途。 25 | #### Video:https://www.bilibili.com/video/BV1hp4y1K78E 26 | #### Demo:https://www.bilibili.com/video/BV1TF411k78w 27 | #### QQ Group:815818430 28 | ## References 29 | + [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 30 | + [jaywalnut310/vits](https://github.com/jaywalnut310/vits) 31 | + [p0p4k/vits2_pytorch](https://github.com/p0p4k/vits2_pytorch) 32 | + [svc-develop-team/so-vits-svc](https://github.com/svc-develop-team/so-vits-svc) 33 | + [PaddlePaddle/PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) 34 | + [emotional-vits](https://github.com/innnky/emotional-vits) 35 | + [fish-speech](https://github.com/fishaudio/fish-speech) 36 | + [Bert-VITS2-UI](https://github.com/jiangyuxiaoxiao/Bert-VITS2-UI) 37 | ## 感谢所有贡献者作出的努力 38 | 39 | 40 | 41 | 42 | [//]: # (# 本项目所有代码引用均已写明,bert部分代码思路来源于[AI峰哥](https://www.bilibili.com/video/BV1w24y1c7z9),与[vits_chinese](https://github.com/PlayVoice/vits_chinese)无任何关系。欢迎各位查阅代码。同时,我们也对该开发者的[碰瓷,乃至开盒开发者的行为](https://www.bilibili.com/read/cv27101514/)表示强烈谴责。) 43 | -------------------------------------------------------------------------------- /oldVersion/V220/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import * 2 | 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 4 | 5 | 6 | def cleaned_text_to_sequence(cleaned_text, tones, language): 7 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | """ 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | tone_start = language_tone_start_map[language] 15 | tones = [i + tone_start for i in tones] 16 | lang_id = language_id_map[language] 17 | lang_ids = [lang_id for i in phones] 18 | return phones, tones, lang_ids 19 | 20 | 21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7): 22 | from .chinese_bert import get_bert_feature as zh_bert 23 | from .english_bert_mock import get_bert_feature as en_bert 24 | from .japanese_bert import get_bert_feature as jp_bert 25 | 26 | lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} 27 | bert = lang_bert_func_map[language]( 28 | norm_text, word2ph, device, style_text, style_weight 29 | ) 30 | return bert 31 | 32 | 33 | def check_bert_models(): 34 | import json 35 | from pathlib import Path 36 | 37 | from config import config 38 | from .bert_utils import _check_bert 39 | 40 | if config.mirror.lower() == "openi": 41 | import openi 42 | 43 | kwargs = {"token": config.openi_token} if config.openi_token else {} 44 | openi.login(**kwargs) 45 | 46 | with open("./bert/bert_models.json", "r") as fp: 47 | models = json.load(fp) 48 | for k, v in models.items(): 49 | local_path = Path("./bert").joinpath(k) 50 | _check_bert(v["repo_id"], v["files"], local_path) 51 | 52 | 53 | def init_openjtalk(): 54 | import platform 55 | 56 | if platform.platform() == "Linux": 57 | import pyopenjtalk 58 | 59 | pyopenjtalk.g2p("こんにちは,世界。") 60 | 61 | 62 | init_openjtalk() 63 | check_bert_models() 64 | -------------------------------------------------------------------------------- /tools/translate.py: -------------------------------------------------------------------------------- 1 | """ 2 | 翻译api 3 | """ 4 | from config import config 5 | 6 | import random 7 | import hashlib 8 | import requests 9 | 10 | 11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""): 12 | """ 13 | :param Sentence: 待翻译语句 14 | :param from_Language: 待翻译语句语言 15 | :param to_Language: 目标语言 16 | :return: 翻译后语句 出错时返回None 17 | 18 | 常见语言代码:中文 zh 英语 en 日语 jp 19 | """ 20 | appid = config.translate_config.app_key 21 | key = config.translate_config.secret_key 22 | if appid == "" or key == "": 23 | return "请开发者在config.yml中配置app_key与secret_key" 24 | url = "https://fanyi-api.baidu.com/api/trans/vip/translate" 25 | texts = Sentence.splitlines() 26 | outTexts = [] 27 | for t in texts: 28 | if t != "": 29 | # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113 30 | salt = str(random.randint(1, 100000)) 31 | signString = appid + t + salt + key 32 | hs = hashlib.md5() 33 | hs.update(signString.encode("utf-8")) 34 | signString = hs.hexdigest() 35 | if from_Language == "": 36 | from_Language = "auto" 37 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 38 | payload = { 39 | "q": t, 40 | "from": from_Language, 41 | "to": to_Language, 42 | "appid": appid, 43 | "salt": salt, 44 | "sign": signString, 45 | } 46 | # 发送请求 47 | try: 48 | response = requests.post( 49 | url=url, data=payload, headers=headers, timeout=3 50 | ) 51 | response = response.json() 52 | if "trans_result" in response.keys(): 53 | result = response["trans_result"][0] 54 | if "dst" in result.keys(): 55 | dst = result["dst"] 56 | outTexts.append(dst) 57 | except Exception: 58 | return Sentence 59 | else: 60 | outTexts.append(t) 61 | return "\n".join(outTexts) 62 | -------------------------------------------------------------------------------- /text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | if ( 24 | sys.platform == "darwin" 25 | and torch.backends.mps.is_available() 26 | and device == "cpu" 27 | ): 28 | device = "mps" 29 | if not device: 30 | device = "cuda" 31 | if device not in models.keys(): 32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 33 | with torch.no_grad(): 34 | inputs = tokenizer(text, return_tensors="pt") 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = models[device](**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | if style_text: 40 | style_inputs = tokenizer(style_text, return_tensors="pt") 41 | for i in style_inputs: 42 | style_inputs[i] = style_inputs[i].to(device) 43 | style_res = models[device](**style_inputs, output_hidden_states=True) 44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 45 | style_res_mean = style_res.mean(0) 46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | -------------------------------------------------------------------------------- /oldVersion/V210/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | if ( 24 | sys.platform == "darwin" 25 | and torch.backends.mps.is_available() 26 | and device == "cpu" 27 | ): 28 | device = "mps" 29 | if not device: 30 | device = "cuda" 31 | if device not in models.keys(): 32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 33 | with torch.no_grad(): 34 | inputs = tokenizer(text, return_tensors="pt") 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = models[device](**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | if style_text: 40 | style_inputs = tokenizer(style_text, return_tensors="pt") 41 | for i in style_inputs: 42 | style_inputs[i] = style_inputs[i].to(device) 43 | style_res = models[device](**style_inputs, output_hidden_states=True) 44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 45 | style_res_mean = style_res.mean(0) 46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | -------------------------------------------------------------------------------- /oldVersion/V220/text/english_bert_mock.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer 5 | 6 | from config import config 7 | 8 | 9 | LOCAL_PATH = "./bert/deberta-v3-large" 10 | 11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | if ( 24 | sys.platform == "darwin" 25 | and torch.backends.mps.is_available() 26 | and device == "cpu" 27 | ): 28 | device = "mps" 29 | if not device: 30 | device = "cuda" 31 | if device not in models.keys(): 32 | models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device) 33 | with torch.no_grad(): 34 | inputs = tokenizer(text, return_tensors="pt") 35 | for i in inputs: 36 | inputs[i] = inputs[i].to(device) 37 | res = models[device](**inputs, output_hidden_states=True) 38 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 39 | if style_text: 40 | style_inputs = tokenizer(style_text, return_tensors="pt") 41 | for i in style_inputs: 42 | style_inputs[i] = style_inputs[i].to(device) 43 | style_res = models[device](**style_inputs, output_hidden_states=True) 44 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 45 | style_res_mean = style_res.mean(0) 46 | assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | -------------------------------------------------------------------------------- /oldVersion/V200/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | sep_text, _, _ = text2sep_kata(text) 18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 21 | return get_bert_feature_with_token(sep_ids, word2ph, device) 22 | 23 | 24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 25 | if ( 26 | sys.platform == "darwin" 27 | and torch.backends.mps.is_available() 28 | and device == "cpu" 29 | ): 30 | device = "mps" 31 | if not device: 32 | device = "cuda" 33 | if device not in models.keys(): 34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 35 | with torch.no_grad(): 36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 37 | token_type_ids = torch.zeros_like(inputs).to(device) 38 | attention_mask = torch.ones_like(inputs).to(device) 39 | inputs = { 40 | "input_ids": inputs, 41 | "token_type_ids": token_type_ids, 42 | "attention_mask": attention_mask, 43 | } 44 | 45 | # for i in inputs: 46 | # inputs[i] = inputs[i].to(device) 47 | res = models[device](**inputs, output_hidden_states=True) 48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 49 | assert inputs["input_ids"].shape[-1] == len(word2ph) 50 | word2phone = word2ph 51 | phone_level_feature = [] 52 | for i in range(len(word2phone)): 53 | repeat_feature = res[i].repeat(word2phone[i], 1) 54 | phone_level_feature.append(repeat_feature) 55 | 56 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 57 | 58 | return phone_level_feature.T 59 | -------------------------------------------------------------------------------- /oldVersion/V111/text/fix/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM 3 | import sys 4 | from .japanese import text2sep_kata 5 | from config import config 6 | 7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") 8 | 9 | models = dict() 10 | 11 | 12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 13 | sep_text, _ = text2sep_kata(text) 14 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 15 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 16 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 17 | return get_bert_feature_with_token(sep_ids, word2ph, device) 18 | 19 | 20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 21 | if ( 22 | sys.platform == "darwin" 23 | and torch.backends.mps.is_available() 24 | and device == "cpu" 25 | ): 26 | device = "mps" 27 | if not device: 28 | device = "cuda" 29 | if device not in models.keys(): 30 | models[device] = AutoModelForMaskedLM.from_pretrained( 31 | "./bert/bert-base-japanese-v3" 32 | ).to(device) 33 | with torch.no_grad(): 34 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 35 | token_type_ids = torch.zeros_like(inputs).to(device) 36 | attention_mask = torch.ones_like(inputs).to(device) 37 | inputs = { 38 | "input_ids": inputs, 39 | "token_type_ids": token_type_ids, 40 | "attention_mask": attention_mask, 41 | } 42 | 43 | # for i in inputs: 44 | # inputs[i] = inputs[i].to(device) 45 | res = models[device](**inputs, output_hidden_states=True) 46 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 47 | assert inputs["input_ids"].shape[-1] == len(word2ph) 48 | word2phone = word2ph 49 | phone_level_feature = [] 50 | for i in range(len(word2phone)): 51 | repeat_feature = res[i].repeat(word2phone[i], 1) 52 | phone_level_feature.append(repeat_feature) 53 | 54 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 55 | 56 | return phone_level_feature.T 57 | -------------------------------------------------------------------------------- /resample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | from multiprocessing import Pool, cpu_count 5 | 6 | import soundfile 7 | from tqdm import tqdm 8 | 9 | from config import config 10 | 11 | 12 | def process(item): 13 | spkdir, wav_name, args = item 14 | wav_path = os.path.join(args.in_dir, spkdir, wav_name) 15 | if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): 16 | wav, sr = librosa.load(wav_path, sr=args.sr) 17 | soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument( 23 | "--sr", 24 | type=int, 25 | default=config.resample_config.sampling_rate, 26 | help="sampling rate", 27 | ) 28 | parser.add_argument( 29 | "--in_dir", 30 | type=str, 31 | default=config.resample_config.in_dir, 32 | help="path to source dir", 33 | ) 34 | parser.add_argument( 35 | "--out_dir", 36 | type=str, 37 | default=config.resample_config.out_dir, 38 | help="path to target dir", 39 | ) 40 | parser.add_argument( 41 | "--processes", 42 | type=int, 43 | default=0, 44 | help="cpu_processes", 45 | ) 46 | args, _ = parser.parse_known_args() 47 | # autodl 无卡模式会识别出46个cpu 48 | if args.processes == 0: 49 | processes = cpu_count() - 2 if cpu_count() > 4 else 1 50 | else: 51 | processes = args.processes 52 | pool = Pool(processes=processes) 53 | 54 | tasks = [] 55 | 56 | for dirpath, _, filenames in os.walk(args.in_dir): 57 | # 子级目录 58 | spk_dir = os.path.relpath(dirpath, args.in_dir) 59 | spk_dir_out = os.path.join(args.out_dir, spk_dir) 60 | if not os.path.isdir(spk_dir_out): 61 | os.makedirs(spk_dir_out, exist_ok=True) 62 | for filename in filenames: 63 | if filename.lower().endswith(".wav"): 64 | twople = (spk_dir, filename, args) 65 | tasks.append(twople) 66 | 67 | for _ in tqdm( 68 | pool.imap_unordered(process, tasks), 69 | ): 70 | pass 71 | 72 | pool.close() 73 | pool.join() 74 | 75 | print("音频重采样完毕!") 76 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 17 | sep_text, _, _ = text2sep_kata(text) 18 | sep_tokens = [tokenizer.tokenize(t) for t in sep_text] 19 | sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens] 20 | sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3] 21 | return get_bert_feature_with_token(sep_ids, word2ph, device) 22 | 23 | 24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device): 25 | if ( 26 | sys.platform == "darwin" 27 | and torch.backends.mps.is_available() 28 | and device == "cpu" 29 | ): 30 | device = "mps" 31 | if not device: 32 | device = "cuda" 33 | if device not in models.keys(): 34 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 35 | with torch.no_grad(): 36 | inputs = torch.tensor(tokens).to(device).unsqueeze(0) 37 | token_type_ids = torch.zeros_like(inputs).to(device) 38 | attention_mask = torch.ones_like(inputs).to(device) 39 | inputs = { 40 | "input_ids": inputs, 41 | "token_type_ids": token_type_ids, 42 | "attention_mask": attention_mask, 43 | } 44 | 45 | # for i in inputs: 46 | # inputs[i] = inputs[i].to(device) 47 | res = models[device](**inputs, output_hidden_states=True) 48 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 49 | assert inputs["input_ids"].shape[-1] == len(word2ph) 50 | word2phone = word2ph 51 | phone_level_feature = [] 52 | for i in range(len(word2phone)): 53 | repeat_feature = res[i].repeat(word2phone[i], 1) 54 | phone_level_feature.append(repeat_feature) 55 | 56 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 57 | 58 | return phone_level_feature.T 59 | -------------------------------------------------------------------------------- /oldVersion/V220/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from text.japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | text = "".join(text2sep_kata(text)[0]) 24 | if ( 25 | sys.platform == "darwin" 26 | and torch.backends.mps.is_available() 27 | and device == "cpu" 28 | ): 29 | device = "mps" 30 | if not device: 31 | device = "cuda" 32 | if device not in models.keys(): 33 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 34 | with torch.no_grad(): 35 | inputs = tokenizer(text, return_tensors="pt") 36 | for i in inputs: 37 | inputs[i] = inputs[i].to(device) 38 | res = models[device](**inputs, output_hidden_states=True) 39 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 40 | if style_text: 41 | style_inputs = tokenizer(style_text, return_tensors="pt") 42 | for i in style_inputs: 43 | style_inputs[i] = style_inputs[i].to(device) 44 | style_res = models[device](**style_inputs, output_hidden_states=True) 45 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 46 | style_res_mean = style_res.mean(0) 47 | 48 | assert len(word2ph) == len(text) + 2 49 | word2phone = word2ph 50 | phone_level_feature = [] 51 | for i in range(len(word2phone)): 52 | if style_text: 53 | repeat_feature = ( 54 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 55 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 56 | ) 57 | else: 58 | repeat_feature = res[i].repeat(word2phone[i], 1) 59 | phone_level_feature.append(repeat_feature) 60 | 61 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 62 | 63 | return phone_level_feature.T 64 | -------------------------------------------------------------------------------- /oldVersion/V220/clap_gen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from multiprocessing import Pool, cpu_count 3 | 4 | import torch 5 | import torch.multiprocessing as mp 6 | from tqdm import tqdm 7 | 8 | import utils 9 | from config import config 10 | from .clap_wrapper import get_clap_audio_feature 11 | import librosa 12 | import os 13 | 14 | os.environ["OMP_NUM_THREADS"] = "1" 15 | os.environ["MKL_NUM_THREADS"] = "1" 16 | 17 | 18 | def process_line(line): 19 | device = config.emo_gen_config.device 20 | if config.emo_gen_config.use_multi_device: 21 | rank = mp.current_process()._identity 22 | rank = rank[0] if len(rank) > 0 else 0 23 | if torch.cuda.is_available(): 24 | gpu_id = rank % torch.cuda.device_count() 25 | device = torch.device(f"cuda:{gpu_id}") 26 | else: 27 | device = torch.device("cpu") 28 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") 29 | 30 | clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy") 31 | if os.path.isfile(clap_path): 32 | return 33 | 34 | audio = librosa.load(wav_path, 48000)[0] 35 | # audio = librosa.resample(audio, 44100, 48000) 36 | 37 | clap = get_clap_audio_feature(audio, device) 38 | torch.save(clap, clap_path) 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | "-c", "--config", type=str, default=config.emo_gen_config.config_path 45 | ) 46 | parser.add_argument( 47 | "--num_processes", type=int, default=config.emo_gen_config.num_processes 48 | ) 49 | args, _ = parser.parse_known_args() 50 | config_path = args.config 51 | hps = utils.get_hparams_from_file(config_path) 52 | lines = [] 53 | with open(hps.data.training_files, encoding="utf-8") as f: 54 | lines.extend(f.readlines()) 55 | 56 | with open(hps.data.validation_files, encoding="utf-8") as f: 57 | lines.extend(f.readlines()) 58 | if len(lines) != 0: 59 | num_processes = min(args.num_processes, cpu_count()) 60 | with Pool(processes=num_processes) as pool: 61 | for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)): 62 | pass 63 | 64 | print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!") 65 | -------------------------------------------------------------------------------- /bert/chinese-roberta-wwm-ext-large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - zh 4 | tags: 5 | - bert 6 | license: "apache-2.0" 7 | --- 8 | 9 | # Please use 'Bert' related functions to load this model! 10 | 11 | ## Chinese BERT with Whole Word Masking 12 | For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**. 13 | 14 | **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)** 15 | Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu 16 | 17 | This repository is developed based on:https://github.com/google-research/bert 18 | 19 | You may also interested in, 20 | - Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm 21 | - Chinese MacBERT: https://github.com/ymcui/MacBERT 22 | - Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA 23 | - Chinese XLNet: https://github.com/ymcui/Chinese-XLNet 24 | - Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer 25 | 26 | More resources by HFL: https://github.com/ymcui/HFL-Anthology 27 | 28 | ## Citation 29 | If you find the technical report or resource is useful, please cite the following technical report in your paper. 30 | - Primary: https://arxiv.org/abs/2004.13922 31 | ``` 32 | @inproceedings{cui-etal-2020-revisiting, 33 | title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing", 34 | author = "Cui, Yiming and 35 | Che, Wanxiang and 36 | Liu, Ting and 37 | Qin, Bing and 38 | Wang, Shijin and 39 | Hu, Guoping", 40 | booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings", 41 | month = nov, 42 | year = "2020", 43 | address = "Online", 44 | publisher = "Association for Computational Linguistics", 45 | url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58", 46 | pages = "657--668", 47 | } 48 | ``` 49 | - Secondary: https://arxiv.org/abs/1906.08101 50 | ``` 51 | @article{chinese-bert-wwm, 52 | title={Pre-Training with Whole Word Masking for Chinese BERT}, 53 | author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping}, 54 | journal={arXiv preprint arXiv:1906.08101}, 55 | year={2019} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from text.japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | text = "".join(text2sep_kata(text)[0]) 24 | if style_text: 25 | style_text = "".join(text2sep_kata(style_text)[0]) 26 | if ( 27 | sys.platform == "darwin" 28 | and torch.backends.mps.is_available() 29 | and device == "cpu" 30 | ): 31 | device = "mps" 32 | if not device: 33 | device = "cuda" 34 | if device not in models.keys(): 35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 36 | with torch.no_grad(): 37 | inputs = tokenizer(text, return_tensors="pt") 38 | for i in inputs: 39 | inputs[i] = inputs[i].to(device) 40 | res = models[device](**inputs, output_hidden_states=True) 41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 42 | if style_text: 43 | style_inputs = tokenizer(style_text, return_tensors="pt") 44 | for i in style_inputs: 45 | style_inputs[i] = style_inputs[i].to(device) 46 | style_res = models[device](**style_inputs, output_hidden_states=True) 47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 48 | style_res_mean = style_res.mean(0) 49 | 50 | assert len(word2ph) == len(text) + 2 51 | word2phone = word2ph 52 | phone_level_feature = [] 53 | for i in range(len(word2phone)): 54 | if style_text: 55 | repeat_feature = ( 56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 58 | ) 59 | else: 60 | repeat_feature = res[i].repeat(word2phone[i], 1) 61 | phone_level_feature.append(repeat_feature) 62 | 63 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 64 | 65 | return phone_level_feature.T 66 | -------------------------------------------------------------------------------- /oldVersion/V210/text/japanese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | from .japanese import text2sep_kata 8 | 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm" 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 12 | 13 | models = dict() 14 | 15 | 16 | def get_bert_feature( 17 | text, 18 | word2ph, 19 | device=config.bert_gen_config.device, 20 | style_text=None, 21 | style_weight=0.7, 22 | ): 23 | text = "".join(text2sep_kata(text)[0]) 24 | if style_text: 25 | style_text = "".join(text2sep_kata(style_text)[0]) 26 | if ( 27 | sys.platform == "darwin" 28 | and torch.backends.mps.is_available() 29 | and device == "cpu" 30 | ): 31 | device = "mps" 32 | if not device: 33 | device = "cuda" 34 | if device not in models.keys(): 35 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 36 | with torch.no_grad(): 37 | inputs = tokenizer(text, return_tensors="pt") 38 | for i in inputs: 39 | inputs[i] = inputs[i].to(device) 40 | res = models[device](**inputs, output_hidden_states=True) 41 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 42 | if style_text: 43 | style_inputs = tokenizer(style_text, return_tensors="pt") 44 | for i in style_inputs: 45 | style_inputs[i] = style_inputs[i].to(device) 46 | style_res = models[device](**style_inputs, output_hidden_states=True) 47 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 48 | style_res_mean = style_res.mean(0) 49 | 50 | assert len(word2ph) == len(text) + 2 51 | word2phone = word2ph 52 | phone_level_feature = [] 53 | for i in range(len(word2phone)): 54 | if style_text: 55 | repeat_feature = ( 56 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 57 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 58 | ) 59 | else: 60 | repeat_feature = res[i].repeat(word2phone[i], 1) 61 | phone_level_feature.append(repeat_feature) 62 | 63 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 64 | 65 | return phone_level_feature.T 66 | -------------------------------------------------------------------------------- /oldVersion/V101/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.0.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | 26 | assert bert.shape[-1] == len(phone) 27 | 28 | phone = torch.LongTensor(phone) 29 | tone = torch.LongTensor(tone) 30 | language = torch.LongTensor(language) 31 | 32 | return bert, phone, tone, language 33 | 34 | 35 | def infer( 36 | text, 37 | sdp_ratio, 38 | noise_scale, 39 | noise_scale_w, 40 | length_scale, 41 | sid, 42 | hps, 43 | net_g, 44 | device, 45 | ): 46 | bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device) 47 | with torch.no_grad(): 48 | x_tst = phones.to(device).unsqueeze(0) 49 | tones = tones.to(device).unsqueeze(0) 50 | lang_ids = lang_ids.to(device).unsqueeze(0) 51 | bert = bert.to(device).unsqueeze(0) 52 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 53 | del phones 54 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 55 | audio = ( 56 | net_g.infer( 57 | x_tst, 58 | x_tst_lengths, 59 | speakers, 60 | tones, 61 | lang_ids, 62 | bert, 63 | sdp_ratio=sdp_ratio, 64 | noise_scale=noise_scale, 65 | noise_scale_w=noise_scale_w, 66 | length_scale=length_scale, 67 | )[0][0, 0] 68 | .data.cpu() 69 | .float() 70 | .numpy() 71 | ) 72 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers 73 | if torch.cuda.is_available(): 74 | torch.cuda.empty_cache() 75 | return audio 76 | -------------------------------------------------------------------------------- /oldVersion/V110/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | 8 | def get_bert_feature(text, word2ph, device=None): 9 | if ( 10 | sys.platform == "darwin" 11 | and torch.backends.mps.is_available() 12 | and device == "cpu" 13 | ): 14 | device = "mps" 15 | if not device: 16 | device = "cuda" 17 | model = AutoModelForMaskedLM.from_pretrained( 18 | "./bert/chinese-roberta-wwm-ext-large" 19 | ).to(device) 20 | with torch.no_grad(): 21 | inputs = tokenizer(text, return_tensors="pt") 22 | for i in inputs: 23 | inputs[i] = inputs[i].to(device) 24 | res = model(**inputs, output_hidden_states=True) 25 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 26 | 27 | assert len(word2ph) == len(text) + 2 28 | word2phone = word2ph 29 | phone_level_feature = [] 30 | for i in range(len(word2phone)): 31 | repeat_feature = res[i].repeat(word2phone[i], 1) 32 | phone_level_feature.append(repeat_feature) 33 | 34 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 35 | 36 | return phone_level_feature.T 37 | 38 | 39 | if __name__ == "__main__": 40 | import torch 41 | 42 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 43 | word2phone = [ 44 | 1, 45 | 2, 46 | 1, 47 | 2, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 2, 59 | 2, 60 | 2, 61 | 1, 62 | 1, 63 | 2, 64 | 2, 65 | 1, 66 | 2, 67 | 2, 68 | 2, 69 | 2, 70 | 1, 71 | 2, 72 | 2, 73 | 2, 74 | 2, 75 | 2, 76 | 1, 77 | 2, 78 | 2, 79 | 2, 80 | 2, 81 | 1, 82 | ] 83 | 84 | # 计算总帧数 85 | total_frames = sum(word2phone) 86 | print(word_level_feature.shape) 87 | print(word2phone) 88 | phone_level_feature = [] 89 | for i in range(len(word2phone)): 90 | print(word_level_feature[i].shape) 91 | 92 | # 对每个词重复word2phone[i]次 93 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 94 | phone_level_feature.append(repeat_feature) 95 | 96 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 97 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 98 | -------------------------------------------------------------------------------- /whisper_transcribe.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | import os 3 | import argparse 4 | import torch 5 | 6 | 7 | def transcribe_one(audio_path): 8 | # load audio and pad/trim it to fit 30 seconds 9 | audio = whisper.load_audio(audio_path) 10 | audio = whisper.pad_or_trim(audio) 11 | 12 | # make log-Mel spectrogram and move to the same device as the model 13 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 14 | 15 | # detect the spoken language 16 | _, probs = model.detect_language(mel) 17 | print(f"Detected language: {max(probs, key=probs.get)}") 18 | lang = max(probs, key=probs.get) 19 | # decode the audio 20 | options = whisper.DecodingOptions(beam_size=5) 21 | result = whisper.decode(model, mel, options) 22 | 23 | # print the recognized text 24 | print(result.text) 25 | return lang, result.text 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--languages", default="CJ") 31 | parser.add_argument("--whisper_size", default="medium") 32 | parser.add_argument("--speaker") 33 | parser.add_argument("--input_dir") 34 | parser.add_argument("--output") 35 | args = parser.parse_args() 36 | 37 | model = whisper.load_model(args.whisper_size) 38 | speaker = args.speaker 39 | input_dir = args.input_dir 40 | output = args.output 41 | 42 | if args.languages == "CJE": 43 | lang2token = { 44 | "zh": "ZH|", 45 | "ja": "JP|", 46 | "en": "EN|", 47 | } 48 | elif args.languages == "CJ": 49 | lang2token = { 50 | "zh": "ZH|", 51 | "ja": "JP|", 52 | } 53 | elif args.languages == "C": 54 | lang2token = { 55 | "zh": "ZH|", 56 | } 57 | 58 | assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!" 59 | 60 | speaker_annos = [] 61 | total_files = sum([len(files) for _, _, files in os.walk(input_dir)]) 62 | 63 | for i, wavfile in enumerate(list(os.walk(input_dir))[0][2]): 64 | try: 65 | lang, text = transcribe_one(f"./data/{speaker}/raw/{wavfile}") 66 | if lang not in list(lang2token.keys()): 67 | print(f"{lang} not supported, ignoring\n") 68 | continue 69 | speaker_annos.append(f"{wavfile}|{speaker}|{lang2token[lang]}{text}") 70 | print(f"Processed: {i + 1}/{total_files}") 71 | except Exception as e: 72 | print(e) 73 | continue 74 | 75 | with open(output, "w", encoding="utf-8") as f: 76 | f.write("\n".join(speaker_annos)) -------------------------------------------------------------------------------- /re_matching.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def extract_language_and_text_updated(speaker, dialogue): 5 | # 使用正则表达式匹配<语言>标签和其后的文本 6 | pattern_language_text = r"<(\S+?)>([^<]+)" 7 | matches = re.findall(pattern_language_text, dialogue, re.DOTALL) 8 | speaker = speaker[1:-1] 9 | # 清理文本:去除两边的空白字符 10 | matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] 11 | matches_cleaned.append(speaker) 12 | return matches_cleaned 13 | 14 | 15 | def validate_text(input_text): 16 | # 验证说话人的正则表达式 17 | pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" 18 | 19 | # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符 20 | matches = re.findall(pattern_speaker, input_text, re.DOTALL) 21 | 22 | # 对每个匹配到的说话人内容进行进一步验证 23 | for _, dialogue in matches: 24 | language_text_matches = extract_language_and_text_updated(_, dialogue) 25 | if not language_text_matches: 26 | return ( 27 | False, 28 | "Error: Invalid format detected in dialogue content. Please check your input.", 29 | ) 30 | 31 | # 如果输入的文本中没有找到任何匹配项 32 | if not matches: 33 | return ( 34 | False, 35 | "Error: No valid speaker format detected. Please check your input.", 36 | ) 37 | 38 | return True, "Input is valid." 39 | 40 | 41 | def text_matching(text: str) -> list: 42 | speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" 43 | matches = re.findall(speaker_pattern, text, re.DOTALL) 44 | result = [] 45 | for speaker, dialogue in matches: 46 | result.append(extract_language_and_text_updated(speaker, dialogue)) 47 | return result 48 | 49 | 50 | def cut_para(text): 51 | splitted_para = re.split("[\n]", text) # 按段分 52 | splitted_para = [ 53 | sentence.strip() for sentence in splitted_para if sentence.strip() 54 | ] # 删除空字符串 55 | return splitted_para 56 | 57 | 58 | def cut_sent(para): 59 | para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符 60 | para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号 61 | para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号 62 | para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) 63 | para = para.rstrip() # 段尾如果有多余的\n就去掉它 64 | return para.split("\n") 65 | 66 | 67 | if __name__ == "__main__": 68 | text = """ 69 | [说话人1] 70 | [说话人2]你好吗?元気ですか?こんにちは,世界。你好吗? 71 | [说话人3]谢谢。どういたしまして。 72 | """ 73 | text_matching(text) 74 | # 测试函数 75 | test_text = """ 76 | [说话人1]你好,こんにちは!こんにちは,世界。 77 | [说话人2]你好吗? 78 | """ 79 | text_matching(test_text) 80 | res = validate_text(test_text) 81 | print(res) 82 | -------------------------------------------------------------------------------- /oldVersion/V101/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | device = torch.device( 6 | "cuda" 7 | if torch.cuda.is_available() 8 | else ( 9 | "mps" 10 | if sys.platform == "darwin" and torch.backends.mps.is_available() 11 | else "cpu" 12 | ) 13 | ) 14 | 15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to( 17 | device 18 | ) 19 | 20 | 21 | def get_bert_feature(text, word2ph): 22 | with torch.no_grad(): 23 | inputs = tokenizer(text, return_tensors="pt") 24 | for i in inputs: 25 | inputs[i] = inputs[i].to(device) 26 | res = model(**inputs, output_hidden_states=True) 27 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 28 | 29 | assert len(word2ph) == len(text) + 2 30 | word2phone = word2ph 31 | phone_level_feature = [] 32 | for i in range(len(word2phone)): 33 | repeat_feature = res[i].repeat(word2phone[i], 1) 34 | phone_level_feature.append(repeat_feature) 35 | 36 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 37 | 38 | return phone_level_feature.T 39 | 40 | 41 | if __name__ == "__main__": 42 | # feature = get_bert_feature('你好,我是说的道理。') 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /compress_model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from text.symbols import symbols 3 | import torch 4 | 5 | from tools.log import logger 6 | import utils 7 | from models import SynthesizerTrn 8 | import os 9 | 10 | 11 | def copyStateDict(state_dict): 12 | if list(state_dict.keys())[0].startswith("module"): 13 | start_idx = 1 14 | else: 15 | start_idx = 0 16 | new_state_dict = OrderedDict() 17 | for k, v in state_dict.items(): 18 | name = ",".join(k.split(".")[start_idx:]) 19 | new_state_dict[name] = v 20 | return new_state_dict 21 | 22 | 23 | def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str): 24 | hps = utils.get_hparams_from_file(config) 25 | 26 | net_g = SynthesizerTrn( 27 | len(symbols), 28 | hps.data.filter_length // 2 + 1, 29 | hps.train.segment_size // hps.data.hop_length, 30 | n_speakers=hps.data.n_speakers, 31 | **hps.model, 32 | ) 33 | 34 | optim_g = torch.optim.AdamW( 35 | net_g.parameters(), 36 | hps.train.learning_rate, 37 | betas=hps.train.betas, 38 | eps=hps.train.eps, 39 | ) 40 | 41 | state_dict_g = torch.load(input_model, map_location="cpu") 42 | new_dict_g = copyStateDict(state_dict_g) 43 | keys = [] 44 | for k, v in new_dict_g["model"].items(): 45 | if "enc_q" in k: 46 | continue # noqa: E701 47 | keys.append(k) 48 | 49 | new_dict_g = ( 50 | {k: new_dict_g["model"][k].half() for k in keys} 51 | if ishalf 52 | else {k: new_dict_g["model"][k] for k in keys} 53 | ) 54 | 55 | torch.save( 56 | { 57 | "model": new_dict_g, 58 | "iteration": 0, 59 | "optimizer": optim_g.state_dict(), 60 | "learning_rate": 0.0001, 61 | }, 62 | output_model, 63 | ) 64 | 65 | 66 | if __name__ == "__main__": 67 | import argparse 68 | 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument("-c", "--config", type=str, default="configs/config.json") 71 | parser.add_argument("-i", "--input", type=str) 72 | parser.add_argument("-o", "--output", type=str, default=None) 73 | parser.add_argument( 74 | "-hf", "--half", action="store_true", default=False, help="Save as FP16" 75 | ) 76 | 77 | args = parser.parse_args() 78 | 79 | output = args.output 80 | 81 | if output is None: 82 | import os.path 83 | 84 | filename, ext = os.path.splitext(args.input) 85 | half = "_half" if args.half else "" 86 | output = filename + "_release" + half + ext 87 | 88 | removeOptimizer(args.config, args.input, args.half, output) 89 | logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}") 90 | -------------------------------------------------------------------------------- /slm/wavlm-base-plus/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "wavlm-base-plus", 3 | "activation_dropout": 0.0, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "WavLMModel" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 256, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": false, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": false, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_norm": "group", 51 | "feat_proj_dropout": 0.1, 52 | "feat_quantizer_dropout": 0.0, 53 | "final_dropout": 0.0, 54 | "freeze_feat_extract_train": true, 55 | "hidden_act": "gelu", 56 | "hidden_dropout": 0.1, 57 | "hidden_size": 768, 58 | "initializer_range": 0.02, 59 | "intermediate_size": 3072, 60 | "layer_norm_eps": 1e-05, 61 | "layerdrop": 0.05, 62 | "mask_channel_length": 10, 63 | "mask_channel_min_space": 1, 64 | "mask_channel_other": 0.0, 65 | "mask_channel_prob": 0.0, 66 | "mask_channel_selection": "static", 67 | "mask_feature_length": 10, 68 | "mask_feature_min_masks": 0, 69 | "mask_feature_prob": 0.0, 70 | "mask_time_length": 10, 71 | "mask_time_min_masks": 2, 72 | "mask_time_min_space": 1, 73 | "mask_time_other": 0.0, 74 | "mask_time_prob": 0.05, 75 | "mask_time_selection": "static", 76 | "model_type": "wavlm", 77 | "no_mask_channel_overlap": false, 78 | "no_mask_time_overlap": false, 79 | "num_adapter_layers": 3, 80 | "num_attention_heads": 12, 81 | "num_buckets": 320, 82 | "num_codevector_groups": 2, 83 | "num_codevectors_per_group": 320, 84 | "num_conv_pos_embedding_groups": 16, 85 | "num_conv_pos_embeddings": 128, 86 | "num_ctc_classes": 80, 87 | "num_feat_extract_layers": 7, 88 | "num_hidden_layers": 12, 89 | "num_negatives": 100, 90 | "output_hidden_size": 768, 91 | "pad_token_id": 0, 92 | "proj_codevector_dim": 256, 93 | "replace_prob": 0.5, 94 | "torch_dtype": "float32", 95 | "transformers_version": "4.13.0.dev0", 96 | "use_weighted_layer_sum": false, 97 | "vocab_size": 32, 98 | "tokenizer_class": "Wav2Vec2CTCTokenizer" 99 | } 100 | -------------------------------------------------------------------------------- /oldVersion/V111/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM 4 | 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large") 6 | 7 | models = dict() 8 | 9 | 10 | def get_bert_feature(text, word2ph, device=None): 11 | if ( 12 | sys.platform == "darwin" 13 | and torch.backends.mps.is_available() 14 | and device == "cpu" 15 | ): 16 | device = "mps" 17 | if not device: 18 | device = "cuda" 19 | if device not in models.keys(): 20 | models[device] = AutoModelForMaskedLM.from_pretrained( 21 | "./bert/chinese-roberta-wwm-ext-large" 22 | ).to(device) 23 | with torch.no_grad(): 24 | inputs = tokenizer(text, return_tensors="pt") 25 | for i in inputs: 26 | inputs[i] = inputs[i].to(device) 27 | res = models[device](**inputs, output_hidden_states=True) 28 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 29 | 30 | assert len(word2ph) == len(text) + 2 31 | word2phone = word2ph 32 | phone_level_feature = [] 33 | for i in range(len(word2phone)): 34 | repeat_feature = res[i].repeat(word2phone[i], 1) 35 | phone_level_feature.append(repeat_feature) 36 | 37 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 38 | 39 | return phone_level_feature.T 40 | 41 | 42 | if __name__ == "__main__": 43 | import torch 44 | 45 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 46 | word2phone = [ 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 2, 52 | 1, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 1, 59 | 2, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 1, 65 | 1, 66 | 2, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 2, 72 | 2, 73 | 1, 74 | 2, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 1, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | ] 86 | 87 | # 计算总帧数 88 | total_frames = sum(word2phone) 89 | print(word_level_feature.shape) 90 | print(word2phone) 91 | phone_level_feature = [] 92 | for i in range(len(word2phone)): 93 | print(word_level_feature[i].shape) 94 | 95 | # 对每个词重复word2phone[i]次 96 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 97 | phone_level_feature.append(repeat_feature) 98 | 99 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 100 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 101 | -------------------------------------------------------------------------------- /oldVersion/V200/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 16 | if ( 17 | sys.platform == "darwin" 18 | and torch.backends.mps.is_available() 19 | and device == "cpu" 20 | ): 21 | device = "mps" 22 | if not device: 23 | device = "cuda" 24 | if device not in models.keys(): 25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 26 | with torch.no_grad(): 27 | inputs = tokenizer(text, return_tensors="pt") 28 | for i in inputs: 29 | inputs[i] = inputs[i].to(device) 30 | res = models[device](**inputs, output_hidden_states=True) 31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 32 | 33 | assert len(word2ph) == len(text) + 2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | 44 | 45 | if __name__ == "__main__": 46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 47 | word2phone = [ 48 | 1, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 1, 54 | 2, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 2, 65 | 1, 66 | 1, 67 | 2, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 1, 86 | ] 87 | 88 | # 计算总帧数 89 | total_frames = sum(word2phone) 90 | print(word_level_feature.shape) 91 | print(word2phone) 92 | phone_level_feature = [] 93 | for i in range(len(word2phone)): 94 | print(word_level_feature[i].shape) 95 | 96 | # 对每个词重复word2phone[i]次 97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 98 | phone_level_feature.append(repeat_feature) 99 | 100 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 101 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 102 | -------------------------------------------------------------------------------- /bert/bert-base-japanese-v3/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /bert/bert-large-japanese-v2/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | datasets: 4 | - cc100 5 | - wikipedia 6 | language: 7 | - ja 8 | widget: 9 | - text: 東北大学で[MASK]の研究をしています。 10 | --- 11 | 12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102) 13 | 14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 15 | 16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization. 17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 18 | 19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/). 20 | 21 | ## Model architecture 22 | 23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads. 24 | 25 | ## Training Data 26 | 27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. 28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. 29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. 30 | 31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). 32 | 33 | ## Tokenization 34 | 35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. 36 | The vocabulary size is 32768. 37 | 38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization. 39 | 40 | ## Training 41 | 42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps. 43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 44 | 45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/). 46 | 47 | ## Licenses 48 | 49 | The pretrained models are distributed under the Apache License 2.0. 50 | 51 | ## Acknowledgments 52 | 53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. 54 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): 16 | if ( 17 | sys.platform == "darwin" 18 | and torch.backends.mps.is_available() 19 | and device == "cpu" 20 | ): 21 | device = "mps" 22 | if not device: 23 | device = "cuda" 24 | if device not in models.keys(): 25 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 26 | with torch.no_grad(): 27 | inputs = tokenizer(text, return_tensors="pt") 28 | for i in inputs: 29 | inputs[i] = inputs[i].to(device) 30 | res = models[device](**inputs, output_hidden_states=True) 31 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 32 | 33 | assert len(word2ph) == len(text) + 2 34 | word2phone = word2ph 35 | phone_level_feature = [] 36 | for i in range(len(word2phone)): 37 | repeat_feature = res[i].repeat(word2phone[i], 1) 38 | phone_level_feature.append(repeat_feature) 39 | 40 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 41 | 42 | return phone_level_feature.T 43 | 44 | 45 | if __name__ == "__main__": 46 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 47 | word2phone = [ 48 | 1, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 1, 54 | 2, 55 | 2, 56 | 1, 57 | 2, 58 | 2, 59 | 1, 60 | 2, 61 | 2, 62 | 2, 63 | 2, 64 | 2, 65 | 1, 66 | 1, 67 | 2, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 2, 78 | 2, 79 | 2, 80 | 1, 81 | 2, 82 | 2, 83 | 2, 84 | 2, 85 | 1, 86 | ] 87 | 88 | # 计算总帧数 89 | total_frames = sum(word2phone) 90 | print(word_level_feature.shape) 91 | print(word2phone) 92 | phone_level_feature = [] 93 | for i in range(len(word2phone)): 94 | print(word_level_feature[i].shape) 95 | 96 | # 对每个词重复word2phone[i]次 97 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 98 | phone_level_feature.append(repeat_feature) 99 | 100 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 101 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 102 | -------------------------------------------------------------------------------- /emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "torch", 3 | "activation_dropout": 0.1, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "Wav2Vec2ForSpeechClassification" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 768, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": true, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": true, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_dropout": 0.0, 51 | "feat_extract_norm": "layer", 52 | "feat_proj_dropout": 0.1, 53 | "feat_quantizer_dropout": 0.0, 54 | "final_dropout": 0.1, 55 | "finetuning_task": "wav2vec2_reg", 56 | "gradient_checkpointing": false, 57 | "hidden_act": "gelu", 58 | "hidden_dropout": 0.1, 59 | "hidden_dropout_prob": 0.1, 60 | "hidden_size": 1024, 61 | "id2label": { 62 | "0": "arousal", 63 | "1": "dominance", 64 | "2": "valence" 65 | }, 66 | "initializer_range": 0.02, 67 | "intermediate_size": 4096, 68 | "label2id": { 69 | "arousal": 0, 70 | "dominance": 1, 71 | "valence": 2 72 | }, 73 | "layer_norm_eps": 1e-05, 74 | "layerdrop": 0.1, 75 | "mask_feature_length": 10, 76 | "mask_feature_min_masks": 0, 77 | "mask_feature_prob": 0.0, 78 | "mask_time_length": 10, 79 | "mask_time_min_masks": 2, 80 | "mask_time_prob": 0.05, 81 | "model_type": "wav2vec2", 82 | "num_adapter_layers": 3, 83 | "num_attention_heads": 16, 84 | "num_codevector_groups": 2, 85 | "num_codevectors_per_group": 320, 86 | "num_conv_pos_embedding_groups": 16, 87 | "num_conv_pos_embeddings": 128, 88 | "num_feat_extract_layers": 7, 89 | "num_hidden_layers": 12, 90 | "num_negatives": 100, 91 | "output_hidden_size": 1024, 92 | "pad_token_id": 0, 93 | "pooling_mode": "mean", 94 | "problem_type": "regression", 95 | "proj_codevector_dim": 768, 96 | "tdnn_dilation": [ 97 | 1, 98 | 2, 99 | 3, 100 | 1, 101 | 1 102 | ], 103 | "tdnn_dim": [ 104 | 512, 105 | 512, 106 | 512, 107 | 512, 108 | 1500 109 | ], 110 | "tdnn_kernel": [ 111 | 5, 112 | 3, 113 | 3, 114 | 1, 115 | 1 116 | ], 117 | "torch_dtype": "float32", 118 | "transformers_version": "4.17.0.dev0", 119 | "use_weighted_layer_sum": false, 120 | "vocab_size": null, 121 | "xvector_output_dim": 512 122 | } 123 | -------------------------------------------------------------------------------- /spec_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | from multiprocessing import Pool 4 | from mel_processing import spectrogram_torch, mel_spectrogram_torch 5 | from utils import load_wav_to_torch 6 | 7 | 8 | class AudioProcessor: 9 | def __init__( 10 | self, 11 | max_wav_value, 12 | use_mel_spec_posterior, 13 | filter_length, 14 | n_mel_channels, 15 | sampling_rate, 16 | hop_length, 17 | win_length, 18 | mel_fmin, 19 | mel_fmax, 20 | ): 21 | self.max_wav_value = max_wav_value 22 | self.use_mel_spec_posterior = use_mel_spec_posterior 23 | self.filter_length = filter_length 24 | self.n_mel_channels = n_mel_channels 25 | self.sampling_rate = sampling_rate 26 | self.hop_length = hop_length 27 | self.win_length = win_length 28 | self.mel_fmin = mel_fmin 29 | self.mel_fmax = mel_fmax 30 | 31 | def process_audio(self, filename): 32 | audio, sampling_rate = load_wav_to_torch(filename) 33 | audio_norm = audio / self.max_wav_value 34 | audio_norm = audio_norm.unsqueeze(0) 35 | spec_filename = filename.replace(".wav", ".spec.pt") 36 | if self.use_mel_spec_posterior: 37 | spec_filename = spec_filename.replace(".spec.pt", ".mel.pt") 38 | try: 39 | spec = torch.load(spec_filename) 40 | except: 41 | if self.use_mel_spec_posterior: 42 | spec = mel_spectrogram_torch( 43 | audio_norm, 44 | self.filter_length, 45 | self.n_mel_channels, 46 | self.sampling_rate, 47 | self.hop_length, 48 | self.win_length, 49 | self.mel_fmin, 50 | self.mel_fmax, 51 | center=False, 52 | ) 53 | else: 54 | spec = spectrogram_torch( 55 | audio_norm, 56 | self.filter_length, 57 | self.sampling_rate, 58 | self.hop_length, 59 | self.win_length, 60 | center=False, 61 | ) 62 | spec = torch.squeeze(spec, 0) 63 | torch.save(spec, spec_filename) 64 | return spec, audio_norm 65 | 66 | 67 | # 使用示例 68 | processor = AudioProcessor( 69 | max_wav_value=32768.0, 70 | use_mel_spec_posterior=False, 71 | filter_length=2048, 72 | n_mel_channels=128, 73 | sampling_rate=44100, 74 | hop_length=512, 75 | win_length=2048, 76 | mel_fmin=0.0, 77 | mel_fmax="null", 78 | ) 79 | 80 | with open("filelists/train.list", "r") as f: 81 | filepaths = [line.split("|")[0] for line in f] # 取每一行的第一部分作为audiopath 82 | 83 | # 使用多进程处理 84 | with Pool(processes=32) as pool: # 使用4个进程 85 | with tqdm(total=len(filepaths)) as pbar: 86 | for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)): 87 | pbar.update() 88 | -------------------------------------------------------------------------------- /bert_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from multiprocessing import Pool 3 | import commons 4 | import utils 5 | from tqdm import tqdm 6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert 7 | import argparse 8 | import torch.multiprocessing as mp 9 | from config import config 10 | 11 | 12 | def process_line(x): 13 | line, add_blank = x 14 | device = config.bert_gen_config.device 15 | if config.bert_gen_config.use_multi_device: 16 | rank = mp.current_process()._identity 17 | rank = rank[0] if len(rank) > 0 else 0 18 | if torch.cuda.is_available(): 19 | gpu_id = rank % torch.cuda.device_count() 20 | device = torch.device(f"cuda:{gpu_id}") 21 | else: 22 | device = torch.device("cpu") 23 | wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") 24 | phone = phones.split(" ") 25 | tone = [int(i) for i in tone.split(" ")] 26 | word2ph = [int(i) for i in word2ph.split(" ")] 27 | word2ph = [i for i in word2ph] 28 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 29 | 30 | if add_blank: 31 | phone = commons.intersperse(phone, 0) 32 | tone = commons.intersperse(tone, 0) 33 | language = commons.intersperse(language, 0) 34 | for i in range(len(word2ph)): 35 | word2ph[i] = word2ph[i] * 2 36 | word2ph[0] += 1 37 | 38 | bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") 39 | 40 | try: 41 | bert = torch.load(bert_path) 42 | assert bert.shape[0] == 2048 43 | except Exception: 44 | bert = get_bert(text, word2ph, language_str, device) 45 | assert bert.shape[-1] == len(phone) 46 | torch.save(bert, bert_path) 47 | 48 | 49 | preprocess_text_config = config.preprocess_text_config 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument( 54 | "-c", "--config", type=str, default=config.bert_gen_config.config_path 55 | ) 56 | parser.add_argument( 57 | "--num_processes", type=int, default=config.bert_gen_config.num_processes 58 | ) 59 | args, _ = parser.parse_known_args() 60 | config_path = args.config 61 | hps = utils.get_hparams_from_file(config_path) 62 | check_bert_models() 63 | lines = [] 64 | with open(hps.data.training_files, encoding="utf-8") as f: 65 | lines.extend(f.readlines()) 66 | 67 | with open(hps.data.validation_files, encoding="utf-8") as f: 68 | lines.extend(f.readlines()) 69 | add_blank = [hps.data.add_blank] * len(lines) 70 | 71 | if len(lines) != 0: 72 | num_processes = args.num_processes 73 | with Pool(processes=num_processes) as pool: 74 | for _ in tqdm( 75 | pool.imap_unordered(process_line, zip(lines, add_blank)), 76 | total=len(lines), 77 | ): 78 | # 这里是缩进的代码块,表示循环体 79 | pass # 使用pass语句作为占位符 80 | 81 | print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!") 82 | -------------------------------------------------------------------------------- /oldVersion/V110/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.1 版本兼容 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1 4 | """ 5 | import torch 6 | import commons 7 | from .text.cleaner import clean_text 8 | from .text import cleaned_text_to_sequence 9 | from oldVersion.V111.text import get_bert 10 | 11 | 12 | def get_text(text, language_str, hps, device): 13 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 14 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 15 | 16 | if hps.data.add_blank: 17 | phone = commons.intersperse(phone, 0) 18 | tone = commons.intersperse(tone, 0) 19 | language = commons.intersperse(language, 0) 20 | for i in range(len(word2ph)): 21 | word2ph[i] = word2ph[i] * 2 22 | word2ph[0] += 1 23 | bert = get_bert(norm_text, word2ph, language_str, device) 24 | del word2ph 25 | assert bert.shape[-1] == len(phone), phone 26 | 27 | if language_str == "ZH": 28 | bert = bert 29 | ja_bert = torch.zeros(768, len(phone)) 30 | elif language_str == "JP": 31 | ja_bert = bert 32 | bert = torch.zeros(1024, len(phone)) 33 | else: 34 | bert = torch.zeros(1024, len(phone)) 35 | ja_bert = torch.zeros(768, len(phone)) 36 | 37 | assert bert.shape[-1] == len( 38 | phone 39 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 40 | 41 | phone = torch.LongTensor(phone) 42 | tone = torch.LongTensor(tone) 43 | language = torch.LongTensor(language) 44 | return bert, ja_bert, phone, tone, language 45 | 46 | 47 | def infer( 48 | text, 49 | sdp_ratio, 50 | noise_scale, 51 | noise_scale_w, 52 | length_scale, 53 | sid, 54 | language, 55 | hps, 56 | net_g, 57 | device, 58 | ): 59 | bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device) 60 | with torch.no_grad(): 61 | x_tst = phones.to(device).unsqueeze(0) 62 | tones = tones.to(device).unsqueeze(0) 63 | lang_ids = lang_ids.to(device).unsqueeze(0) 64 | bert = bert.to(device).unsqueeze(0) 65 | ja_bert = ja_bert.to(device).unsqueeze(0) 66 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 67 | del phones 68 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 69 | audio = ( 70 | net_g.infer( 71 | x_tst, 72 | x_tst_lengths, 73 | speakers, 74 | tones, 75 | lang_ids, 76 | bert, 77 | ja_bert, 78 | sdp_ratio=sdp_ratio, 79 | noise_scale=noise_scale, 80 | noise_scale_w=noise_scale_w, 81 | length_scale=length_scale, 82 | )[0][0, 0] 83 | .data.cpu() 84 | .float() 85 | .numpy() 86 | ) 87 | del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert 88 | if torch.cuda.is_available(): 89 | torch.cuda.empty_cache() 90 | return audio 91 | -------------------------------------------------------------------------------- /oldVersion/V101/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "I", 78 | "N", 79 | "U", 80 | "a", 81 | "b", 82 | "by", 83 | "ch", 84 | "cl", 85 | "d", 86 | "dy", 87 | "e", 88 | "f", 89 | "g", 90 | "gy", 91 | "h", 92 | "hy", 93 | "i", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "p", 103 | "py", 104 | "r", 105 | "ry", 106 | "s", 107 | "sh", 108 | "t", 109 | "ts", 110 | "u", 111 | "V", 112 | "w", 113 | "y", 114 | "z", 115 | ] 116 | num_ja_tones = 1 117 | 118 | # English 119 | en_symbols = [ 120 | "aa", 121 | "ae", 122 | "ah", 123 | "ao", 124 | "aw", 125 | "ay", 126 | "b", 127 | "ch", 128 | "d", 129 | "dh", 130 | "eh", 131 | "er", 132 | "ey", 133 | "f", 134 | "g", 135 | "hh", 136 | "ih", 137 | "iy", 138 | "jh", 139 | "k", 140 | "l", 141 | "m", 142 | "n", 143 | "ng", 144 | "ow", 145 | "oy", 146 | "p", 147 | "r", 148 | "s", 149 | "sh", 150 | "t", 151 | "th", 152 | "uh", 153 | "uw", 154 | "V", 155 | "w", 156 | "y", 157 | "z", 158 | "zh", 159 | ] 160 | num_en_tones = 4 161 | 162 | # combine all symbols 163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 164 | symbols = [pad] + normal_symbols + pu_symbols 165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 166 | 167 | # combine all tones 168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 169 | 170 | # language maps 171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2} 172 | num_languages = len(language_id_map.keys()) 173 | 174 | language_tone_start_map = { 175 | "ZH": 0, 176 | "JA": num_zh_tones, 177 | "EN": num_zh_tones + num_ja_tones, 178 | } 179 | 180 | if __name__ == "__main__": 181 | a = set(zh_symbols) 182 | b = set(en_symbols) 183 | print(sorted(a & b)) 184 | -------------------------------------------------------------------------------- /update_status.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gradio as gr 3 | 4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"} 5 | 6 | 7 | def raw_dir_convert_to_path(target_dir: str, lang): 8 | res = target_dir.rstrip("/").rstrip("\\") 9 | if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")): 10 | res = os.path.join("./raw", res) 11 | if ( 12 | (not res.endswith("_zh")) 13 | and (not res.endswith("_jp")) 14 | and (not res.endswith("_en")) 15 | ): 16 | res += lang_dict[lang] 17 | return res 18 | 19 | 20 | def update_g_files(): 21 | g_files = [] 22 | cnt = 0 23 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 24 | for file in files: 25 | if file.startswith("G_") and file.endswith(".pth"): 26 | g_files.append(os.path.join(root, file)) 27 | cnt += 1 28 | print(g_files) 29 | return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files) 30 | 31 | 32 | def update_c_files(): 33 | c_files = [] 34 | cnt = 0 35 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 36 | for file in files: 37 | if file.startswith("config.json"): 38 | c_files.append(os.path.join(root, file)) 39 | cnt += 1 40 | print(c_files) 41 | return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files) 42 | 43 | 44 | def update_model_folders(): 45 | subdirs = [] 46 | cnt = 0 47 | for root, dirs, files in os.walk(os.path.abspath("./logs")): 48 | for dir_name in dirs: 49 | if os.path.basename(dir_name) != "eval": 50 | subdirs.append(os.path.join(root, dir_name)) 51 | cnt += 1 52 | print(subdirs) 53 | return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs) 54 | 55 | 56 | def update_wav_lab_pairs(): 57 | wav_count = tot_count = 0 58 | for root, _, files in os.walk("./raw"): 59 | for file in files: 60 | # print(file) 61 | file_path = os.path.join(root, file) 62 | if file.lower().endswith(".wav"): 63 | lab_file = os.path.splitext(file_path)[0] + ".lab" 64 | if os.path.exists(lab_file): 65 | wav_count += 1 66 | tot_count += 1 67 | return f"{wav_count} / {tot_count}" 68 | 69 | 70 | def update_raw_folders(): 71 | subdirs = [] 72 | cnt = 0 73 | script_path = os.path.dirname(os.path.abspath(__file__)) # 获取当前脚本的绝对路径 74 | raw_path = os.path.join(script_path, "raw") 75 | print(raw_path) 76 | os.makedirs(raw_path, exist_ok=True) 77 | for root, dirs, files in os.walk(raw_path): 78 | for dir_name in dirs: 79 | relative_path = os.path.relpath( 80 | os.path.join(root, dir_name), script_path 81 | ) # 获取相对路径 82 | subdirs.append(relative_path) 83 | cnt += 1 84 | print(subdirs) 85 | return ( 86 | f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹", 87 | gr.Dropdown.update(choices=subdirs), 88 | gr.Textbox.update(value=update_wav_lab_pairs()), 89 | ) 90 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V110/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V111/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 1 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V200/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V210/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V220/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V220/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V230/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V220_novq_dev/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V200/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | @Desc: 2.0版本兼容 对应2.0.1 2.0.2-fix 3 | """ 4 | import torch 5 | import commons 6 | from .text import cleaned_text_to_sequence, get_bert 7 | from .text.cleaner import clean_text 8 | 9 | 10 | def get_text(text, language_str, hps, device): 11 | # 在此处实现当前版本的get_text 12 | norm_text, phone, tone, word2ph = clean_text(text, language_str) 13 | phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) 14 | 15 | if hps.data.add_blank: 16 | phone = commons.intersperse(phone, 0) 17 | tone = commons.intersperse(tone, 0) 18 | language = commons.intersperse(language, 0) 19 | for i in range(len(word2ph)): 20 | word2ph[i] = word2ph[i] * 2 21 | word2ph[0] += 1 22 | bert_ori = get_bert(norm_text, word2ph, language_str, device) 23 | del word2ph 24 | assert bert_ori.shape[-1] == len(phone), phone 25 | 26 | if language_str == "ZH": 27 | bert = bert_ori 28 | ja_bert = torch.zeros(1024, len(phone)) 29 | en_bert = torch.zeros(1024, len(phone)) 30 | elif language_str == "JP": 31 | bert = torch.zeros(1024, len(phone)) 32 | ja_bert = bert_ori 33 | en_bert = torch.zeros(1024, len(phone)) 34 | elif language_str == "EN": 35 | bert = torch.zeros(1024, len(phone)) 36 | ja_bert = torch.zeros(1024, len(phone)) 37 | en_bert = bert_ori 38 | else: 39 | raise ValueError("language_str should be ZH, JP or EN") 40 | 41 | assert bert.shape[-1] == len( 42 | phone 43 | ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" 44 | 45 | phone = torch.LongTensor(phone) 46 | tone = torch.LongTensor(tone) 47 | language = torch.LongTensor(language) 48 | return bert, ja_bert, en_bert, phone, tone, language 49 | 50 | 51 | def infer( 52 | text, 53 | sdp_ratio, 54 | noise_scale, 55 | noise_scale_w, 56 | length_scale, 57 | sid, 58 | language, 59 | hps, 60 | net_g, 61 | device, 62 | ): 63 | bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( 64 | text, language, hps, device 65 | ) 66 | with torch.no_grad(): 67 | x_tst = phones.to(device).unsqueeze(0) 68 | tones = tones.to(device).unsqueeze(0) 69 | lang_ids = lang_ids.to(device).unsqueeze(0) 70 | bert = bert.to(device).unsqueeze(0) 71 | ja_bert = ja_bert.to(device).unsqueeze(0) 72 | en_bert = en_bert.to(device).unsqueeze(0) 73 | x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) 74 | del phones 75 | speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) 76 | audio = ( 77 | net_g.infer( 78 | x_tst, 79 | x_tst_lengths, 80 | speakers, 81 | tones, 82 | lang_ids, 83 | bert, 84 | ja_bert, 85 | en_bert, 86 | sdp_ratio=sdp_ratio, 87 | noise_scale=noise_scale, 88 | noise_scale_w=noise_scale_w, 89 | length_scale=length_scale, 90 | )[0][0, 0] 91 | .data.cpu() 92 | .float() 93 | .numpy() 94 | ) 95 | del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert 96 | if torch.cuda.is_available(): 97 | torch.cuda.empty_cache() 98 | return audio 99 | -------------------------------------------------------------------------------- /onnx_modules/V200/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /onnx_modules/V210/text/symbols.py: -------------------------------------------------------------------------------- 1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"] 2 | pu_symbols = punctuation + ["SP", "UNK"] 3 | pad = "_" 4 | 5 | # chinese 6 | zh_symbols = [ 7 | "E", 8 | "En", 9 | "a", 10 | "ai", 11 | "an", 12 | "ang", 13 | "ao", 14 | "b", 15 | "c", 16 | "ch", 17 | "d", 18 | "e", 19 | "ei", 20 | "en", 21 | "eng", 22 | "er", 23 | "f", 24 | "g", 25 | "h", 26 | "i", 27 | "i0", 28 | "ia", 29 | "ian", 30 | "iang", 31 | "iao", 32 | "ie", 33 | "in", 34 | "ing", 35 | "iong", 36 | "ir", 37 | "iu", 38 | "j", 39 | "k", 40 | "l", 41 | "m", 42 | "n", 43 | "o", 44 | "ong", 45 | "ou", 46 | "p", 47 | "q", 48 | "r", 49 | "s", 50 | "sh", 51 | "t", 52 | "u", 53 | "ua", 54 | "uai", 55 | "uan", 56 | "uang", 57 | "ui", 58 | "un", 59 | "uo", 60 | "v", 61 | "van", 62 | "ve", 63 | "vn", 64 | "w", 65 | "x", 66 | "y", 67 | "z", 68 | "zh", 69 | "AA", 70 | "EE", 71 | "OO", 72 | ] 73 | num_zh_tones = 6 74 | 75 | # japanese 76 | ja_symbols = [ 77 | "N", 78 | "a", 79 | "a:", 80 | "b", 81 | "by", 82 | "ch", 83 | "d", 84 | "dy", 85 | "e", 86 | "e:", 87 | "f", 88 | "g", 89 | "gy", 90 | "h", 91 | "hy", 92 | "i", 93 | "i:", 94 | "j", 95 | "k", 96 | "ky", 97 | "m", 98 | "my", 99 | "n", 100 | "ny", 101 | "o", 102 | "o:", 103 | "p", 104 | "py", 105 | "q", 106 | "r", 107 | "ry", 108 | "s", 109 | "sh", 110 | "t", 111 | "ts", 112 | "ty", 113 | "u", 114 | "u:", 115 | "w", 116 | "y", 117 | "z", 118 | "zy", 119 | ] 120 | num_ja_tones = 2 121 | 122 | # English 123 | en_symbols = [ 124 | "aa", 125 | "ae", 126 | "ah", 127 | "ao", 128 | "aw", 129 | "ay", 130 | "b", 131 | "ch", 132 | "d", 133 | "dh", 134 | "eh", 135 | "er", 136 | "ey", 137 | "f", 138 | "g", 139 | "hh", 140 | "ih", 141 | "iy", 142 | "jh", 143 | "k", 144 | "l", 145 | "m", 146 | "n", 147 | "ng", 148 | "ow", 149 | "oy", 150 | "p", 151 | "r", 152 | "s", 153 | "sh", 154 | "t", 155 | "th", 156 | "uh", 157 | "uw", 158 | "V", 159 | "w", 160 | "y", 161 | "z", 162 | "zh", 163 | ] 164 | num_en_tones = 4 165 | 166 | # combine all symbols 167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) 168 | symbols = [pad] + normal_symbols + pu_symbols 169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] 170 | 171 | # combine all tones 172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones 173 | 174 | # language maps 175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2} 176 | num_languages = len(language_id_map.keys()) 177 | 178 | language_tone_start_map = { 179 | "ZH": 0, 180 | "JP": num_zh_tones, 181 | "EN": num_zh_tones + num_ja_tones, 182 | } 183 | 184 | if __name__ == "__main__": 185 | a = set(zh_symbols) 186 | b = set(en_symbols) 187 | print(sorted(a & b)) 188 | -------------------------------------------------------------------------------- /oldVersion/V101/text/japanese.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py 2 | import re 3 | import sys 4 | 5 | import pyopenjtalk 6 | 7 | from . import symbols 8 | 9 | # Regular expression matching Japanese without punctuation marks: 10 | _japanese_characters = re.compile( 11 | r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 12 | ) 13 | 14 | # Regular expression matching non-Japanese characters or punctuation marks: 15 | _japanese_marks = re.compile( 16 | r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" 17 | ) 18 | 19 | # List of (symbol, Japanese) pairs for marks: 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] 21 | 22 | 23 | # List of (consonant, sokuon) pairs: 24 | _real_sokuon = [ 25 | (re.compile("%s" % x[0]), x[1]) 26 | for x in [ 27 | (r"Q([↑↓]*[kg])", r"k#\1"), 28 | (r"Q([↑↓]*[tdjʧ])", r"t#\1"), 29 | (r"Q([↑↓]*[sʃ])", r"s\1"), 30 | (r"Q([↑↓]*[pb])", r"p#\1"), 31 | ] 32 | ] 33 | 34 | # List of (consonant, hatsuon) pairs: 35 | _real_hatsuon = [ 36 | (re.compile("%s" % x[0]), x[1]) 37 | for x in [ 38 | (r"N([↑↓]*[pbm])", r"m\1"), 39 | (r"N([↑↓]*[ʧʥj])", r"n^\1"), 40 | (r"N([↑↓]*[tdn])", r"n\1"), 41 | (r"N([↑↓]*[kg])", r"ŋ\1"), 42 | ] 43 | ] 44 | 45 | 46 | def post_replace_ph(ph): 47 | rep_map = { 48 | ":": ",", 49 | ";": ",", 50 | ",": ",", 51 | "。": ".", 52 | "!": "!", 53 | "?": "?", 54 | "\n": ".", 55 | "·": ",", 56 | "、": ",", 57 | "...": "…", 58 | "v": "V", 59 | } 60 | if ph in rep_map.keys(): 61 | ph = rep_map[ph] 62 | if ph in symbols: 63 | return ph 64 | if ph not in symbols: 65 | ph = "UNK" 66 | return ph 67 | 68 | 69 | def symbols_to_japanese(text): 70 | for regex, replacement in _symbols_to_japanese: 71 | text = re.sub(regex, replacement, text) 72 | return text 73 | 74 | 75 | def preprocess_jap(text): 76 | """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" 77 | text = symbols_to_japanese(text) 78 | sentences = re.split(_japanese_marks, text) 79 | marks = re.findall(_japanese_marks, text) 80 | text = [] 81 | for i, sentence in enumerate(sentences): 82 | if re.match(_japanese_characters, sentence): 83 | p = pyopenjtalk.g2p(sentence) 84 | text += p.split(" ") 85 | 86 | if i < len(marks): 87 | text += [marks[i].replace(" ", "")] 88 | return text 89 | 90 | 91 | def text_normalize(text): 92 | # todo: jap text normalize 93 | return text 94 | 95 | 96 | def g2p(norm_text): 97 | phones = preprocess_jap(norm_text) 98 | phones = [post_replace_ph(i) for i in phones] 99 | # todo: implement tones and word2ph 100 | tones = [0 for i in phones] 101 | word2ph = [1 for i in phones] 102 | return phones, tones, word2ph 103 | 104 | 105 | if __name__ == "__main__": 106 | for line in open("../../../Downloads/transcript_utf8.txt").readlines(): 107 | text = line.split(":")[1] 108 | phones, tones, word2ph = g2p(text) 109 | for p in phones: 110 | if p == "z": 111 | print(text, phones) 112 | sys.exit(0) 113 | -------------------------------------------------------------------------------- /text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature( 16 | text, 17 | word2ph, 18 | device=config.bert_gen_config.device, 19 | style_text=None, 20 | style_weight=0.7, 21 | ): 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | if device not in models.keys(): 31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | for i in inputs: 35 | inputs[i] = inputs[i].to(device) 36 | res = models[device](**inputs, output_hidden_states=True) 37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 38 | if style_text: 39 | style_inputs = tokenizer(style_text, return_tensors="pt") 40 | for i in style_inputs: 41 | style_inputs[i] = style_inputs[i].to(device) 42 | style_res = models[device](**style_inputs, output_hidden_states=True) 43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 44 | style_res_mean = style_res.mean(0) 45 | assert len(word2ph) == len(text) + 2 46 | word2phone = word2ph 47 | phone_level_feature = [] 48 | for i in range(len(word2phone)): 49 | if style_text: 50 | repeat_feature = ( 51 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 52 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 53 | ) 54 | else: 55 | repeat_feature = res[i].repeat(word2phone[i], 1) 56 | phone_level_feature.append(repeat_feature) 57 | 58 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 59 | 60 | return phone_level_feature.T 61 | 62 | 63 | if __name__ == "__main__": 64 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 65 | word2phone = [ 66 | 1, 67 | 2, 68 | 1, 69 | 2, 70 | 2, 71 | 1, 72 | 2, 73 | 2, 74 | 1, 75 | 2, 76 | 2, 77 | 1, 78 | 2, 79 | 2, 80 | 2, 81 | 2, 82 | 2, 83 | 1, 84 | 1, 85 | 2, 86 | 2, 87 | 1, 88 | 2, 89 | 2, 90 | 2, 91 | 2, 92 | 1, 93 | 2, 94 | 2, 95 | 2, 96 | 2, 97 | 2, 98 | 1, 99 | 2, 100 | 2, 101 | 2, 102 | 2, 103 | 1, 104 | ] 105 | 106 | # 计算总帧数 107 | total_frames = sum(word2phone) 108 | print(word_level_feature.shape) 109 | print(word2phone) 110 | phone_level_feature = [] 111 | for i in range(len(word2phone)): 112 | print(word_level_feature[i].shape) 113 | 114 | # 对每个词重复word2phone[i]次 115 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 116 | phone_level_feature.append(repeat_feature) 117 | 118 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 119 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 120 | -------------------------------------------------------------------------------- /oldVersion/V210/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature( 16 | text, 17 | word2ph, 18 | device=config.bert_gen_config.device, 19 | style_text=None, 20 | style_weight=0.7, 21 | ): 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | if device not in models.keys(): 31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | for i in inputs: 35 | inputs[i] = inputs[i].to(device) 36 | res = models[device](**inputs, output_hidden_states=True) 37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 38 | if style_text: 39 | style_inputs = tokenizer(style_text, return_tensors="pt") 40 | for i in style_inputs: 41 | style_inputs[i] = style_inputs[i].to(device) 42 | style_res = models[device](**style_inputs, output_hidden_states=True) 43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 44 | style_res_mean = style_res.mean(0) 45 | 46 | assert len(word2ph) == len(text) + 2 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | 63 | 64 | if __name__ == "__main__": 65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 66 | word2phone = [ 67 | 1, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 1, 73 | 2, 74 | 2, 75 | 1, 76 | 2, 77 | 2, 78 | 1, 79 | 2, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | 1, 86 | 2, 87 | 2, 88 | 1, 89 | 2, 90 | 2, 91 | 2, 92 | 2, 93 | 1, 94 | 2, 95 | 2, 96 | 2, 97 | 2, 98 | 2, 99 | 1, 100 | 2, 101 | 2, 102 | 2, 103 | 2, 104 | 1, 105 | ] 106 | 107 | # 计算总帧数 108 | total_frames = sum(word2phone) 109 | print(word_level_feature.shape) 110 | print(word2phone) 111 | phone_level_feature = [] 112 | for i in range(len(word2phone)): 113 | print(word_level_feature[i].shape) 114 | 115 | # 对每个词重复word2phone[i]次 116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 117 | phone_level_feature.append(repeat_feature) 118 | 119 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 120 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 121 | -------------------------------------------------------------------------------- /oldVersion/V220/text/chinese_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer 5 | 6 | from config import config 7 | 8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large" 9 | 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) 11 | 12 | models = dict() 13 | 14 | 15 | def get_bert_feature( 16 | text, 17 | word2ph, 18 | device=config.bert_gen_config.device, 19 | style_text=None, 20 | style_weight=0.7, 21 | ): 22 | if ( 23 | sys.platform == "darwin" 24 | and torch.backends.mps.is_available() 25 | and device == "cpu" 26 | ): 27 | device = "mps" 28 | if not device: 29 | device = "cuda" 30 | if device not in models.keys(): 31 | models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device) 32 | with torch.no_grad(): 33 | inputs = tokenizer(text, return_tensors="pt") 34 | for i in inputs: 35 | inputs[i] = inputs[i].to(device) 36 | res = models[device](**inputs, output_hidden_states=True) 37 | res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() 38 | if style_text: 39 | style_inputs = tokenizer(style_text, return_tensors="pt") 40 | for i in style_inputs: 41 | style_inputs[i] = style_inputs[i].to(device) 42 | style_res = models[device](**style_inputs, output_hidden_states=True) 43 | style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() 44 | style_res_mean = style_res.mean(0) 45 | 46 | assert len(word2ph) == len(text) + 2 47 | word2phone = word2ph 48 | phone_level_feature = [] 49 | for i in range(len(word2phone)): 50 | if style_text: 51 | repeat_feature = ( 52 | res[i].repeat(word2phone[i], 1) * (1 - style_weight) 53 | + style_res_mean.repeat(word2phone[i], 1) * style_weight 54 | ) 55 | else: 56 | repeat_feature = res[i].repeat(word2phone[i], 1) 57 | phone_level_feature.append(repeat_feature) 58 | 59 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 60 | 61 | return phone_level_feature.T 62 | 63 | 64 | if __name__ == "__main__": 65 | word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 66 | word2phone = [ 67 | 1, 68 | 2, 69 | 1, 70 | 2, 71 | 2, 72 | 1, 73 | 2, 74 | 2, 75 | 1, 76 | 2, 77 | 2, 78 | 1, 79 | 2, 80 | 2, 81 | 2, 82 | 2, 83 | 2, 84 | 1, 85 | 1, 86 | 2, 87 | 2, 88 | 1, 89 | 2, 90 | 2, 91 | 2, 92 | 2, 93 | 1, 94 | 2, 95 | 2, 96 | 2, 97 | 2, 98 | 2, 99 | 1, 100 | 2, 101 | 2, 102 | 2, 103 | 2, 104 | 1, 105 | ] 106 | 107 | # 计算总帧数 108 | total_frames = sum(word2phone) 109 | print(word_level_feature.shape) 110 | print(word2phone) 111 | phone_level_feature = [] 112 | for i in range(len(word2phone)): 113 | print(word_level_feature[i].shape) 114 | 115 | # 对每个词重复word2phone[i]次 116 | repeat_feature = word_level_feature[i].repeat(word2phone[i], 1) 117 | phone_level_feature.append(repeat_feature) 118 | 119 | phone_level_feature = torch.cat(phone_level_feature, dim=0) 120 | print(phone_level_feature.shape) # torch.Size([36, 1024]) 121 | -------------------------------------------------------------------------------- /bert/deberta-v3-large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | tags: 4 | - deberta 5 | - deberta-v3 6 | - fill-mask 7 | thumbnail: https://huggingface.co/front/thumbnails/microsoft.png 8 | license: mit 9 | --- 10 | 11 | ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing 12 | 13 | [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data. 14 | 15 | In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543). 16 | 17 | Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates. 18 | 19 | The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2. 20 | 21 | 22 | #### Fine-tuning on NLU tasks 23 | 24 | We present the dev results on SQuAD 2.0 and MNLI tasks. 25 | 26 | | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)| 27 | |-------------------|----------|-------------------|-----------|----------| 28 | | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 | 29 | | XLNet-large |32 |- | 90.6/87.9 | 90.8 | 30 | | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 | 31 | | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**| 32 | 33 | 34 | #### Fine-tuning with HF transformers 35 | 36 | ```bash 37 | #!/bin/bash 38 | 39 | cd transformers/examples/pytorch/text-classification/ 40 | 41 | pip install datasets 42 | export TASK_NAME=mnli 43 | 44 | output_dir="ds_results" 45 | 46 | num_gpus=8 47 | 48 | batch_size=8 49 | 50 | python -m torch.distributed.launch --nproc_per_node=${num_gpus} \ 51 | run_glue.py \ 52 | --model_name_or_path microsoft/deberta-v3-large \ 53 | --task_name $TASK_NAME \ 54 | --do_train \ 55 | --do_eval \ 56 | --evaluation_strategy steps \ 57 | --max_seq_length 256 \ 58 | --warmup_steps 50 \ 59 | --per_device_train_batch_size ${batch_size} \ 60 | --learning_rate 6e-6 \ 61 | --num_train_epochs 2 \ 62 | --output_dir $output_dir \ 63 | --overwrite_output_dir \ 64 | --logging_steps 1000 \ 65 | --logging_dir $output_dir 66 | 67 | ``` 68 | 69 | ### Citation 70 | 71 | If you find DeBERTa useful for your work, please cite the following papers: 72 | 73 | ``` latex 74 | @misc{he2021debertav3, 75 | title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, 76 | author={Pengcheng He and Jianfeng Gao and Weizhu Chen}, 77 | year={2021}, 78 | eprint={2111.09543}, 79 | archivePrefix={arXiv}, 80 | primaryClass={cs.CL} 81 | } 82 | ``` 83 | 84 | ``` latex 85 | @inproceedings{ 86 | he2021deberta, 87 | title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION}, 88 | author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen}, 89 | booktitle={International Conference on Learning Representations}, 90 | year={2021}, 91 | url={https://openreview.net/forum?id=XPZIaotutsD} 92 | } 93 | ``` 94 | -------------------------------------------------------------------------------- /oldVersion/V210/emo_gen.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.utils.data import Dataset 6 | from torch.utils.data import Dataset 7 | from transformers import Wav2Vec2Processor 8 | from transformers.models.wav2vec2.modeling_wav2vec2 import ( 9 | Wav2Vec2Model, 10 | Wav2Vec2PreTrainedModel, 11 | ) 12 | 13 | from config import config 14 | 15 | 16 | class RegressionHead(nn.Module): 17 | r"""Classification head.""" 18 | 19 | def __init__(self, config): 20 | super().__init__() 21 | 22 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 23 | self.dropout = nn.Dropout(config.final_dropout) 24 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 25 | 26 | def forward(self, features, **kwargs): 27 | x = features 28 | x = self.dropout(x) 29 | x = self.dense(x) 30 | x = torch.tanh(x) 31 | x = self.dropout(x) 32 | x = self.out_proj(x) 33 | 34 | return x 35 | 36 | 37 | class EmotionModel(Wav2Vec2PreTrainedModel): 38 | r"""Speech emotion classifier.""" 39 | 40 | def __init__(self, config): 41 | super().__init__(config) 42 | 43 | self.config = config 44 | self.wav2vec2 = Wav2Vec2Model(config) 45 | self.classifier = RegressionHead(config) 46 | self.init_weights() 47 | 48 | def forward( 49 | self, 50 | input_values, 51 | ): 52 | outputs = self.wav2vec2(input_values) 53 | hidden_states = outputs[0] 54 | hidden_states = torch.mean(hidden_states, dim=1) 55 | logits = self.classifier(hidden_states) 56 | 57 | return hidden_states, logits 58 | 59 | 60 | class AudioDataset(Dataset): 61 | def __init__(self, list_of_wav_files, sr, processor): 62 | self.list_of_wav_files = list_of_wav_files 63 | self.processor = processor 64 | self.sr = sr 65 | 66 | def __len__(self): 67 | return len(self.list_of_wav_files) 68 | 69 | def __getitem__(self, idx): 70 | wav_file = self.list_of_wav_files[idx] 71 | audio_data, _ = librosa.load(wav_file, sr=self.sr) 72 | processed_data = self.processor(audio_data, sampling_rate=self.sr)[ 73 | "input_values" 74 | ][0] 75 | return torch.from_numpy(processed_data) 76 | 77 | 78 | device = config.emo_gen_config.device 79 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim" 80 | processor = Wav2Vec2Processor.from_pretrained(model_name) 81 | model = EmotionModel.from_pretrained(model_name).to(device) 82 | 83 | 84 | def process_func( 85 | x: np.ndarray, 86 | sampling_rate: int, 87 | model: EmotionModel, 88 | processor: Wav2Vec2Processor, 89 | device: str, 90 | embeddings: bool = False, 91 | ) -> np.ndarray: 92 | r"""Predict emotions or extract embeddings from raw audio signal.""" 93 | model = model.to(device) 94 | y = processor(x, sampling_rate=sampling_rate) 95 | y = y["input_values"][0] 96 | y = torch.from_numpy(y).unsqueeze(0).to(device) 97 | 98 | # run through model 99 | with torch.no_grad(): 100 | y = model(y)[0 if embeddings else 1] 101 | 102 | # convert to numpy 103 | y = y.detach().cpu().numpy() 104 | 105 | return y 106 | 107 | 108 | def get_emo(path): 109 | wav, sr = librosa.load(path, 16000) 110 | return process_func( 111 | np.expand_dims(wav, 0).astype(np.float64), 112 | sr, 113 | model, 114 | processor, 115 | device, 116 | embeddings=True, 117 | ).squeeze(0) 118 | --------------------------------------------------------------------------------