├── .gitmodules
├── oldVersion
    ├── V111
    │   └── text
    │   │   ├── fix
    │   │       ├── __init__.py
    │   │       └── japanese_bert.py
    │   │   ├── english_bert_mock.py
    │   │   ├── japanese_bert.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── chinese_bert.py
    │   │   └── symbols.py
    ├── __init__.py
    ├── V200
    │   ├── text
    │   │   ├── cmudict_cache.pickle
    │   │   ├── bert_utils.py
    │   │   ├── cleaner.py
    │   │   ├── english_bert_mock.py
    │   │   ├── __init__.py
    │   │   ├── japanese_bert.py
    │   │   ├── chinese_bert.py
    │   │   └── symbols.py
    │   └── __init__.py
    ├── V210
    │   ├── text
    │   │   ├── cmudict_cache.pickle
    │   │   ├── bert_utils.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── english_bert_mock.py
    │   │   ├── japanese_bert.py
    │   │   ├── symbols.py
    │   │   └── chinese_bert.py
    │   └── emo_gen.py
    ├── V220
    │   ├── text
    │   │   ├── cmudict_cache.pickle
    │   │   ├── bert_utils.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── english_bert_mock.py
    │   │   ├── japanese_bert.py
    │   │   ├── symbols.py
    │   │   └── chinese_bert.py
    │   ├── clap_wrapper.py
    │   └── clap_gen.py
    ├── V101
    │   ├── text
    │   │   ├── english_bert_mock.py
    │   │   ├── cleaner.py
    │   │   ├── __init__.py
    │   │   ├── chinese_bert.py
    │   │   ├── symbols.py
    │   │   └── japanese.py
    │   └── __init__.py
    └── V110
    │   ├── text
    │       ├── english_bert_mock.py
    │       ├── cleaner.py
    │       ├── __init__.py
    │       ├── japanese_bert.py
    │       ├── chinese_bert.py
    │       └── symbols.py
    │   └── __init__.py
├── tools
    ├── __init__.py
    ├── log.py
    └── translate.py
├── bert
    ├── chinese-roberta-wwm-ext-large
    │   ├── added_tokens.json
    │   ├── tokenizer_config.json
    │   ├── special_tokens_map.json
    │   ├── .gitattributes
    │   ├── config.json
    │   └── README.md
    ├── deberta-v3-large
    │   ├── tokenizer_config.json
    │   ├── generator_config.json
    │   ├── config.json
    │   ├── .gitattributes
    │   └── README.md
    ├── deberta-v2-large-japanese-char-wwm
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   └── .gitattributes
    ├── deberta-v2-large-japanese
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   └── .gitattributes
    ├── bert-base-japanese-v3
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   ├── .gitattributes
    │   └── README.md
    ├── bert-large-japanese-v2
    │   ├── tokenizer_config.json
    │   ├── config.json
    │   ├── .gitattributes
    │   └── README.md
    └── bert_models.json
├── onnx_modules
    ├── V200
    │   ├── text
    │   │   ├── __init__.py
    │   │   ├── bert_utils.py
    │   │   ├── cleaner.py
    │   │   ├── english_bert_mock.py
    │   │   ├── japanese_bert.py
    │   │   ├── chinese_bert.py
    │   │   └── symbols.py
    │   └── __init__.py
    ├── V210
    │   ├── text
    │   │   ├── __init__.py
    │   │   └── symbols.py
    │   └── __init__.py
    ├── V220
    │   ├── text
    │   │   ├── __init__.py
    │   │   └── symbols.py
    │   └── __init__.py
    ├── V230
    │   ├── text
    │   │   ├── __init__.py
    │   │   └── symbols.py
    │   └── __init__.py
    ├── V220_novq_dev
    │   ├── text
    │   │   ├── __init__.py
    │   │   └── symbols.py
    │   └── __init__.py
    └── __init__.py
├── emotional
    ├── wav2vec2-large-robust-12-ft-emotion-msp-dim
    │   ├── vocab.json
    │   ├── preprocessor_config.json
    │   ├── .gitattributes
    │   └── config.json
    └── clap-htsat-fused
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   ├── preprocessor_config.json
    │   └── .gitattributes
├── img
    ├── 宵宫.png
    ├── 纳西妲.png
    ├── yuyu.png
    ├── 参数说明.png
    ├── 神里绫华.png
    └── 微信图片_20231010105112.png
├── text
    ├── cmudict_cache.pickle
    ├── bert_utils.py
    ├── cleaner.py
    ├── __init__.py
    ├── english_bert_mock.py
    ├── japanese_bert.py
    ├── symbols.py
    └── chinese_bert.py
├── filelists
    └── sample.list
├── slm
    └── wavlm-base-plus
    │   ├── preprocessor_config.json
    │   ├── .gitattributes
    │   └── config.json
├── css
    └── custom.css
├── requirements.txt
├── export_onnx.py
├── monotonic_align
    ├── __init__.py
    └── core.py
├── .pre-commit-config.yaml
├── run_MnodesAndMgpus.sh
├── .github
    └── workflows
    │   ├── pull_format.yml
    │   └── push_format.yml
├── onnx_infer.py
├── resample_legacy.py
├── README.md
├── resample.py
├── whisper_transcribe.py
├── re_matching.py
├── compress_model.py
├── spec_gen.py
├── bert_gen.py
└── update_status.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具包
3 | """
4 | 


--------------------------------------------------------------------------------
/oldVersion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 老版本模型推理兼容
3 | """
4 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/added_tokens.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 | 


--------------------------------------------------------------------------------
/onnx_modules/V210/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 | 


--------------------------------------------------------------------------------
/onnx_modules/V220/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 | 


--------------------------------------------------------------------------------
/onnx_modules/V230/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/vocab.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import *
2 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"init_inputs": []}
2 | 


--------------------------------------------------------------------------------
/img/宵宫.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/宵宫.png


--------------------------------------------------------------------------------
/img/纳西妲.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/纳西妲.png


--------------------------------------------------------------------------------
/img/yuyu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/yuyu.png


--------------------------------------------------------------------------------
/img/参数说明.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/参数说明.png


--------------------------------------------------------------------------------
/img/神里绫华.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/神里绫华.png


--------------------------------------------------------------------------------
/bert/deberta-v3-large/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "do_lower_case": false,
3 |   "vocab_type": "spm"
4 | }
5 | 


--------------------------------------------------------------------------------
/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/filelists/sample.list:
--------------------------------------------------------------------------------
1 | Example:
2 | {wav_path}|{speaker_name}|{language}|{text}
3 | 派蒙_1.wav|派蒙|ZH|前面的区域，以后再来探索吧！
4 | 


--------------------------------------------------------------------------------
/img/微信图片_20231010105112.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/img/微信图片_20231010105112.png


--------------------------------------------------------------------------------
/oldVersion/V200/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V200/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/oldVersion/V210/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V210/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/oldVersion/V220/text/cmudict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrisKimZHT/Bert-VITS2/master/oldVersion/V220/text/cmudict_cache.pickle


--------------------------------------------------------------------------------
/oldVersion/V101/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/english_bert_mock.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def get_bert_feature(norm_text, word2ph):
5 |     return torch.zeros(1024, sum(word2ph))
6 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 | 
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 | 


--------------------------------------------------------------------------------
/onnx_modules/V210/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 | 
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 | 


--------------------------------------------------------------------------------
/onnx_modules/V220/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 | 
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 | 


--------------------------------------------------------------------------------
/onnx_modules/V230/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 | 
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 | 


--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/__init__.py:
--------------------------------------------------------------------------------
1 | from .text.symbols import symbols
2 | from .models_onnx import SynthesizerTrn
3 | 
4 | __all__ = ["symbols", "SynthesizerTrn"]
5 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
2 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "cls_token": "[CLS]",
3 |   "mask_token": "[MASK]",
4 |   "pad_token": "[PAD]",
5 |   "sep_token": "[SEP]",
6 |   "unk_token": "[UNK]"
7 | }
8 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": "[CLS]",
 3 |   "cls_token": "[CLS]",
 4 |   "eos_token": "[SEP]",
 5 |   "mask_token": "[MASK]",
 6 |   "pad_token": "[PAD]",
 7 |   "sep_token": "[SEP]",
 8 |   "unk_token": "[UNK]"
 9 | }
10 | 


--------------------------------------------------------------------------------
/slm/wavlm-base-plus/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": false,
 3 |   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
 4 |   "feature_size": 1,
 5 |   "padding_side": "right",
 6 |   "padding_value": 0.0,
 7 |   "return_attention_mask": true,
 8 |   "sampling_rate": 16000
 9 | }
10 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": true,
 3 |   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
 4 |   "feature_size": 1,
 5 |   "padding_side": "right",
 6 |   "padding_value": 0.0,
 7 |   "return_attention_mask": true,
 8 |   "sampling_rate": 16000
 9 | }
10 | 


--------------------------------------------------------------------------------
/css/custom.css:
--------------------------------------------------------------------------------
 1 | 
 2 | #yml_code {
 3 |     height: 600px;
 4 |     flex-grow: inherit;
 5 |     overflow-y: auto;
 6 | }
 7 | 
 8 | #json_code {
 9 |     height: 600px;
10 |     flex-grow: inherit;
11 |     overflow-y: auto;
12 | }
13 | 
14 | #gpu_code {
15 |     height: 300px;
16 |     flex-grow: inherit;
17 |     overflow-y: auto;
18 | }
19 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tokenizer_class": "BertJapaneseTokenizer",
 3 |     "model_max_length": 512,
 4 |     "do_lower_case": false,
 5 |     "word_tokenizer_type": "mecab",
 6 |     "subword_tokenizer_type": "wordpiece",
 7 |     "mecab_kwargs": {
 8 |         "mecab_dic": "unidic_lite"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tokenizer_class": "BertJapaneseTokenizer",
 3 |     "model_max_length": 512,
 4 |     "do_lower_case": false,
 5 |     "word_tokenizer_type": "mecab",
 6 |     "subword_tokenizer_type": "wordpiece",
 7 |     "mecab_kwargs": {
 8 |         "mecab_dic": "unidic_lite"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/tools/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | logger封装
 3 | """
 4 | from loguru import logger
 5 | import sys
 6 | 
 7 | 
 8 | # 移除所有默认的处理器
 9 | logger.remove()
10 | 
11 | # 自定义格式并添加到标准输出
12 | log_format = (
13 |     "<g>{time:MM-DD HH:mm:ss}</g> <lvl>{level:<9}</lvl>| {file}:{line} | {message}"
14 | )
15 | 
16 | logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
17 | 


--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": "<s>",
 3 |   "cls_token": "<s>",
 4 |   "eos_token": "</s>",
 5 |   "mask_token": {
 6 |     "content": "<mask>",
 7 |     "lstrip": true,
 8 |     "normalized": false,
 9 |     "rstrip": false,
10 |     "single_word": false
11 |   },
12 |   "pad_token": "<pad>",
13 |   "sep_token": "</s>",
14 |   "unk_token": "<unk>"
15 | }
16 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": "[CLS]",
 3 |   "cls_token": "[CLS]",
 4 |   "do_lower_case": false,
 5 |   "eos_token": "[SEP]",
 6 |   "keep_accents": true,
 7 |   "mask_token": "[MASK]",
 8 |   "pad_token": "[PAD]",
 9 |   "sep_token": "[SEP]",
10 |   "sp_model_kwargs": {},
11 |   "special_tokens_map_file": null,
12 |   "split_by_punct": false,
13 |   "tokenizer_class": "DebertaV2Tokenizer",
14 |   "unk_token": "[UNK]"
15 | }
16 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 2 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.h5 filter=lfs diff=lfs merge=lfs -text
 5 | *.tflite filter=lfs diff=lfs merge=lfs -text
 6 | *.tar.gz filter=lfs diff=lfs merge=lfs -text
 7 | *.ot filter=lfs diff=lfs merge=lfs -text
 8 | *.onnx filter=lfs diff=lfs merge=lfs -text
 9 | *.msgpack filter=lfs diff=lfs merge=lfs -text
10 | 


--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "bos_token": "<s>",
 4 |   "cls_token": "<s>",
 5 |   "eos_token": "</s>",
 6 |   "errors": "replace",
 7 |   "mask_token": "<mask>",
 8 |   "model_max_length": 512,
 9 |   "pad_token": "<pad>",
10 |   "processor_class": "ClapProcessor",
11 |   "sep_token": "</s>",
12 |   "special_tokens_map_file": null,
13 |   "tokenizer_class": "RobertaTokenizer",
14 |   "trim_offsets": true,
15 |   "unk_token": "<unk>"
16 | }
17 | 


--------------------------------------------------------------------------------
/bert/bert_models.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "deberta-v2-large-japanese-char-wwm": {
 3 |         "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
 4 |         "files": ["pytorch_model.bin"]
 5 |     },
 6 |     "chinese-roberta-wwm-ext-large": {
 7 |         "repo_id": "hfl/chinese-roberta-wwm-ext-large",
 8 |         "files": ["pytorch_model.bin"]
 9 |     },
10 |     "deberta-v3-large": {
11 |         "repo_id": "microsoft/deberta-v3-large",
12 |         "files": ["spm.model", "pytorch_model.bin"]
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | librosa==0.9.2
 2 | matplotlib
 3 | numpy
 4 | numba
 5 | phonemizer
 6 | scipy
 7 | tensorboard
 8 | Unidecode
 9 | amfm_decompy
10 | jieba
11 | transformers
12 | pypinyin
13 | cn2an
14 | gradio==3.50.2
15 | av
16 | mecab-python3
17 | loguru
18 | unidic-lite
19 | cmudict
20 | fugashi
21 | num2words
22 | PyYAML
23 | requests
24 | pyopenjtalk-prebuilt
25 | jaconv
26 | psutil
27 | GPUtil
28 | vector_quantize_pytorch
29 | g2p_en
30 | sentencepiece
31 | pykakasi
32 | langid
33 | WeTextProcessing>=0.1.10
34 | 


--------------------------------------------------------------------------------
/export_onnx.py:
--------------------------------------------------------------------------------
 1 | from onnx_modules import export_onnx
 2 | import os
 3 | 
 4 | if __name__ == "__main__":
 5 |     export_path = "BertVits2.2PT"
 6 |     model_path = "model\\G_0.pth"
 7 |     config_path = "model\\config.json"
 8 |     novq = False
 9 |     dev = False
10 |     if not os.path.exists("onnx"):
11 |         os.makedirs("onnx")
12 |     if not os.path.exists(f"onnx/{export_path}"):
13 |         os.makedirs(f"onnx/{export_path}")
14 |     export_onnx(export_path, model_path, config_path, novq, dev)
15 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |         "BertForPreTraining"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 12,
15 |     "num_hidden_layers": 12,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 32768
19 | }
20 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |         "BertForPreTraining"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 1024,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 4096,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 16,
15 |     "num_hidden_layers": 24,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 32768
19 | }
20 | 


--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | from numpy import zeros, int32, float32
 2 | from torch import from_numpy
 3 | 
 4 | from .core import maximum_path_jit
 5 | 
 6 | 
 7 | def maximum_path(neg_cent, mask):
 8 |     device = neg_cent.device
 9 |     dtype = neg_cent.dtype
10 |     neg_cent = neg_cent.data.cpu().numpy().astype(float32)
11 |     path = zeros(neg_cent.shape, dtype=int32)
12 | 
13 |     t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
14 |     t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
15 |     maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
16 |     return from_numpy(path).to(device=device, dtype=dtype)
17 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cls_token": "[CLS]",
 3 |   "do_lower_case": false,
 4 |   "do_subword_tokenize": true,
 5 |   "do_word_tokenize": true,
 6 |   "jumanpp_kwargs": null,
 7 |   "mask_token": "[MASK]",
 8 |   "mecab_kwargs": null,
 9 |   "model_max_length": 1000000000000000019884624838656,
10 |   "never_split": null,
11 |   "pad_token": "[PAD]",
12 |   "sep_token": "[SEP]",
13 |   "special_tokens_map_file": null,
14 |   "subword_tokenizer_type": "character",
15 |   "sudachi_kwargs": null,
16 |   "tokenizer_class": "BertJapaneseTokenizer",
17 |   "unk_token": "[UNK]",
18 |   "word_tokenizer_type": "basic"
19 | }
20 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 | 
 9 |   - repo: https://github.com/astral-sh/ruff-pre-commit
10 |     rev: v0.1.11
11 |     hooks:
12 |       - id: ruff
13 |         args: [ --fix ]
14 | 
15 |   - repo: https://github.com/psf/black
16 |     rev: 23.12.1
17 |     hooks:
18 |       - id: black
19 | 
20 |   - repo: https://github.com/codespell-project/codespell
21 |     rev: v2.2.6
22 |     hooks:
23 |       - id: codespell
24 |         files: ^.*\.(py|md|rst|yml)$
25 |         args: [-L=fro]
26 | 


--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "chunk_length_s": 10,
 3 |   "feature_extractor_type": "ClapFeatureExtractor",
 4 |   "feature_size": 64,
 5 |   "fft_window_size": 1024,
 6 |   "frequency_max": 14000,
 7 |   "frequency_min": 50,
 8 |   "hop_length": 480,
 9 |   "max_length_s": 10,
10 |   "n_fft": 1024,
11 |   "nb_frequency_bins": 513,
12 |   "nb_max_frames": 1000,
13 |   "nb_max_samples": 480000,
14 |   "padding": "repeatpad",
15 |   "padding_side": "right",
16 |   "padding_value": 0.0,
17 |   "processor_class": "ClapProcessor",
18 |   "return_attention_mask": false,
19 |   "sampling_rate": 48000,
20 |   "top_db": null,
21 |   "truncation": "fusion"
22 | }
23 | 


--------------------------------------------------------------------------------
/bert/deberta-v3-large/generator_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"model_type": "deberta-v2",
 3 | 	"attention_probs_dropout_prob": 0.1,
 4 | 	"hidden_act": "gelu",
 5 | 	"hidden_dropout_prob": 0.1,
 6 | 	"hidden_size": 1024,
 7 | 	"initializer_range": 0.02,
 8 | 	"intermediate_size": 4096,
 9 | 	"max_position_embeddings": 512,
10 | 	"relative_attention": true,
11 | 	"position_buckets": 256,
12 | 	"norm_rel_ebd": "layer_norm",
13 | 	"share_att_key": true,
14 | 	"pos_att_type": "p2c|c2p",
15 | 	"layer_norm_eps": 1e-7,
16 | 	"max_relative_positions": -1,
17 | 	"position_biased_input": false,
18 | 	"num_attention_heads": 16,
19 | 	"num_hidden_layers": 12,
20 | 	"type_vocab_size": 0,
21 | 	"vocab_size": 128100
22 | }
23 | 


--------------------------------------------------------------------------------
/bert/deberta-v3-large/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 		"model_type": "deberta-v2",
 3 | 		"attention_probs_dropout_prob": 0.1,
 4 | 		"hidden_act": "gelu",
 5 | 		"hidden_dropout_prob": 0.1,
 6 | 		"hidden_size": 1024,
 7 | 		"initializer_range": 0.02,
 8 | 		"intermediate_size": 4096,
 9 | 		"max_position_embeddings": 512,
10 | 		"relative_attention": true,
11 | 		"position_buckets": 256,
12 | 		"norm_rel_ebd": "layer_norm",
13 | 		"share_att_key": true,
14 | 		"pos_att_type": "p2c|c2p",
15 | 		"layer_norm_eps": 1e-7,
16 | 		"max_relative_positions": -1,
17 | 		"position_biased_input": false,
18 | 		"num_attention_heads": 16,
19 | 		"num_hidden_layers": 24,
20 | 		"type_vocab_size": 0,
21 | 		"vocab_size": 128100
22 | }
23 | 


--------------------------------------------------------------------------------
/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/bert_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from config import config
 6 | 
 7 | 
 8 | MIRROR: str = config.mirror
 9 | 
10 | 
11 | def _check_bert(repo_id, files, local_path):
12 |     for file in files:
13 |         if not Path(local_path).joinpath(file).exists():
14 |             if MIRROR.lower() == "openi":
15 |                 import openi
16 | 
17 |                 openi.model.download_model(
18 |                     "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19 |                 )
20 |             else:
21 |                 hf_hub_download(
22 |                     repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23 |                 )
24 | 


--------------------------------------------------------------------------------
/run_MnodesAndMgpus.sh:
--------------------------------------------------------------------------------
 1 | #多机多卡训练
 2 | 
 3 | #--nnodes=1:3 表示 使用一到三台机器 弹性分配资源
 4 | #--nnodes=<最小节点数>:<最大节点数>
 5 | #--nproc_per_node=每台机器上可用的GPU数
 6 | #--rdzv_endpoint=主节点（最先启动的）ip:端口号
 7 | #其他不需要变
 8 | 
 9 | #注意： 此版本的分布式训练是基于数据并行的，多机多卡相当于开更大的batchsize，此时epoch迭代速度会增加,
10 | #但由于 该版本的代码中 保存模型是按照global step来计算的，所以会出现的效果就是 ： 保存模型的时间不会有明显加速，
11 | #但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数，实现更好的效果”
12 | 
13 | #*************************
14 | # torchrun \
15 | #     --nnodes=1:3\
16 | #     --nproc_per_node=2\
17 | #     --rdzv_id=1\
18 | #     --rdzv_backend=c10d\
19 | #     --rdzv_endpoint="inspur1:8880"\
20 | #     train_ms.py
21 | #****************************
22 | 
23 | #多卡训练
24 | #nproc_per_node = 机器上可用的GPU数
25 | 
26 | #*************************
27 | torchrun \
28 |     --nnodes=1\
29 |     --nproc_per_node=2\
30 |     train_ms.py
31 | #*************************
32 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "bos_token_id": 0,
 7 |   "directionality": "bidi",
 8 |   "eos_token_id": 2,
 9 |   "hidden_act": "gelu",
10 |   "hidden_dropout_prob": 0.1,
11 |   "hidden_size": 1024,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 4096,
14 |   "layer_norm_eps": 1e-12,
15 |   "max_position_embeddings": 512,
16 |   "model_type": "bert",
17 |   "num_attention_heads": 16,
18 |   "num_hidden_layers": 24,
19 |   "output_past": true,
20 |   "pad_token_id": 0,
21 |   "pooler_fc_size": 768,
22 |   "pooler_num_attention_heads": 12,
23 |   "pooler_num_fc_layers": 3,
24 |   "pooler_size_per_head": 128,
25 |   "pooler_type": "first_token_transform",
26 |   "type_vocab_size": 2,
27 |   "vocab_size": 21128
28 | }
29 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, english, cleaned_text_to_sequence
 2 | 
 3 | 
 4 | language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 5 | 
 6 | 
 7 | def clean_text(text, language):
 8 |     language_module = language_module_map[language]
 9 |     norm_text = language_module.text_normalize(text)
10 |     phones, tones, word2ph = language_module.g2p(norm_text)
11 |     return norm_text, phones, tones, word2ph
12 | 
13 | 
14 | def clean_text_bert(text, language):
15 |     language_module = language_module_map[language]
16 |     norm_text = language_module.text_normalize(text)
17 |     phones, tones, word2ph = language_module.g2p(norm_text)
18 |     bert = language_module.get_bert_feature(norm_text, word2ph)
19 |     return phones, tones, bert
20 | 
21 | 
22 | def text_to_sequence(text, language):
23 |     norm_text, phones, tones, word2ph = clean_text(text, language)
24 |     return cleaned_text_to_sequence(phones, tones, language)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     pass
29 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | 
 7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 8 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 9 |     Args:
10 |       text: string to convert to a sequence
11 |     Returns:
12 |       List of integers corresponding to the symbols in the text
13 |     """
14 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 |     tone_start = language_tone_start_map[language]
16 |     tones = [i + tone_start for i in tones]
17 |     lang_id = language_id_map[language]
18 |     lang_ids = [lang_id for i in phones]
19 |     return phones, tones, lang_ids
20 | 
21 | 
22 | def get_bert(norm_text, word2ph, language):
23 |     from .chinese_bert import get_bert_feature as zh_bert
24 |     from .english_bert_mock import get_bert_feature as en_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph)
28 |     return bert
29 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "DebertaV2ForMaskedLM"
 4 |   ],
 5 |   "attention_head_size": 64,
 6 |   "attention_probs_dropout_prob": 0.1,
 7 |   "conv_act": "gelu",
 8 |   "conv_kernel_size": 3,
 9 |   "hidden_act": "gelu",
10 |   "hidden_dropout_prob": 0.1,
11 |   "hidden_size": 1024,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 4096,
14 |   "layer_norm_eps": 1e-07,
15 |   "max_position_embeddings": 512,
16 |   "max_relative_positions": -1,
17 |   "model_type": "deberta-v2",
18 |   "norm_rel_ebd": "layer_norm",
19 |   "num_attention_heads": 16,
20 |   "num_hidden_layers": 24,
21 |   "pad_token_id": 0,
22 |   "pooler_dropout": 0,
23 |   "pooler_hidden_act": "gelu",
24 |   "pooler_hidden_size": 1024,
25 |   "pos_att_type": [
26 |     "p2c",
27 |     "c2p"
28 |   ],
29 |   "position_biased_input": false,
30 |   "position_buckets": 256,
31 |   "relative_attention": true,
32 |   "share_att_key": true,
33 |   "torch_dtype": "float16",
34 |   "transformers_version": "4.25.1",
35 |   "type_vocab_size": 0,
36 |   "vocab_size": 22012
37 | }
38 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_format.yml:
--------------------------------------------------------------------------------
 1 | name: pull format
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   pull_format:
10 |     runs-on: ${{ matrix.os }}
11 | 
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.10"]
15 |         os: [ubuntu-latest]
16 |       fail-fast: false
17 | 
18 |     continue-on-error: true
19 | 
20 |     steps:
21 |       - name: checkout
22 |         continue-on-error: true
23 |         uses: actions/checkout@v3
24 |         with:
25 |           ref: ${{ github.head_ref }}
26 |           fetch-depth: 0
27 | 
28 |       - name: Set up Python ${{ matrix.python-version }}
29 |         uses: actions/setup-python@v4
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 | 
33 |       - name: Install Black
34 |         run: pip install "black[jupyter]"
35 | 
36 |       - name: Run Black
37 |         # run: black $(git ls-files '*.py')
38 |         run: black .
39 | 
40 |       - name: Commit Back
41 |         uses: stefanzweifel/git-auto-commit-action@v4
42 |         with:
43 |           commit_message: Apply Code Formatter Change
44 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "configs/deberta_v2_large.json",
 3 |   "architectures": [
 4 |     "DebertaV2ForMaskedLM"
 5 |   ],
 6 |   "attention_head_size": 64,
 7 |   "attention_probs_dropout_prob": 0.1,
 8 |   "conv_act": "gelu",
 9 |   "conv_kernel_size": 3,
10 |   "hidden_act": "gelu",
11 |   "hidden_dropout_prob": 0.1,
12 |   "hidden_size": 1024,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 4096,
15 |   "layer_norm_eps": 1e-07,
16 |   "max_position_embeddings": 512,
17 |   "max_relative_positions": -1,
18 |   "model_type": "deberta-v2",
19 |   "norm_rel_ebd": "layer_norm",
20 |   "num_attention_heads": 16,
21 |   "num_hidden_layers": 24,
22 |   "pad_token_id": 0,
23 |   "pooler_dropout": 0,
24 |   "pooler_hidden_act": "gelu",
25 |   "pooler_hidden_size": 1024,
26 |   "pos_att_type": [
27 |     "p2c",
28 |     "c2p"
29 |   ],
30 |   "position_biased_input": false,
31 |   "position_buckets": 256,
32 |   "relative_attention": true,
33 |   "share_att_key": true,
34 |   "torch_dtype": "float32",
35 |   "transformers_version": "4.23.1",
36 |   "type_vocab_size": 0,
37 |   "vocab_size": 32000
38 | }
39 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | 
 7 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 8 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 9 |     Args:
10 |       text: string to convert to a sequence
11 |     Returns:
12 |       List of integers corresponding to the symbols in the text
13 |     """
14 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
15 |     tone_start = language_tone_start_map[language]
16 |     tones = [i + tone_start for i in tones]
17 |     lang_id = language_id_map[language]
18 |     lang_ids = [lang_id for i in phones]
19 |     return phones, tones, lang_ids
20 | 
21 | 
22 | def get_bert(norm_text, word2ph, language, device):
23 |     from .chinese_bert import get_bert_feature as zh_bert
24 |     from .english_bert_mock import get_bert_feature as en_bert
25 |     from .japanese_bert import get_bert_feature as jp_bert
26 | 
27 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
28 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
29 |     return bert
30 | 


--------------------------------------------------------------------------------
/bert/deberta-v3-large/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 6 | 
 7 | 
 8 | def get_bert_feature(text, word2ph, device=None):
 9 |     if (
10 |         sys.platform == "darwin"
11 |         and torch.backends.mps.is_available()
12 |         and device == "cpu"
13 |     ):
14 |         device = "mps"
15 |     if not device:
16 |         device = "cuda"
17 |     model = AutoModelForMaskedLM.from_pretrained("./bert/bert-base-japanese-v3").to(
18 |         device
19 |     )
20 |     with torch.no_grad():
21 |         inputs = tokenizer(text, return_tensors="pt")
22 |         for i in inputs:
23 |             inputs[i] = inputs[i].to(device)
24 |         res = model(**inputs, output_hidden_states=True)
25 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
27 |     word2phone = word2ph
28 |     phone_level_feature = []
29 |     for i in range(len(word2phone)):
30 |         repeat_feature = res[i].repeat(word2phone[i], 1)
31 |         phone_level_feature.append(repeat_feature)
32 | 
33 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
34 | 
35 |     return phone_level_feature.T
36 | 


--------------------------------------------------------------------------------
/slm/wavlm-base-plus/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.wasm filter=lfs diff=lfs merge=lfs -text
25 | *.xz filter=lfs diff=lfs merge=lfs -text
26 | *.zip filter=lfs diff=lfs merge=lfs -text
27 | *.zstandard filter=lfs diff=lfs merge=lfs -text
28 | *tfevents* filter=lfs diff=lfs merge=lfs -text
29 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 6 | 
 7 | models = dict()
 8 | 
 9 | 
10 | def get_bert_feature(text, word2ph, device=None):
11 |     if (
12 |         sys.platform == "darwin"
13 |         and torch.backends.mps.is_available()
14 |         and device == "cpu"
15 |     ):
16 |         device = "mps"
17 |     if not device:
18 |         device = "cuda"
19 |     if device not in models.keys():
20 |         models[device] = AutoModelForMaskedLM.from_pretrained(
21 |             "./bert/bert-base-japanese-v3"
22 |         ).to(device)
23 |     with torch.no_grad():
24 |         inputs = tokenizer(text, return_tensors="pt")
25 |         for i in inputs:
26 |             inputs[i] = inputs[i].to(device)
27 |         res = models[device](**inputs, output_hidden_states=True)
28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
30 |     word2phone = word2ph
31 |     phone_level_feature = []
32 |     for i in range(len(word2phone)):
33 |         repeat_feature = res[i].repeat(word2phone[i], 1)
34 |         phone_level_feature.append(repeat_feature)
35 | 
36 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
37 | 
38 |     return phone_level_feature.T
39 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, cleaned_text_to_sequence
 2 | from .fix import japanese as japanese_fix
 3 | 
 4 | 
 5 | language_module_map = {"ZH": chinese, "JP": japanese}
 6 | language_module_map_fix = {"ZH": chinese, "JP": japanese_fix}
 7 | 
 8 | 
 9 | def clean_text(text, language):
10 |     language_module = language_module_map[language]
11 |     norm_text = language_module.text_normalize(text)
12 |     phones, tones, word2ph = language_module.g2p(norm_text)
13 |     return norm_text, phones, tones, word2ph
14 | 
15 | 
16 | def clean_text_fix(text, language):
17 |     """使用dev分支修复"""
18 |     language_module = language_module_map_fix[language]
19 |     norm_text = language_module.text_normalize(text)
20 |     phones, tones, word2ph = language_module.g2p(norm_text)
21 |     return norm_text, phones, tones, word2ph
22 | 
23 | 
24 | def clean_text_bert(text, language):
25 |     language_module = language_module_map[language]
26 |     norm_text = language_module.text_normalize(text)
27 |     phones, tones, word2ph = language_module.g2p(norm_text)
28 |     bert = language_module.get_bert_feature(norm_text, word2ph)
29 |     return phones, tones, bert
30 | 
31 | 
32 | def text_to_sequence(text, language):
33 |     norm_text, phones, tones, word2ph = clean_text(text, language)
34 |     return cleaned_text_to_sequence(phones, tones, language)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     pass
39 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     if (
18 |         sys.platform == "darwin"
19 |         and torch.backends.mps.is_available()
20 |         and device == "cpu"
21 |     ):
22 |         device = "mps"
23 |     if not device:
24 |         device = "cuda"
25 |     if device not in models.keys():
26 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 |     with torch.no_grad():
28 |         inputs = tokenizer(text, return_tensors="pt")
29 |         for i in inputs:
30 |             inputs[i] = inputs[i].to(device)
31 |         res = models[device](**inputs, output_hidden_states=True)
32 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 |     # assert len(word2ph) == len(text)+2
34 |     word2phone = word2ph
35 |     phone_level_feature = []
36 |     for i in range(len(word2phone)):
37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
38 |         phone_level_feature.append(repeat_feature)
39 | 
40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 | 
42 |     return phone_level_feature.T
43 | 


--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
 1 | import numba
 2 | 
 3 | 
 4 | @numba.jit(
 5 |     numba.void(
 6 |         numba.int32[:, :, ::1],
 7 |         numba.float32[:, :, ::1],
 8 |         numba.int32[::1],
 9 |         numba.int32[::1],
10 |     ),
11 |     nopython=True,
12 |     nogil=True,
13 | )
14 | def maximum_path_jit(paths, values, t_ys, t_xs):
15 |     b = paths.shape[0]
16 |     max_neg_val = -1e9
17 |     for i in range(int(b)):
18 |         path = paths[i]
19 |         value = values[i]
20 |         t_y = t_ys[i]
21 |         t_x = t_xs[i]
22 | 
23 |         v_prev = v_cur = 0.0
24 |         index = t_x - 1
25 | 
26 |         for y in range(t_y):
27 |             for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
28 |                 if x == y:
29 |                     v_cur = max_neg_val
30 |                 else:
31 |                     v_cur = value[y - 1, x]
32 |                 if x == 0:
33 |                     if y == 0:
34 |                         v_prev = 0.0
35 |                     else:
36 |                         v_prev = max_neg_val
37 |                 else:
38 |                     v_prev = value[y - 1, x - 1]
39 |                 value[y, x] += max(v_prev, v_cur)
40 | 
41 |         for y in range(t_y - 1, -1, -1):
42 |             path[y, index] = 1
43 |             if index != 0 and (
44 |                 index == y or value[y - 1, index] < value[y - 1, index - 1]
45 |             ):
46 |                 index = index - 1
47 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     if (
18 |         sys.platform == "darwin"
19 |         and torch.backends.mps.is_available()
20 |         and device == "cpu"
21 |     ):
22 |         device = "mps"
23 |     if not device:
24 |         device = "cuda"
25 |     if device not in models.keys():
26 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27 |     with torch.no_grad():
28 |         inputs = tokenizer(text, return_tensors="pt")
29 |         for i in inputs:
30 |             inputs[i] = inputs[i].to(device)
31 |         res = models[device](**inputs, output_hidden_states=True)
32 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33 |     # assert len(word2ph) == len(text)+2
34 |     word2phone = word2ph
35 |     phone_level_feature = []
36 |     for i in range(len(word2phone)):
37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
38 |         phone_level_feature.append(repeat_feature)
39 | 
40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
41 | 
42 |     return phone_level_feature.T
43 | 


--------------------------------------------------------------------------------
/onnx_infer.py:
--------------------------------------------------------------------------------
 1 | from onnx_modules.V220_OnnxInference import OnnxInferenceSession
 2 | import numpy as np
 3 | 
 4 | Session = OnnxInferenceSession(
 5 |     {
 6 |         "enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
 7 |         "emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
 8 |         "dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
 9 |         "sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
10 |         "flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
11 |         "dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx",
12 |     },
13 |     Providers=["CPUExecutionProvider"],
14 | )
15 | 
16 | # 这里的输入和原版是一样的，只需要在原版预处理结果出来之后加上.numpy()即可
17 | x = np.array(
18 |     [
19 |         0,
20 |         97,
21 |         0,
22 |         8,
23 |         0,
24 |         78,
25 |         0,
26 |         8,
27 |         0,
28 |         76,
29 |         0,
30 |         37,
31 |         0,
32 |         40,
33 |         0,
34 |         97,
35 |         0,
36 |         8,
37 |         0,
38 |         23,
39 |         0,
40 |         8,
41 |         0,
42 |         74,
43 |         0,
44 |         26,
45 |         0,
46 |         104,
47 |         0,
48 |     ]
49 | )
50 | tone = np.zeros_like(x)
51 | language = np.zeros_like(x)
52 | sid = np.array([0])
53 | bert = np.random.randn(x.shape[0], 1024)
54 | ja_bert = np.random.randn(x.shape[0], 1024)
55 | en_bert = np.random.randn(x.shape[0], 1024)
56 | emo = np.random.randn(512, 1)
57 | 
58 | audio = Session(x, tone, language, bert, ja_bert, en_bert, emo, sid)
59 | 
60 | print(audio)
61 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 |     return bert
29 | 
30 | 
31 | def get_bert_fix(norm_text, word2ph, language, device):
32 |     from .chinese_bert import get_bert_feature as zh_bert
33 |     from .english_bert_mock import get_bert_feature as en_bert
34 |     from .fix.japanese_bert import get_bert_feature as jp_bert
35 | 
36 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
37 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
38 |     return bert
39 | 


--------------------------------------------------------------------------------
/.github/workflows/push_format.yml:
--------------------------------------------------------------------------------
 1 | name: push format
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - dev
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pull-requests: write
12 | 
13 | jobs:
14 |   push_format:
15 |     runs-on: ${{ matrix.os }}
16 | 
17 |     strategy:
18 |       matrix:
19 |         python-version: ["3.10"]
20 |         os: [ubuntu-latest]
21 |       fail-fast: false
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |         with:
26 |           ref: ${{github.ref_name}}
27 | 
28 |       - name: Set up Python ${{ matrix.python-version }}
29 |         uses: actions/setup-python@v4
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 | 
33 |       - name: Install Black
34 |         run: pip install "black[jupyter]"
35 | 
36 |       - name: Run Black
37 |         # run: black $(git ls-files '*.py')
38 |         run: black .
39 | 
40 |       - name: Commit Back
41 |         continue-on-error: true
42 |         id: commitback
43 |         run: |
44 |           git config --local user.email "github-actions[bot]@users.noreply.github.com"
45 |           git config --local user.name "github-actions[bot]"
46 |           git add --all
47 |           git commit -m "Format code"
48 | 
49 |       - name: Create Pull Request
50 |         if: steps.commitback.outcome == 'success'
51 |         continue-on-error: true
52 |         uses: peter-evans/create-pull-request@v5
53 |         with:
54 |           delete-branch: true
55 |           body: Apply Code Formatter Change
56 |           title: Apply Code Formatter Change
57 |           commit-message: Automatic code format
58 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 | 


--------------------------------------------------------------------------------
/emotional/clap-htsat-fused/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 | 


--------------------------------------------------------------------------------
/oldVersion/V220/clap_wrapper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import ClapModel, ClapProcessor
 5 | 
 6 | from config import config
 7 | 
 8 | models = dict()
 9 | processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
10 | 
11 | 
12 | def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
13 |     if (
14 |         sys.platform == "darwin"
15 |         and torch.backends.mps.is_available()
16 |         and device == "cpu"
17 |     ):
18 |         device = "mps"
19 |     if not device:
20 |         device = "cuda"
21 |     if device not in models.keys():
22 |         models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
23 |             device
24 |         )
25 |     with torch.no_grad():
26 |         inputs = processor(
27 |             audios=audio_data, return_tensors="pt", sampling_rate=48000
28 |         ).to(device)
29 |         emb = models[device].get_audio_features(**inputs)
30 |     return emb.T
31 | 
32 | 
33 | def get_clap_text_feature(text, device=config.bert_gen_config.device):
34 |     if (
35 |         sys.platform == "darwin"
36 |         and torch.backends.mps.is_available()
37 |         and device == "cpu"
38 |     ):
39 |         device = "mps"
40 |     if not device:
41 |         device = "cuda"
42 |     if device not in models.keys():
43 |         models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
44 |             device
45 |         )
46 |     with torch.no_grad():
47 |         inputs = processor(text=text, return_tensors="pt").to(device)
48 |         emb = models[device].get_text_features(**inputs)
49 |     return emb.T
50 | 


--------------------------------------------------------------------------------
/bert/deberta-v2-large-japanese-char-wwm/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tflite filter=lfs diff=lfs merge=lfs -text
29 | *.tgz filter=lfs diff=lfs merge=lfs -text
30 | *.wasm filter=lfs diff=lfs merge=lfs -text
31 | *.xz filter=lfs diff=lfs merge=lfs -text
32 | *.zip filter=lfs diff=lfs merge=lfs -text
33 | *.zst filter=lfs diff=lfs merge=lfs -text
34 | *tfevents* filter=lfs diff=lfs merge=lfs -text
35 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](norm_text, word2ph, device)
28 |     return bert
29 | 
30 | 
31 | def check_bert_models():
32 |     import json
33 |     from pathlib import Path
34 | 
35 |     from config import config
36 |     from .bert_utils import _check_bert
37 | 
38 |     if config.mirror.lower() == "openi":
39 |         import openi
40 | 
41 |         kwargs = {"token": config.openi_token} if config.openi_token else {}
42 |         openi.login(**kwargs)
43 | 
44 |     with open("./bert/bert_models.json", "r") as fp:
45 |         models = json.load(fp)
46 |         for k, v in models.items():
47 |             local_path = Path("./bert").joinpath(k)
48 |             _check_bert(v["repo_id"], v["files"], local_path)
49 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device, style_text, style_weight):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](
28 |         norm_text, word2ph, device, style_text, style_weight
29 |     )
30 |     return bert
31 | 
32 | 
33 | def check_bert_models():
34 |     import json
35 |     from pathlib import Path
36 | 
37 |     from config import config
38 |     from .bert_utils import _check_bert
39 | 
40 |     if config.mirror.lower() == "openi":
41 |         import openi
42 | 
43 |         kwargs = {"token": config.openi_token} if config.openi_token else {}
44 |         openi.login(**kwargs)
45 | 
46 |     with open("./bert/bert_models.json", "r") as fp:
47 |         models = json.load(fp)
48 |         for k, v in models.items():
49 |             local_path = Path("./bert").joinpath(k)
50 |             _check_bert(v["repo_id"], v["files"], local_path)
51 | 
52 | 
53 | check_bert_models()
54 | 


--------------------------------------------------------------------------------
/onnx_modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from utils import get_hparams_from_file, load_checkpoint
 2 | import json
 3 | 
 4 | 
 5 | def export_onnx(export_path, model_path, config_path, novq, dev):
 6 |     hps = get_hparams_from_file(config_path)
 7 |     version = hps.version[0:3]
 8 |     if version == "2.0" or (version == "2.1" and novq):
 9 |         from .V200 import SynthesizerTrn, symbols
10 |     elif version == "2.1" and (not novq):
11 |         from .V210 import SynthesizerTrn, symbols
12 |     elif version == "2.2":
13 |         if novq and dev:
14 |             from .V220_novq_dev import SynthesizerTrn, symbols
15 |         else:
16 |             from .V220 import SynthesizerTrn, symbols
17 |     elif version == "2.3":
18 |         from .V230 import SynthesizerTrn, symbols
19 |     net_g = SynthesizerTrn(
20 |         len(symbols),
21 |         hps.data.filter_length // 2 + 1,
22 |         hps.train.segment_size // hps.data.hop_length,
23 |         n_speakers=hps.data.n_speakers,
24 |         **hps.model,
25 |     )
26 |     _ = net_g.eval()
27 |     _ = load_checkpoint(model_path, net_g, None, skip_optimizer=True)
28 |     net_g.cpu()
29 |     net_g.export_onnx(export_path)
30 | 
31 |     spklist = []
32 |     for key in hps.data.spk2id.keys():
33 |         spklist.append(key)
34 | 
35 |     MoeVSConf = {
36 |         "Folder": f"{export_path}",
37 |         "Name": f"{export_path}",
38 |         "Type": "BertVits",
39 |         "Symbol": symbols,
40 |         "Cleaner": "",
41 |         "Rate": hps.data.sampling_rate,
42 |         "CharaMix": True,
43 |         "Characters": spklist,
44 |         "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
45 |         "Dict": "BasicDict",
46 |         "BertPath": [
47 |             "chinese-roberta-wwm-ext-large",
48 |             "deberta-v2-large-japanese",
49 |             "bert-base-japanese-v3",
50 |         ],
51 |         "Clap": "clap-htsat-fused",
52 |     }
53 | 
54 |     with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
55 |         json.dump(MoeVSConf, MoeVsConfFile, indent=4)
56 | 


--------------------------------------------------------------------------------
/resample_legacy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | from multiprocessing import Pool, cpu_count
 5 | 
 6 | import soundfile
 7 | from tqdm import tqdm
 8 | 
 9 | from config import config
10 | 
11 | 
12 | def process(item):
13 |     wav_name, args = item
14 |     wav_path = os.path.join(args.in_dir, wav_name)
15 |     if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
16 |         wav, sr = librosa.load(wav_path, sr=args.sr)
17 |         soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument(
23 |         "--sr",
24 |         type=int,
25 |         default=config.resample_config.sampling_rate,
26 |         help="sampling rate",
27 |     )
28 |     parser.add_argument(
29 |         "--in_dir",
30 |         type=str,
31 |         default=config.resample_config.in_dir,
32 |         help="path to source dir",
33 |     )
34 |     parser.add_argument(
35 |         "--out_dir",
36 |         type=str,
37 |         default=config.resample_config.out_dir,
38 |         help="path to target dir",
39 |     )
40 |     parser.add_argument(
41 |         "--processes",
42 |         type=int,
43 |         default=0,
44 |         help="cpu_processes",
45 |     )
46 |     args, _ = parser.parse_known_args()
47 |     # autodl 无卡模式会识别出46个cpu
48 |     if args.processes == 0:
49 |         processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 |     else:
51 |         processes = args.processes
52 |     pool = Pool(processes=processes)
53 | 
54 |     tasks = []
55 | 
56 |     for dirpath, _, filenames in os.walk(args.in_dir):
57 |         if not os.path.isdir(args.out_dir):
58 |             os.makedirs(args.out_dir, exist_ok=True)
59 |         for filename in filenames:
60 |             if filename.lower().endswith(".wav"):
61 |                 tasks.append((filename, args))
62 | 
63 |     for _ in tqdm(
64 |         pool.imap_unordered(process, tasks),
65 |     ):
66 |         pass
67 | 
68 |     pool.close()
69 |     pool.join()
70 | 
71 |     print("音频重采样完毕!")
72 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from text.symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](
28 |         norm_text, word2ph, device, style_text, style_weight
29 |     )
30 |     return bert
31 | 
32 | 
33 | def check_bert_models():
34 |     import json
35 |     from pathlib import Path
36 | 
37 |     from config import config
38 |     from .bert_utils import _check_bert
39 | 
40 |     if config.mirror.lower() == "openi":
41 |         import openi
42 | 
43 |         kwargs = {"token": config.openi_token} if config.openi_token else {}
44 |         openi.login(**kwargs)
45 | 
46 |     with open("./bert/bert_models.json", "r") as fp:
47 |         models = json.load(fp)
48 |         for k, v in models.items():
49 |             local_path = Path("./bert").joinpath(k)
50 |             _check_bert(v["repo_id"], v["files"], local_path)
51 | 
52 | 
53 | def init_openjtalk():
54 |     import platform
55 | 
56 |     if platform.platform() == "Linux":
57 |         import pyopenjtalk
58 | 
59 |         pyopenjtalk.g2p("こんにちは，世界。")
60 | 
61 | 
62 | init_openjtalk()
63 | check_bert_models()
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | <img alt="LOGO" src="https://cdn.jsdelivr.net/gh/fishaudio/fish-diffusion@main/images/logo_512x512.png" width="256" height="256" />
 4 | 
 5 | # Bert-VITS2
 6 | 
 7 | VITS2 Backbone with multilingual bert
 8 | 
 9 | For quick guide, please refer to `webui_preprocess.py`.
10 | 
11 | 简易教程请参见 `webui_preprocess.py`。
12 | 
13 | ## 请注意，本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
14 | ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
15 | 
16 | [//]: # (## 本项目与[PlayVoice/vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41; 没有任何关系)
17 | 
18 | [//]: # ()
19 | [//]: # (本仓库来源于之前朋友分享了ai峰哥的视频，本人被其中的效果惊艳，在自己尝试MassTTS以后发现fs在音质方面与vits有一定差距，并且training的pipeline比vits更复杂，因此按照其思路将bert)
20 | 
21 | ## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应当参阅代码自己学习如何训练。
22 | 
23 | ### 严禁将此项目用于一切违反《中华人民共和国宪法》，《中华人民共和国刑法》，《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
24 | ### 严禁用于任何政治相关用途。
25 | #### Video:https://www.bilibili.com/video/BV1hp4y1K78E
26 | #### Demo:https://www.bilibili.com/video/BV1TF411k78w
27 | #### QQ Group：815818430
28 | ## References
29 | + [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
30 | + [jaywalnut310/vits](https://github.com/jaywalnut310/vits)
31 | + [p0p4k/vits2_pytorch](https://github.com/p0p4k/vits2_pytorch)
32 | + [svc-develop-team/so-vits-svc](https://github.com/svc-develop-team/so-vits-svc)
33 | + [PaddlePaddle/PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech)
34 | + [emotional-vits](https://github.com/innnky/emotional-vits)
35 | + [fish-speech](https://github.com/fishaudio/fish-speech)
36 | + [Bert-VITS2-UI](https://github.com/jiangyuxiaoxiao/Bert-VITS2-UI)
37 | ## 感谢所有贡献者作出的努力
38 | <a href="https://github.com/fishaudio/Bert-VITS2/graphs/contributors" target="_blank">
39 |   <img src="https://contrib.rocks/image?repo=fishaudio/Bert-VITS2"/>
40 | </a>
41 | 
42 | [//]: # (# 本项目所有代码引用均已写明，bert部分代码思路来源于[AI峰哥]&#40;https://www.bilibili.com/video/BV1w24y1c7z9&#41;，与[vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41;无任何关系。欢迎各位查阅代码。同时，我们也对该开发者的[碰瓷，乃至开盒开发者的行为]&#40;https://www.bilibili.com/read/cv27101514/&#41;表示强烈谴责。)
43 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 4 | 
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text, tones, language):
 7 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |     """
13 |     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |     tone_start = language_tone_start_map[language]
15 |     tones = [i + tone_start for i in tones]
16 |     lang_id = language_id_map[language]
17 |     lang_ids = [lang_id for i in phones]
18 |     return phones, tones, lang_ids
19 | 
20 | 
21 | def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22 |     from .chinese_bert import get_bert_feature as zh_bert
23 |     from .english_bert_mock import get_bert_feature as en_bert
24 |     from .japanese_bert import get_bert_feature as jp_bert
25 | 
26 |     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27 |     bert = lang_bert_func_map[language](
28 |         norm_text, word2ph, device, style_text, style_weight
29 |     )
30 |     return bert
31 | 
32 | 
33 | def check_bert_models():
34 |     import json
35 |     from pathlib import Path
36 | 
37 |     from config import config
38 |     from .bert_utils import _check_bert
39 | 
40 |     if config.mirror.lower() == "openi":
41 |         import openi
42 | 
43 |         kwargs = {"token": config.openi_token} if config.openi_token else {}
44 |         openi.login(**kwargs)
45 | 
46 |     with open("./bert/bert_models.json", "r") as fp:
47 |         models = json.load(fp)
48 |         for k, v in models.items():
49 |             local_path = Path("./bert").joinpath(k)
50 |             _check_bert(v["repo_id"], v["files"], local_path)
51 | 
52 | 
53 | def init_openjtalk():
54 |     import platform
55 | 
56 |     if platform.platform() == "Linux":
57 |         import pyopenjtalk
58 | 
59 |         pyopenjtalk.g2p("こんにちは，世界。")
60 | 
61 | 
62 | init_openjtalk()
63 | check_bert_models()
64 | 


--------------------------------------------------------------------------------
/tools/translate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 翻译api
 3 | """
 4 | from config import config
 5 | 
 6 | import random
 7 | import hashlib
 8 | import requests
 9 | 
10 | 
11 | def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12 |     """
13 |     :param Sentence: 待翻译语句
14 |     :param from_Language: 待翻译语句语言
15 |     :param to_Language: 目标语言
16 |     :return: 翻译后语句 出错时返回None
17 | 
18 |     常见语言代码：中文 zh 英语 en 日语 jp
19 |     """
20 |     appid = config.translate_config.app_key
21 |     key = config.translate_config.secret_key
22 |     if appid == "" or key == "":
23 |         return "请开发者在config.yml中配置app_key与secret_key"
24 |     url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25 |     texts = Sentence.splitlines()
26 |     outTexts = []
27 |     for t in texts:
28 |         if t != "":
29 |             # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30 |             salt = str(random.randint(1, 100000))
31 |             signString = appid + t + salt + key
32 |             hs = hashlib.md5()
33 |             hs.update(signString.encode("utf-8"))
34 |             signString = hs.hexdigest()
35 |             if from_Language == "":
36 |                 from_Language = "auto"
37 |             headers = {"Content-Type": "application/x-www-form-urlencoded"}
38 |             payload = {
39 |                 "q": t,
40 |                 "from": from_Language,
41 |                 "to": to_Language,
42 |                 "appid": appid,
43 |                 "salt": salt,
44 |                 "sign": signString,
45 |             }
46 |             # 发送请求
47 |             try:
48 |                 response = requests.post(
49 |                     url=url, data=payload, headers=headers, timeout=3
50 |                 )
51 |                 response = response.json()
52 |                 if "trans_result" in response.keys():
53 |                     result = response["trans_result"][0]
54 |                     if "dst" in result.keys():
55 |                         dst = result["dst"]
56 |                         outTexts.append(dst)
57 |             except Exception:
58 |                 return Sentence
59 |         else:
60 |             outTexts.append(t)
61 |     return "\n".join(outTexts)
62 | 


--------------------------------------------------------------------------------
/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     if (
24 |         sys.platform == "darwin"
25 |         and torch.backends.mps.is_available()
26 |         and device == "cpu"
27 |     ):
28 |         device = "mps"
29 |     if not device:
30 |         device = "cuda"
31 |     if device not in models.keys():
32 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 |     with torch.no_grad():
34 |         inputs = tokenizer(text, return_tensors="pt")
35 |         for i in inputs:
36 |             inputs[i] = inputs[i].to(device)
37 |         res = models[device](**inputs, output_hidden_states=True)
38 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 |         if style_text:
40 |             style_inputs = tokenizer(style_text, return_tensors="pt")
41 |             for i in style_inputs:
42 |                 style_inputs[i] = style_inputs[i].to(device)
43 |             style_res = models[device](**style_inputs, output_hidden_states=True)
44 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 |             style_res_mean = style_res.mean(0)
46 |     assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 |     word2phone = word2ph
48 |     phone_level_feature = []
49 |     for i in range(len(word2phone)):
50 |         if style_text:
51 |             repeat_feature = (
52 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 |             )
55 |         else:
56 |             repeat_feature = res[i].repeat(word2phone[i], 1)
57 |         phone_level_feature.append(repeat_feature)
58 | 
59 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 | 
61 |     return phone_level_feature.T
62 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     if (
24 |         sys.platform == "darwin"
25 |         and torch.backends.mps.is_available()
26 |         and device == "cpu"
27 |     ):
28 |         device = "mps"
29 |     if not device:
30 |         device = "cuda"
31 |     if device not in models.keys():
32 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 |     with torch.no_grad():
34 |         inputs = tokenizer(text, return_tensors="pt")
35 |         for i in inputs:
36 |             inputs[i] = inputs[i].to(device)
37 |         res = models[device](**inputs, output_hidden_states=True)
38 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 |         if style_text:
40 |             style_inputs = tokenizer(style_text, return_tensors="pt")
41 |             for i in style_inputs:
42 |                 style_inputs[i] = style_inputs[i].to(device)
43 |             style_res = models[device](**style_inputs, output_hidden_states=True)
44 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 |             style_res_mean = style_res.mean(0)
46 |     assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 |     word2phone = word2ph
48 |     phone_level_feature = []
49 |     for i in range(len(word2phone)):
50 |         if style_text:
51 |             repeat_feature = (
52 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 |             )
55 |         else:
56 |             repeat_feature = res[i].repeat(word2phone[i], 1)
57 |         phone_level_feature.append(repeat_feature)
58 | 
59 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 | 
61 |     return phone_level_feature.T
62 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/english_bert_mock.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import DebertaV2Model, DebertaV2Tokenizer
 5 | 
 6 | from config import config
 7 | 
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v3-large"
10 | 
11 | tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     if (
24 |         sys.platform == "darwin"
25 |         and torch.backends.mps.is_available()
26 |         and device == "cpu"
27 |     ):
28 |         device = "mps"
29 |     if not device:
30 |         device = "cuda"
31 |     if device not in models.keys():
32 |         models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33 |     with torch.no_grad():
34 |         inputs = tokenizer(text, return_tensors="pt")
35 |         for i in inputs:
36 |             inputs[i] = inputs[i].to(device)
37 |         res = models[device](**inputs, output_hidden_states=True)
38 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39 |         if style_text:
40 |             style_inputs = tokenizer(style_text, return_tensors="pt")
41 |             for i in style_inputs:
42 |                 style_inputs[i] = style_inputs[i].to(device)
43 |             style_res = models[device](**style_inputs, output_hidden_states=True)
44 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45 |             style_res_mean = style_res.mean(0)
46 |     assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47 |     word2phone = word2ph
48 |     phone_level_feature = []
49 |     for i in range(len(word2phone)):
50 |         if style_text:
51 |             repeat_feature = (
52 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
54 |             )
55 |         else:
56 |             repeat_feature = res[i].repeat(word2phone[i], 1)
57 |         phone_level_feature.append(repeat_feature)
58 | 
59 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
60 | 
61 |     return phone_level_feature.T
62 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from .japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     sep_text, _, _ = text2sep_kata(text)
18 |     sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 |     sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 |     sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 |     return get_bert_feature_with_token(sep_ids, word2ph, device)
22 | 
23 | 
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 |     if (
26 |         sys.platform == "darwin"
27 |         and torch.backends.mps.is_available()
28 |         and device == "cpu"
29 |     ):
30 |         device = "mps"
31 |     if not device:
32 |         device = "cuda"
33 |     if device not in models.keys():
34 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 |     with torch.no_grad():
36 |         inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 |         token_type_ids = torch.zeros_like(inputs).to(device)
38 |         attention_mask = torch.ones_like(inputs).to(device)
39 |         inputs = {
40 |             "input_ids": inputs,
41 |             "token_type_ids": token_type_ids,
42 |             "attention_mask": attention_mask,
43 |         }
44 | 
45 |         # for i in inputs:
46 |         #     inputs[i] = inputs[i].to(device)
47 |         res = models[device](**inputs, output_hidden_states=True)
48 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
50 |     word2phone = word2ph
51 |     phone_level_feature = []
52 |     for i in range(len(word2phone)):
53 |         repeat_feature = res[i].repeat(word2phone[i], 1)
54 |         phone_level_feature.append(repeat_feature)
55 | 
56 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 | 
58 |     return phone_level_feature.T
59 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/fix/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 3 | import sys
 4 | from .japanese import text2sep_kata
 5 | from config import config
 6 | 
 7 | tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
 8 | 
 9 | models = dict()
10 | 
11 | 
12 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
13 |     sep_text, _ = text2sep_kata(text)
14 |     sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
15 |     sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
16 |     sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
17 |     return get_bert_feature_with_token(sep_ids, word2ph, device)
18 | 
19 | 
20 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
21 |     if (
22 |         sys.platform == "darwin"
23 |         and torch.backends.mps.is_available()
24 |         and device == "cpu"
25 |     ):
26 |         device = "mps"
27 |     if not device:
28 |         device = "cuda"
29 |     if device not in models.keys():
30 |         models[device] = AutoModelForMaskedLM.from_pretrained(
31 |             "./bert/bert-base-japanese-v3"
32 |         ).to(device)
33 |     with torch.no_grad():
34 |         inputs = torch.tensor(tokens).to(device).unsqueeze(0)
35 |         token_type_ids = torch.zeros_like(inputs).to(device)
36 |         attention_mask = torch.ones_like(inputs).to(device)
37 |         inputs = {
38 |             "input_ids": inputs,
39 |             "token_type_ids": token_type_ids,
40 |             "attention_mask": attention_mask,
41 |         }
42 | 
43 |         # for i in inputs:
44 |         #     inputs[i] = inputs[i].to(device)
45 |         res = models[device](**inputs, output_hidden_states=True)
46 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
47 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
48 |     word2phone = word2ph
49 |     phone_level_feature = []
50 |     for i in range(len(word2phone)):
51 |         repeat_feature = res[i].repeat(word2phone[i], 1)
52 |         phone_level_feature.append(repeat_feature)
53 | 
54 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
55 | 
56 |     return phone_level_feature.T
57 | 


--------------------------------------------------------------------------------
/resample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | from multiprocessing import Pool, cpu_count
 5 | 
 6 | import soundfile
 7 | from tqdm import tqdm
 8 | 
 9 | from config import config
10 | 
11 | 
12 | def process(item):
13 |     spkdir, wav_name, args = item
14 |     wav_path = os.path.join(args.in_dir, spkdir, wav_name)
15 |     if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
16 |         wav, sr = librosa.load(wav_path, sr=args.sr)
17 |         soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument(
23 |         "--sr",
24 |         type=int,
25 |         default=config.resample_config.sampling_rate,
26 |         help="sampling rate",
27 |     )
28 |     parser.add_argument(
29 |         "--in_dir",
30 |         type=str,
31 |         default=config.resample_config.in_dir,
32 |         help="path to source dir",
33 |     )
34 |     parser.add_argument(
35 |         "--out_dir",
36 |         type=str,
37 |         default=config.resample_config.out_dir,
38 |         help="path to target dir",
39 |     )
40 |     parser.add_argument(
41 |         "--processes",
42 |         type=int,
43 |         default=0,
44 |         help="cpu_processes",
45 |     )
46 |     args, _ = parser.parse_known_args()
47 |     # autodl 无卡模式会识别出46个cpu
48 |     if args.processes == 0:
49 |         processes = cpu_count() - 2 if cpu_count() > 4 else 1
50 |     else:
51 |         processes = args.processes
52 |     pool = Pool(processes=processes)
53 | 
54 |     tasks = []
55 | 
56 |     for dirpath, _, filenames in os.walk(args.in_dir):
57 |         # 子级目录
58 |         spk_dir = os.path.relpath(dirpath, args.in_dir)
59 |         spk_dir_out = os.path.join(args.out_dir, spk_dir)
60 |         if not os.path.isdir(spk_dir_out):
61 |             os.makedirs(spk_dir_out, exist_ok=True)
62 |         for filename in filenames:
63 |             if filename.lower().endswith(".wav"):
64 |                 twople = (spk_dir, filename, args)
65 |                 tasks.append(twople)
66 | 
67 |     for _ in tqdm(
68 |         pool.imap_unordered(process, tasks),
69 |     ):
70 |         pass
71 | 
72 |     pool.close()
73 |     pool.join()
74 | 
75 |     print("音频重采样完毕!")
76 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from .japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17 |     sep_text, _, _ = text2sep_kata(text)
18 |     sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
19 |     sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
20 |     sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
21 |     return get_bert_feature_with_token(sep_ids, word2ph, device)
22 | 
23 | 
24 | def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
25 |     if (
26 |         sys.platform == "darwin"
27 |         and torch.backends.mps.is_available()
28 |         and device == "cpu"
29 |     ):
30 |         device = "mps"
31 |     if not device:
32 |         device = "cuda"
33 |     if device not in models.keys():
34 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
35 |     with torch.no_grad():
36 |         inputs = torch.tensor(tokens).to(device).unsqueeze(0)
37 |         token_type_ids = torch.zeros_like(inputs).to(device)
38 |         attention_mask = torch.ones_like(inputs).to(device)
39 |         inputs = {
40 |             "input_ids": inputs,
41 |             "token_type_ids": token_type_ids,
42 |             "attention_mask": attention_mask,
43 |         }
44 | 
45 |         # for i in inputs:
46 |         #     inputs[i] = inputs[i].to(device)
47 |         res = models[device](**inputs, output_hidden_states=True)
48 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
49 |     assert inputs["input_ids"].shape[-1] == len(word2ph)
50 |     word2phone = word2ph
51 |     phone_level_feature = []
52 |     for i in range(len(word2phone)):
53 |         repeat_feature = res[i].repeat(word2phone[i], 1)
54 |         phone_level_feature.append(repeat_feature)
55 | 
56 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
57 | 
58 |     return phone_level_feature.T
59 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from text.japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     text = "".join(text2sep_kata(text)[0])
24 |     if (
25 |         sys.platform == "darwin"
26 |         and torch.backends.mps.is_available()
27 |         and device == "cpu"
28 |     ):
29 |         device = "mps"
30 |     if not device:
31 |         device = "cuda"
32 |     if device not in models.keys():
33 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
34 |     with torch.no_grad():
35 |         inputs = tokenizer(text, return_tensors="pt")
36 |         for i in inputs:
37 |             inputs[i] = inputs[i].to(device)
38 |         res = models[device](**inputs, output_hidden_states=True)
39 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
40 |         if style_text:
41 |             style_inputs = tokenizer(style_text, return_tensors="pt")
42 |             for i in style_inputs:
43 |                 style_inputs[i] = style_inputs[i].to(device)
44 |             style_res = models[device](**style_inputs, output_hidden_states=True)
45 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
46 |             style_res_mean = style_res.mean(0)
47 | 
48 |     assert len(word2ph) == len(text) + 2
49 |     word2phone = word2ph
50 |     phone_level_feature = []
51 |     for i in range(len(word2phone)):
52 |         if style_text:
53 |             repeat_feature = (
54 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
55 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
56 |             )
57 |         else:
58 |             repeat_feature = res[i].repeat(word2phone[i], 1)
59 |         phone_level_feature.append(repeat_feature)
60 | 
61 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
62 | 
63 |     return phone_level_feature.T
64 | 


--------------------------------------------------------------------------------
/oldVersion/V220/clap_gen.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from multiprocessing import Pool, cpu_count
 3 | 
 4 | import torch
 5 | import torch.multiprocessing as mp
 6 | from tqdm import tqdm
 7 | 
 8 | import utils
 9 | from config import config
10 | from .clap_wrapper import get_clap_audio_feature
11 | import librosa
12 | import os
13 | 
14 | os.environ["OMP_NUM_THREADS"] = "1"
15 | os.environ["MKL_NUM_THREADS"] = "1"
16 | 
17 | 
18 | def process_line(line):
19 |     device = config.emo_gen_config.device
20 |     if config.emo_gen_config.use_multi_device:
21 |         rank = mp.current_process()._identity
22 |         rank = rank[0] if len(rank) > 0 else 0
23 |         if torch.cuda.is_available():
24 |             gpu_id = rank % torch.cuda.device_count()
25 |             device = torch.device(f"cuda:{gpu_id}")
26 |         else:
27 |             device = torch.device("cpu")
28 |     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
29 | 
30 |     clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
31 |     if os.path.isfile(clap_path):
32 |         return
33 | 
34 |     audio = librosa.load(wav_path, 48000)[0]
35 |     # audio = librosa.resample(audio, 44100, 48000)
36 | 
37 |     clap = get_clap_audio_feature(audio, device)
38 |     torch.save(clap, clap_path)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument(
44 |         "-c", "--config", type=str, default=config.emo_gen_config.config_path
45 |     )
46 |     parser.add_argument(
47 |         "--num_processes", type=int, default=config.emo_gen_config.num_processes
48 |     )
49 |     args, _ = parser.parse_known_args()
50 |     config_path = args.config
51 |     hps = utils.get_hparams_from_file(config_path)
52 |     lines = []
53 |     with open(hps.data.training_files, encoding="utf-8") as f:
54 |         lines.extend(f.readlines())
55 | 
56 |     with open(hps.data.validation_files, encoding="utf-8") as f:
57 |         lines.extend(f.readlines())
58 |     if len(lines) != 0:
59 |         num_processes = min(args.num_processes, cpu_count())
60 |         with Pool(processes=num_processes) as pool:
61 |             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
62 |                 pass
63 | 
64 |     print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
65 | 


--------------------------------------------------------------------------------
/bert/chinese-roberta-wwm-ext-large/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - zh
 4 | tags:
 5 | - bert
 6 | license: "apache-2.0"
 7 | ---
 8 | 
 9 | # Please use 'Bert' related functions to load this model!
10 | 
11 | ## Chinese BERT with Whole Word Masking
12 | For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13 | 
14 | **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15 | Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16 | 
17 | This repository is developed based on：https://github.com/google-research/bert
18 | 
19 | You may also interested in,
20 | - Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm
21 | - Chinese MacBERT: https://github.com/ymcui/MacBERT
22 | - Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA
23 | - Chinese XLNet: https://github.com/ymcui/Chinese-XLNet
24 | - Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer
25 | 
26 | More resources by HFL: https://github.com/ymcui/HFL-Anthology
27 | 
28 | ## Citation
29 | If you find the technical report or resource is useful, please cite the following technical report in your paper.
30 | - Primary: https://arxiv.org/abs/2004.13922
31 | ```
32 | @inproceedings{cui-etal-2020-revisiting,
33 |     title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing",
34 |     author = "Cui, Yiming  and
35 |       Che, Wanxiang  and
36 |       Liu, Ting  and
37 |       Qin, Bing  and
38 |       Wang, Shijin  and
39 |       Hu, Guoping",
40 |     booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
41 |     month = nov,
42 |     year = "2020",
43 |     address = "Online",
44 |     publisher = "Association for Computational Linguistics",
45 |     url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58",
46 |     pages = "657--668",
47 | }
48 | ```
49 | - Secondary: https://arxiv.org/abs/1906.08101
50 | ```
51 | @article{chinese-bert-wwm,
52 |   title={Pre-Training with Whole Word Masking for Chinese BERT},
53 |   author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping},
54 |   journal={arXiv preprint arXiv:1906.08101},
55 |   year={2019}
56 |  }
57 | ```
58 | 


--------------------------------------------------------------------------------
/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from text.japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     text = "".join(text2sep_kata(text)[0])
24 |     if style_text:
25 |         style_text = "".join(text2sep_kata(style_text)[0])
26 |     if (
27 |         sys.platform == "darwin"
28 |         and torch.backends.mps.is_available()
29 |         and device == "cpu"
30 |     ):
31 |         device = "mps"
32 |     if not device:
33 |         device = "cuda"
34 |     if device not in models.keys():
35 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 |     with torch.no_grad():
37 |         inputs = tokenizer(text, return_tensors="pt")
38 |         for i in inputs:
39 |             inputs[i] = inputs[i].to(device)
40 |         res = models[device](**inputs, output_hidden_states=True)
41 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 |         if style_text:
43 |             style_inputs = tokenizer(style_text, return_tensors="pt")
44 |             for i in style_inputs:
45 |                 style_inputs[i] = style_inputs[i].to(device)
46 |             style_res = models[device](**style_inputs, output_hidden_states=True)
47 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 |             style_res_mean = style_res.mean(0)
49 | 
50 |     assert len(word2ph) == len(text) + 2
51 |     word2phone = word2ph
52 |     phone_level_feature = []
53 |     for i in range(len(word2phone)):
54 |         if style_text:
55 |             repeat_feature = (
56 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 |             )
59 |         else:
60 |             repeat_feature = res[i].repeat(word2phone[i], 1)
61 |         phone_level_feature.append(repeat_feature)
62 | 
63 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 | 
65 |     return phone_level_feature.T
66 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/japanese_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 5 | 
 6 | from config import config
 7 | from .japanese import text2sep_kata
 8 | 
 9 | LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10 | 
11 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12 | 
13 | models = dict()
14 | 
15 | 
16 | def get_bert_feature(
17 |     text,
18 |     word2ph,
19 |     device=config.bert_gen_config.device,
20 |     style_text=None,
21 |     style_weight=0.7,
22 | ):
23 |     text = "".join(text2sep_kata(text)[0])
24 |     if style_text:
25 |         style_text = "".join(text2sep_kata(style_text)[0])
26 |     if (
27 |         sys.platform == "darwin"
28 |         and torch.backends.mps.is_available()
29 |         and device == "cpu"
30 |     ):
31 |         device = "mps"
32 |     if not device:
33 |         device = "cuda"
34 |     if device not in models.keys():
35 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36 |     with torch.no_grad():
37 |         inputs = tokenizer(text, return_tensors="pt")
38 |         for i in inputs:
39 |             inputs[i] = inputs[i].to(device)
40 |         res = models[device](**inputs, output_hidden_states=True)
41 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42 |         if style_text:
43 |             style_inputs = tokenizer(style_text, return_tensors="pt")
44 |             for i in style_inputs:
45 |                 style_inputs[i] = style_inputs[i].to(device)
46 |             style_res = models[device](**style_inputs, output_hidden_states=True)
47 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48 |             style_res_mean = style_res.mean(0)
49 | 
50 |     assert len(word2ph) == len(text) + 2
51 |     word2phone = word2ph
52 |     phone_level_feature = []
53 |     for i in range(len(word2phone)):
54 |         if style_text:
55 |             repeat_feature = (
56 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
58 |             )
59 |         else:
60 |             repeat_feature = res[i].repeat(word2phone[i], 1)
61 |         phone_level_feature.append(repeat_feature)
62 | 
63 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
64 | 
65 |     return phone_level_feature.T
66 | 


--------------------------------------------------------------------------------
/oldVersion/V101/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 1.0.1 版本兼容
 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.0.1
 4 | """
 5 | import torch
 6 | import commons
 7 | from .text.cleaner import clean_text
 8 | from .text import cleaned_text_to_sequence
 9 | from oldVersion.V111.text import get_bert
10 | 
11 | 
12 | def get_text(text, language_str, hps, device):
13 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 | 
16 |     if hps.data.add_blank:
17 |         phone = commons.intersperse(phone, 0)
18 |         tone = commons.intersperse(tone, 0)
19 |         language = commons.intersperse(language, 0)
20 |         for i in range(len(word2ph)):
21 |             word2ph[i] = word2ph[i] * 2
22 |         word2ph[0] += 1
23 |     bert = get_bert(norm_text, word2ph, language_str, device)
24 |     del word2ph
25 | 
26 |     assert bert.shape[-1] == len(phone)
27 | 
28 |     phone = torch.LongTensor(phone)
29 |     tone = torch.LongTensor(tone)
30 |     language = torch.LongTensor(language)
31 | 
32 |     return bert, phone, tone, language
33 | 
34 | 
35 | def infer(
36 |     text,
37 |     sdp_ratio,
38 |     noise_scale,
39 |     noise_scale_w,
40 |     length_scale,
41 |     sid,
42 |     hps,
43 |     net_g,
44 |     device,
45 | ):
46 |     bert, phones, tones, lang_ids = get_text(text, "ZH", hps, device)
47 |     with torch.no_grad():
48 |         x_tst = phones.to(device).unsqueeze(0)
49 |         tones = tones.to(device).unsqueeze(0)
50 |         lang_ids = lang_ids.to(device).unsqueeze(0)
51 |         bert = bert.to(device).unsqueeze(0)
52 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
53 |         del phones
54 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
55 |         audio = (
56 |             net_g.infer(
57 |                 x_tst,
58 |                 x_tst_lengths,
59 |                 speakers,
60 |                 tones,
61 |                 lang_ids,
62 |                 bert,
63 |                 sdp_ratio=sdp_ratio,
64 |                 noise_scale=noise_scale,
65 |                 noise_scale_w=noise_scale_w,
66 |                 length_scale=length_scale,
67 |             )[0][0, 0]
68 |             .data.cpu()
69 |             .float()
70 |             .numpy()
71 |         )
72 |         del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
73 |         if torch.cuda.is_available():
74 |             torch.cuda.empty_cache()
75 |         return audio
76 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/chinese_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
 4 | 
 5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
 6 | 
 7 | 
 8 | def get_bert_feature(text, word2ph, device=None):
 9 |     if (
10 |         sys.platform == "darwin"
11 |         and torch.backends.mps.is_available()
12 |         and device == "cpu"
13 |     ):
14 |         device = "mps"
15 |     if not device:
16 |         device = "cuda"
17 |     model = AutoModelForMaskedLM.from_pretrained(
18 |         "./bert/chinese-roberta-wwm-ext-large"
19 |     ).to(device)
20 |     with torch.no_grad():
21 |         inputs = tokenizer(text, return_tensors="pt")
22 |         for i in inputs:
23 |             inputs[i] = inputs[i].to(device)
24 |         res = model(**inputs, output_hidden_states=True)
25 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
26 | 
27 |     assert len(word2ph) == len(text) + 2
28 |     word2phone = word2ph
29 |     phone_level_feature = []
30 |     for i in range(len(word2phone)):
31 |         repeat_feature = res[i].repeat(word2phone[i], 1)
32 |         phone_level_feature.append(repeat_feature)
33 | 
34 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
35 | 
36 |     return phone_level_feature.T
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     import torch
41 | 
42 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
43 |     word2phone = [
44 |         1,
45 |         2,
46 |         1,
47 |         2,
48 |         2,
49 |         1,
50 |         2,
51 |         2,
52 |         1,
53 |         2,
54 |         2,
55 |         1,
56 |         2,
57 |         2,
58 |         2,
59 |         2,
60 |         2,
61 |         1,
62 |         1,
63 |         2,
64 |         2,
65 |         1,
66 |         2,
67 |         2,
68 |         2,
69 |         2,
70 |         1,
71 |         2,
72 |         2,
73 |         2,
74 |         2,
75 |         2,
76 |         1,
77 |         2,
78 |         2,
79 |         2,
80 |         2,
81 |         1,
82 |     ]
83 | 
84 |     # 计算总帧数
85 |     total_frames = sum(word2phone)
86 |     print(word_level_feature.shape)
87 |     print(word2phone)
88 |     phone_level_feature = []
89 |     for i in range(len(word2phone)):
90 |         print(word_level_feature[i].shape)
91 | 
92 |         # 对每个词重复word2phone[i]次
93 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
94 |         phone_level_feature.append(repeat_feature)
95 | 
96 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
97 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
98 | 


--------------------------------------------------------------------------------
/whisper_transcribe.py:
--------------------------------------------------------------------------------
 1 | import whisper
 2 | import os
 3 | import argparse
 4 | import torch
 5 | 
 6 | 
 7 | def transcribe_one(audio_path):
 8 |     # load audio and pad/trim it to fit 30 seconds
 9 |     audio = whisper.load_audio(audio_path)
10 |     audio = whisper.pad_or_trim(audio)
11 | 
12 |     # make log-Mel spectrogram and move to the same device as the model
13 |     mel = whisper.log_mel_spectrogram(audio).to(model.device)
14 | 
15 |     # detect the spoken language
16 |     _, probs = model.detect_language(mel)
17 |     print(f"Detected language: {max(probs, key=probs.get)}")
18 |     lang = max(probs, key=probs.get)
19 |     # decode the audio
20 |     options = whisper.DecodingOptions(beam_size=5)
21 |     result = whisper.decode(model, mel, options)
22 | 
23 |     # print the recognized text
24 |     print(result.text)
25 |     return lang, result.text
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("--languages", default="CJ")
31 |     parser.add_argument("--whisper_size", default="medium")
32 |     parser.add_argument("--speaker")
33 |     parser.add_argument("--input_dir")
34 |     parser.add_argument("--output")
35 |     args = parser.parse_args()
36 | 
37 |     model = whisper.load_model(args.whisper_size)
38 |     speaker = args.speaker
39 |     input_dir = args.input_dir
40 |     output = args.output
41 | 
42 |     if args.languages == "CJE":
43 |         lang2token = {
44 |             "zh": "ZH|",
45 |             "ja": "JP|",
46 |             "en": "EN|",
47 |         }
48 |     elif args.languages == "CJ":
49 |         lang2token = {
50 |             "zh": "ZH|",
51 |             "ja": "JP|",
52 |         }
53 |     elif args.languages == "C":
54 |         lang2token = {
55 |             "zh": "ZH|",
56 |         }
57 |     
58 |     assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
59 | 
60 |     speaker_annos = []
61 |     total_files = sum([len(files) for _, _, files in os.walk(input_dir)])
62 | 
63 |     for i, wavfile in enumerate(list(os.walk(input_dir))[0][2]):
64 |         try:
65 |             lang, text = transcribe_one(f"./data/{speaker}/raw/{wavfile}")
66 |             if lang not in list(lang2token.keys()):
67 |                 print(f"{lang} not supported, ignoring\n")
68 |                 continue
69 |             speaker_annos.append(f"{wavfile}|{speaker}|{lang2token[lang]}{text}")
70 |             print(f"Processed: {i + 1}/{total_files}")
71 |         except Exception as e:
72 |             print(e)
73 |             continue
74 | 
75 |     with open(output, "w", encoding="utf-8") as f:
76 |         f.write("\n".join(speaker_annos))


--------------------------------------------------------------------------------
/re_matching.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def extract_language_and_text_updated(speaker, dialogue):
 5 |     # 使用正则表达式匹配<语言>标签和其后的文本
 6 |     pattern_language_text = r"<(\S+?)>([^<]+)"
 7 |     matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
 8 |     speaker = speaker[1:-1]
 9 |     # 清理文本：去除两边的空白字符
10 |     matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11 |     matches_cleaned.append(speaker)
12 |     return matches_cleaned
13 | 
14 | 
15 | def validate_text(input_text):
16 |     # 验证说话人的正则表达式
17 |     pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18 | 
19 |     # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20 |     matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21 | 
22 |     # 对每个匹配到的说话人内容进行进一步验证
23 |     for _, dialogue in matches:
24 |         language_text_matches = extract_language_and_text_updated(_, dialogue)
25 |         if not language_text_matches:
26 |             return (
27 |                 False,
28 |                 "Error: Invalid format detected in dialogue content. Please check your input.",
29 |             )
30 | 
31 |     # 如果输入的文本中没有找到任何匹配项
32 |     if not matches:
33 |         return (
34 |             False,
35 |             "Error: No valid speaker format detected. Please check your input.",
36 |         )
37 | 
38 |     return True, "Input is valid."
39 | 
40 | 
41 | def text_matching(text: str) -> list:
42 |     speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43 |     matches = re.findall(speaker_pattern, text, re.DOTALL)
44 |     result = []
45 |     for speaker, dialogue in matches:
46 |         result.append(extract_language_and_text_updated(speaker, dialogue))
47 |     return result
48 | 
49 | 
50 | def cut_para(text):
51 |     splitted_para = re.split("[\n]", text)  # 按段分
52 |     splitted_para = [
53 |         sentence.strip() for sentence in splitted_para if sentence.strip()
54 |     ]  # 删除空字符串
55 |     return splitted_para
56 | 
57 | 
58 | def cut_sent(para):
59 |     para = re.sub("([。！;？\?])([^”’])", r"\1\n\2", para)  # 单字符断句符
60 |     para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)  # 英文省略号
61 |     para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)  # 中文省略号
62 |     para = re.sub("([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
63 |     para = para.rstrip()  # 段尾如果有多余的\n就去掉它
64 |     return para.split("\n")
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     text = """
69 |     [说话人1]
70 |     [说话人2]<zh>你好吗？<jp>元気ですか？<jp>こんにちは，世界。<zh>你好吗？
71 |     [说话人3]<zh>谢谢。<jp>どういたしまして。
72 |     """
73 |     text_matching(text)
74 |     # 测试函数
75 |     test_text = """
76 |     [说话人1]<zh>你好，こんにちは！<jp>こんにちは，世界。
77 |     [说话人2]<zh>你好吗？
78 |     """
79 |     text_matching(test_text)
80 |     res = validate_text(test_text)
81 |     print(res)
82 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
  4 | 
  5 | device = torch.device(
  6 |     "cuda"
  7 |     if torch.cuda.is_available()
  8 |     else (
  9 |         "mps"
 10 |         if sys.platform == "darwin" and torch.backends.mps.is_available()
 11 |         else "cpu"
 12 |     )
 13 | )
 14 | 
 15 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
 16 | model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(
 17 |     device
 18 | )
 19 | 
 20 | 
 21 | def get_bert_feature(text, word2ph):
 22 |     with torch.no_grad():
 23 |         inputs = tokenizer(text, return_tensors="pt")
 24 |         for i in inputs:
 25 |             inputs[i] = inputs[i].to(device)
 26 |         res = model(**inputs, output_hidden_states=True)
 27 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 28 | 
 29 |     assert len(word2ph) == len(text) + 2
 30 |     word2phone = word2ph
 31 |     phone_level_feature = []
 32 |     for i in range(len(word2phone)):
 33 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 34 |         phone_level_feature.append(repeat_feature)
 35 | 
 36 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 37 | 
 38 |     return phone_level_feature.T
 39 | 
 40 | 
 41 | if __name__ == "__main__":
 42 |     # feature = get_bert_feature('你好,我是说的道理。')
 43 |     import torch
 44 | 
 45 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 46 |     word2phone = [
 47 |         1,
 48 |         2,
 49 |         1,
 50 |         2,
 51 |         2,
 52 |         1,
 53 |         2,
 54 |         2,
 55 |         1,
 56 |         2,
 57 |         2,
 58 |         1,
 59 |         2,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         1,
 65 |         1,
 66 |         2,
 67 |         2,
 68 |         1,
 69 |         2,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         1,
 74 |         2,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         1,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |     ]
 86 | 
 87 |     # 计算总帧数
 88 |     total_frames = sum(word2phone)
 89 |     print(word_level_feature.shape)
 90 |     print(word2phone)
 91 |     phone_level_feature = []
 92 |     for i in range(len(word2phone)):
 93 |         print(word_level_feature[i].shape)
 94 | 
 95 |         # 对每个词重复word2phone[i]次
 96 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 97 |         phone_level_feature.append(repeat_feature)
 98 | 
 99 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
101 | 


--------------------------------------------------------------------------------
/compress_model.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from text.symbols import symbols
 3 | import torch
 4 | 
 5 | from tools.log import logger
 6 | import utils
 7 | from models import SynthesizerTrn
 8 | import os
 9 | 
10 | 
11 | def copyStateDict(state_dict):
12 |     if list(state_dict.keys())[0].startswith("module"):
13 |         start_idx = 1
14 |     else:
15 |         start_idx = 0
16 |     new_state_dict = OrderedDict()
17 |     for k, v in state_dict.items():
18 |         name = ",".join(k.split(".")[start_idx:])
19 |         new_state_dict[name] = v
20 |     return new_state_dict
21 | 
22 | 
23 | def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
24 |     hps = utils.get_hparams_from_file(config)
25 | 
26 |     net_g = SynthesizerTrn(
27 |         len(symbols),
28 |         hps.data.filter_length // 2 + 1,
29 |         hps.train.segment_size // hps.data.hop_length,
30 |         n_speakers=hps.data.n_speakers,
31 |         **hps.model,
32 |     )
33 | 
34 |     optim_g = torch.optim.AdamW(
35 |         net_g.parameters(),
36 |         hps.train.learning_rate,
37 |         betas=hps.train.betas,
38 |         eps=hps.train.eps,
39 |     )
40 | 
41 |     state_dict_g = torch.load(input_model, map_location="cpu")
42 |     new_dict_g = copyStateDict(state_dict_g)
43 |     keys = []
44 |     for k, v in new_dict_g["model"].items():
45 |         if "enc_q" in k:
46 |             continue  # noqa: E701
47 |         keys.append(k)
48 | 
49 |     new_dict_g = (
50 |         {k: new_dict_g["model"][k].half() for k in keys}
51 |         if ishalf
52 |         else {k: new_dict_g["model"][k] for k in keys}
53 |     )
54 | 
55 |     torch.save(
56 |         {
57 |             "model": new_dict_g,
58 |             "iteration": 0,
59 |             "optimizer": optim_g.state_dict(),
60 |             "learning_rate": 0.0001,
61 |         },
62 |         output_model,
63 |     )
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     import argparse
68 | 
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument("-c", "--config", type=str, default="configs/config.json")
71 |     parser.add_argument("-i", "--input", type=str)
72 |     parser.add_argument("-o", "--output", type=str, default=None)
73 |     parser.add_argument(
74 |         "-hf", "--half", action="store_true", default=False, help="Save as FP16"
75 |     )
76 | 
77 |     args = parser.parse_args()
78 | 
79 |     output = args.output
80 | 
81 |     if output is None:
82 |         import os.path
83 | 
84 |         filename, ext = os.path.splitext(args.input)
85 |         half = "_half" if args.half else ""
86 |         output = filename + "_release" + half + ext
87 | 
88 |     removeOptimizer(args.config, args.input, args.half, output)
89 |     logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")
90 | 


--------------------------------------------------------------------------------
/slm/wavlm-base-plus/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_name_or_path": "wavlm-base-plus",
  3 |   "activation_dropout": 0.0,
  4 |   "adapter_kernel_size": 3,
  5 |   "adapter_stride": 2,
  6 |   "add_adapter": false,
  7 |   "apply_spec_augment": true,
  8 |   "architectures": [
  9 |     "WavLMModel"
 10 |   ],
 11 |   "attention_dropout": 0.1,
 12 |   "bos_token_id": 1,
 13 |   "classifier_proj_size": 256,
 14 |   "codevector_dim": 256,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": false,
 17 |   "conv_dim": [
 18 |     512,
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512
 25 |   ],
 26 |   "conv_kernel": [
 27 |     10,
 28 |     3,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     2,
 33 |     2
 34 |   ],
 35 |   "conv_stride": [
 36 |     5,
 37 |     2,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2
 43 |   ],
 44 |   "ctc_loss_reduction": "sum",
 45 |   "ctc_zero_infinity": false,
 46 |   "diversity_loss_weight": 0.1,
 47 |   "do_stable_layer_norm": false,
 48 |   "eos_token_id": 2,
 49 |   "feat_extract_activation": "gelu",
 50 |   "feat_extract_norm": "group",
 51 |   "feat_proj_dropout": 0.1,
 52 |   "feat_quantizer_dropout": 0.0,
 53 |   "final_dropout": 0.0,
 54 |   "freeze_feat_extract_train": true,
 55 |   "hidden_act": "gelu",
 56 |   "hidden_dropout": 0.1,
 57 |   "hidden_size": 768,
 58 |   "initializer_range": 0.02,
 59 |   "intermediate_size": 3072,
 60 |   "layer_norm_eps": 1e-05,
 61 |   "layerdrop": 0.05,
 62 |   "mask_channel_length": 10,
 63 |   "mask_channel_min_space": 1,
 64 |   "mask_channel_other": 0.0,
 65 |   "mask_channel_prob": 0.0,
 66 |   "mask_channel_selection": "static",
 67 |   "mask_feature_length": 10,
 68 |   "mask_feature_min_masks": 0,
 69 |   "mask_feature_prob": 0.0,
 70 |   "mask_time_length": 10,
 71 |   "mask_time_min_masks": 2,
 72 |   "mask_time_min_space": 1,
 73 |   "mask_time_other": 0.0,
 74 |   "mask_time_prob": 0.05,
 75 |   "mask_time_selection": "static",
 76 |   "model_type": "wavlm",
 77 |   "no_mask_channel_overlap": false,
 78 |   "no_mask_time_overlap": false,
 79 |   "num_adapter_layers": 3,
 80 |   "num_attention_heads": 12,
 81 |   "num_buckets": 320,
 82 |   "num_codevector_groups": 2,
 83 |   "num_codevectors_per_group": 320,
 84 |   "num_conv_pos_embedding_groups": 16,
 85 |   "num_conv_pos_embeddings": 128,
 86 |   "num_ctc_classes": 80,
 87 |   "num_feat_extract_layers": 7,
 88 |   "num_hidden_layers": 12,
 89 |   "num_negatives": 100,
 90 |   "output_hidden_size": 768,
 91 |   "pad_token_id": 0,
 92 |   "proj_codevector_dim": 256,
 93 |   "replace_prob": 0.5,
 94 |   "torch_dtype": "float32",
 95 |   "transformers_version": "4.13.0.dev0",
 96 |   "use_weighted_layer_sum": false,
 97 |   "vocab_size": 32,
 98 |   "tokenizer_class": "Wav2Vec2CTCTokenizer"
 99 | }
100 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | from transformers import AutoTokenizer, AutoModelForMaskedLM
  4 | 
  5 | tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
  6 | 
  7 | models = dict()
  8 | 
  9 | 
 10 | def get_bert_feature(text, word2ph, device=None):
 11 |     if (
 12 |         sys.platform == "darwin"
 13 |         and torch.backends.mps.is_available()
 14 |         and device == "cpu"
 15 |     ):
 16 |         device = "mps"
 17 |     if not device:
 18 |         device = "cuda"
 19 |     if device not in models.keys():
 20 |         models[device] = AutoModelForMaskedLM.from_pretrained(
 21 |             "./bert/chinese-roberta-wwm-ext-large"
 22 |         ).to(device)
 23 |     with torch.no_grad():
 24 |         inputs = tokenizer(text, return_tensors="pt")
 25 |         for i in inputs:
 26 |             inputs[i] = inputs[i].to(device)
 27 |         res = models[device](**inputs, output_hidden_states=True)
 28 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 29 | 
 30 |     assert len(word2ph) == len(text) + 2
 31 |     word2phone = word2ph
 32 |     phone_level_feature = []
 33 |     for i in range(len(word2phone)):
 34 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 35 |         phone_level_feature.append(repeat_feature)
 36 | 
 37 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 38 | 
 39 |     return phone_level_feature.T
 40 | 
 41 | 
 42 | if __name__ == "__main__":
 43 |     import torch
 44 | 
 45 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 46 |     word2phone = [
 47 |         1,
 48 |         2,
 49 |         1,
 50 |         2,
 51 |         2,
 52 |         1,
 53 |         2,
 54 |         2,
 55 |         1,
 56 |         2,
 57 |         2,
 58 |         1,
 59 |         2,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         1,
 65 |         1,
 66 |         2,
 67 |         2,
 68 |         1,
 69 |         2,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         1,
 74 |         2,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         1,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |     ]
 86 | 
 87 |     # 计算总帧数
 88 |     total_frames = sum(word2phone)
 89 |     print(word_level_feature.shape)
 90 |     print(word2phone)
 91 |     phone_level_feature = []
 92 |     for i in range(len(word2phone)):
 93 |         print(word_level_feature[i].shape)
 94 | 
 95 |         # 对每个词重复word2phone[i]次
 96 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 97 |         phone_level_feature.append(repeat_feature)
 98 | 
 99 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
100 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
101 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
 16 |     if (
 17 |         sys.platform == "darwin"
 18 |         and torch.backends.mps.is_available()
 19 |         and device == "cpu"
 20 |     ):
 21 |         device = "mps"
 22 |     if not device:
 23 |         device = "cuda"
 24 |     if device not in models.keys():
 25 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 26 |     with torch.no_grad():
 27 |         inputs = tokenizer(text, return_tensors="pt")
 28 |         for i in inputs:
 29 |             inputs[i] = inputs[i].to(device)
 30 |         res = models[device](**inputs, output_hidden_states=True)
 31 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 32 | 
 33 |     assert len(word2ph) == len(text) + 2
 34 |     word2phone = word2ph
 35 |     phone_level_feature = []
 36 |     for i in range(len(word2phone)):
 37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 38 |         phone_level_feature.append(repeat_feature)
 39 | 
 40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 41 | 
 42 |     return phone_level_feature.T
 43 | 
 44 | 
 45 | if __name__ == "__main__":
 46 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 47 |     word2phone = [
 48 |         1,
 49 |         2,
 50 |         1,
 51 |         2,
 52 |         2,
 53 |         1,
 54 |         2,
 55 |         2,
 56 |         1,
 57 |         2,
 58 |         2,
 59 |         1,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         2,
 65 |         1,
 66 |         1,
 67 |         2,
 68 |         2,
 69 |         1,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         2,
 74 |         1,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         2,
 80 |         1,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         2,
 85 |         1,
 86 |     ]
 87 | 
 88 |     # 计算总帧数
 89 |     total_frames = sum(word2phone)
 90 |     print(word_level_feature.shape)
 91 |     print(word2phone)
 92 |     phone_level_feature = []
 93 |     for i in range(len(word2phone)):
 94 |         print(word_level_feature[i].shape)
 95 | 
 96 |         # 对每个词重复word2phone[i]次
 97 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 98 |         phone_level_feature.append(repeat_feature)
 99 | 
100 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
102 | 


--------------------------------------------------------------------------------
/bert/bert-base-japanese-v3/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: apache-2.0
 3 | datasets:
 4 | - cc100
 5 | - wikipedia
 6 | language:
 7 | - ja
 8 | widget:
 9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 | 
12 | # BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 | 
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 | 
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 | 
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 | 
21 | ## Model architecture
22 | 
23 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24 | 
25 | ## Training Data
26 | 
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 | 
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 | 
33 | ## Tokenization
34 | 
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 | 
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 | 
40 | ## Training
41 | 
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 | 
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 | 
47 | ## Licenses
48 | 
49 | The pretrained models are distributed under the Apache License 2.0.
50 | 
51 | ## Acknowledgments
52 | 
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 | 


--------------------------------------------------------------------------------
/bert/bert-large-japanese-v2/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: apache-2.0
 3 | datasets:
 4 | - cc100
 5 | - wikipedia
 6 | language:
 7 | - ja
 8 | widget:
 9 | - text: 東北大学で[MASK]の研究をしています。
10 | ---
11 | 
12 | # BERT large Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13 | 
14 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15 | 
16 | This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18 | 
19 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20 | 
21 | ## Model architecture
22 | 
23 | The model architecture is the same as the original BERT large model; 24 layers, 1024 dimensions of hidden states, and 16 attention heads.
24 | 
25 | ## Training Data
26 | 
27 | The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28 | For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29 | The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30 | 
31 | For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32 | 
33 | ## Tokenization
34 | 
35 | The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36 | The vocabulary size is 32768.
37 | 
38 | We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39 | 
40 | ## Training
41 | 
42 | We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43 | For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44 | 
45 | For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46 | 
47 | ## Licenses
48 | 
49 | The pretrained models are distributed under the Apache License 2.0.
50 | 
51 | ## Acknowledgments
52 | 
53 | This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
54 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
 16 |     if (
 17 |         sys.platform == "darwin"
 18 |         and torch.backends.mps.is_available()
 19 |         and device == "cpu"
 20 |     ):
 21 |         device = "mps"
 22 |     if not device:
 23 |         device = "cuda"
 24 |     if device not in models.keys():
 25 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 26 |     with torch.no_grad():
 27 |         inputs = tokenizer(text, return_tensors="pt")
 28 |         for i in inputs:
 29 |             inputs[i] = inputs[i].to(device)
 30 |         res = models[device](**inputs, output_hidden_states=True)
 31 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 32 | 
 33 |     assert len(word2ph) == len(text) + 2
 34 |     word2phone = word2ph
 35 |     phone_level_feature = []
 36 |     for i in range(len(word2phone)):
 37 |         repeat_feature = res[i].repeat(word2phone[i], 1)
 38 |         phone_level_feature.append(repeat_feature)
 39 | 
 40 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 41 | 
 42 |     return phone_level_feature.T
 43 | 
 44 | 
 45 | if __name__ == "__main__":
 46 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 47 |     word2phone = [
 48 |         1,
 49 |         2,
 50 |         1,
 51 |         2,
 52 |         2,
 53 |         1,
 54 |         2,
 55 |         2,
 56 |         1,
 57 |         2,
 58 |         2,
 59 |         1,
 60 |         2,
 61 |         2,
 62 |         2,
 63 |         2,
 64 |         2,
 65 |         1,
 66 |         1,
 67 |         2,
 68 |         2,
 69 |         1,
 70 |         2,
 71 |         2,
 72 |         2,
 73 |         2,
 74 |         1,
 75 |         2,
 76 |         2,
 77 |         2,
 78 |         2,
 79 |         2,
 80 |         1,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         2,
 85 |         1,
 86 |     ]
 87 | 
 88 |     # 计算总帧数
 89 |     total_frames = sum(word2phone)
 90 |     print(word_level_feature.shape)
 91 |     print(word2phone)
 92 |     phone_level_feature = []
 93 |     for i in range(len(word2phone)):
 94 |         print(word_level_feature[i].shape)
 95 | 
 96 |         # 对每个词重复word2phone[i]次
 97 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
 98 |         phone_level_feature.append(repeat_feature)
 99 | 
100 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
101 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
102 | 


--------------------------------------------------------------------------------
/emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_name_or_path": "torch",
  3 |   "activation_dropout": 0.1,
  4 |   "adapter_kernel_size": 3,
  5 |   "adapter_stride": 2,
  6 |   "add_adapter": false,
  7 |   "apply_spec_augment": true,
  8 |   "architectures": [
  9 |     "Wav2Vec2ForSpeechClassification"
 10 |   ],
 11 |   "attention_dropout": 0.1,
 12 |   "bos_token_id": 1,
 13 |   "classifier_proj_size": 256,
 14 |   "codevector_dim": 768,
 15 |   "contrastive_logits_temperature": 0.1,
 16 |   "conv_bias": true,
 17 |   "conv_dim": [
 18 |     512,
 19 |     512,
 20 |     512,
 21 |     512,
 22 |     512,
 23 |     512,
 24 |     512
 25 |   ],
 26 |   "conv_kernel": [
 27 |     10,
 28 |     3,
 29 |     3,
 30 |     3,
 31 |     3,
 32 |     2,
 33 |     2
 34 |   ],
 35 |   "conv_stride": [
 36 |     5,
 37 |     2,
 38 |     2,
 39 |     2,
 40 |     2,
 41 |     2,
 42 |     2
 43 |   ],
 44 |   "ctc_loss_reduction": "sum",
 45 |   "ctc_zero_infinity": false,
 46 |   "diversity_loss_weight": 0.1,
 47 |   "do_stable_layer_norm": true,
 48 |   "eos_token_id": 2,
 49 |   "feat_extract_activation": "gelu",
 50 |   "feat_extract_dropout": 0.0,
 51 |   "feat_extract_norm": "layer",
 52 |   "feat_proj_dropout": 0.1,
 53 |   "feat_quantizer_dropout": 0.0,
 54 |   "final_dropout": 0.1,
 55 |   "finetuning_task": "wav2vec2_reg",
 56 |   "gradient_checkpointing": false,
 57 |   "hidden_act": "gelu",
 58 |   "hidden_dropout": 0.1,
 59 |   "hidden_dropout_prob": 0.1,
 60 |   "hidden_size": 1024,
 61 |   "id2label": {
 62 |     "0": "arousal",
 63 |     "1": "dominance",
 64 |     "2": "valence"
 65 |   },
 66 |   "initializer_range": 0.02,
 67 |   "intermediate_size": 4096,
 68 |   "label2id": {
 69 |     "arousal": 0,
 70 |     "dominance": 1,
 71 |     "valence": 2
 72 |   },
 73 |   "layer_norm_eps": 1e-05,
 74 |   "layerdrop": 0.1,
 75 |   "mask_feature_length": 10,
 76 |   "mask_feature_min_masks": 0,
 77 |   "mask_feature_prob": 0.0,
 78 |   "mask_time_length": 10,
 79 |   "mask_time_min_masks": 2,
 80 |   "mask_time_prob": 0.05,
 81 |   "model_type": "wav2vec2",
 82 |   "num_adapter_layers": 3,
 83 |   "num_attention_heads": 16,
 84 |   "num_codevector_groups": 2,
 85 |   "num_codevectors_per_group": 320,
 86 |   "num_conv_pos_embedding_groups": 16,
 87 |   "num_conv_pos_embeddings": 128,
 88 |   "num_feat_extract_layers": 7,
 89 |   "num_hidden_layers": 12,
 90 |   "num_negatives": 100,
 91 |   "output_hidden_size": 1024,
 92 |   "pad_token_id": 0,
 93 |   "pooling_mode": "mean",
 94 |   "problem_type": "regression",
 95 |   "proj_codevector_dim": 768,
 96 |   "tdnn_dilation": [
 97 |     1,
 98 |     2,
 99 |     3,
100 |     1,
101 |     1
102 |   ],
103 |   "tdnn_dim": [
104 |     512,
105 |     512,
106 |     512,
107 |     512,
108 |     1500
109 |   ],
110 |   "tdnn_kernel": [
111 |     5,
112 |     3,
113 |     3,
114 |     1,
115 |     1
116 |   ],
117 |   "torch_dtype": "float32",
118 |   "transformers_version": "4.17.0.dev0",
119 |   "use_weighted_layer_sum": false,
120 |   "vocab_size": null,
121 |   "xvector_output_dim": 512
122 | }
123 | 


--------------------------------------------------------------------------------
/spec_gen.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | from multiprocessing import Pool
 4 | from mel_processing import spectrogram_torch, mel_spectrogram_torch
 5 | from utils import load_wav_to_torch
 6 | 
 7 | 
 8 | class AudioProcessor:
 9 |     def __init__(
10 |         self,
11 |         max_wav_value,
12 |         use_mel_spec_posterior,
13 |         filter_length,
14 |         n_mel_channels,
15 |         sampling_rate,
16 |         hop_length,
17 |         win_length,
18 |         mel_fmin,
19 |         mel_fmax,
20 |     ):
21 |         self.max_wav_value = max_wav_value
22 |         self.use_mel_spec_posterior = use_mel_spec_posterior
23 |         self.filter_length = filter_length
24 |         self.n_mel_channels = n_mel_channels
25 |         self.sampling_rate = sampling_rate
26 |         self.hop_length = hop_length
27 |         self.win_length = win_length
28 |         self.mel_fmin = mel_fmin
29 |         self.mel_fmax = mel_fmax
30 | 
31 |     def process_audio(self, filename):
32 |         audio, sampling_rate = load_wav_to_torch(filename)
33 |         audio_norm = audio / self.max_wav_value
34 |         audio_norm = audio_norm.unsqueeze(0)
35 |         spec_filename = filename.replace(".wav", ".spec.pt")
36 |         if self.use_mel_spec_posterior:
37 |             spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
38 |         try:
39 |             spec = torch.load(spec_filename)
40 |         except:
41 |             if self.use_mel_spec_posterior:
42 |                 spec = mel_spectrogram_torch(
43 |                     audio_norm,
44 |                     self.filter_length,
45 |                     self.n_mel_channels,
46 |                     self.sampling_rate,
47 |                     self.hop_length,
48 |                     self.win_length,
49 |                     self.mel_fmin,
50 |                     self.mel_fmax,
51 |                     center=False,
52 |                 )
53 |             else:
54 |                 spec = spectrogram_torch(
55 |                     audio_norm,
56 |                     self.filter_length,
57 |                     self.sampling_rate,
58 |                     self.hop_length,
59 |                     self.win_length,
60 |                     center=False,
61 |                 )
62 |             spec = torch.squeeze(spec, 0)
63 |             torch.save(spec, spec_filename)
64 |         return spec, audio_norm
65 | 
66 | 
67 | # 使用示例
68 | processor = AudioProcessor(
69 |     max_wav_value=32768.0,
70 |     use_mel_spec_posterior=False,
71 |     filter_length=2048,
72 |     n_mel_channels=128,
73 |     sampling_rate=44100,
74 |     hop_length=512,
75 |     win_length=2048,
76 |     mel_fmin=0.0,
77 |     mel_fmax="null",
78 | )
79 | 
80 | with open("filelists/train.list", "r") as f:
81 |     filepaths = [line.split("|")[0] for line in f]  # 取每一行的第一部分作为audiopath
82 | 
83 | # 使用多进程处理
84 | with Pool(processes=32) as pool:  # 使用4个进程
85 |     with tqdm(total=len(filepaths)) as pbar:
86 |         for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)):
87 |             pbar.update()
88 | 


--------------------------------------------------------------------------------
/bert_gen.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from multiprocessing import Pool
 3 | import commons
 4 | import utils
 5 | from tqdm import tqdm
 6 | from text import check_bert_models, cleaned_text_to_sequence, get_bert
 7 | import argparse
 8 | import torch.multiprocessing as mp
 9 | from config import config
10 | 
11 | 
12 | def process_line(x):
13 |     line, add_blank = x
14 |     device = config.bert_gen_config.device
15 |     if config.bert_gen_config.use_multi_device:
16 |         rank = mp.current_process()._identity
17 |         rank = rank[0] if len(rank) > 0 else 0
18 |         if torch.cuda.is_available():
19 |             gpu_id = rank % torch.cuda.device_count()
20 |             device = torch.device(f"cuda:{gpu_id}")
21 |         else:
22 |             device = torch.device("cpu")
23 |     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
24 |     phone = phones.split(" ")
25 |     tone = [int(i) for i in tone.split(" ")]
26 |     word2ph = [int(i) for i in word2ph.split(" ")]
27 |     word2ph = [i for i in word2ph]
28 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
29 | 
30 |     if add_blank:
31 |         phone = commons.intersperse(phone, 0)
32 |         tone = commons.intersperse(tone, 0)
33 |         language = commons.intersperse(language, 0)
34 |         for i in range(len(word2ph)):
35 |             word2ph[i] = word2ph[i] * 2
36 |         word2ph[0] += 1
37 | 
38 |     bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
39 | 
40 |     try:
41 |         bert = torch.load(bert_path)
42 |         assert bert.shape[0] == 2048
43 |     except Exception:
44 |         bert = get_bert(text, word2ph, language_str, device)
45 |         assert bert.shape[-1] == len(phone)
46 |         torch.save(bert, bert_path)
47 | 
48 | 
49 | preprocess_text_config = config.preprocess_text_config
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument(
54 |         "-c", "--config", type=str, default=config.bert_gen_config.config_path
55 |     )
56 |     parser.add_argument(
57 |         "--num_processes", type=int, default=config.bert_gen_config.num_processes
58 |     )
59 |     args, _ = parser.parse_known_args()
60 |     config_path = args.config
61 |     hps = utils.get_hparams_from_file(config_path)
62 |     check_bert_models()
63 |     lines = []
64 |     with open(hps.data.training_files, encoding="utf-8") as f:
65 |         lines.extend(f.readlines())
66 | 
67 |     with open(hps.data.validation_files, encoding="utf-8") as f:
68 |         lines.extend(f.readlines())
69 |     add_blank = [hps.data.add_blank] * len(lines)
70 | 
71 |     if len(lines) != 0:
72 |         num_processes = args.num_processes
73 |         with Pool(processes=num_processes) as pool:
74 |             for _ in tqdm(
75 |                 pool.imap_unordered(process_line, zip(lines, add_blank)),
76 |                 total=len(lines),
77 |             ):
78 |                 # 这里是缩进的代码块，表示循环体
79 |                 pass  # 使用pass语句作为占位符
80 | 
81 |     print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")
82 | 


--------------------------------------------------------------------------------
/oldVersion/V110/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 1.1 版本兼容
 3 | https://github.com/fishaudio/Bert-VITS2/releases/tag/1.1
 4 | """
 5 | import torch
 6 | import commons
 7 | from .text.cleaner import clean_text
 8 | from .text import cleaned_text_to_sequence
 9 | from oldVersion.V111.text import get_bert
10 | 
11 | 
12 | def get_text(text, language_str, hps, device):
13 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
14 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
15 | 
16 |     if hps.data.add_blank:
17 |         phone = commons.intersperse(phone, 0)
18 |         tone = commons.intersperse(tone, 0)
19 |         language = commons.intersperse(language, 0)
20 |         for i in range(len(word2ph)):
21 |             word2ph[i] = word2ph[i] * 2
22 |         word2ph[0] += 1
23 |     bert = get_bert(norm_text, word2ph, language_str, device)
24 |     del word2ph
25 |     assert bert.shape[-1] == len(phone), phone
26 | 
27 |     if language_str == "ZH":
28 |         bert = bert
29 |         ja_bert = torch.zeros(768, len(phone))
30 |     elif language_str == "JP":
31 |         ja_bert = bert
32 |         bert = torch.zeros(1024, len(phone))
33 |     else:
34 |         bert = torch.zeros(1024, len(phone))
35 |         ja_bert = torch.zeros(768, len(phone))
36 | 
37 |     assert bert.shape[-1] == len(
38 |         phone
39 |     ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
40 | 
41 |     phone = torch.LongTensor(phone)
42 |     tone = torch.LongTensor(tone)
43 |     language = torch.LongTensor(language)
44 |     return bert, ja_bert, phone, tone, language
45 | 
46 | 
47 | def infer(
48 |     text,
49 |     sdp_ratio,
50 |     noise_scale,
51 |     noise_scale_w,
52 |     length_scale,
53 |     sid,
54 |     language,
55 |     hps,
56 |     net_g,
57 |     device,
58 | ):
59 |     bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps, device)
60 |     with torch.no_grad():
61 |         x_tst = phones.to(device).unsqueeze(0)
62 |         tones = tones.to(device).unsqueeze(0)
63 |         lang_ids = lang_ids.to(device).unsqueeze(0)
64 |         bert = bert.to(device).unsqueeze(0)
65 |         ja_bert = ja_bert.to(device).unsqueeze(0)
66 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
67 |         del phones
68 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
69 |         audio = (
70 |             net_g.infer(
71 |                 x_tst,
72 |                 x_tst_lengths,
73 |                 speakers,
74 |                 tones,
75 |                 lang_ids,
76 |                 bert,
77 |                 ja_bert,
78 |                 sdp_ratio=sdp_ratio,
79 |                 noise_scale=noise_scale,
80 |                 noise_scale_w=noise_scale_w,
81 |                 length_scale=length_scale,
82 |             )[0][0, 0]
83 |             .data.cpu()
84 |             .float()
85 |             .numpy()
86 |         )
87 |         del x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, ja_bert
88 |         if torch.cuda.is_available():
89 |             torch.cuda.empty_cache()
90 |         return audio
91 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "I",
 78 |     "N",
 79 |     "U",
 80 |     "a",
 81 |     "b",
 82 |     "by",
 83 |     "ch",
 84 |     "cl",
 85 |     "d",
 86 |     "dy",
 87 |     "e",
 88 |     "f",
 89 |     "g",
 90 |     "gy",
 91 |     "h",
 92 |     "hy",
 93 |     "i",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "p",
103 |     "py",
104 |     "r",
105 |     "ry",
106 |     "s",
107 |     "sh",
108 |     "t",
109 |     "ts",
110 |     "u",
111 |     "V",
112 |     "w",
113 |     "y",
114 |     "z",
115 | ]
116 | num_ja_tones = 1
117 | 
118 | # English
119 | en_symbols = [
120 |     "aa",
121 |     "ae",
122 |     "ah",
123 |     "ao",
124 |     "aw",
125 |     "ay",
126 |     "b",
127 |     "ch",
128 |     "d",
129 |     "dh",
130 |     "eh",
131 |     "er",
132 |     "ey",
133 |     "f",
134 |     "g",
135 |     "hh",
136 |     "ih",
137 |     "iy",
138 |     "jh",
139 |     "k",
140 |     "l",
141 |     "m",
142 |     "n",
143 |     "ng",
144 |     "ow",
145 |     "oy",
146 |     "p",
147 |     "r",
148 |     "s",
149 |     "sh",
150 |     "t",
151 |     "th",
152 |     "uh",
153 |     "uw",
154 |     "V",
155 |     "w",
156 |     "y",
157 |     "z",
158 |     "zh",
159 | ]
160 | num_en_tones = 4
161 | 
162 | # combine all symbols
163 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
164 | symbols = [pad] + normal_symbols + pu_symbols
165 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
166 | 
167 | # combine all tones
168 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
169 | 
170 | # language maps
171 | language_id_map = {"ZH": 0, "JA": 1, "EN": 2}
172 | num_languages = len(language_id_map.keys())
173 | 
174 | language_tone_start_map = {
175 |     "ZH": 0,
176 |     "JA": num_zh_tones,
177 |     "EN": num_zh_tones + num_ja_tones,
178 | }
179 | 
180 | if __name__ == "__main__":
181 |     a = set(zh_symbols)
182 |     b = set(en_symbols)
183 |     print(sorted(a & b))
184 | 


--------------------------------------------------------------------------------
/update_status.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gradio as gr
 3 | 
 4 | lang_dict = {"EN(英文)": "_en", "ZH(中文)": "_zh", "JP(日语)": "_jp"}
 5 | 
 6 | 
 7 | def raw_dir_convert_to_path(target_dir: str, lang):
 8 |     res = target_dir.rstrip("/").rstrip("\\")
 9 |     if (not target_dir.startswith("raw")) and (not target_dir.startswith("./raw")):
10 |         res = os.path.join("./raw", res)
11 |     if (
12 |         (not res.endswith("_zh"))
13 |         and (not res.endswith("_jp"))
14 |         and (not res.endswith("_en"))
15 |     ):
16 |         res += lang_dict[lang]
17 |     return res
18 | 
19 | 
20 | def update_g_files():
21 |     g_files = []
22 |     cnt = 0
23 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
24 |         for file in files:
25 |             if file.startswith("G_") and file.endswith(".pth"):
26 |                 g_files.append(os.path.join(root, file))
27 |                 cnt += 1
28 |     print(g_files)
29 |     return f"更新模型列表完成, 共找到{cnt}个模型", gr.Dropdown.update(choices=g_files)
30 | 
31 | 
32 | def update_c_files():
33 |     c_files = []
34 |     cnt = 0
35 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
36 |         for file in files:
37 |             if file.startswith("config.json"):
38 |                 c_files.append(os.path.join(root, file))
39 |                 cnt += 1
40 |     print(c_files)
41 |     return f"更新模型列表完成, 共找到{cnt}个配置文件", gr.Dropdown.update(choices=c_files)
42 | 
43 | 
44 | def update_model_folders():
45 |     subdirs = []
46 |     cnt = 0
47 |     for root, dirs, files in os.walk(os.path.abspath("./logs")):
48 |         for dir_name in dirs:
49 |             if os.path.basename(dir_name) != "eval":
50 |                 subdirs.append(os.path.join(root, dir_name))
51 |                 cnt += 1
52 |     print(subdirs)
53 |     return f"更新模型文件夹列表完成, 共找到{cnt}个文件夹", gr.Dropdown.update(choices=subdirs)
54 | 
55 | 
56 | def update_wav_lab_pairs():
57 |     wav_count = tot_count = 0
58 |     for root, _, files in os.walk("./raw"):
59 |         for file in files:
60 |             # print(file)
61 |             file_path = os.path.join(root, file)
62 |             if file.lower().endswith(".wav"):
63 |                 lab_file = os.path.splitext(file_path)[0] + ".lab"
64 |                 if os.path.exists(lab_file):
65 |                     wav_count += 1
66 |                 tot_count += 1
67 |     return f"{wav_count} / {tot_count}"
68 | 
69 | 
70 | def update_raw_folders():
71 |     subdirs = []
72 |     cnt = 0
73 |     script_path = os.path.dirname(os.path.abspath(__file__))  # 获取当前脚本的绝对路径
74 |     raw_path = os.path.join(script_path, "raw")
75 |     print(raw_path)
76 |     os.makedirs(raw_path, exist_ok=True)
77 |     for root, dirs, files in os.walk(raw_path):
78 |         for dir_name in dirs:
79 |             relative_path = os.path.relpath(
80 |                 os.path.join(root, dir_name), script_path
81 |             )  # 获取相对路径
82 |             subdirs.append(relative_path)
83 |             cnt += 1
84 |     print(subdirs)
85 |     return (
86 |         f"更新raw音频文件夹列表完成, 共找到{cnt}个文件夹",
87 |         gr.Dropdown.update(choices=subdirs),
88 |         gr.Textbox.update(value=update_wav_lab_pairs()),
89 |     )
90 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V110/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 1
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V111/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 1
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V200/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/onnx_modules/V220/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/onnx_modules/V230/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/onnx_modules/V220_novq_dev/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V200/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @Desc: 2.0版本兼容 对应2.0.1 2.0.2-fix
 3 | """
 4 | import torch
 5 | import commons
 6 | from .text import cleaned_text_to_sequence, get_bert
 7 | from .text.cleaner import clean_text
 8 | 
 9 | 
10 | def get_text(text, language_str, hps, device):
11 |     # 在此处实现当前版本的get_text
12 |     norm_text, phone, tone, word2ph = clean_text(text, language_str)
13 |     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
14 | 
15 |     if hps.data.add_blank:
16 |         phone = commons.intersperse(phone, 0)
17 |         tone = commons.intersperse(tone, 0)
18 |         language = commons.intersperse(language, 0)
19 |         for i in range(len(word2ph)):
20 |             word2ph[i] = word2ph[i] * 2
21 |         word2ph[0] += 1
22 |     bert_ori = get_bert(norm_text, word2ph, language_str, device)
23 |     del word2ph
24 |     assert bert_ori.shape[-1] == len(phone), phone
25 | 
26 |     if language_str == "ZH":
27 |         bert = bert_ori
28 |         ja_bert = torch.zeros(1024, len(phone))
29 |         en_bert = torch.zeros(1024, len(phone))
30 |     elif language_str == "JP":
31 |         bert = torch.zeros(1024, len(phone))
32 |         ja_bert = bert_ori
33 |         en_bert = torch.zeros(1024, len(phone))
34 |     elif language_str == "EN":
35 |         bert = torch.zeros(1024, len(phone))
36 |         ja_bert = torch.zeros(1024, len(phone))
37 |         en_bert = bert_ori
38 |     else:
39 |         raise ValueError("language_str should be ZH, JP or EN")
40 | 
41 |     assert bert.shape[-1] == len(
42 |         phone
43 |     ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
44 | 
45 |     phone = torch.LongTensor(phone)
46 |     tone = torch.LongTensor(tone)
47 |     language = torch.LongTensor(language)
48 |     return bert, ja_bert, en_bert, phone, tone, language
49 | 
50 | 
51 | def infer(
52 |     text,
53 |     sdp_ratio,
54 |     noise_scale,
55 |     noise_scale_w,
56 |     length_scale,
57 |     sid,
58 |     language,
59 |     hps,
60 |     net_g,
61 |     device,
62 | ):
63 |     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
64 |         text, language, hps, device
65 |     )
66 |     with torch.no_grad():
67 |         x_tst = phones.to(device).unsqueeze(0)
68 |         tones = tones.to(device).unsqueeze(0)
69 |         lang_ids = lang_ids.to(device).unsqueeze(0)
70 |         bert = bert.to(device).unsqueeze(0)
71 |         ja_bert = ja_bert.to(device).unsqueeze(0)
72 |         en_bert = en_bert.to(device).unsqueeze(0)
73 |         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
74 |         del phones
75 |         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
76 |         audio = (
77 |             net_g.infer(
78 |                 x_tst,
79 |                 x_tst_lengths,
80 |                 speakers,
81 |                 tones,
82 |                 lang_ids,
83 |                 bert,
84 |                 ja_bert,
85 |                 en_bert,
86 |                 sdp_ratio=sdp_ratio,
87 |                 noise_scale=noise_scale,
88 |                 noise_scale_w=noise_scale_w,
89 |                 length_scale=length_scale,
90 |             )[0][0, 0]
91 |             .data.cpu()
92 |             .float()
93 |             .numpy()
94 |         )
95 |         del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
96 |         if torch.cuda.is_available():
97 |             torch.cuda.empty_cache()
98 |         return audio
99 | 


--------------------------------------------------------------------------------
/onnx_modules/V200/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/onnx_modules/V210/text/symbols.py:
--------------------------------------------------------------------------------
  1 | punctuation = ["!", "?", "…", ",", ".", "'", "-"]
  2 | pu_symbols = punctuation + ["SP", "UNK"]
  3 | pad = "_"
  4 | 
  5 | # chinese
  6 | zh_symbols = [
  7 |     "E",
  8 |     "En",
  9 |     "a",
 10 |     "ai",
 11 |     "an",
 12 |     "ang",
 13 |     "ao",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "e",
 19 |     "ei",
 20 |     "en",
 21 |     "eng",
 22 |     "er",
 23 |     "f",
 24 |     "g",
 25 |     "h",
 26 |     "i",
 27 |     "i0",
 28 |     "ia",
 29 |     "ian",
 30 |     "iang",
 31 |     "iao",
 32 |     "ie",
 33 |     "in",
 34 |     "ing",
 35 |     "iong",
 36 |     "ir",
 37 |     "iu",
 38 |     "j",
 39 |     "k",
 40 |     "l",
 41 |     "m",
 42 |     "n",
 43 |     "o",
 44 |     "ong",
 45 |     "ou",
 46 |     "p",
 47 |     "q",
 48 |     "r",
 49 |     "s",
 50 |     "sh",
 51 |     "t",
 52 |     "u",
 53 |     "ua",
 54 |     "uai",
 55 |     "uan",
 56 |     "uang",
 57 |     "ui",
 58 |     "un",
 59 |     "uo",
 60 |     "v",
 61 |     "van",
 62 |     "ve",
 63 |     "vn",
 64 |     "w",
 65 |     "x",
 66 |     "y",
 67 |     "z",
 68 |     "zh",
 69 |     "AA",
 70 |     "EE",
 71 |     "OO",
 72 | ]
 73 | num_zh_tones = 6
 74 | 
 75 | # japanese
 76 | ja_symbols = [
 77 |     "N",
 78 |     "a",
 79 |     "a:",
 80 |     "b",
 81 |     "by",
 82 |     "ch",
 83 |     "d",
 84 |     "dy",
 85 |     "e",
 86 |     "e:",
 87 |     "f",
 88 |     "g",
 89 |     "gy",
 90 |     "h",
 91 |     "hy",
 92 |     "i",
 93 |     "i:",
 94 |     "j",
 95 |     "k",
 96 |     "ky",
 97 |     "m",
 98 |     "my",
 99 |     "n",
100 |     "ny",
101 |     "o",
102 |     "o:",
103 |     "p",
104 |     "py",
105 |     "q",
106 |     "r",
107 |     "ry",
108 |     "s",
109 |     "sh",
110 |     "t",
111 |     "ts",
112 |     "ty",
113 |     "u",
114 |     "u:",
115 |     "w",
116 |     "y",
117 |     "z",
118 |     "zy",
119 | ]
120 | num_ja_tones = 2
121 | 
122 | # English
123 | en_symbols = [
124 |     "aa",
125 |     "ae",
126 |     "ah",
127 |     "ao",
128 |     "aw",
129 |     "ay",
130 |     "b",
131 |     "ch",
132 |     "d",
133 |     "dh",
134 |     "eh",
135 |     "er",
136 |     "ey",
137 |     "f",
138 |     "g",
139 |     "hh",
140 |     "ih",
141 |     "iy",
142 |     "jh",
143 |     "k",
144 |     "l",
145 |     "m",
146 |     "n",
147 |     "ng",
148 |     "ow",
149 |     "oy",
150 |     "p",
151 |     "r",
152 |     "s",
153 |     "sh",
154 |     "t",
155 |     "th",
156 |     "uh",
157 |     "uw",
158 |     "V",
159 |     "w",
160 |     "y",
161 |     "z",
162 |     "zh",
163 | ]
164 | num_en_tones = 4
165 | 
166 | # combine all symbols
167 | normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168 | symbols = [pad] + normal_symbols + pu_symbols
169 | sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170 | 
171 | # combine all tones
172 | num_tones = num_zh_tones + num_ja_tones + num_en_tones
173 | 
174 | # language maps
175 | language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176 | num_languages = len(language_id_map.keys())
177 | 
178 | language_tone_start_map = {
179 |     "ZH": 0,
180 |     "JP": num_zh_tones,
181 |     "EN": num_zh_tones + num_ja_tones,
182 | }
183 | 
184 | if __name__ == "__main__":
185 |     a = set(zh_symbols)
186 |     b = set(en_symbols)
187 |     print(sorted(a & b))
188 | 


--------------------------------------------------------------------------------
/oldVersion/V101/text/japanese.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2 | import re
  3 | import sys
  4 | 
  5 | import pyopenjtalk
  6 | 
  7 | from . import symbols
  8 | 
  9 | # Regular expression matching Japanese without punctuation marks:
 10 | _japanese_characters = re.compile(
 11 |     r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 12 | )
 13 | 
 14 | # Regular expression matching non-Japanese characters or punctuation marks:
 15 | _japanese_marks = re.compile(
 16 |     r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 17 | )
 18 | 
 19 | # List of (symbol, Japanese) pairs for marks:
 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
 21 | 
 22 | 
 23 | # List of (consonant, sokuon) pairs:
 24 | _real_sokuon = [
 25 |     (re.compile("%s" % x[0]), x[1])
 26 |     for x in [
 27 |         (r"Q([↑↓]*[kg])", r"k#\1"),
 28 |         (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
 29 |         (r"Q([↑↓]*[sʃ])", r"s\1"),
 30 |         (r"Q([↑↓]*[pb])", r"p#\1"),
 31 |     ]
 32 | ]
 33 | 
 34 | # List of (consonant, hatsuon) pairs:
 35 | _real_hatsuon = [
 36 |     (re.compile("%s" % x[0]), x[1])
 37 |     for x in [
 38 |         (r"N([↑↓]*[pbm])", r"m\1"),
 39 |         (r"N([↑↓]*[ʧʥj])", r"n^\1"),
 40 |         (r"N([↑↓]*[tdn])", r"n\1"),
 41 |         (r"N([↑↓]*[kg])", r"ŋ\1"),
 42 |     ]
 43 | ]
 44 | 
 45 | 
 46 | def post_replace_ph(ph):
 47 |     rep_map = {
 48 |         "：": ",",
 49 |         "；": ",",
 50 |         "，": ",",
 51 |         "。": ".",
 52 |         "！": "!",
 53 |         "？": "?",
 54 |         "\n": ".",
 55 |         "·": ",",
 56 |         "、": ",",
 57 |         "...": "…",
 58 |         "v": "V",
 59 |     }
 60 |     if ph in rep_map.keys():
 61 |         ph = rep_map[ph]
 62 |     if ph in symbols:
 63 |         return ph
 64 |     if ph not in symbols:
 65 |         ph = "UNK"
 66 |     return ph
 67 | 
 68 | 
 69 | def symbols_to_japanese(text):
 70 |     for regex, replacement in _symbols_to_japanese:
 71 |         text = re.sub(regex, replacement, text)
 72 |     return text
 73 | 
 74 | 
 75 | def preprocess_jap(text):
 76 |     """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
 77 |     text = symbols_to_japanese(text)
 78 |     sentences = re.split(_japanese_marks, text)
 79 |     marks = re.findall(_japanese_marks, text)
 80 |     text = []
 81 |     for i, sentence in enumerate(sentences):
 82 |         if re.match(_japanese_characters, sentence):
 83 |             p = pyopenjtalk.g2p(sentence)
 84 |             text += p.split(" ")
 85 | 
 86 |         if i < len(marks):
 87 |             text += [marks[i].replace(" ", "")]
 88 |     return text
 89 | 
 90 | 
 91 | def text_normalize(text):
 92 |     # todo: jap text normalize
 93 |     return text
 94 | 
 95 | 
 96 | def g2p(norm_text):
 97 |     phones = preprocess_jap(norm_text)
 98 |     phones = [post_replace_ph(i) for i in phones]
 99 |     # todo: implement tones and word2ph
100 |     tones = [0 for i in phones]
101 |     word2ph = [1 for i in phones]
102 |     return phones, tones, word2ph
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     for line in open("../../../Downloads/transcript_utf8.txt").readlines():
107 |         text = line.split(":")[1]
108 |         phones, tones, word2ph = g2p(text)
109 |         for p in phones:
110 |             if p == "z":
111 |                 print(text, phones)
112 |                 sys.exit(0)
113 | 


--------------------------------------------------------------------------------
/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(
 16 |     text,
 17 |     word2ph,
 18 |     device=config.bert_gen_config.device,
 19 |     style_text=None,
 20 |     style_weight=0.7,
 21 | ):
 22 |     if (
 23 |         sys.platform == "darwin"
 24 |         and torch.backends.mps.is_available()
 25 |         and device == "cpu"
 26 |     ):
 27 |         device = "mps"
 28 |     if not device:
 29 |         device = "cuda"
 30 |     if device not in models.keys():
 31 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 32 |     with torch.no_grad():
 33 |         inputs = tokenizer(text, return_tensors="pt")
 34 |         for i in inputs:
 35 |             inputs[i] = inputs[i].to(device)
 36 |         res = models[device](**inputs, output_hidden_states=True)
 37 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 38 |         if style_text:
 39 |             style_inputs = tokenizer(style_text, return_tensors="pt")
 40 |             for i in style_inputs:
 41 |                 style_inputs[i] = style_inputs[i].to(device)
 42 |             style_res = models[device](**style_inputs, output_hidden_states=True)
 43 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
 44 |             style_res_mean = style_res.mean(0)
 45 |     assert len(word2ph) == len(text) + 2
 46 |     word2phone = word2ph
 47 |     phone_level_feature = []
 48 |     for i in range(len(word2phone)):
 49 |         if style_text:
 50 |             repeat_feature = (
 51 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
 52 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
 53 |             )
 54 |         else:
 55 |             repeat_feature = res[i].repeat(word2phone[i], 1)
 56 |         phone_level_feature.append(repeat_feature)
 57 | 
 58 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 59 | 
 60 |     return phone_level_feature.T
 61 | 
 62 | 
 63 | if __name__ == "__main__":
 64 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 65 |     word2phone = [
 66 |         1,
 67 |         2,
 68 |         1,
 69 |         2,
 70 |         2,
 71 |         1,
 72 |         2,
 73 |         2,
 74 |         1,
 75 |         2,
 76 |         2,
 77 |         1,
 78 |         2,
 79 |         2,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         1,
 84 |         1,
 85 |         2,
 86 |         2,
 87 |         1,
 88 |         2,
 89 |         2,
 90 |         2,
 91 |         2,
 92 |         1,
 93 |         2,
 94 |         2,
 95 |         2,
 96 |         2,
 97 |         2,
 98 |         1,
 99 |         2,
100 |         2,
101 |         2,
102 |         2,
103 |         1,
104 |     ]
105 | 
106 |     # 计算总帧数
107 |     total_frames = sum(word2phone)
108 |     print(word_level_feature.shape)
109 |     print(word2phone)
110 |     phone_level_feature = []
111 |     for i in range(len(word2phone)):
112 |         print(word_level_feature[i].shape)
113 | 
114 |         # 对每个词重复word2phone[i]次
115 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
116 |         phone_level_feature.append(repeat_feature)
117 | 
118 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
119 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
120 | 


--------------------------------------------------------------------------------
/oldVersion/V210/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(
 16 |     text,
 17 |     word2ph,
 18 |     device=config.bert_gen_config.device,
 19 |     style_text=None,
 20 |     style_weight=0.7,
 21 | ):
 22 |     if (
 23 |         sys.platform == "darwin"
 24 |         and torch.backends.mps.is_available()
 25 |         and device == "cpu"
 26 |     ):
 27 |         device = "mps"
 28 |     if not device:
 29 |         device = "cuda"
 30 |     if device not in models.keys():
 31 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 32 |     with torch.no_grad():
 33 |         inputs = tokenizer(text, return_tensors="pt")
 34 |         for i in inputs:
 35 |             inputs[i] = inputs[i].to(device)
 36 |         res = models[device](**inputs, output_hidden_states=True)
 37 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 38 |         if style_text:
 39 |             style_inputs = tokenizer(style_text, return_tensors="pt")
 40 |             for i in style_inputs:
 41 |                 style_inputs[i] = style_inputs[i].to(device)
 42 |             style_res = models[device](**style_inputs, output_hidden_states=True)
 43 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
 44 |             style_res_mean = style_res.mean(0)
 45 | 
 46 |     assert len(word2ph) == len(text) + 2
 47 |     word2phone = word2ph
 48 |     phone_level_feature = []
 49 |     for i in range(len(word2phone)):
 50 |         if style_text:
 51 |             repeat_feature = (
 52 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
 53 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
 54 |             )
 55 |         else:
 56 |             repeat_feature = res[i].repeat(word2phone[i], 1)
 57 |         phone_level_feature.append(repeat_feature)
 58 | 
 59 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 60 | 
 61 |     return phone_level_feature.T
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 66 |     word2phone = [
 67 |         1,
 68 |         2,
 69 |         1,
 70 |         2,
 71 |         2,
 72 |         1,
 73 |         2,
 74 |         2,
 75 |         1,
 76 |         2,
 77 |         2,
 78 |         1,
 79 |         2,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |         1,
 86 |         2,
 87 |         2,
 88 |         1,
 89 |         2,
 90 |         2,
 91 |         2,
 92 |         2,
 93 |         1,
 94 |         2,
 95 |         2,
 96 |         2,
 97 |         2,
 98 |         2,
 99 |         1,
100 |         2,
101 |         2,
102 |         2,
103 |         2,
104 |         1,
105 |     ]
106 | 
107 |     # 计算总帧数
108 |     total_frames = sum(word2phone)
109 |     print(word_level_feature.shape)
110 |     print(word2phone)
111 |     phone_level_feature = []
112 |     for i in range(len(word2phone)):
113 |         print(word_level_feature[i].shape)
114 | 
115 |         # 对每个词重复word2phone[i]次
116 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
117 |         phone_level_feature.append(repeat_feature)
118 | 
119 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
120 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
121 | 


--------------------------------------------------------------------------------
/oldVersion/V220/text/chinese_bert.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  5 | 
  6 | from config import config
  7 | 
  8 | LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
  9 | 
 10 | tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 11 | 
 12 | models = dict()
 13 | 
 14 | 
 15 | def get_bert_feature(
 16 |     text,
 17 |     word2ph,
 18 |     device=config.bert_gen_config.device,
 19 |     style_text=None,
 20 |     style_weight=0.7,
 21 | ):
 22 |     if (
 23 |         sys.platform == "darwin"
 24 |         and torch.backends.mps.is_available()
 25 |         and device == "cpu"
 26 |     ):
 27 |         device = "mps"
 28 |     if not device:
 29 |         device = "cuda"
 30 |     if device not in models.keys():
 31 |         models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
 32 |     with torch.no_grad():
 33 |         inputs = tokenizer(text, return_tensors="pt")
 34 |         for i in inputs:
 35 |             inputs[i] = inputs[i].to(device)
 36 |         res = models[device](**inputs, output_hidden_states=True)
 37 |         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
 38 |         if style_text:
 39 |             style_inputs = tokenizer(style_text, return_tensors="pt")
 40 |             for i in style_inputs:
 41 |                 style_inputs[i] = style_inputs[i].to(device)
 42 |             style_res = models[device](**style_inputs, output_hidden_states=True)
 43 |             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
 44 |             style_res_mean = style_res.mean(0)
 45 | 
 46 |     assert len(word2ph) == len(text) + 2
 47 |     word2phone = word2ph
 48 |     phone_level_feature = []
 49 |     for i in range(len(word2phone)):
 50 |         if style_text:
 51 |             repeat_feature = (
 52 |                 res[i].repeat(word2phone[i], 1) * (1 - style_weight)
 53 |                 + style_res_mean.repeat(word2phone[i], 1) * style_weight
 54 |             )
 55 |         else:
 56 |             repeat_feature = res[i].repeat(word2phone[i], 1)
 57 |         phone_level_feature.append(repeat_feature)
 58 | 
 59 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
 60 | 
 61 |     return phone_level_feature.T
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
 66 |     word2phone = [
 67 |         1,
 68 |         2,
 69 |         1,
 70 |         2,
 71 |         2,
 72 |         1,
 73 |         2,
 74 |         2,
 75 |         1,
 76 |         2,
 77 |         2,
 78 |         1,
 79 |         2,
 80 |         2,
 81 |         2,
 82 |         2,
 83 |         2,
 84 |         1,
 85 |         1,
 86 |         2,
 87 |         2,
 88 |         1,
 89 |         2,
 90 |         2,
 91 |         2,
 92 |         2,
 93 |         1,
 94 |         2,
 95 |         2,
 96 |         2,
 97 |         2,
 98 |         2,
 99 |         1,
100 |         2,
101 |         2,
102 |         2,
103 |         2,
104 |         1,
105 |     ]
106 | 
107 |     # 计算总帧数
108 |     total_frames = sum(word2phone)
109 |     print(word_level_feature.shape)
110 |     print(word2phone)
111 |     phone_level_feature = []
112 |     for i in range(len(word2phone)):
113 |         print(word_level_feature[i].shape)
114 | 
115 |         # 对每个词重复word2phone[i]次
116 |         repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
117 |         phone_level_feature.append(repeat_feature)
118 | 
119 |     phone_level_feature = torch.cat(phone_level_feature, dim=0)
120 |     print(phone_level_feature.shape)  # torch.Size([36, 1024])
121 | 


--------------------------------------------------------------------------------
/bert/deberta-v3-large/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: en
 3 | tags:
 4 |   - deberta
 5 |   - deberta-v3
 6 |   - fill-mask
 7 | thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
 8 | license: mit
 9 | ---
10 | 
11 | ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
12 | 
13 | [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
14 | 
15 | In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa,  our V3 version significantly improves the model performance on downstream tasks.  You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
16 | 
17 | Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
18 | 
19 | The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters  with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer.  This model was trained using the 160GB data as DeBERTa V2.
20 | 
21 | 
22 | #### Fine-tuning on NLU tasks
23 | 
24 | We present the dev results on SQuAD 2.0 and MNLI tasks.
25 | 
26 | | Model             |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
27 | |-------------------|----------|-------------------|-----------|----------|
28 | | RoBERTa-large     |50     |304                | 89.4/86.5 | 90.2   |
29 | | XLNet-large       |32     |-                  | 90.6/87.9 | 90.8   |
30 | | DeBERTa-large     |50     |-                  | 90.7/88.0 | 91.3   |
31 | | **DeBERTa-v3-large**|128|304                  |  **91.5/89.0**| **91.8/91.9**|
32 | 
33 | 
34 | #### Fine-tuning with HF transformers
35 | 
36 | ```bash
37 | #!/bin/bash
38 | 
39 | cd transformers/examples/pytorch/text-classification/
40 | 
41 | pip install datasets
42 | export TASK_NAME=mnli
43 | 
44 | output_dir="ds_results"
45 | 
46 | num_gpus=8
47 | 
48 | batch_size=8
49 | 
50 | python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
51 |   run_glue.py \
52 |   --model_name_or_path microsoft/deberta-v3-large \
53 |   --task_name $TASK_NAME \
54 |   --do_train \
55 |   --do_eval \
56 |   --evaluation_strategy steps \
57 |   --max_seq_length 256 \
58 |   --warmup_steps 50 \
59 |   --per_device_train_batch_size ${batch_size} \
60 |   --learning_rate 6e-6 \
61 |   --num_train_epochs 2 \
62 |   --output_dir $output_dir \
63 |   --overwrite_output_dir \
64 |   --logging_steps 1000 \
65 |   --logging_dir $output_dir
66 | 
67 | ```
68 | 
69 | ### Citation
70 | 
71 | If you find DeBERTa useful for your work, please cite the following papers:
72 | 
73 | ``` latex
74 | @misc{he2021debertav3,
75 |       title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
76 |       author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
77 |       year={2021},
78 |       eprint={2111.09543},
79 |       archivePrefix={arXiv},
80 |       primaryClass={cs.CL}
81 | }
82 | ```
83 | 
84 | ``` latex
85 | @inproceedings{
86 | he2021deberta,
87 | title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
88 | author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
89 | booktitle={International Conference on Learning Representations},
90 | year={2021},
91 | url={https://openreview.net/forum?id=XPZIaotutsD}
92 | }
93 | ```
94 | 


--------------------------------------------------------------------------------
/oldVersion/V210/emo_gen.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.utils.data import Dataset
  6 | from torch.utils.data import Dataset
  7 | from transformers import Wav2Vec2Processor
  8 | from transformers.models.wav2vec2.modeling_wav2vec2 import (
  9 |     Wav2Vec2Model,
 10 |     Wav2Vec2PreTrainedModel,
 11 | )
 12 | 
 13 | from config import config
 14 | 
 15 | 
 16 | class RegressionHead(nn.Module):
 17 |     r"""Classification head."""
 18 | 
 19 |     def __init__(self, config):
 20 |         super().__init__()
 21 | 
 22 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 23 |         self.dropout = nn.Dropout(config.final_dropout)
 24 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
 25 | 
 26 |     def forward(self, features, **kwargs):
 27 |         x = features
 28 |         x = self.dropout(x)
 29 |         x = self.dense(x)
 30 |         x = torch.tanh(x)
 31 |         x = self.dropout(x)
 32 |         x = self.out_proj(x)
 33 | 
 34 |         return x
 35 | 
 36 | 
 37 | class EmotionModel(Wav2Vec2PreTrainedModel):
 38 |     r"""Speech emotion classifier."""
 39 | 
 40 |     def __init__(self, config):
 41 |         super().__init__(config)
 42 | 
 43 |         self.config = config
 44 |         self.wav2vec2 = Wav2Vec2Model(config)
 45 |         self.classifier = RegressionHead(config)
 46 |         self.init_weights()
 47 | 
 48 |     def forward(
 49 |         self,
 50 |         input_values,
 51 |     ):
 52 |         outputs = self.wav2vec2(input_values)
 53 |         hidden_states = outputs[0]
 54 |         hidden_states = torch.mean(hidden_states, dim=1)
 55 |         logits = self.classifier(hidden_states)
 56 | 
 57 |         return hidden_states, logits
 58 | 
 59 | 
 60 | class AudioDataset(Dataset):
 61 |     def __init__(self, list_of_wav_files, sr, processor):
 62 |         self.list_of_wav_files = list_of_wav_files
 63 |         self.processor = processor
 64 |         self.sr = sr
 65 | 
 66 |     def __len__(self):
 67 |         return len(self.list_of_wav_files)
 68 | 
 69 |     def __getitem__(self, idx):
 70 |         wav_file = self.list_of_wav_files[idx]
 71 |         audio_data, _ = librosa.load(wav_file, sr=self.sr)
 72 |         processed_data = self.processor(audio_data, sampling_rate=self.sr)[
 73 |             "input_values"
 74 |         ][0]
 75 |         return torch.from_numpy(processed_data)
 76 | 
 77 | 
 78 | device = config.emo_gen_config.device
 79 | model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
 80 | processor = Wav2Vec2Processor.from_pretrained(model_name)
 81 | model = EmotionModel.from_pretrained(model_name).to(device)
 82 | 
 83 | 
 84 | def process_func(
 85 |     x: np.ndarray,
 86 |     sampling_rate: int,
 87 |     model: EmotionModel,
 88 |     processor: Wav2Vec2Processor,
 89 |     device: str,
 90 |     embeddings: bool = False,
 91 | ) -> np.ndarray:
 92 |     r"""Predict emotions or extract embeddings from raw audio signal."""
 93 |     model = model.to(device)
 94 |     y = processor(x, sampling_rate=sampling_rate)
 95 |     y = y["input_values"][0]
 96 |     y = torch.from_numpy(y).unsqueeze(0).to(device)
 97 | 
 98 |     # run through model
 99 |     with torch.no_grad():
100 |         y = model(y)[0 if embeddings else 1]
101 | 
102 |     # convert to numpy
103 |     y = y.detach().cpu().numpy()
104 | 
105 |     return y
106 | 
107 | 
108 | def get_emo(path):
109 |     wav, sr = librosa.load(path, 16000)
110 |     return process_func(
111 |         np.expand_dims(wav, 0).astype(np.float64),
112 |         sr,
113 |         model,
114 |         processor,
115 |         device,
116 |         embeddings=True,
117 |     ).squeeze(0)
118 | 


--------------------------------------------------------------------------------