├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── environment.yaml ├── kantts ├── __init__.py ├── bin │ ├── __init__.py │ ├── infer_hifigan.py │ ├── infer_sambert.py │ ├── text_to_wav.py │ ├── train_hifigan.py │ ├── train_sambert.py │ └── train_sybert.py ├── configs │ ├── audio_config_16k.yaml │ ├── audio_config_24k.yaml │ ├── audio_config_48k.yaml │ ├── audio_config_8k.yaml │ ├── audio_config_se_16k.yaml │ ├── hifigan_noncausal_nsf_global_v1_16k.yaml │ ├── hifigan_noncausal_nsf_v1_16k.yaml │ ├── hifigan_noncausal_v1_16k.yaml │ ├── hifigan_v1_16k.yaml │ ├── hifigan_v1_24k.yaml │ ├── hifigan_v1_48k.yaml │ ├── hifigan_v1_8k.yaml │ ├── hifigan_v1_nsf_24k.yaml │ ├── sambert_16k.yaml │ ├── sambert_16k_MAS.yaml │ ├── sambert_16k_MAS_byte.yaml │ ├── sambert_24k.yaml │ ├── sambert_48k.yaml │ ├── sambert_fp_8k.yaml │ ├── sambert_nsf_16k.yaml │ ├── sambert_nsf_24k.yaml │ ├── sambert_se_nsf_global_16k.yaml │ ├── sambert_sichuan_16k.yaml │ └── sybert.yaml ├── datasets │ ├── __init__.py │ ├── data_types.py │ └── dataset.py ├── models │ ├── __init__.py │ ├── hifigan │ │ ├── hifigan.py │ │ └── layers.py │ ├── pqmf.py │ ├── sambert │ │ ├── __init__.py │ │ ├── adaptors.py │ │ ├── alignment.py │ │ ├── attention.py │ │ ├── fsmn.py │ │ ├── kantts_sambert.py │ │ └── positions.py │ └── utils.py ├── preprocess │ ├── __init__.py │ ├── audio_processor │ │ ├── __init__.py │ │ ├── audio_processor.py │ │ └── core │ │ │ ├── __init__.py │ │ │ ├── dsp.py │ │ │ └── utils.py │ ├── data_process.py │ ├── fp_processor.py │ ├── languages │ │ ├── PinYin │ │ │ ├── En2ChPhoneMap.txt │ │ │ ├── PhoneSet.xml │ │ │ ├── PosSet.xml │ │ │ ├── py2phoneMap.txt │ │ │ └── tonelist.txt │ │ ├── Sichuan │ │ │ ├── En2ChPhoneMap.txt │ │ │ ├── PhoneSet.xml │ │ │ ├── PosSet.xml │ │ │ ├── py2phoneMap.txt │ │ │ └── tonelist.txt │ │ ├── WuuShanghai │ │ │ ├── En2ChPhoneMap.txt │ │ │ ├── PhoneSet.xml │ │ │ ├── PosSet.xml │ │ │ ├── py2phoneMap.txt │ │ │ └── tonelist.txt │ │ ├── ZhHK │ │ │ ├── En2ChPhoneMap.txt │ │ │ ├── PhoneSet.xml │ │ │ ├── PosSet.xml │ │ │ ├── py2phoneMap.txt │ │ │ └── tonelist.txt │ │ └── __init__.py │ ├── script_convertor │ │ ├── TextScriptConvertor.py │ │ ├── __init__.py │ │ └── core │ │ │ ├── Phone.py │ │ │ ├── PhoneSet.py │ │ │ ├── Pos.py │ │ │ ├── PosSet.py │ │ │ ├── Script.py │ │ │ ├── ScriptItem.py │ │ │ ├── ScriptSentence.py │ │ │ ├── ScriptWord.py │ │ │ ├── Syllable.py │ │ │ ├── SyllableFormatter.py │ │ │ ├── XmlObj.py │ │ │ ├── __init__.py │ │ │ ├── core_types.py │ │ │ └── utils.py │ ├── se_processor │ │ ├── D_TDNN.py │ │ ├── __init__.py │ │ ├── layers.py │ │ └── se_processor.py │ └── text_process.py ├── train │ ├── __init__.py │ ├── loss.py │ ├── scheduler.py │ └── trainer.py └── utils │ ├── __init__.py │ ├── audio_torch.py │ ├── ling_unit │ ├── __init__.py │ ├── cleaners.py │ ├── emotion_types.py │ ├── lang_symbols.py │ ├── ling_unit.py │ └── numbers.py │ ├── log.py │ └── plot.py ├── notebooks └── README.md ├── requirements.txt ├── setup.py └── test └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | # 162 | # MISC 163 | .DS_Store 164 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 20.8b1 4 | hooks: 5 | - id: black 6 | additional_dependencies: ['click==8.0.4'] 7 | - repo: https://gitlab.com/pycqa/flake8 8 | rev: 3.8.4 9 | hooks: 10 | - id: flake8 11 | args: ['--max-line-length=120', '--extend-ignore=E203'] 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Alibaba Research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KAN-TTS 2 | 3 | With KAN-TTS you can train your own TTS model from zero to hero :). 4 | 5 | ## Models 6 | Temporarily we support sam-bert and hifi-GAN, other models are coming soon. 7 | 8 | ## Support Languages 9 | | Language | Model Links | 10 | | :---: | :---: | 11 | | Mandarin | https://modelscope.cn/models?name=zhcn&page=1&tasks=text-to-speech&type=audio | 12 | | English | https://modelscope.cn/models?name=enus&page=1&tasks=text-to-speech&type=audio | 13 | | British | https://modelscope.cn/models?name=engb&page=1&tasks=text-to-speech&type=audio | 14 | | Shanghainese | https://modelscope.cn/models?name=WuuShanghai&page=1&tasks=text-to-speech&type=audio | 15 | | Sichuanese | https://modelscope.cn/models?name=Sichuan&page=1&tasks=text-to-speech&type=audio | 16 | | Cantonese | https://modelscope.cn/models?name=Cantonese&page=1&tasks=text-to-speech&type=audio | 17 | | Italian | https://modelscope.cn/models?name=itit&page=1&tasks=text-to-speech&type=audio | 18 | | Spanish | https://modelscope.cn/models?name=eses&page=1&tasks=text-to-speech&type=audio | 19 | | Russian | https://modelscope.cn/models?name=ruru&page=1&tasks=text-to-speech&type=audio | 20 | | Korean | https://modelscope.cn/models?name=kokr&page=1&tasks=text-to-speech&type=audio | 21 | More languages are coming soon. 22 | 23 | ## Training Tutorial 24 | You can find the training tutorial in our wiki page [KAN-TTS Wiki](https://github.com/AlibabaResearch/KAN-TTS/wiki). 25 | 26 | ## ModelScope Demo 27 | Try our demo on ModelScope [KAN-TTS Demo](https://modelscope.cn/models?page=1&tasks=text-to-speech). 28 | 29 | ## Contribute to this repo 30 | 31 | ```shell 32 | pip install -r requirements.txt 33 | pre-commit install 34 | ``` 35 | 36 | ## Contact us 37 | If you have any questions, please feel free to contact us. 38 | 39 | Scan the QR code to join our DingTalk group. 40 | 41 | 42 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: maas 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - audioread 8 | - cudatoolkit=10.1 9 | - ffmpeg 10 | - lame 11 | - librosa=0.9.2 12 | - libsndfile 13 | - matplotlib=3.5.1 14 | - matplotlib-base=3.5.1 15 | - numba 16 | - numpy 17 | - unidecode 18 | - inflect 19 | - numpy-base 20 | - pip 21 | - protobuf=3.20.1 22 | - pysocks=1.7.1 23 | - pysoundfile 24 | - python=3.7.13 25 | - python-dateutil=2.8.2 26 | - python_abi=3.7 27 | - pytorch=1.7.0 28 | - pywavelets=1.3.0 29 | - pyyaml=6.0 30 | - readline 31 | - scikit-learn=1.0.2 32 | - scipy=1.7.3 33 | - setuptools=61.2.0 34 | - six=1.16.0 35 | - sqlite=3.38.5 36 | - tensorboardx=2.2 37 | - threadpoolctl=3.1.0 38 | - tk=8.6.12 39 | - torchaudio=0.7.0 40 | - torchvision=0.8.0 41 | - tqdm 42 | - urllib3 43 | - wheel 44 | - yaml=0.2.5 45 | - pip: 46 | - appnope==0.1.3 47 | - backcall==0.2.0 48 | - cython==0.29.30 49 | - dataclasses==0.6 50 | - future==0.18.2 51 | - greenlet==1.1.2 52 | - ipdb 53 | - ipython 54 | - jedi==0.18.1 55 | - matplotlib-inline==0.1.3 56 | - msgpack==1.0.4 57 | - parso==0.8.3 58 | - pexpect==4.8.0 59 | - pickleshare==0.7.5 60 | - prompt-toolkit==3.0.30 61 | - ptyprocess==0.7.0 62 | - pygments==2.12.0 63 | - pysptk 64 | - git+https://github.com/fbcotter/pytorch_wavelets.git 65 | - sox 66 | - toml==0.10.2 67 | - traitlets==5.3.0 68 | - wcwidth==0.2.5 69 | - bitstring==3.1.6 70 | - --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html 71 | - ttsfrd 72 | -------------------------------------------------------------------------------- /kantts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/__init__.py -------------------------------------------------------------------------------- /kantts/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/bin/__init__.py -------------------------------------------------------------------------------- /kantts/bin/infer_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import torch 5 | import soundfile as sf 6 | import yaml 7 | import logging 8 | import numpy as np 9 | import time 10 | import glob 11 | 12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402 13 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402 14 | 15 | try: 16 | from kantts.utils.log import logging_to_file 17 | except ImportError: 18 | raise ImportError("Please install kantts.") 19 | 20 | logging.basicConfig( 21 | # filename=os.path.join(stage_dir, 'stdout.log'), 22 | format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s", 23 | datefmt="%Y-%m-%d:%H:%M:%S", 24 | level=logging.INFO, 25 | ) 26 | 27 | 28 | def count_parameters(model): 29 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 30 | 31 | 32 | def load_model(ckpt, config=None): 33 | # load config if not provided 34 | if config is None: 35 | dirname = os.path.dirname(os.path.dirname(ckpt)) 36 | config = os.path.join(dirname, "config.yaml") 37 | with open(config) as f: 38 | config = yaml.load(f, Loader=yaml.Loader) 39 | 40 | # lazy load for circular error 41 | from kantts.models.hifigan.hifigan import Generator 42 | 43 | model = Generator(**config["Model"]["Generator"]["params"]) 44 | states = torch.load(ckpt, map_location="cpu") 45 | model.load_state_dict(states["model"]["generator"]) 46 | 47 | # add pqmf if needed 48 | if config["Model"]["Generator"]["params"]["out_channels"] > 1: 49 | # lazy load for circular error 50 | from kantts.models.pqmf import PQMF 51 | 52 | model.pqmf = PQMF() 53 | 54 | return model 55 | 56 | 57 | def binarize(mel, threshold=0.6): 58 | # vuv binarize 59 | res_mel = mel.copy() 60 | index = np.where(mel[:, -1] < threshold)[0] 61 | res_mel[:, -1] = 1.0 62 | res_mel[:, -1][index] = 0.0 63 | return res_mel 64 | 65 | 66 | def hifigan_infer(input_mel, ckpt_path, output_dir, config=None): 67 | if not torch.cuda.is_available(): 68 | device = torch.device("cpu") 69 | else: 70 | torch.backends.cudnn.benchmark = True 71 | device = torch.device("cuda", 0) 72 | 73 | if config is not None: 74 | with open(config, "r") as f: 75 | config = yaml.load(f, Loader=yaml.Loader) 76 | else: 77 | config_path = os.path.join( 78 | os.path.dirname(os.path.dirname(ckpt_path)), "config.yaml" 79 | ) 80 | if not os.path.exists(config_path): 81 | raise ValueError("config file not found: {}".format(config_path)) 82 | with open(config_path, "r") as f: 83 | config = yaml.load(f, Loader=yaml.Loader) 84 | 85 | for key, value in config.items(): 86 | logging.info(f"{key} = {value}") 87 | 88 | # check directory existence 89 | if not os.path.exists(output_dir): 90 | os.makedirs(output_dir) 91 | 92 | logging_to_file(os.path.join(output_dir, "stdout.log")) 93 | 94 | if os.path.isfile(input_mel): 95 | mel_lst = [input_mel] 96 | elif os.path.isdir(input_mel): 97 | mel_lst = glob.glob(os.path.join(input_mel, "*.npy")) 98 | else: 99 | raise ValueError("input_mel should be a file or a directory") 100 | 101 | model = load_model(ckpt_path, config) 102 | 103 | logging.info(f"Loaded model parameters from {ckpt_path}.") 104 | model.remove_weight_norm() 105 | model = model.eval().to(device) 106 | 107 | with torch.no_grad(): 108 | start = time.time() 109 | pcm_len = 0 110 | for mel in mel_lst: 111 | utt_id = os.path.splitext(os.path.basename(mel))[0] 112 | mel_data = np.load(mel) 113 | if model.nsf_enable: 114 | mel_data = binarize(mel_data) 115 | # generate 116 | mel_data = torch.tensor(mel_data, dtype=torch.float).to(device) 117 | # (T, C) -> (B, C, T) 118 | mel_data = mel_data.transpose(1, 0).unsqueeze(0) 119 | y = model(mel_data) 120 | if hasattr(model, "pqmf"): 121 | y = model.pqmf.synthesis(y) 122 | y = y.view(-1).cpu().numpy() 123 | pcm_len += len(y) 124 | 125 | # save as PCM 16 bit wav file 126 | sf.write( 127 | os.path.join(output_dir, f"{utt_id}_gen.wav"), 128 | y, 129 | config["audio_config"]["sampling_rate"], 130 | "PCM_16", 131 | ) 132 | rtf = (time.time() - start) / ( 133 | pcm_len / config["audio_config"]["sampling_rate"] 134 | ) 135 | 136 | # report average RTF 137 | logging.info( 138 | f"Finished generation of {len(mel_lst)} utterances (RTF = {rtf:.03f})." 139 | ) 140 | 141 | 142 | if __name__ == "__main__": 143 | parser = argparse.ArgumentParser(description="Infer hifigan model") 144 | parser.add_argument( 145 | "--ckpt", type=str, required=True, help="Path to model checkpoint" 146 | ) 147 | parser.add_argument( 148 | "--input_mel", 149 | type=str, 150 | required=True, 151 | help="Path to input mel file or directory containing mel files", 152 | ) 153 | parser.add_argument( 154 | "--output_dir", type=str, required=True, help="Path to output directory" 155 | ) 156 | parser.add_argument("--config", type=str, default=None, help="Path to config file") 157 | args = parser.parse_args() 158 | hifigan_infer( 159 | args.input_mel, 160 | args.ckpt, 161 | args.output_dir, 162 | args.config, 163 | ) 164 | -------------------------------------------------------------------------------- /kantts/bin/text_to_wav.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import yaml 5 | import logging 6 | import zipfile 7 | from glob import glob 8 | import soundfile as sf 9 | import numpy as np 10 | 11 | 12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402 13 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402 14 | 15 | try: 16 | from kantts.bin.infer_sambert import am_infer 17 | from kantts.bin.infer_hifigan import hifigan_infer 18 | from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols 19 | except ImportError: 20 | raise ImportError("Please install kantts.") 21 | 22 | logging.basicConfig( 23 | # filename=os.path.join(stage_dir, 'stdout.log'), 24 | format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s", 25 | datefmt="%Y-%m-%d:%H:%M:%S", 26 | level=logging.INFO, 27 | ) 28 | 29 | 30 | def concat_process(chunked_dir, output_dir): 31 | wav_files = sorted(glob(os.path.join(chunked_dir, "*.wav"))) 32 | print(wav_files) 33 | sentence_sil = 0.28 # seconds 34 | end_sil = 0.05 # seconds 35 | 36 | cnt = 0 37 | wav_concat = None 38 | main_id, sub_id = 0, 0 39 | 40 | while cnt < len(wav_files): 41 | wav_file = os.path.join( 42 | chunked_dir, "{}_{}_mel_gen.wav".format(main_id, sub_id) 43 | ) 44 | if os.path.exists(wav_file): 45 | wav, sr = sf.read(wav_file) 46 | sentence_sil_samples = int(sentence_sil * sr) 47 | end_sil_samples = int(end_sil * sr) 48 | if sub_id == 0: 49 | wav_concat = wav 50 | else: 51 | wav_concat = np.concatenate( 52 | (wav_concat, np.zeros(sentence_sil_samples), wav), axis=0 53 | ) 54 | 55 | sub_id += 1 56 | cnt += 1 57 | else: 58 | if wav_concat is not None: 59 | wav_concat = np.concatenate( 60 | (wav_concat, np.zeros(end_sil_samples)), axis=0 61 | ) 62 | sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr) 63 | 64 | main_id += 1 65 | sub_id = 0 66 | wav_concat = None 67 | 68 | if cnt == len(wav_files): 69 | wav_concat = np.concatenate((wav_concat, np.zeros(end_sil_samples)), axis=0) 70 | sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr) 71 | 72 | 73 | def text_to_wav( 74 | text_file, 75 | output_dir, 76 | resources_zip_file, 77 | am_ckpt, 78 | voc_ckpt, 79 | speaker=None, 80 | se_file=None, 81 | lang="PinYin", 82 | ): 83 | os.makedirs(output_dir, exist_ok=True) 84 | os.makedirs(os.path.join(output_dir, "res_wavs"), exist_ok=True) 85 | 86 | resource_root_dir = os.path.dirname(resources_zip_file) 87 | resource_dir = os.path.join(resource_root_dir, "resource") 88 | 89 | if not os.path.exists(resource_dir): 90 | logging.info("Extracting resources...") 91 | with zipfile.ZipFile(resources_zip_file, "r") as zip_ref: 92 | zip_ref.extractall(resource_root_dir) 93 | 94 | with open(text_file, "r") as text_data: 95 | texts = text_data.readlines() 96 | 97 | logging.info("Converting text to symbols...") 98 | am_config = os.path.join(os.path.dirname(os.path.dirname(am_ckpt)), "config.yaml") 99 | with open(am_config, "r") as f: 100 | config = yaml.load(f, Loader=yaml.Loader) 101 | if speaker is None: 102 | speaker = config["linguistic_unit"]["speaker_list"].split(",")[0] 103 | symbols_lst = text_to_symbols(texts, resource_dir, speaker, lang) 104 | symbols_file = os.path.join(output_dir, "symbols.lst") 105 | with open(symbols_file, "w") as symbol_data: 106 | for symbol in symbols_lst: 107 | symbol_data.write(symbol) 108 | 109 | logging.info("AM is infering...") 110 | am_infer(symbols_file, am_ckpt, output_dir, se_file) 111 | 112 | logging.info("Vocoder is infering...") 113 | hifigan_infer(os.path.join(output_dir, "feat"), voc_ckpt, output_dir) 114 | 115 | concat_process(output_dir, os.path.join(output_dir, "res_wavs")) 116 | 117 | logging.info("Text to wav finished!") 118 | 119 | 120 | if __name__ == "__main__": 121 | parser = argparse.ArgumentParser(description="Text to wav") 122 | parser.add_argument("--txt", type=str, required=True, help="Path to text file") 123 | parser.add_argument( 124 | "--output_dir", type=str, required=True, help="Path to output directory" 125 | ) 126 | parser.add_argument( 127 | "--res_zip", type=str, required=True, help="Path to resource zip file" 128 | ) 129 | parser.add_argument( 130 | "--am_ckpt", type=str, required=True, help="Path to am ckpt file" 131 | ) 132 | parser.add_argument( 133 | "--voc_ckpt", type=str, required=True, help="Path to voc ckpt file" 134 | ) 135 | parser.add_argument( 136 | "--speaker", 137 | type=str, 138 | required=False, 139 | default=None, 140 | help="The speaker name, default is the first speaker", 141 | ) 142 | parser.add_argument( 143 | "--se_file", 144 | type=str, 145 | required=False, 146 | default=None, 147 | help="The speaker embedding file , default is None", 148 | ) 149 | parser.add_argument( 150 | "--lang", 151 | type=str, 152 | default="PinYin", 153 | help="""The language of the text, default is PinYin, other options are: 154 | English, 155 | British, 156 | ZhHK, 157 | WuuShanghai, 158 | Sichuan, 159 | Indonesian, 160 | Malay, 161 | Filipino, 162 | Vietnamese, 163 | Korean, 164 | Russian 165 | """, 166 | ) 167 | args = parser.parse_args() 168 | text_to_wav( 169 | args.txt, 170 | args.output_dir, 171 | args.res_zip, 172 | args.am_ckpt, 173 | args.voc_ckpt, 174 | args.speaker, 175 | args.se_file, 176 | args.lang, 177 | ) 178 | -------------------------------------------------------------------------------- /kantts/configs/audio_config_16k.yaml: -------------------------------------------------------------------------------- 1 | # Audio processing configs 2 | 3 | audio_config: 4 | # Preprocess 5 | wav_normalize: True 6 | trim_silence: True 7 | trim_silence_threshold_db: 60 8 | preemphasize: False 9 | 10 | # Feature extraction 11 | sampling_rate: 16000 12 | hop_length: 200 13 | win_length: 1000 14 | n_fft: 2048 15 | n_mels: 80 16 | fmin: 0.0 17 | fmax: 8000.0 18 | phone_level_feature: True 19 | 20 | # Normalization 21 | norm_type: "mean_std" # "mean_std" or "global" 22 | max_norm: 1.0 23 | symmetric: False 24 | min_level_db: -100.0 25 | ref_level_db: 20 26 | 27 | num_workers: 16 28 | -------------------------------------------------------------------------------- /kantts/configs/audio_config_24k.yaml: -------------------------------------------------------------------------------- 1 | # Audio processing configs 2 | 3 | audio_config: 4 | # Preprocess 5 | wav_normalize: True 6 | trim_silence: True 7 | trim_silence_threshold_db: 60 8 | preemphasize: False 9 | 10 | # Feature extraction 11 | sampling_rate: 24000 12 | hop_length: 240 13 | win_length: 1024 14 | n_fft: 1024 15 | n_mels: 80 16 | fmin: 50.0 17 | fmax: 8000.0 18 | phone_level_feature: True 19 | 20 | # Normalization 21 | norm_type: "mean_std" # "mean_std" or "global" 22 | max_norm: 1.0 23 | symmetric: False 24 | min_level_db: -100.0 25 | ref_level_db: 20 26 | 27 | num_workers: 16 28 | -------------------------------------------------------------------------------- /kantts/configs/audio_config_48k.yaml: -------------------------------------------------------------------------------- 1 | # Audio processing configs 2 | 3 | audio_config: 4 | # Preprocess 5 | wav_normalize: True 6 | trim_silence: True 7 | trim_silence_threshold_db: 60 8 | preemphasize: False 9 | 10 | # Feature extraction 11 | sampling_rate: 48000 12 | hop_length: 600 13 | win_length: 2400 14 | n_fft: 4096 15 | n_mels: 128 16 | fmin: 0.0 17 | fmax: 12000.0 18 | phone_level_feature: True 19 | 20 | # Normalization 21 | norm_type: "mean_std" # "mean_std" or "global" 22 | max_norm: 1.0 23 | symmetric: False 24 | min_level_db: -100.0 25 | ref_level_db: 20 26 | 27 | num_workers: 16 28 | -------------------------------------------------------------------------------- /kantts/configs/audio_config_8k.yaml: -------------------------------------------------------------------------------- 1 | # Audio processing configs 2 | 3 | audio_config: 4 | # Preprocess 5 | wav_normalize: True 6 | trim_silence: True 7 | trim_silence_threshold_db: 60 8 | preemphasize: False 9 | 10 | # Feature extraction 11 | sampling_rate: 8000 12 | hop_length: 100 13 | win_length: 600 14 | n_fft: 2048 15 | n_mels: 80 16 | fmin: 0.0 17 | fmax: 4000.0 18 | phone_level_feature: True 19 | 20 | # Normalization 21 | norm_type: "mean_std" # "mean_std" or "global" 22 | max_norm: 1.0 23 | symmetric: False 24 | min_level_db: -100.0 25 | ref_level_db: 20 26 | 27 | num_workers: 16 28 | 29 | -------------------------------------------------------------------------------- /kantts/configs/audio_config_se_16k.yaml: -------------------------------------------------------------------------------- 1 | # Audio processing configs 2 | 3 | audio_config: 4 | # Preprocess 5 | wav_normalize: True 6 | trim_silence: True 7 | trim_silence_threshold_db: 60 8 | preemphasize: False 9 | 10 | # Feature extraction 11 | sampling_rate: 16000 12 | hop_length: 200 13 | win_length: 1000 14 | n_fft: 2048 15 | n_mels: 80 16 | fmin: 0.0 17 | fmax: 8000.0 18 | phone_level_feature: True 19 | se_feature: True 20 | 21 | # Normalization 22 | norm_type: "mean_std" # "mean_std" or "global" 23 | max_norm: 1.0 24 | symmetric: False 25 | min_level_db: -100.0 26 | ref_level_db: 20 27 | 28 | num_workers: 16 29 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_noncausal_v1_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 80 9 | out_channels: 1 10 | channels: 256 11 | kernel_size: 7 12 | upsample_scales: [10, 5, 2, 2] 13 | upsample_kernal_sizes: [20, 11, 4, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5, 7] 17 | - [1, 3, 5, 7] 18 | - [1, 3, 5, 7] 19 | bias: true 20 | causal: false 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | optimizer: 26 | type: Adam 27 | params: 28 | lr: 2.0e-4 29 | betas: [0.5, 0.9] 30 | weight_decay: 0.0 31 | scheduler: 32 | type: MultiStepLR 33 | params: 34 | gamma: 0.5 35 | milestones: 36 | - 200000 37 | - 400000 38 | - 600000 39 | - 800000 40 | 41 | ########################################################### 42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 43 | ########################################################### 44 | MultiScaleDiscriminator: 45 | params: 46 | scales: 3 47 | downsample_pooling: "DWT" 48 | downsample_pooling_params: 49 | kernel_size: 4 50 | stride: 2 51 | padding: 2 52 | discriminator_params: 53 | in_channels: 1 54 | out_channels: 1 55 | kernel_sizes: [15, 41, 5, 3] 56 | channels: 128 57 | max_downsample_channels: 1024 58 | max_groups: 16 59 | bias: true 60 | downsample_scales: [4, 4, 4, 4, 1] 61 | nonlinear_activation: "LeakyReLU" 62 | nonlinear_activation_params: 63 | negative_slope: 0.1 64 | follow_official_norm: true 65 | optimizer: 66 | type: Adam 67 | params: 68 | lr: 2.0e-4 69 | betas: [0.5, 0.9] 70 | weight_decay: 0.0 71 | scheduler: 72 | type: MultiStepLR 73 | params: 74 | gamma: 0.5 75 | milestones: 76 | - 200000 77 | - 400000 78 | - 600000 79 | - 800000 80 | 81 | MultiPeriodDiscriminator: 82 | params: 83 | periods: [2, 3, 5, 7, 11] 84 | discriminator_params: 85 | in_channels: 1 86 | out_channels: 1 87 | kernel_sizes: [5, 3] 88 | channels: 32 89 | downsample_scales: [3, 3, 3, 3, 1] 90 | max_downsample_channels: 1024 91 | bias: true 92 | nonlinear_activation: "LeakyReLU" 93 | nonlinear_activation_params: 94 | negative_slope: 0.1 95 | use_spectral_norm: false 96 | optimizer: 97 | type: Adam 98 | params: 99 | lr: 2.0e-4 100 | betas: [0.5, 0.9] 101 | weight_decay: 0.0 102 | scheduler: 103 | type: MultiStepLR 104 | params: 105 | gamma: 0.5 106 | milestones: 107 | - 200000 108 | - 400000 109 | - 600000 110 | - 800000 111 | 112 | #################################################### 113 | # LOSS SETTING # 114 | #################################################### 115 | Loss: 116 | generator_adv_loss: 117 | enable: True 118 | params: 119 | average_by_discriminators: False 120 | weights: 1.0 121 | 122 | discriminator_adv_loss: 123 | enable: True 124 | params: 125 | average_by_discriminators: False 126 | weights: 1.0 127 | 128 | stft_loss: 129 | enable: False # Whether to use multi-resolution STFT loss. 130 | 131 | mel_loss: 132 | enable: True 133 | params: 134 | fs: 16000 135 | fft_size: 2048 136 | hop_size: 200 137 | win_length: 1000 138 | window: "hann" 139 | num_mels: 80 140 | fmin: 0 141 | fmax: 8000 142 | log_base: null 143 | weights: 45.0 144 | 145 | subband_stft_loss: 146 | enable: False 147 | params: 148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 151 | window: "hann_window" # Window function for STFT-based loss 152 | 153 | feat_match_loss: 154 | enable: True 155 | params: 156 | average_by_discriminators: false 157 | average_by_layers: false 158 | weights: 2.0 159 | 160 | 161 | ########################################################### 162 | # DATA LOADER SETTING # 163 | ########################################################### 164 | batch_size: 16 165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. 166 | pin_memory: True 167 | num_workers: 2 # FIXME: set > 0 may stuck on macos 168 | remove_short_samples: False 169 | allow_cache: True 170 | 171 | generator_grad_norm: -1 172 | 173 | discriminator_grad_norm: -1 174 | 175 | ########################################################### 176 | # INTERVAL SETTING # 177 | ########################################################### 178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 180 | train_max_steps: 2500000 # Number of training steps. 181 | save_interval_steps: 20000 # Interval steps to save checkpoint. 182 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 183 | log_interval_steps: 1000 # Interval steps to record the training log. 184 | 185 | ########################################################### 186 | # OTHER SETTING # 187 | ########################################################### 188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 189 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_v1_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 80 9 | out_channels: 1 10 | channels: 256 11 | kernel_size: 7 12 | upsample_scales: [10, 5, 2, 2] 13 | upsample_kernal_sizes: [20, 10, 4, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5, 7] 17 | - [1, 3, 5, 7] 18 | - [1, 3, 5, 7] 19 | bias: true 20 | causal: true 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | optimizer: 26 | type: Adam 27 | params: 28 | lr: 2.0e-4 29 | betas: [0.5, 0.9] 30 | weight_decay: 0.0 31 | scheduler: 32 | type: MultiStepLR 33 | params: 34 | gamma: 0.5 35 | milestones: 36 | - 200000 37 | - 400000 38 | - 600000 39 | - 800000 40 | 41 | ########################################################### 42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 43 | ########################################################### 44 | MultiScaleDiscriminator: 45 | params: 46 | scales: 3 47 | downsample_pooling: "DWT" 48 | downsample_pooling_params: 49 | kernel_size: 4 50 | stride: 2 51 | padding: 2 52 | discriminator_params: 53 | in_channels: 1 54 | out_channels: 1 55 | kernel_sizes: [15, 41, 5, 3] 56 | channels: 128 57 | max_downsample_channels: 1024 58 | max_groups: 16 59 | bias: true 60 | downsample_scales: [4, 4, 4, 4, 1] 61 | nonlinear_activation: "LeakyReLU" 62 | nonlinear_activation_params: 63 | negative_slope: 0.1 64 | follow_official_norm: true 65 | optimizer: 66 | type: Adam 67 | params: 68 | lr: 2.0e-4 69 | betas: [0.5, 0.9] 70 | weight_decay: 0.0 71 | scheduler: 72 | type: MultiStepLR 73 | params: 74 | gamma: 0.5 75 | milestones: 76 | - 200000 77 | - 400000 78 | - 600000 79 | - 800000 80 | 81 | MultiPeriodDiscriminator: 82 | params: 83 | periods: [2, 3, 5, 7, 11] 84 | discriminator_params: 85 | in_channels: 1 86 | out_channels: 1 87 | kernel_sizes: [5, 3] 88 | channels: 32 89 | downsample_scales: [3, 3, 3, 3, 1] 90 | max_downsample_channels: 1024 91 | bias: true 92 | nonlinear_activation: "LeakyReLU" 93 | nonlinear_activation_params: 94 | negative_slope: 0.1 95 | use_spectral_norm: false 96 | optimizer: 97 | type: Adam 98 | params: 99 | lr: 2.0e-4 100 | betas: [0.5, 0.9] 101 | weight_decay: 0.0 102 | scheduler: 103 | type: MultiStepLR 104 | params: 105 | gamma: 0.5 106 | milestones: 107 | - 200000 108 | - 400000 109 | - 600000 110 | - 800000 111 | 112 | #################################################### 113 | # LOSS SETTING # 114 | #################################################### 115 | Loss: 116 | generator_adv_loss: 117 | enable: True 118 | params: 119 | average_by_discriminators: False 120 | weights: 1.0 121 | 122 | discriminator_adv_loss: 123 | enable: True 124 | params: 125 | average_by_discriminators: False 126 | weights: 1.0 127 | 128 | stft_loss: 129 | enable: False # Whether to use multi-resolution STFT loss. 130 | 131 | mel_loss: 132 | enable: True 133 | params: 134 | fs: 16000 135 | fft_size: 2048 136 | hop_size: 200 137 | win_length: 1000 138 | window: "hann" 139 | num_mels: 80 140 | fmin: 0 141 | fmax: 8000 142 | log_base: null 143 | weights: 45.0 144 | 145 | subband_stft_loss: 146 | enable: False 147 | params: 148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 151 | window: "hann_window" # Window function for STFT-based loss 152 | 153 | feat_match_loss: 154 | enable: True 155 | params: 156 | average_by_discriminators: false 157 | average_by_layers: false 158 | weights: 2.0 159 | 160 | 161 | ########################################################### 162 | # DATA LOADER SETTING # 163 | ########################################################### 164 | batch_size: 16 165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. 166 | pin_memory: True 167 | num_workers: 2 # FIXME: set > 0 may stuck on macos 168 | remove_short_samples: False 169 | allow_cache: True 170 | 171 | generator_grad_norm: -1 172 | 173 | discriminator_grad_norm: -1 174 | 175 | ########################################################### 176 | # INTERVAL SETTING # 177 | ########################################################### 178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 180 | train_max_steps: 2500000 # Number of training steps. 181 | save_interval_steps: 20000 # Interval steps to save checkpoint. 182 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 183 | log_interval_steps: 1000 # Interval steps to record the training log. 184 | 185 | ########################################################### 186 | # OTHER SETTING # 187 | ########################################################### 188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 189 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_v1_24k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 80 9 | out_channels: 1 10 | channels: 512 11 | kernel_size: 7 12 | upsample_scales: [8, 5, 3, 2] 13 | upsample_kernal_sizes: [16, 10, 6, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5] 17 | - [1, 3, 5] 18 | - [1, 3, 5] 19 | bias: true 20 | causal: true 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | optimizer: 26 | type: Adam 27 | params: 28 | lr: 2.0e-4 29 | betas: [0.5, 0.9] 30 | weight_decay: 0.0 31 | scheduler: 32 | type: MultiStepLR 33 | params: 34 | gamma: 0.5 35 | milestones: 36 | - 200000 37 | - 400000 38 | - 600000 39 | - 800000 40 | 41 | ########################################################### 42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 43 | ########################################################### 44 | MultiScaleDiscriminator: 45 | params: 46 | scales: 3 47 | downsample_pooling: "DWT" 48 | downsample_pooling_params: 49 | kernel_size: 4 50 | stride: 2 51 | padding: 2 52 | discriminator_params: 53 | in_channels: 1 54 | out_channels: 1 55 | kernel_sizes: [15, 41, 5, 3] 56 | channels: 128 57 | max_downsample_channels: 1024 58 | max_groups: 16 59 | bias: true 60 | downsample_scales: [4, 4, 4, 4, 1] 61 | nonlinear_activation: "LeakyReLU" 62 | nonlinear_activation_params: 63 | negative_slope: 0.1 64 | follow_official_norm: true 65 | optimizer: 66 | type: Adam 67 | params: 68 | lr: 2.0e-4 69 | betas: [0.5, 0.9] 70 | weight_decay: 0.0 71 | scheduler: 72 | type: MultiStepLR 73 | params: 74 | gamma: 0.5 75 | milestones: 76 | - 200000 77 | - 400000 78 | - 600000 79 | - 800000 80 | 81 | MultiPeriodDiscriminator: 82 | params: 83 | periods: [2, 3, 5, 7, 11] 84 | discriminator_params: 85 | in_channels: 1 86 | out_channels: 1 87 | kernel_sizes: [5, 3] 88 | channels: 32 89 | downsample_scales: [3, 3, 3, 3, 1] 90 | max_downsample_channels: 1024 91 | bias: true 92 | nonlinear_activation: "LeakyReLU" 93 | nonlinear_activation_params: 94 | negative_slope: 0.1 95 | use_spectral_norm: false 96 | optimizer: 97 | type: Adam 98 | params: 99 | lr: 2.0e-4 100 | betas: [0.5, 0.9] 101 | weight_decay: 0.0 102 | scheduler: 103 | type: MultiStepLR 104 | params: 105 | gamma: 0.5 106 | milestones: 107 | - 200000 108 | - 400000 109 | - 600000 110 | - 800000 111 | 112 | #################################################### 113 | # LOSS SETTING # 114 | #################################################### 115 | Loss: 116 | generator_adv_loss: 117 | enable: True 118 | params: 119 | average_by_discriminators: False 120 | weights: 1.0 121 | 122 | discriminator_adv_loss: 123 | enable: True 124 | params: 125 | average_by_discriminators: False 126 | weights: 1.0 127 | 128 | stft_loss: 129 | enable: False # Whether to use multi-resolution STFT loss. 130 | 131 | mel_loss: 132 | enable: True 133 | params: 134 | fs: 24000 135 | fft_size: 1024 136 | hop_size: 240 137 | win_length: 1024 138 | window: "hann" 139 | num_mels: 80 140 | fmin: 0 141 | fmax: 8000 142 | log_base: null 143 | weights: 45.0 144 | 145 | subband_stft_loss: 146 | enable: False 147 | params: 148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 151 | window: "hann_window" # Window function for STFT-based loss 152 | 153 | feat_match_loss: 154 | enable: True 155 | params: 156 | average_by_discriminators: false 157 | average_by_layers: false 158 | weights: 2.0 159 | 160 | 161 | ########################################################### 162 | # DATA LOADER SETTING # 163 | ########################################################### 164 | batch_size: 16 165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. 166 | pin_memory: True 167 | num_workers: 2 # FIXME: set > 0 may stuck on macos 168 | remove_short_samples: False 169 | allow_cache: True 170 | 171 | generator_grad_norm: -1 172 | 173 | discriminator_grad_norm: -1 174 | 175 | ########################################################### 176 | # INTERVAL SETTING # 177 | ########################################################### 178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 180 | train_max_steps: 2500000 # Number of training steps. 181 | save_interval_steps: 20000 # Interval steps to save checkpoint. 182 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 183 | log_interval_steps: 1000 # Interval steps to record the training log. 184 | 185 | ########################################################### 186 | # OTHER SETTING # 187 | ########################################################### 188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 189 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_v1_48k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 128 9 | out_channels: 1 10 | channels: 512 11 | kernel_size: 7 12 | upsample_scales: [10, 5, 3, 2, 2] 13 | upsample_kernal_sizes: [20, 10, 6, 4, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5, 7] 17 | - [1, 3, 5, 7] 18 | - [1, 3, 5, 7] 19 | bias: true 20 | causal: true 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | optimizer: 26 | type: Adam 27 | params: 28 | lr: 2.0e-4 29 | betas: [0.5, 0.9] 30 | weight_decay: 0.0 31 | scheduler: 32 | type: MultiStepLR 33 | params: 34 | gamma: 0.5 35 | milestones: 36 | - 200000 37 | - 400000 38 | - 600000 39 | - 800000 40 | 41 | ########################################################### 42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 43 | ########################################################### 44 | MultiScaleDiscriminator: 45 | params: 46 | scales: 3 47 | downsample_pooling: "DWT" 48 | downsample_pooling_params: 49 | kernel_size: 4 50 | stride: 2 51 | padding: 2 52 | discriminator_params: 53 | in_channels: 1 54 | out_channels: 1 55 | kernel_sizes: [15, 41, 5, 3] 56 | channels: 128 57 | max_downsample_channels: 1024 58 | max_groups: 16 59 | bias: true 60 | downsample_scales: [4, 4, 4, 4, 1] 61 | nonlinear_activation: "LeakyReLU" 62 | nonlinear_activation_params: 63 | negative_slope: 0.1 64 | follow_official_norm: true 65 | optimizer: 66 | type: Adam 67 | params: 68 | lr: 2.0e-4 69 | betas: [0.5, 0.9] 70 | weight_decay: 0.0 71 | scheduler: 72 | type: MultiStepLR 73 | params: 74 | gamma: 0.5 75 | milestones: 76 | - 200000 77 | - 400000 78 | - 600000 79 | - 800000 80 | 81 | MultiPeriodDiscriminator: 82 | params: 83 | periods: [2, 3, 5, 7, 11] 84 | discriminator_params: 85 | in_channels: 1 86 | out_channels: 1 87 | kernel_sizes: [5, 3] 88 | channels: 32 89 | downsample_scales: [3, 3, 3, 3, 1] 90 | max_downsample_channels: 1024 91 | bias: true 92 | nonlinear_activation: "LeakyReLU" 93 | nonlinear_activation_params: 94 | negative_slope: 0.1 95 | use_spectral_norm: false 96 | optimizer: 97 | type: Adam 98 | params: 99 | lr: 2.0e-4 100 | betas: [0.5, 0.9] 101 | weight_decay: 0.0 102 | scheduler: 103 | type: MultiStepLR 104 | params: 105 | gamma: 0.5 106 | milestones: 107 | - 200000 108 | - 400000 109 | - 600000 110 | - 800000 111 | 112 | #################################################### 113 | # LOSS SETTING # 114 | #################################################### 115 | Loss: 116 | generator_adv_loss: 117 | enable: True 118 | params: 119 | average_by_discriminators: False 120 | weights: 1.0 121 | 122 | discriminator_adv_loss: 123 | enable: True 124 | params: 125 | average_by_discriminators: False 126 | weights: 1.0 127 | 128 | stft_loss: 129 | enable: False # Whether to use multi-resolution STFT loss. 130 | 131 | mel_loss: 132 | enable: True 133 | params: 134 | fs: 48000 135 | fft_size: 4096 136 | hop_size: 600 137 | win_length: 2400 138 | window: "hann" 139 | num_mels: 128 140 | fmin: 0 141 | fmax: 12000 142 | log_base: null 143 | weights: 45.0 144 | 145 | subband_stft_loss: 146 | enable: False 147 | params: 148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 151 | window: "hann_window" # Window function for STFT-based loss 152 | 153 | feat_match_loss: 154 | enable: True 155 | params: 156 | average_by_discriminators: false 157 | average_by_layers: false 158 | weights: 2.0 159 | 160 | 161 | ########################################################### 162 | # DATA LOADER SETTING # 163 | ########################################################### 164 | batch_size: 16 165 | batch_max_steps: 19200 # Length of each audio in batch. Make sure dividable by hop_size. 166 | pin_memory: True 167 | num_workers: 2 # FIXME: set > 0 may stuck on macos 168 | remove_short_samples: False 169 | allow_cache: True 170 | 171 | generator_grad_norm: -1 172 | 173 | discriminator_grad_norm: -1 174 | 175 | ########################################################### 176 | # INTERVAL SETTING # 177 | ########################################################### 178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 180 | train_max_steps: 2500000 # Number of training steps. 181 | save_interval_steps: 20000 # Interval steps to save checkpoint. 182 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 183 | log_interval_steps: 1000 # Interval steps to record the training log. 184 | 185 | ########################################################### 186 | # OTHER SETTING # 187 | ########################################################### 188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 189 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_v1_8k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 80 9 | out_channels: 1 10 | channels: 256 11 | kernel_size: 7 12 | upsample_scales: [5, 5, 2, 2] 13 | upsample_kernal_sizes: [10, 10, 4, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5, 7] 17 | - [1, 3, 5, 7] 18 | - [1, 3, 5, 7] 19 | bias: true 20 | causal: true 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | optimizer: 26 | type: Adam 27 | params: 28 | lr: 2.0e-4 29 | betas: [0.5, 0.9] 30 | weight_decay: 0.0 31 | scheduler: 32 | type: MultiStepLR 33 | params: 34 | gamma: 0.5 35 | milestones: 36 | - 200000 37 | - 400000 38 | - 600000 39 | - 800000 40 | 41 | ########################################################### 42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 43 | ########################################################### 44 | MultiScaleDiscriminator: 45 | params: 46 | scales: 3 47 | downsample_pooling: "DWT" 48 | downsample_pooling_params: 49 | kernel_size: 4 50 | stride: 2 51 | padding: 2 52 | discriminator_params: 53 | in_channels: 1 54 | out_channels: 1 55 | kernel_sizes: [15, 41, 5, 3] 56 | channels: 128 57 | max_downsample_channels: 1024 58 | max_groups: 16 59 | bias: true 60 | downsample_scales: [4, 4, 4, 4, 1] 61 | nonlinear_activation: "LeakyReLU" 62 | nonlinear_activation_params: 63 | negative_slope: 0.1 64 | follow_official_norm: true 65 | optimizer: 66 | type: Adam 67 | params: 68 | lr: 2.0e-4 69 | betas: [0.5, 0.9] 70 | weight_decay: 0.0 71 | scheduler: 72 | type: MultiStepLR 73 | params: 74 | gamma: 0.5 75 | milestones: 76 | - 200000 77 | - 400000 78 | - 600000 79 | - 800000 80 | 81 | MultiPeriodDiscriminator: 82 | params: 83 | periods: [2, 3, 5, 7, 11] 84 | discriminator_params: 85 | in_channels: 1 86 | out_channels: 1 87 | kernel_sizes: [5, 3] 88 | channels: 32 89 | downsample_scales: [3, 3, 3, 3, 1] 90 | max_downsample_channels: 1024 91 | bias: true 92 | nonlinear_activation: "LeakyReLU" 93 | nonlinear_activation_params: 94 | negative_slope: 0.1 95 | use_spectral_norm: false 96 | optimizer: 97 | type: Adam 98 | params: 99 | lr: 2.0e-4 100 | betas: [0.5, 0.9] 101 | weight_decay: 0.0 102 | scheduler: 103 | type: MultiStepLR 104 | params: 105 | gamma: 0.5 106 | milestones: 107 | - 200000 108 | - 400000 109 | - 600000 110 | - 800000 111 | 112 | #################################################### 113 | # LOSS SETTING # 114 | #################################################### 115 | Loss: 116 | generator_adv_loss: 117 | enable: True 118 | params: 119 | average_by_discriminators: False 120 | weights: 1.0 121 | 122 | discriminator_adv_loss: 123 | enable: True 124 | params: 125 | average_by_discriminators: False 126 | weights: 1.0 127 | 128 | stft_loss: 129 | enable: False # Whether to use multi-resolution STFT loss. 130 | 131 | mel_loss: 132 | enable: True 133 | params: 134 | fs: 8000 135 | fft_size: 2048 136 | hop_size: 100 137 | win_length: 600 138 | window: "hann" 139 | num_mels: 80 140 | fmin: 0 141 | fmax: 8000 142 | log_base: null 143 | weights: 45.0 144 | 145 | subband_stft_loss: 146 | enable: False 147 | params: 148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 151 | window: "hann_window" # Window function for STFT-based loss 152 | 153 | feat_match_loss: 154 | enable: True 155 | params: 156 | average_by_discriminators: false 157 | average_by_layers: false 158 | weights: 2.0 159 | 160 | 161 | ########################################################### 162 | # DATA LOADER SETTING # 163 | ########################################################### 164 | batch_size: 16 165 | batch_max_steps: 6000 # Length of each audio in batch. Make sure dividable by hop_size. 166 | pin_memory: True 167 | num_workers: 2 # FIXME: set > 0 may stuck on macos 168 | remove_short_samples: False 169 | allow_cache: True 170 | 171 | generator_grad_norm: -1 172 | 173 | discriminator_grad_norm: -1 174 | 175 | ########################################################### 176 | # INTERVAL SETTING # 177 | ########################################################### 178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 180 | train_max_steps: 2500000 # Number of training steps. 181 | save_interval_steps: 20000 # Interval steps to save checkpoint. 182 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 183 | log_interval_steps: 1000 # Interval steps to record the training log. 184 | 185 | ########################################################### 186 | # OTHER SETTING # 187 | ########################################################### 188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 189 | -------------------------------------------------------------------------------- /kantts/configs/hifigan_v1_nsf_24k.yaml: -------------------------------------------------------------------------------- 1 | model_type: hifigan 2 | Model: 3 | ########################################################### 4 | # GENERATOR NETWORK ARCHITECTURE SETTING # 5 | ########################################################### 6 | Generator: 7 | params: 8 | in_channels: 80 9 | out_channels: 1 10 | channels: 512 11 | kernel_size: 7 12 | upsample_scales: [8, 5, 3, 2] 13 | upsample_kernal_sizes: [16, 10, 6, 4] 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilations: 16 | - [1, 3, 5] 17 | - [1, 3, 5] 18 | - [1, 3, 5] 19 | bias: true 20 | causal: true 21 | nonlinear_activation: "LeakyReLU" 22 | nonlinear_activation_params: 23 | negative_slope: 0.1 24 | use_weight_norm: true 25 | nsf_params: 26 | nb_harmonics: 7 27 | sampling_rate: 24000 28 | optimizer: 29 | type: Adam 30 | params: 31 | lr: 2.0e-4 32 | betas: [0.5, 0.9] 33 | weight_decay: 0.0 34 | scheduler: 35 | type: MultiStepLR 36 | params: 37 | gamma: 0.5 38 | milestones: 39 | - 200000 40 | - 400000 41 | - 600000 42 | - 800000 43 | 44 | ########################################################### 45 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 46 | ########################################################### 47 | MultiScaleDiscriminator: 48 | params: 49 | scales: 3 50 | downsample_pooling: "DWT" 51 | downsample_pooling_params: 52 | kernel_size: 4 53 | stride: 2 54 | padding: 2 55 | discriminator_params: 56 | in_channels: 1 57 | out_channels: 1 58 | kernel_sizes: [15, 41, 5, 3] 59 | channels: 128 60 | max_downsample_channels: 1024 61 | max_groups: 16 62 | bias: true 63 | downsample_scales: [4, 4, 4, 4, 1] 64 | nonlinear_activation: "LeakyReLU" 65 | nonlinear_activation_params: 66 | negative_slope: 0.1 67 | follow_official_norm: true 68 | optimizer: 69 | type: Adam 70 | params: 71 | lr: 2.0e-4 72 | betas: [0.5, 0.9] 73 | weight_decay: 0.0 74 | scheduler: 75 | type: MultiStepLR 76 | params: 77 | gamma: 0.5 78 | milestones: 79 | - 200000 80 | - 400000 81 | - 600000 82 | - 800000 83 | 84 | MultiPeriodDiscriminator: 85 | params: 86 | periods: [2, 3, 5, 7, 11] 87 | discriminator_params: 88 | in_channels: 1 89 | out_channels: 1 90 | kernel_sizes: [5, 3] 91 | channels: 32 92 | downsample_scales: [3, 3, 3, 3, 1] 93 | max_downsample_channels: 1024 94 | bias: true 95 | nonlinear_activation: "LeakyReLU" 96 | nonlinear_activation_params: 97 | negative_slope: 0.1 98 | use_spectral_norm: false 99 | optimizer: 100 | type: Adam 101 | params: 102 | lr: 2.0e-4 103 | betas: [0.5, 0.9] 104 | weight_decay: 0.0 105 | scheduler: 106 | type: MultiStepLR 107 | params: 108 | gamma: 0.5 109 | milestones: 110 | - 200000 111 | - 400000 112 | - 600000 113 | - 800000 114 | 115 | #################################################### 116 | # LOSS SETTING # 117 | #################################################### 118 | Loss: 119 | generator_adv_loss: 120 | enable: True 121 | params: 122 | average_by_discriminators: False 123 | weights: 1.0 124 | 125 | discriminator_adv_loss: 126 | enable: True 127 | params: 128 | average_by_discriminators: False 129 | weights: 1.0 130 | 131 | stft_loss: 132 | enable: False # Whether to use multi-resolution STFT loss. 133 | 134 | mel_loss: 135 | enable: True 136 | params: 137 | fs: 24000 138 | fft_size: 1024 139 | hop_size: 240 140 | win_length: 1024 141 | window: "hann" 142 | num_mels: 80 143 | fmin: 0 144 | fmax: 8000 145 | log_base: null 146 | weights: 45.0 147 | 148 | subband_stft_loss: 149 | enable: False 150 | params: 151 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. 152 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss 153 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss. 154 | window: "hann_window" # Window function for STFT-based loss 155 | 156 | feat_match_loss: 157 | enable: True 158 | params: 159 | average_by_discriminators: false 160 | average_by_layers: false 161 | weights: 2.0 162 | 163 | 164 | ########################################################### 165 | # DATA LOADER SETTING # 166 | ########################################################### 167 | batch_size: 16 168 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. 169 | pin_memory: False 170 | num_workers: 2 # FIXME: set > 0 may stuck on macos 171 | remove_short_samples: False 172 | allow_cache: True 173 | 174 | generator_grad_norm: -1 175 | 176 | discriminator_grad_norm: -1 177 | 178 | ########################################################### 179 | # INTERVAL SETTING # 180 | ########################################################### 181 | generator_train_start_steps: 1 # Number of steps to start to train discriminator. 182 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 183 | train_max_steps: 2500000 # Number of training steps. 184 | save_interval_steps: 20000 # Interval steps to save checkpoint. 185 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 186 | log_interval_steps: 1000 # Interval steps to record the training log. 187 | 188 | ########################################################### 189 | # OTHER SETTING # 190 | ########################################################### 191 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 192 | -------------------------------------------------------------------------------- /kantts/configs/sambert_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | 54 | optimizer: 55 | type: Adam 56 | params: 57 | lr: 0.001 58 | betas: [0.9, 0.98] 59 | eps: 1.0e-9 60 | weight_decay: 0.0 61 | scheduler: 62 | type: NoamLR 63 | params: 64 | warmup_steps: 4000 65 | 66 | linguistic_unit: 67 | cleaners: english_cleaners 68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 69 | speaker_list: F7 70 | #################################################### 71 | # LOSS SETTING # 72 | #################################################### 73 | Loss: 74 | MelReconLoss: 75 | enable: True 76 | params: 77 | loss_type: mae 78 | 79 | ProsodyReconLoss: 80 | enable: True 81 | params: 82 | loss_type: mae 83 | 84 | ########################################################### 85 | # DATA LOADER SETTING # 86 | ########################################################### 87 | batch_size: 32 88 | pin_memory: False 89 | num_workers: 4 # FIXME: set > 0 may stuck on macos 90 | remove_short_samples: False 91 | allow_cache: True 92 | grad_norm: 1.0 93 | 94 | ########################################################### 95 | # INTERVAL SETTING # 96 | ########################################################### 97 | train_max_steps: 1000000 # Number of training steps. 98 | save_interval_steps: 20000 # Interval steps to save checkpoint. 99 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 100 | log_interval_steps: 1000 # Interval steps to record the training log. 101 | 102 | ########################################################### 103 | # OTHER SETTING # 104 | ########################################################### 105 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 106 | -------------------------------------------------------------------------------- /kantts/configs/sambert_16k_MAS.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | 53 | MAS: True 54 | 55 | 56 | optimizer: 57 | type: Adam 58 | params: 59 | lr: 0.001 60 | betas: [0.9, 0.98] 61 | eps: 1.0e-9 62 | weight_decay: 0.0 63 | scheduler: 64 | type: NoamLR 65 | params: 66 | warmup_steps: 4000 67 | 68 | linguistic_unit: 69 | cleaners: english_cleaners 70 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 71 | speaker_list: F7 72 | #################################################### 73 | # LOSS SETTING # 74 | #################################################### 75 | Loss: 76 | MelReconLoss: 77 | enable: True 78 | params: 79 | loss_type: mae 80 | 81 | ProsodyReconLoss: 82 | enable: True 83 | params: 84 | loss_type: mae 85 | 86 | AttentionCTCLoss: 87 | enable: True 88 | 89 | AttentionBinarizationLoss: 90 | enable: True 91 | params: 92 | start_epoch: 0 93 | warmup_epoch: 100 94 | 95 | 96 | ########################################################### 97 | # DATA LOADER SETTING # 98 | ########################################################### 99 | batch_size: 32 100 | pin_memory: False 101 | num_workers: 4 # FIXME: set > 0 may stuck on macos 102 | remove_short_samples: False 103 | allow_cache: True 104 | 105 | grad_norm: 1.0 106 | 107 | ########################################################### 108 | # INTERVAL SETTING # 109 | ########################################################### 110 | train_max_steps: 1000000 # Number of training steps. 111 | save_interval_steps: 20000 # Interval steps to save checkpoint. 112 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 113 | log_interval_steps: 1000 # Interval steps to record the training log. 114 | 115 | ########################################################### 116 | # OTHER SETTING # 117 | ########################################################### 118 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 119 | -------------------------------------------------------------------------------- /kantts/configs/sambert_16k_MAS_byte.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | 53 | MAS: True 54 | using_byte: True 55 | 56 | 57 | optimizer: 58 | type: Adam 59 | params: 60 | lr: 0.001 61 | betas: [0.9, 0.98] 62 | eps: 1.0e-9 63 | weight_decay: 0.0 64 | scheduler: 65 | type: NoamLR 66 | params: 67 | warmup_steps: 4000 68 | 69 | linguistic_unit: 70 | cleaners: english_cleaners 71 | lfeat_type_list: byte_index,emo_category,speaker_category 72 | speaker_list: F7 73 | #################################################### 74 | # LOSS SETTING # 75 | #################################################### 76 | Loss: 77 | MelReconLoss: 78 | enable: True 79 | params: 80 | loss_type: mae 81 | 82 | ProsodyReconLoss: 83 | enable: True 84 | params: 85 | loss_type: mae 86 | 87 | AttentionCTCLoss: 88 | enable: True 89 | 90 | AttentionBinarizationLoss: 91 | enable: True 92 | params: 93 | start_epoch: 0 94 | warmup_epoch: 100 95 | 96 | 97 | ########################################################### 98 | # DATA LOADER SETTING # 99 | ########################################################### 100 | batch_size: 8 101 | pin_memory: False 102 | num_workers: 4 # FIXME: set > 0 may stuck on macos 103 | remove_short_samples: False 104 | allow_cache: True 105 | 106 | grad_norm: 1.0 107 | 108 | ########################################################### 109 | # INTERVAL SETTING # 110 | ########################################################### 111 | train_max_steps: 1000000 # Number of training steps. 112 | save_interval_steps: 20000 # Interval steps to save checkpoint. 113 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 114 | log_interval_steps: 1000 # Interval steps to record the training log. 115 | 116 | ########################################################### 117 | # OTHER SETTING # 118 | ########################################################### 119 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 120 | -------------------------------------------------------------------------------- /kantts/configs/sambert_24k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | 54 | optimizer: 55 | type: Adam 56 | params: 57 | lr: 0.001 58 | betas: [0.9, 0.98] 59 | eps: 1.0e-9 60 | weight_decay: 0.0 61 | scheduler: 62 | type: NoamLR 63 | params: 64 | warmup_steps: 4000 65 | 66 | linguistic_unit: 67 | cleaners: english_cleaners 68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 69 | speaker_list: F7 70 | #################################################### 71 | # LOSS SETTING # 72 | #################################################### 73 | Loss: 74 | MelReconLoss: 75 | enable: True 76 | params: 77 | loss_type: mae 78 | 79 | ProsodyReconLoss: 80 | enable: True 81 | params: 82 | loss_type: mae 83 | 84 | 85 | ########################################################### 86 | # DATA LOADER SETTING # 87 | ########################################################### 88 | batch_size: 32 89 | pin_memory: False 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos 91 | remove_short_samples: False 92 | allow_cache: True 93 | 94 | grad_norm: 1.0 95 | 96 | ########################################################### 97 | # INTERVAL SETTING # 98 | ########################################################### 99 | train_max_steps: 1000000 # Number of training steps. 100 | save_interval_steps: 20000 # Interval steps to save checkpoint. 101 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 102 | log_interval_steps: 1000 # Interval steps to record the training log. 103 | 104 | ########################################################### 105 | # OTHER SETTING # 106 | ########################################################### 107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 108 | -------------------------------------------------------------------------------- /kantts/configs/sambert_48k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 900 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 128 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | 54 | optimizer: 55 | type: Adam 56 | params: 57 | lr: 0.001 58 | betas: [0.9, 0.98] 59 | eps: 1.0e-9 60 | weight_decay: 0.0 61 | scheduler: 62 | type: NoamLR 63 | params: 64 | warmup_steps: 4000 65 | 66 | linguistic_unit: 67 | cleaners: english_cleaners 68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 69 | speaker_list: F7 70 | #################################################### 71 | # LOSS SETTING # 72 | #################################################### 73 | Loss: 74 | MelReconLoss: 75 | enable: True 76 | params: 77 | loss_type: mae 78 | 79 | ProsodyReconLoss: 80 | enable: True 81 | params: 82 | loss_type: mae 83 | 84 | 85 | ########################################################### 86 | # DATA LOADER SETTING # 87 | ########################################################### 88 | batch_size: 32 89 | pin_memory: False 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos 91 | remove_short_samples: False 92 | allow_cache: True 93 | 94 | grad_norm: 1.0 95 | 96 | ########################################################### 97 | # INTERVAL SETTING # 98 | ########################################################### 99 | train_max_steps: 1000000 # Number of training steps. 100 | save_interval_steps: 20000 # Interval steps to save checkpoint. 101 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 102 | log_interval_steps: 1000 # Interval steps to record the training log. 103 | 104 | ########################################################### 105 | # OTHER SETTING # 106 | ########################################################### 107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 108 | -------------------------------------------------------------------------------- /kantts/configs/sambert_fp_8k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | FP: True 54 | 55 | optimizer: 56 | type: Adam 57 | params: 58 | lr: 0.001 59 | betas: [0.9, 0.98] 60 | eps: 1.0e-9 61 | weight_decay: 0.0 62 | scheduler: 63 | type: NoamLR 64 | params: 65 | warmup_steps: 4000 66 | 67 | linguistic_unit: 68 | cleaners: english_cleaners 69 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 70 | speaker_list: F7,F74,M7,FBYN,FRXL,xiaoyu 71 | #################################################### 72 | # LOSS SETTING # 73 | #################################################### 74 | Loss: 75 | MelReconLoss: 76 | enable: True 77 | params: 78 | loss_type: mae 79 | 80 | ProsodyReconLoss: 81 | enable: True 82 | params: 83 | loss_type: mae 84 | 85 | FpCELoss: 86 | enable: True 87 | params: 88 | loss_type: ce 89 | weight: [1,4,4,8] 90 | 91 | ########################################################### 92 | # DATA LOADER SETTING # 93 | ########################################################### 94 | batch_size: 16 95 | pin_memory: False 96 | num_workers: 4 # FIXME: set > 0 may stuck on macos 97 | remove_short_samples: False 98 | allow_cache: True 99 | 100 | grad_norm: 1.0 101 | 102 | ########################################################### 103 | # INTERVAL SETTING # 104 | ########################################################### 105 | train_max_steps: 1000000 # Number of training steps. 106 | save_interval_steps: 20000 # Interval steps to save checkpoint. 107 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 108 | log_interval_steps: 1000 # Interval steps to record the training log. 109 | 110 | ########################################################### 111 | # OTHER SETTING # 112 | ########################################################### 113 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 114 | -------------------------------------------------------------------------------- /kantts/configs/sambert_nsf_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 82 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | NSF: True 54 | 55 | 56 | optimizer: 57 | type: Adam 58 | params: 59 | lr: 0.001 60 | betas: [0.9, 0.98] 61 | eps: 1.0e-9 62 | weight_decay: 0.0 63 | scheduler: 64 | type: NoamLR 65 | params: 66 | warmup_steps: 4000 67 | 68 | linguistic_unit: 69 | cleaners: english_cleaners 70 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 71 | speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu 72 | #################################################### 73 | # LOSS SETTING # 74 | #################################################### 75 | Loss: 76 | MelReconLoss: 77 | enable: True 78 | params: 79 | loss_type: mae 80 | 81 | ProsodyReconLoss: 82 | enable: True 83 | params: 84 | loss_type: mae 85 | 86 | ########################################################### 87 | # DATA LOADER SETTING # 88 | ########################################################### 89 | batch_size: 32 90 | pin_memory: False 91 | num_workers: 4 # FIXME: set > 0 may stuck on macos 92 | remove_short_samples: False 93 | allow_cache: True 94 | grad_norm: 1.0 95 | 96 | ########################################################### 97 | # INTERVAL SETTING # 98 | ########################################################### 99 | train_max_steps: 10000000 # Number of training steps. 100 | save_interval_steps: 20000 # Interval steps to save checkpoint. 101 | eval_interval_steps: 2300500 # Interval steps to evaluate the network. 102 | log_interval_steps: 1000 # Interval steps to record the training log. 103 | 104 | ########################################################### 105 | # OTHER SETTING # 106 | ########################################################### 107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 108 | -------------------------------------------------------------------------------- /kantts/configs/sambert_nsf_24k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 82 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | NSF: True 54 | 55 | optimizer: 56 | type: Adam 57 | params: 58 | lr: 0.001 59 | betas: [0.9, 0.98] 60 | eps: 1.0e-9 61 | weight_decay: 0.0 62 | scheduler: 63 | type: NoamLR 64 | params: 65 | warmup_steps: 4000 66 | 67 | linguistic_unit: 68 | cleaners: english_cleaners 69 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 70 | speaker_list: F7 71 | #################################################### 72 | # LOSS SETTING # 73 | #################################################### 74 | Loss: 75 | MelReconLoss: 76 | enable: True 77 | params: 78 | loss_type: mae 79 | 80 | ProsodyReconLoss: 81 | enable: True 82 | params: 83 | loss_type: mae 84 | 85 | 86 | ########################################################### 87 | # DATA LOADER SETTING # 88 | ########################################################### 89 | batch_size: 32 90 | pin_memory: False 91 | num_workers: 4 # FIXME: set > 0 may stuck on macos 92 | remove_short_samples: False 93 | allow_cache: True 94 | 95 | grad_norm: 1.0 96 | 97 | ########################################################### 98 | # INTERVAL SETTING # 99 | ########################################################### 100 | train_max_steps: 1000000 # Number of training steps. 101 | save_interval_steps: 20000 # Interval steps to save checkpoint. 102 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 103 | log_interval_steps: 1000 # Interval steps to record the training log. 104 | 105 | ########################################################### 106 | # OTHER SETTING # 107 | ########################################################### 108 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 109 | -------------------------------------------------------------------------------- /kantts/configs/sambert_se_nsf_global_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 192 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 82 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | NSF: True 54 | nsf_norm_type: global 55 | nsf_f0_global_minimum: 30.0 56 | nsf_f0_global_maximum: 730.0 57 | SE: True 58 | 59 | 60 | optimizer: 61 | type: Adam 62 | params: 63 | lr: 0.001 64 | betas: [0.9, 0.98] 65 | eps: 1.0e-9 66 | weight_decay: 0.0 67 | scheduler: 68 | type: NoamLR 69 | params: 70 | warmup_steps: 4000 71 | 72 | linguistic_unit: 73 | cleaners: english_cleaners 74 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 75 | speaker_list: F7 76 | #################################################### 77 | # LOSS SETTING # 78 | #################################################### 79 | Loss: 80 | MelReconLoss: 81 | enable: True 82 | params: 83 | loss_type: mae 84 | 85 | ProsodyReconLoss: 86 | enable: True 87 | params: 88 | loss_type: mae 89 | 90 | ########################################################### 91 | # DATA LOADER SETTING # 92 | ########################################################### 93 | batch_size: 32 94 | pin_memory: False 95 | num_workers: 4 # FIXME: set > 0 may stuck on macos 96 | remove_short_samples: False 97 | allow_cache: False 98 | grad_norm: 1.0 99 | 100 | ########################################################### 101 | # INTERVAL SETTING # 102 | ########################################################### 103 | train_max_steps: 1760101 # Number of training steps. 104 | save_interval_steps: 100 # Interval steps to save checkpoint. 105 | eval_interval_steps: 1000000000000 # Interval steps to evaluate the network. 106 | log_interval_steps: 10 # Interval steps to record the training log. 107 | 108 | ########################################################### 109 | # OTHER SETTING # 110 | ########################################################### 111 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 112 | -------------------------------------------------------------------------------- /kantts/configs/sambert_sichuan_16k.yaml: -------------------------------------------------------------------------------- 1 | model_type: sambert 2 | Model: 3 | ######################################################### 4 | # SAMBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsSAMBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | speaker_units: 32 21 | emotion_units: 32 22 | 23 | predictor_filter_size: 41 24 | predictor_fsmn_num_layers: 3 25 | predictor_num_memory_units: 128 26 | predictor_ffn_inner_dim: 256 27 | predictor_dropout: 0.1 28 | predictor_shift: 0 29 | predictor_lstm_units: 128 30 | dur_pred_prenet_units: [128, 128] 31 | dur_pred_lstm_units: 128 32 | 33 | decoder_prenet_units: [256, 256] 34 | decoder_num_layers: 12 35 | decoder_num_heads: 8 36 | decoder_num_units: 128 37 | decoder_ffn_inner_dim: 1024 38 | decoder_dropout: 0.1 39 | decoder_attention_dropout: 0.1 40 | decoder_relu_dropout: 0.1 41 | 42 | outputs_per_step: 3 43 | num_mels: 80 44 | 45 | postnet_filter_size: 41 46 | postnet_fsmn_num_layers: 4 47 | postnet_num_memory_units: 256 48 | postnet_ffn_inner_dim: 512 49 | postnet_dropout: 0.1 50 | postnet_shift: 17 51 | postnet_lstm_units: 128 52 | MAS: False 53 | 54 | optimizer: 55 | type: Adam 56 | params: 57 | lr: 0.001 58 | betas: [0.9, 0.98] 59 | eps: 1.0e-9 60 | weight_decay: 0.0 61 | scheduler: 62 | type: NoamLR 63 | params: 64 | warmup_steps: 4000 65 | 66 | linguistic_unit: 67 | cleaners: english_cleaners 68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 69 | speaker_list: xiaoyue 70 | language: Sichuan 71 | #################################################### 72 | # LOSS SETTING # 73 | #################################################### 74 | Loss: 75 | MelReconLoss: 76 | enable: True 77 | params: 78 | loss_type: mae 79 | 80 | ProsodyReconLoss: 81 | enable: True 82 | params: 83 | loss_type: mae 84 | 85 | ########################################################### 86 | # DATA LOADER SETTING # 87 | ########################################################### 88 | batch_size: 32 89 | pin_memory: False 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos 91 | remove_short_samples: False 92 | allow_cache: True 93 | grad_norm: 1.0 94 | 95 | ########################################################### 96 | # INTERVAL SETTING # 97 | ########################################################### 98 | train_max_steps: 1000000 # Number of training steps. 99 | save_interval_steps: 20000 # Interval steps to save checkpoint. 100 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 101 | log_interval_steps: 1000 # Interval steps to record the training log. 102 | 103 | ########################################################### 104 | # OTHER SETTING # 105 | ########################################################### 106 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 107 | -------------------------------------------------------------------------------- /kantts/configs/sybert.yaml: -------------------------------------------------------------------------------- 1 | model_type: sybert 2 | Model: 3 | ######################################################### 4 | # TextsyBERT NETWORK ARCHITECTURE SETTING # 5 | ######################################################### 6 | KanTtsTextsyBERT: 7 | params: 8 | max_len: 800 9 | 10 | embedding_dim: 512 11 | encoder_num_layers: 8 12 | encoder_num_heads: 8 13 | encoder_num_units: 128 14 | encoder_ffn_inner_dim: 1024 15 | encoder_dropout: 0.1 16 | encoder_attention_dropout: 0.1 17 | encoder_relu_dropout: 0.1 18 | encoder_projection_units: 32 19 | 20 | mask_ratio: 0.3 21 | 22 | optimizer: 23 | type: Adam 24 | params: 25 | lr: 0.0001 26 | betas: [0.9, 0.98] 27 | eps: 1.0e-9 28 | weight_decay: 0.0 29 | scheduler: 30 | type: NoamLR 31 | params: 32 | warmup_steps: 10000 33 | 34 | linguistic_unit: 35 | cleaners: english_cleaners 36 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category 37 | speaker_list: F7 38 | #################################################### 39 | # LOSS SETTING # 40 | #################################################### 41 | Loss: 42 | SeqCELoss: 43 | enable: True 44 | params: 45 | loss_type: ce 46 | 47 | ########################################################### 48 | # DATA LOADER SETTING # 49 | ########################################################### 50 | batch_size: 32 51 | pin_memory: False 52 | num_workers: 4 # FIXME: set > 0 may stuck on macos 53 | remove_short_samples: False 54 | allow_cache: True 55 | 56 | grad_norm: 1.0 57 | 58 | ########################################################### 59 | # INTERVAL SETTING # 60 | ########################################################### 61 | train_max_steps: 1000000 # Number of training steps. 62 | save_interval_steps: 20000 # Interval steps to save checkpoint. 63 | eval_interval_steps: 10000 # Interval steps to evaluate the network. 64 | log_interval_steps: 1000 # Interval steps to record the training log. 65 | 66 | ########################################################### 67 | # OTHER SETTING # 68 | ########################################################### 69 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 70 | -------------------------------------------------------------------------------- /kantts/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/datasets/__init__.py -------------------------------------------------------------------------------- /kantts/datasets/data_types.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import wavfile 3 | 4 | 5 | # TODO: add your own data type here as you need. 6 | DATA_TYPE_DICT = { 7 | "txt": { 8 | "load_func": np.loadtxt, 9 | "desc": "plain txt file or readable by np.loadtxt", 10 | }, 11 | "wav": { 12 | "load_func": lambda x: wavfile.read(x)[1], 13 | "desc": "wav file or readable by soundfile.read", 14 | }, 15 | "npy": { 16 | "load_func": np.load, 17 | "desc": "any .npy format file", 18 | }, 19 | # PCM data type can be loaded by binary format 20 | "bin_f32": { 21 | "load_func": lambda x: np.fromfile(x, dtype=np.float32), 22 | "desc": "binary file with float32 format", 23 | }, 24 | "bin_f64": { 25 | "load_func": lambda x: np.fromfile(x, dtype=np.float64), 26 | "desc": "binary file with float64 format", 27 | }, 28 | "bin_i32": { 29 | "load_func": lambda x: np.fromfile(x, dtype=np.int32), 30 | "desc": "binary file with int32 format", 31 | }, 32 | "bin_i16": { 33 | "load_func": lambda x: np.fromfile(x, dtype=np.int16), 34 | "desc": "binary file with int16 format", 35 | }, 36 | } 37 | -------------------------------------------------------------------------------- /kantts/models/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.parallel import DistributedDataParallel 3 | from kantts.models.hifigan.hifigan import ( # NOQA 4 | Generator, # NOQA 5 | MultiScaleDiscriminator, # NOQA 6 | MultiPeriodDiscriminator, # NOQA 7 | MultiSpecDiscriminator, # NOQA 8 | ) 9 | import kantts 10 | import kantts.train.scheduler 11 | from kantts.models.sambert.kantts_sambert import KanTtsSAMBERT, KanTtsTextsyBERT # NOQA 12 | from kantts.utils.ling_unit.ling_unit import get_fpdict 13 | from .pqmf import PQMF 14 | 15 | 16 | def optimizer_builder(model_params, opt_name, opt_params): 17 | opt_cls = getattr(torch.optim, opt_name) 18 | optimizer = opt_cls(model_params, **opt_params) 19 | return optimizer 20 | 21 | 22 | def scheduler_builder(optimizer, sche_name, sche_params): 23 | scheduler_cls = getattr(kantts.train.scheduler, sche_name) 24 | scheduler = scheduler_cls(optimizer, **sche_params) 25 | return scheduler 26 | 27 | 28 | def hifigan_model_builder(config, device, rank, distributed): 29 | model = {} 30 | optimizer = {} 31 | scheduler = {} 32 | model["discriminator"] = {} 33 | optimizer["discriminator"] = {} 34 | scheduler["discriminator"] = {} 35 | for model_name in config["Model"].keys(): 36 | if model_name == "Generator": 37 | params = config["Model"][model_name]["params"] 38 | model["generator"] = Generator(**params).to(device) 39 | optimizer["generator"] = optimizer_builder( 40 | model["generator"].parameters(), 41 | config["Model"][model_name]["optimizer"].get("type", "Adam"), 42 | config["Model"][model_name]["optimizer"].get("params", {}), 43 | ) 44 | scheduler["generator"] = scheduler_builder( 45 | optimizer["generator"], 46 | config["Model"][model_name]["scheduler"].get("type", "StepLR"), 47 | config["Model"][model_name]["scheduler"].get("params", {}), 48 | ) 49 | else: 50 | params = config["Model"][model_name]["params"] 51 | model["discriminator"][model_name] = globals()[model_name](**params).to( 52 | device 53 | ) 54 | optimizer["discriminator"][model_name] = optimizer_builder( 55 | model["discriminator"][model_name].parameters(), 56 | config["Model"][model_name]["optimizer"].get("type", "Adam"), 57 | config["Model"][model_name]["optimizer"].get("params", {}), 58 | ) 59 | scheduler["discriminator"][model_name] = scheduler_builder( 60 | optimizer["discriminator"][model_name], 61 | config["Model"][model_name]["scheduler"].get("type", "StepLR"), 62 | config["Model"][model_name]["scheduler"].get("params", {}), 63 | ) 64 | 65 | out_channels = config["Model"]["Generator"]["params"]["out_channels"] 66 | if out_channels > 1: 67 | model["pqmf"] = PQMF(subbands=out_channels, **config.get("pqmf", {})).to(device) 68 | 69 | # FIXME: pywavelets buffer leads to gradient error in DDP training 70 | # Solution: https://github.com/pytorch/pytorch/issues/22095 71 | if distributed: 72 | model["generator"] = DistributedDataParallel( 73 | model["generator"], 74 | device_ids=[rank], 75 | output_device=rank, 76 | broadcast_buffers=False, 77 | ) 78 | for model_name in model["discriminator"].keys(): 79 | model["discriminator"][model_name] = DistributedDataParallel( 80 | model["discriminator"][model_name], 81 | device_ids=[rank], 82 | output_device=rank, 83 | broadcast_buffers=False, 84 | ) 85 | 86 | return model, optimizer, scheduler 87 | 88 | 89 | # TODO: some parsing 90 | def sambert_model_builder(config, device, rank, distributed): 91 | model = {} 92 | optimizer = {} 93 | scheduler = {} 94 | 95 | model["KanTtsSAMBERT"] = KanTtsSAMBERT( 96 | config["Model"]["KanTtsSAMBERT"]["params"] 97 | ).to(device) 98 | 99 | fp_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("FP", False) 100 | if fp_enable: 101 | fp_dict = { 102 | k: torch.from_numpy(v).long().unsqueeze(0).to(device) 103 | for k, v in get_fpdict(config).items() 104 | } 105 | model["KanTtsSAMBERT"].fp_dict = fp_dict 106 | 107 | optimizer["KanTtsSAMBERT"] = optimizer_builder( 108 | model["KanTtsSAMBERT"].parameters(), 109 | config["Model"]["KanTtsSAMBERT"]["optimizer"].get("type", "Adam"), 110 | config["Model"]["KanTtsSAMBERT"]["optimizer"].get("params", {}), 111 | ) 112 | scheduler["KanTtsSAMBERT"] = scheduler_builder( 113 | optimizer["KanTtsSAMBERT"], 114 | config["Model"]["KanTtsSAMBERT"]["scheduler"].get("type", "StepLR"), 115 | config["Model"]["KanTtsSAMBERT"]["scheduler"].get("params", {}), 116 | ) 117 | 118 | if distributed: 119 | model["KanTtsSAMBERT"] = DistributedDataParallel( 120 | model["KanTtsSAMBERT"], device_ids=[rank], output_device=rank 121 | ) 122 | 123 | return model, optimizer, scheduler 124 | 125 | 126 | def sybert_model_builder(config, device, rank, distributed): 127 | model = {} 128 | optimizer = {} 129 | scheduler = {} 130 | 131 | model["KanTtsTextsyBERT"] = KanTtsTextsyBERT( 132 | config["Model"]["KanTtsTextsyBERT"]["params"] 133 | ).to(device) 134 | optimizer["KanTtsTextsyBERT"] = optimizer_builder( 135 | model["KanTtsTextsyBERT"].parameters(), 136 | config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("type", "Adam"), 137 | config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("params", {}), 138 | ) 139 | scheduler["KanTtsTextsyBERT"] = scheduler_builder( 140 | optimizer["KanTtsTextsyBERT"], 141 | config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("type", "StepLR"), 142 | config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("params", {}), 143 | ) 144 | 145 | if distributed: 146 | model["KanTtsTextsyBERT"] = DistributedDataParallel( 147 | model["KanTtsTextsyBERT"], device_ids=[rank], output_device=rank 148 | ) 149 | 150 | return model, optimizer, scheduler 151 | 152 | 153 | # TODO: implement a builder for specific model 154 | model_dict = { 155 | "hifigan": hifigan_model_builder, 156 | "sambert": sambert_model_builder, 157 | "sybert": sybert_model_builder, 158 | } 159 | 160 | 161 | def model_builder(config, device="cpu", rank=0, distributed=False): 162 | builder_func = model_dict[config["model_type"]] 163 | model, optimizer, scheduler = builder_func(config, device, rank, distributed) 164 | return model, optimizer, scheduler 165 | -------------------------------------------------------------------------------- /kantts/models/pqmf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Pseudo QMF modules.""" 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | from scipy.signal import kaiser 11 | 12 | 13 | def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): 14 | """Design prototype filter for PQMF. 15 | 16 | This method is based on `A Kaiser window approach for the design of prototype 17 | filters of cosine modulated filterbanks`_. 18 | 19 | Args: 20 | taps (int): The number of filter taps. 21 | cutoff_ratio (float): Cut-off frequency ratio. 22 | beta (float): Beta coefficient for kaiser window. 23 | 24 | Returns: 25 | ndarray: Impluse response of prototype filter (taps + 1,). 26 | 27 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 28 | https://ieeexplore.ieee.org/abstract/document/681427 29 | 30 | """ 31 | # check the arguments are valid 32 | assert taps % 2 == 0, "The number of taps mush be even number." 33 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 34 | 35 | # make initial filter 36 | omega_c = np.pi * cutoff_ratio 37 | with np.errstate(invalid="ignore"): 38 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / ( 39 | np.pi * (np.arange(taps + 1) - 0.5 * taps) 40 | ) 41 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 42 | 43 | # apply kaiser window 44 | w = kaiser(taps + 1, beta) 45 | h = h_i * w 46 | 47 | return h 48 | 49 | 50 | class PQMF(torch.nn.Module): 51 | """PQMF module. 52 | 53 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 54 | 55 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 56 | https://ieeexplore.ieee.org/document/258122 57 | 58 | """ 59 | 60 | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0): 61 | """Initilize PQMF module. 62 | 63 | The cutoff_ratio and beta parameters are optimized for #subbands = 4. 64 | See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. 65 | 66 | Args: 67 | subbands (int): The number of subbands. 68 | taps (int): The number of filter taps. 69 | cutoff_ratio (float): Cut-off frequency ratio. 70 | beta (float): Beta coefficient for kaiser window. 71 | 72 | """ 73 | super(PQMF, self).__init__() 74 | 75 | # build analysis & synthesis filter coefficients 76 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 77 | h_analysis = np.zeros((subbands, len(h_proto))) 78 | h_synthesis = np.zeros((subbands, len(h_proto))) 79 | for k in range(subbands): 80 | h_analysis[k] = ( 81 | 2 82 | * h_proto 83 | * np.cos( 84 | (2 * k + 1) 85 | * (np.pi / (2 * subbands)) 86 | * (np.arange(taps + 1) - (taps / 2)) 87 | + (-1) ** k * np.pi / 4 88 | ) 89 | ) 90 | h_synthesis[k] = ( 91 | 2 92 | * h_proto 93 | * np.cos( 94 | (2 * k + 1) 95 | * (np.pi / (2 * subbands)) 96 | * (np.arange(taps + 1) - (taps / 2)) 97 | - (-1) ** k * np.pi / 4 98 | ) 99 | ) 100 | 101 | # convert to tensor 102 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1) 103 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0) 104 | 105 | # register coefficients as beffer 106 | self.register_buffer("analysis_filter", analysis_filter) 107 | self.register_buffer("synthesis_filter", synthesis_filter) 108 | 109 | # filter for downsampling & upsampling 110 | updown_filter = torch.zeros((subbands, subbands, subbands)).float() 111 | for k in range(subbands): 112 | updown_filter[k, k, 0] = 1.0 113 | self.register_buffer("updown_filter", updown_filter) 114 | self.subbands = subbands 115 | 116 | # keep padding info 117 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 118 | 119 | def analysis(self, x): 120 | """Analysis with PQMF. 121 | 122 | Args: 123 | x (Tensor): Input tensor (B, 1, T). 124 | 125 | Returns: 126 | Tensor: Output tensor (B, subbands, T // subbands). 127 | 128 | """ 129 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 130 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 131 | 132 | def synthesis(self, x): 133 | """Synthesis with PQMF. 134 | 135 | Args: 136 | x (Tensor): Input tensor (B, subbands, T // subbands). 137 | 138 | Returns: 139 | Tensor: Output tensor (B, 1, T). 140 | 141 | """ 142 | # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands. 143 | # Not sure this is the correct way, it is better to check again. 144 | # TODO(kan-bayashi): Understand the reconstruction procedure 145 | x = F.conv_transpose1d( 146 | x, self.updown_filter * self.subbands, stride=self.subbands 147 | ) 148 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) 149 | -------------------------------------------------------------------------------- /kantts/models/sambert/adaptors.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from kantts.models.sambert.fsmn import FsmnEncoderV2 6 | from kantts.models.sambert import Prenet 7 | 8 | 9 | class LengthRegulator(nn.Module): 10 | def __init__(self, r=1): 11 | super(LengthRegulator, self).__init__() 12 | 13 | self.r = r 14 | 15 | def forward(self, inputs, durations, masks=None): 16 | reps = (durations + 0.5).long() 17 | output_lens = reps.sum(dim=1) 18 | max_len = output_lens.max() 19 | reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[ 20 | :, None, : 21 | ] 22 | range_ = torch.arange(max_len).to(inputs.device)[None, :, None] 23 | mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_) 24 | mult = mult.float() 25 | out = torch.matmul(mult, inputs) 26 | 27 | if masks is not None: 28 | out = out.masked_fill(masks.unsqueeze(-1), 0.0) 29 | 30 | seq_len = out.size(1) 31 | padding = self.r - int(seq_len) % self.r 32 | if padding < self.r: 33 | out = F.pad(out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0) 34 | out = out.transpose(1, 2) 35 | 36 | return out, output_lens 37 | 38 | 39 | class VarRnnARPredictor(nn.Module): 40 | def __init__(self, cond_units, prenet_units, rnn_units): 41 | super(VarRnnARPredictor, self).__init__() 42 | 43 | self.prenet = Prenet(1, prenet_units) 44 | self.lstm = nn.LSTM( 45 | prenet_units[-1] + cond_units, 46 | rnn_units, 47 | num_layers=2, 48 | batch_first=True, 49 | bidirectional=False, 50 | ) 51 | self.fc = nn.Linear(rnn_units, 1) 52 | 53 | def forward(self, inputs, cond, h=None, masks=None): 54 | x = torch.cat([self.prenet(inputs), cond], dim=-1) 55 | # The input can also be a packed variable length sequence, 56 | # here we just omit it for simplicity due to the mask and uni-directional lstm. 57 | x, h_new = self.lstm(x, h) 58 | 59 | x = self.fc(x).squeeze(-1) 60 | x = F.relu(x) 61 | 62 | if masks is not None: 63 | x = x.masked_fill(masks, 0.0) 64 | 65 | return x, h_new 66 | 67 | def infer(self, cond, masks=None): 68 | batch_size, length = cond.size(0), cond.size(1) 69 | 70 | output = [] 71 | x = torch.zeros((batch_size, 1)).to(cond.device) 72 | h = None 73 | 74 | for i in range(length): 75 | x, h = self.forward(x.unsqueeze(1), cond[:, i : i + 1, :], h=h) 76 | output.append(x) 77 | 78 | output = torch.cat(output, dim=-1) 79 | 80 | if masks is not None: 81 | output = output.masked_fill(masks, 0.0) 82 | 83 | return output 84 | 85 | 86 | class VarFsmnRnnNARPredictor(nn.Module): 87 | def __init__( 88 | self, 89 | in_dim, 90 | filter_size, 91 | fsmn_num_layers, 92 | num_memory_units, 93 | ffn_inner_dim, 94 | dropout, 95 | shift, 96 | lstm_units, 97 | ): 98 | super(VarFsmnRnnNARPredictor, self).__init__() 99 | 100 | self.fsmn = FsmnEncoderV2( 101 | filter_size, 102 | fsmn_num_layers, 103 | in_dim, 104 | num_memory_units, 105 | ffn_inner_dim, 106 | dropout, 107 | shift, 108 | ) 109 | self.blstm = nn.LSTM( 110 | num_memory_units, 111 | lstm_units, 112 | num_layers=1, 113 | batch_first=True, 114 | bidirectional=True, 115 | ) 116 | self.fc = nn.Linear(2 * lstm_units, 1) 117 | 118 | def forward(self, inputs, masks=None): 119 | input_lengths = None 120 | if masks is not None: 121 | input_lengths = torch.sum((~masks).float(), dim=1).long() 122 | 123 | x = self.fsmn(inputs, masks) 124 | 125 | if input_lengths is not None: 126 | x = nn.utils.rnn.pack_padded_sequence( 127 | x, input_lengths.tolist(), batch_first=True, enforce_sorted=False 128 | ) 129 | x, _ = self.blstm(x) 130 | x, _ = nn.utils.rnn.pad_packed_sequence( 131 | x, batch_first=True, total_length=inputs.size(1) 132 | ) 133 | else: 134 | x, _ = self.blstm(x) 135 | 136 | x = self.fc(x).squeeze(-1) 137 | 138 | if masks is not None: 139 | x = x.masked_fill(masks, 0.0) 140 | 141 | return x 142 | -------------------------------------------------------------------------------- /kantts/models/sambert/alignment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba as nb 3 | 4 | 5 | @nb.jit(nopython=True) 6 | def mas(attn_map, width=1): 7 | # assumes mel x text 8 | opt = np.zeros_like(attn_map) 9 | attn_map = np.log(attn_map) 10 | attn_map[0, 1:] = -np.inf 11 | log_p = np.zeros_like(attn_map) 12 | log_p[0, :] = attn_map[0, :] 13 | prev_ind = np.zeros_like(attn_map, dtype=np.int64) 14 | for i in range(1, attn_map.shape[0]): 15 | for j in range(attn_map.shape[1]): # for each text dim 16 | prev_j = np.arange(max(0, j - width), j + 1) 17 | prev_log = np.array([log_p[i - 1, prev_idx] for prev_idx in prev_j]) 18 | 19 | ind = np.argmax(prev_log) 20 | log_p[i, j] = attn_map[i, j] + prev_log[ind] 21 | prev_ind[i, j] = prev_j[ind] 22 | 23 | # now backtrack 24 | curr_text_idx = attn_map.shape[1] - 1 25 | for i in range(attn_map.shape[0] - 1, -1, -1): 26 | opt[i, curr_text_idx] = 1 27 | curr_text_idx = prev_ind[i, curr_text_idx] 28 | opt[0, curr_text_idx] = 1 29 | return opt 30 | 31 | 32 | @nb.jit(nopython=True) 33 | def mas_width1(attn_map): 34 | """mas with hardcoded width=1""" 35 | # assumes mel x text 36 | opt = np.zeros_like(attn_map) 37 | attn_map = np.log(attn_map) 38 | attn_map[0, 1:] = -np.inf 39 | log_p = np.zeros_like(attn_map) 40 | log_p[0, :] = attn_map[0, :] 41 | prev_ind = np.zeros_like(attn_map, dtype=np.int64) 42 | for i in range(1, attn_map.shape[0]): 43 | for j in range(attn_map.shape[1]): # for each text dim 44 | prev_log = log_p[i - 1, j] 45 | prev_j = j 46 | 47 | if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]: 48 | prev_log = log_p[i - 1, j - 1] 49 | prev_j = j - 1 50 | 51 | log_p[i, j] = attn_map[i, j] + prev_log 52 | prev_ind[i, j] = prev_j 53 | 54 | # now backtrack 55 | curr_text_idx = attn_map.shape[1] - 1 56 | for i in range(attn_map.shape[0] - 1, -1, -1): 57 | opt[i, curr_text_idx] = 1 58 | curr_text_idx = prev_ind[i, curr_text_idx] 59 | opt[0, curr_text_idx] = 1 60 | return opt 61 | 62 | 63 | @nb.jit(nopython=True, parallel=True) 64 | def b_mas(b_attn_map, in_lens, out_lens, width=1): 65 | assert width == 1 66 | attn_out = np.zeros_like(b_attn_map) 67 | 68 | for b in nb.prange(b_attn_map.shape[0]): 69 | out = mas_width1(b_attn_map[b, 0, : out_lens[b], : in_lens[b]]) 70 | attn_out[b, 0, : out_lens[b], : in_lens[b]] = out 71 | return attn_out 72 | -------------------------------------------------------------------------------- /kantts/models/sambert/attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class ConvNorm(torch.nn.Module): 7 | def __init__( 8 | self, 9 | in_channels, 10 | out_channels, 11 | kernel_size=1, 12 | stride=1, 13 | padding=None, 14 | dilation=1, 15 | bias=True, 16 | w_init_gain="linear", 17 | ): 18 | super(ConvNorm, self).__init__() 19 | if padding is None: 20 | assert kernel_size % 2 == 1 21 | padding = int(dilation * (kernel_size - 1) / 2) 22 | 23 | self.conv = torch.nn.Conv1d( 24 | in_channels, 25 | out_channels, 26 | kernel_size=kernel_size, 27 | stride=stride, 28 | padding=padding, 29 | dilation=dilation, 30 | bias=bias, 31 | ) 32 | 33 | torch.nn.init.xavier_uniform_( 34 | self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain) 35 | ) 36 | 37 | def forward(self, signal): 38 | conv_signal = self.conv(signal) 39 | return conv_signal 40 | 41 | 42 | class ConvAttention(torch.nn.Module): 43 | def __init__( 44 | self, 45 | n_mel_channels=80, 46 | n_text_channels=512, 47 | n_att_channels=80, 48 | temperature=1.0, 49 | use_query_proj=True, 50 | ): 51 | super(ConvAttention, self).__init__() 52 | self.temperature = temperature 53 | self.att_scaling_factor = np.sqrt(n_att_channels) 54 | self.softmax = torch.nn.Softmax(dim=3) 55 | self.log_softmax = torch.nn.LogSoftmax(dim=3) 56 | self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1) 57 | self.use_query_proj = bool(use_query_proj) 58 | 59 | self.key_proj = nn.Sequential( 60 | ConvNorm( 61 | n_text_channels, 62 | n_text_channels * 2, 63 | kernel_size=3, 64 | bias=True, 65 | w_init_gain="relu", 66 | ), 67 | torch.nn.ReLU(), 68 | ConvNorm(n_text_channels * 2, n_att_channels, kernel_size=1, bias=True), 69 | ) 70 | 71 | self.query_proj = nn.Sequential( 72 | ConvNorm( 73 | n_mel_channels, 74 | n_mel_channels * 2, 75 | kernel_size=3, 76 | bias=True, 77 | w_init_gain="relu", 78 | ), 79 | torch.nn.ReLU(), 80 | ConvNorm(n_mel_channels * 2, n_mel_channels, kernel_size=1, bias=True), 81 | torch.nn.ReLU(), 82 | ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True), 83 | ) 84 | 85 | def forward(self, queries, keys, mask=None, attn_prior=None): 86 | """Attention mechanism for flowtron parallel 87 | Unlike in Flowtron, we have no restrictions such as causality etc, 88 | since we only need this during training. 89 | 90 | Args: 91 | queries (torch.tensor): B x C x T1 tensor 92 | (probably going to be mel data) 93 | keys (torch.tensor): B x C2 x T2 tensor (text data) 94 | mask (torch.tensor): uint8 binary mask for variable length entries 95 | (should be in the T2 domain) 96 | Output: 97 | attn (torch.tensor): B x 1 x T1 x T2 attention mask. 98 | Final dim T2 should sum to 1 99 | """ 100 | keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 101 | 102 | # Beware can only do this since query_dim = attn_dim = n_mel_channels 103 | if self.use_query_proj: 104 | queries_enc = self.query_proj(queries) 105 | else: 106 | queries_enc = queries 107 | 108 | # different ways of computing attn, 109 | # one is isotopic gaussians (per phoneme) 110 | # Simplistic Gaussian Isotopic Attention 111 | 112 | # B x n_attn_dims x T1 x T2 113 | attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2 114 | # compute log likelihood from a gaussian 115 | attn = -0.0005 * attn.sum(1, keepdim=True) 116 | if attn_prior is not None: 117 | attn = self.log_softmax(attn) + torch.log(attn_prior[:, None] + 1e-8) 118 | 119 | attn_logprob = attn.clone() 120 | 121 | if mask is not None: 122 | attn.data.masked_fill_(mask.unsqueeze(1).unsqueeze(1), -float("inf")) 123 | 124 | attn = self.softmax(attn) # Softmax along T2 125 | return attn, attn_logprob 126 | -------------------------------------------------------------------------------- /kantts/models/sambert/fsmn.py: -------------------------------------------------------------------------------- 1 | """ 2 | FSMN Pytorch Version 3 | """ 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class FeedForwardNet(nn.Module): 9 | """ A two-feed-forward-layer module """ 10 | 11 | def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1): 12 | super().__init__() 13 | 14 | # Use Conv1D 15 | # position-wise 16 | self.w_1 = nn.Conv1d( 17 | d_in, 18 | d_hid, 19 | kernel_size=kernel_size[0], 20 | padding=(kernel_size[0] - 1) // 2, 21 | ) 22 | # position-wise 23 | self.w_2 = nn.Conv1d( 24 | d_hid, 25 | d_out, 26 | kernel_size=kernel_size[1], 27 | padding=(kernel_size[1] - 1) // 2, 28 | bias=False, 29 | ) 30 | 31 | self.dropout = nn.Dropout(dropout) 32 | 33 | def forward(self, x): 34 | output = x.transpose(1, 2) 35 | output = F.relu(self.w_1(output)) 36 | output = self.dropout(output) 37 | output = self.w_2(output) 38 | output = output.transpose(1, 2) 39 | 40 | return output 41 | 42 | 43 | class MemoryBlockV2(nn.Module): 44 | def __init__(self, d, filter_size, shift, dropout=0.0): 45 | super(MemoryBlockV2, self).__init__() 46 | 47 | left_padding = int(round((filter_size - 1) / 2)) 48 | right_padding = int((filter_size - 1) / 2) 49 | if shift > 0: 50 | left_padding += shift 51 | right_padding -= shift 52 | 53 | self.lp, self.rp = left_padding, right_padding 54 | 55 | self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False) 56 | self.dropout = nn.Dropout(dropout) 57 | 58 | def forward(self, input, mask=None): 59 | if mask is not None: 60 | input = input.masked_fill(mask.unsqueeze(-1), 0) 61 | 62 | x = F.pad(input, (0, 0, self.lp, self.rp, 0, 0), mode="constant", value=0.0) 63 | output = ( 64 | self.conv_dw(x.contiguous().transpose(1, 2)).contiguous().transpose(1, 2) 65 | ) 66 | output += input 67 | output = self.dropout(output) 68 | 69 | if mask is not None: 70 | output = output.masked_fill(mask.unsqueeze(-1), 0) 71 | 72 | return output 73 | 74 | 75 | class FsmnEncoderV2(nn.Module): 76 | def __init__( 77 | self, 78 | filter_size, 79 | fsmn_num_layers, 80 | input_dim, 81 | num_memory_units, 82 | ffn_inner_dim, 83 | dropout=0.0, 84 | shift=0, 85 | ): 86 | super(FsmnEncoderV2, self).__init__() 87 | 88 | self.filter_size = filter_size 89 | self.fsmn_num_layers = fsmn_num_layers 90 | self.num_memory_units = num_memory_units 91 | self.ffn_inner_dim = ffn_inner_dim 92 | self.dropout = dropout 93 | self.shift = shift 94 | if not isinstance(shift, list): 95 | self.shift = [shift for _ in range(self.fsmn_num_layers)] 96 | 97 | self.ffn_lst = nn.ModuleList() 98 | self.ffn_lst.append( 99 | FeedForwardNet(input_dim, ffn_inner_dim, num_memory_units, dropout=dropout) 100 | ) 101 | for i in range(1, fsmn_num_layers): 102 | self.ffn_lst.append( 103 | FeedForwardNet( 104 | num_memory_units, ffn_inner_dim, num_memory_units, dropout=dropout 105 | ) 106 | ) 107 | 108 | self.memory_block_lst = nn.ModuleList() 109 | for i in range(fsmn_num_layers): 110 | self.memory_block_lst.append( 111 | MemoryBlockV2(num_memory_units, filter_size, self.shift[i], dropout) 112 | ) 113 | 114 | def forward(self, input, mask=None): 115 | x = F.dropout(input, self.dropout, self.training) 116 | for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst): 117 | context = ffn(x) 118 | memory = memory_block(context, mask) 119 | memory = F.dropout(memory, self.dropout, self.training) 120 | if memory.size(-1) == x.size(-1): 121 | memory += x 122 | x = memory 123 | 124 | return x 125 | -------------------------------------------------------------------------------- /kantts/models/sambert/positions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import numpy as np 6 | 7 | 8 | class SinusoidalPositionEncoder(nn.Module): 9 | def __init__(self, max_len, depth): 10 | super(SinusoidalPositionEncoder, self).__init__() 11 | 12 | self.max_len = max_len 13 | self.depth = depth 14 | self.position_enc = nn.Parameter( 15 | self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0), 16 | requires_grad=False, 17 | ) 18 | 19 | def forward(self, input): 20 | bz_in, len_in, _ = input.size() 21 | if len_in > self.max_len: 22 | self.max_len = len_in 23 | self.position_enc.data = ( 24 | self.get_sinusoid_encoding_table(self.max_len, self.depth) 25 | .unsqueeze(0) 26 | .to(input.device) 27 | ) 28 | 29 | output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1) 30 | 31 | return output 32 | 33 | @staticmethod 34 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 35 | """ Sinusoid position encoding table """ 36 | 37 | def cal_angle(position, hid_idx): 38 | return position / np.power(10000, hid_idx / float(d_hid / 2 - 1)) 39 | 40 | def get_posi_angle_vec(position): 41 | return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)] 42 | 43 | scaled_time_table = np.array( 44 | [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)] 45 | ) 46 | 47 | sinusoid_table = np.zeros((n_position, d_hid)) 48 | sinusoid_table[:, : d_hid // 2] = np.sin(scaled_time_table) 49 | sinusoid_table[:, d_hid // 2 :] = np.cos(scaled_time_table) 50 | 51 | if padding_idx is not None: 52 | # zero vector for padding dimension 53 | sinusoid_table[padding_idx] = 0.0 54 | 55 | return torch.FloatTensor(sinusoid_table) 56 | 57 | 58 | class DurSinusoidalPositionEncoder(nn.Module): 59 | def __init__(self, depth, outputs_per_step): 60 | super(DurSinusoidalPositionEncoder, self).__init__() 61 | 62 | self.depth = depth 63 | self.outputs_per_step = outputs_per_step 64 | 65 | inv_timescales = [ 66 | np.power(10000, 2 * (hid_idx // 2) / depth) for hid_idx in range(depth) 67 | ] 68 | self.inv_timescales = nn.Parameter( 69 | torch.FloatTensor(inv_timescales), requires_grad=False 70 | ) 71 | 72 | def forward(self, durations, masks=None): 73 | reps = (durations + 0.5).long() 74 | output_lens = reps.sum(dim=1) 75 | max_len = output_lens.max() 76 | reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[ 77 | :, None, : 78 | ] 79 | range_ = torch.arange(max_len).to(durations.device)[None, :, None] 80 | mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_) 81 | mult = mult.float() 82 | offsets = torch.matmul(mult, reps_cumsum[:, 0, :-1].unsqueeze(-1)).squeeze(-1) 83 | dur_pos = range_[:, :, 0] - offsets + 1 84 | 85 | if masks is not None: 86 | assert masks.size(1) == dur_pos.size(1) 87 | dur_pos = dur_pos.masked_fill(masks, 0.0) 88 | 89 | seq_len = dur_pos.size(1) 90 | padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step 91 | if padding < self.outputs_per_step: 92 | dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0) 93 | 94 | position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, None, :] 95 | position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, 0::2]) 96 | position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, 1::2]) 97 | 98 | return position_embedding 99 | -------------------------------------------------------------------------------- /kantts/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from distutils.version import LooseVersion 3 | 4 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7") 5 | 6 | 7 | def init_weights(m, mean=0.0, std=0.01): 8 | classname = m.__class__.__name__ 9 | if classname.find("Conv") != -1: 10 | m.weight.data.normal_(mean, std) 11 | 12 | 13 | def get_mask_from_lengths(lengths, max_len=None): 14 | batch_size = lengths.shape[0] 15 | if max_len is None: 16 | max_len = torch.max(lengths).item() 17 | 18 | ids = ( 19 | torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device) 20 | ) 21 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) 22 | 23 | return mask 24 | -------------------------------------------------------------------------------- /kantts/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/audio_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/audio_processor/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/core/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/fp_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import random 4 | 5 | 6 | def is_fp_line(line): 7 | fp_category_list = ["FP", "I", "N", "Q"] 8 | elements = line.strip().split(" ") 9 | res = True 10 | for ele in elements: 11 | if ele not in fp_category_list: 12 | res = False 13 | break 14 | return res 15 | 16 | 17 | class FpProcessor: 18 | def __init__(self): 19 | # TODO: Add more audio processing methods. 20 | self.res = [] 21 | 22 | def is_fp_line(line): 23 | fp_category_list = ["FP", "I", "N", "Q"] 24 | elements = line.strip().split(" ") 25 | res = True 26 | for ele in elements: 27 | if ele not in fp_category_list: 28 | res = False 29 | break 30 | return res 31 | 32 | # TODO: adjust idx judgment rule 33 | def addfp(self, voice_output_dir, prosody, raw_metafile_lines): 34 | 35 | fp_category_list = ["FP", "I", "N"] 36 | 37 | f = open(prosody) 38 | prosody_lines = f.readlines() 39 | f.close() 40 | 41 | idx = "" 42 | fp = "" 43 | fp_label_dict = {} 44 | i = 0 45 | while i < len(prosody_lines): 46 | if len(prosody_lines[i].strip().split("\t")) == 2: 47 | idx = prosody_lines[i].strip().split("\t")[0] 48 | i += 1 49 | else: 50 | fp_enable = is_fp_line(prosody_lines[i]) 51 | if fp_enable: 52 | fp = prosody_lines[i].strip().split("\t")[0].split(" ") 53 | for label in fp: 54 | if label not in fp_category_list: 55 | logging.warning("fp label not in fp_category_list") 56 | break 57 | i += 4 58 | else: 59 | fp = [ 60 | "N" 61 | for _ in range( 62 | len( 63 | prosody_lines[i] 64 | .strip() 65 | .split("\t")[0] 66 | .replace("/ ", "") 67 | .replace(". ", "") 68 | .split(" ") 69 | ) 70 | ) 71 | ] 72 | i += 1 73 | fp_label_dict[idx] = fp 74 | 75 | fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt") 76 | f_out = open(fpadd_metafile, "w") 77 | for line in raw_metafile_lines: 78 | tokens = line.strip().split("\t") 79 | if len(tokens) == 2: 80 | uttname = tokens[0] 81 | symbol_sequences = tokens[1].split(" ") 82 | 83 | error_flag = False 84 | idx = 0 85 | out_str = uttname + "\t" 86 | 87 | for this_symbol_sequence in symbol_sequences: 88 | emotion = this_symbol_sequence.split("$")[4] 89 | this_symbol_sequence = this_symbol_sequence.replace( 90 | emotion, "emotion_neutral" 91 | ) 92 | 93 | if idx < len(fp_label_dict[uttname]): 94 | if fp_label_dict[uttname][idx] == "FP": 95 | if "none" not in this_symbol_sequence: 96 | this_symbol_sequence = this_symbol_sequence.replace( 97 | "emotion_neutral", "emotion_disgust" 98 | ) 99 | syllable_label = this_symbol_sequence.split("$")[2] 100 | if syllable_label == "s_both" or syllable_label == "s_end": 101 | idx += 1 102 | elif idx > len(fp_label_dict[uttname]): 103 | logging.warning(uttname + " not match") 104 | error_flag = True 105 | out_str = out_str + this_symbol_sequence + " " 106 | 107 | # if idx != len(fp_label_dict[uttname]): 108 | # logging.warning( 109 | # "{} length mismatch, length: {} ".format( 110 | # idx, len(fp_label_dict[uttname]) 111 | # ) 112 | # ) 113 | 114 | if not error_flag: 115 | f_out.write(out_str.strip() + "\n") 116 | f_out.close() 117 | return fpadd_metafile 118 | 119 | def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines): 120 | 121 | f = open(fpadd_metafile) 122 | fpadd_metafile_lines = f.readlines() 123 | f.close() 124 | 125 | fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt") 126 | f_out = open(fprm_metafile, "w") 127 | for i in range(len(raw_metafile_lines)): 128 | tokens = raw_metafile_lines[i].strip().split("\t") 129 | symbol_sequences = tokens[1].split(" ") 130 | fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t") 131 | fpadd_symbol_sequences = fpadd_tokens[1].split(" ") 132 | 133 | error_flag = False 134 | out_str = tokens[0] + "\t" 135 | idx = 0 136 | length = len(symbol_sequences) 137 | while idx < length: 138 | if "$emotion_disgust" in fpadd_symbol_sequences[idx]: 139 | if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]: 140 | idx = idx + 2 141 | else: 142 | idx = idx + 1 143 | continue 144 | out_str = out_str + symbol_sequences[idx] + " " 145 | idx = idx + 1 146 | 147 | if not error_flag: 148 | f_out.write(out_str.strip() + "\n") 149 | f_out.close() 150 | 151 | def process(self, voice_output_dir, prosody, raw_metafile): 152 | 153 | with open(raw_metafile, "r") as f: 154 | lines = f.readlines() 155 | random.shuffle(lines) 156 | 157 | fpadd_metafile = self.addfp(voice_output_dir, prosody, lines) 158 | self.removefp(voice_output_dir, fpadd_metafile, lines) 159 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt: -------------------------------------------------------------------------------- 1 | wu w 2 | yi y 3 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/PinYin/PosSet.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 1 5 | a 6 | todo 7 | 8 | 9 | 2 10 | b 11 | todo 12 | 13 | 14 | 3 15 | c 16 | todo 17 | 18 | 19 | 4 20 | d 21 | todo 22 | 23 | 24 | 5 25 | e 26 | todo 27 | 28 | 29 | 6 30 | f 31 | todo 32 | 33 | 34 | 7 35 | g 36 | todo 37 | 38 | 39 | 8 40 | gb 41 | todo 42 | 43 | 44 | 45 | 46 | 9 47 | h 48 | todo 49 | 50 | 51 | 10 52 | i 53 | todo 54 | 55 | 56 | 11 57 | j 58 | todo 59 | 60 | 61 | 12 62 | k 63 | todo 64 | 65 | 66 | 13 67 | l 68 | todo 69 | 70 | 71 | 14 72 | m 73 | todo 74 | 75 | 76 | 15 77 | n 78 | todo 79 | 80 | 81 | 16 82 | nz 83 | todo 84 | 85 | 86 | 87 | 88 | 17 89 | o 90 | todo 91 | 92 | 93 | 18 94 | p 95 | todo 96 | 97 | 98 | 19 99 | q 100 | todo 101 | 102 | 103 | 20 104 | r 105 | todo 106 | 107 | 108 | 21 109 | s 110 | todo 111 | 112 | 113 | 22 114 | t 115 | todo 116 | 117 | 118 | 23 119 | u 120 | todo 121 | 122 | 123 | 24 124 | v 125 | todo 126 | 127 | 128 | 25 129 | w 130 | todo 131 | 132 | 133 | 26 134 | x 135 | todo 136 | 137 | 138 | 27 139 | y 140 | todo 141 | 142 | 143 | 28 144 | z 145 | todo 146 | 147 | 148 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/PinYin/tonelist.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 3 | 4 4 | 2 5 | 3 6 | 5 7 | 0 8 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/Sichuan/En2ChPhoneMap.txt: -------------------------------------------------------------------------------- 1 | wu w 2 | yi y 3 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/Sichuan/PosSet.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 1 5 | a 6 | todo 7 | 8 | 9 | 2 10 | b 11 | todo 12 | 13 | 14 | 3 15 | c 16 | todo 17 | 18 | 19 | 4 20 | d 21 | todo 22 | 23 | 24 | 5 25 | e 26 | todo 27 | 28 | 29 | 6 30 | f 31 | todo 32 | 33 | 34 | 7 35 | g 36 | todo 37 | 38 | 39 | 8 40 | gb 41 | todo 42 | 43 | 44 | 45 | 46 | 9 47 | h 48 | todo 49 | 50 | 51 | 10 52 | i 53 | todo 54 | 55 | 56 | 11 57 | j 58 | todo 59 | 60 | 61 | 12 62 | k 63 | todo 64 | 65 | 66 | 13 67 | l 68 | todo 69 | 70 | 71 | 14 72 | m 73 | todo 74 | 75 | 76 | 15 77 | n 78 | todo 79 | 80 | 81 | 16 82 | nz 83 | todo 84 | 85 | 86 | 87 | 88 | 17 89 | o 90 | todo 91 | 92 | 93 | 18 94 | p 95 | todo 96 | 97 | 98 | 19 99 | q 100 | todo 101 | 102 | 103 | 20 104 | r 105 | todo 106 | 107 | 108 | 21 109 | s 110 | todo 111 | 112 | 113 | 22 114 | t 115 | todo 116 | 117 | 118 | 23 119 | u 120 | todo 121 | 122 | 123 | 24 124 | v 125 | todo 126 | 127 | 128 | 25 129 | w 130 | todo 131 | 132 | 133 | 26 134 | x 135 | todo 136 | 137 | 138 | 27 139 | y 140 | todo 141 | 142 | 143 | 28 144 | z 145 | todo 146 | 147 | 148 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/Sichuan/tonelist.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 3 | 4 4 | 2 5 | 3 6 | 5 7 | 0 8 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/WuuShanghai/En2ChPhoneMap.txt: -------------------------------------------------------------------------------- 1 | wu w 2 | yi y 3 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/WuuShanghai/tonelist.txt: -------------------------------------------------------------------------------- 1 | 6 2 | 0 3 | 3 4 | 4 5 | 2 6 | 5 7 | 1 8 | 7 9 | 8 10 | 11 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/ZhHK/En2ChPhoneMap.txt: -------------------------------------------------------------------------------- 1 | wu w 2 | yi y 3 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/ZhHK/tonelist.txt: -------------------------------------------------------------------------------- 1 | 6 2 | 0 3 | 3 4 | 4 5 | 2 6 | 5 7 | 7 8 | 1 9 | 8 10 | 9 11 | 12 | -------------------------------------------------------------------------------- /kantts/preprocess/languages/__init__.py: -------------------------------------------------------------------------------- 1 | languages = { 2 | "PinYin": { 3 | "phoneset_path": "PhoneSet.xml", 4 | "posset_path": "PosSet.xml", 5 | "f2t_map_path": "En2ChPhoneMap.txt", 6 | "s2p_map_path": "py2phoneMap.txt", 7 | "tonelist_path": "tonelist.txt", 8 | }, 9 | "ZhHK": { 10 | "phoneset_path": "PhoneSet.xml", 11 | "posset_path": "PosSet.xml", 12 | "f2t_map_path": "En2ChPhoneMap.txt", 13 | "s2p_map_path": "py2phoneMap.txt", 14 | "tonelist_path": "tonelist.txt", 15 | }, 16 | "WuuShanghai": { 17 | "phoneset_path": "PhoneSet.xml", 18 | "posset_path": "PosSet.xml", 19 | "f2t_map_path": "En2ChPhoneMap.txt", 20 | "s2p_map_path": "py2phoneMap.txt", 21 | "tonelist_path": "tonelist.txt", 22 | }, 23 | "Sichuan": { 24 | "phoneset_path": "PhoneSet.xml", 25 | "posset_path": "PosSet.xml", 26 | "f2t_map_path": "En2ChPhoneMap.txt", 27 | "s2p_map_path": "py2phoneMap.txt", 28 | "tonelist_path": "tonelist.txt", 29 | }, 30 | } 31 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/Phone.py: -------------------------------------------------------------------------------- 1 | from .XmlObj import XmlObj 2 | from .core_types import PhoneCVType, PhoneIFType, PhoneUVType, PhoneAPType, PhoneAMType 3 | 4 | 5 | class Phone(XmlObj): 6 | def __init__(self): 7 | self.m_id = None 8 | self.m_name = None 9 | self.m_cv_type = PhoneCVType.NULL 10 | self.m_if_type = PhoneIFType.NULL 11 | self.m_uv_type = PhoneUVType.NULL 12 | self.m_ap_type = PhoneAPType.NULL 13 | self.m_am_type = PhoneAMType.NULL 14 | self.m_bnd = False 15 | 16 | def __str__(self): 17 | return self.m_name 18 | 19 | def Save(self): 20 | pass 21 | 22 | def Load(self, phone_node): 23 | ns = "{http://schemas.alibaba-inc.com/tts}" 24 | 25 | id_node = phone_node.find(ns + "id") 26 | self.m_id = int(id_node.text) 27 | 28 | name_node = phone_node.find(ns + "name") 29 | self.m_name = name_node.text 30 | 31 | cv_node = phone_node.find(ns + "cv") 32 | self.m_cv_type = PhoneCVType.parse(cv_node.text) 33 | 34 | if_node = phone_node.find(ns + "if") 35 | self.m_if_type = PhoneIFType.parse(if_node.text) 36 | 37 | uv_node = phone_node.find(ns + "uv") 38 | self.m_uv_type = PhoneUVType.parse(uv_node.text) 39 | 40 | ap_node = phone_node.find(ns + "ap") 41 | self.m_ap_type = PhoneAPType.parse(ap_node.text) 42 | 43 | am_node = phone_node.find(ns + "am") 44 | self.m_am_type = PhoneAMType.parse(am_node.text) 45 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/PhoneSet.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import logging 3 | 4 | from .XmlObj import XmlObj 5 | from .Phone import Phone 6 | 7 | 8 | class PhoneSet(XmlObj): 9 | def __init__(self, phoneset_path): 10 | self.m_phone_list = [] 11 | self.m_id_map = {} 12 | self.m_name_map = {} 13 | self.Load(phoneset_path) 14 | 15 | def Load(self, file_path): 16 | # alibaba tts xml namespace 17 | ns = "{http://schemas.alibaba-inc.com/tts}" 18 | 19 | phoneset_root = ET.parse(file_path).getroot() 20 | for phone_node in phoneset_root.findall(ns + "phone"): 21 | phone = Phone() 22 | phone.Load(phone_node) 23 | self.m_phone_list.append(phone) 24 | if phone.m_id in self.m_id_map: 25 | logging.error("PhoneSet.Load: duplicate id: %d", phone.m_id) 26 | self.m_id_map[phone.m_id] = phone 27 | 28 | if phone.m_name in self.m_name_map: 29 | logging.error("PhoneSet.Load duplicate name name: %s", phone.m_name) 30 | self.m_name_map[phone.m_name] = phone 31 | 32 | def Save(self): 33 | pass 34 | 35 | 36 | # if __name__ == "__main__": 37 | # import os 38 | # import sys 39 | # 40 | # phoneset = PhoneSet() 41 | # phoneset.Load(sys.argv[1]) 42 | # 43 | # for phone in phoneset.m_phone_list: 44 | # print(phone) 45 | # print(phone.m_id) 46 | # print(phone.m_name) 47 | # print(phone.m_cv_type) 48 | # print(phone.m_if_type) 49 | # print(phone.m_uv_type) 50 | # print(phone.m_ap_type) 51 | # print(phone.m_am_type) 52 | # print(phone.m_bnd) 53 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/Pos.py: -------------------------------------------------------------------------------- 1 | from .XmlObj import XmlObj 2 | 3 | 4 | class Pos(XmlObj): 5 | def __init__(self): 6 | self.m_id = None 7 | self.m_name = None 8 | self.m_desc = None 9 | self.m_level = 1 10 | self.m_parent = None 11 | self.m_sub_pos_list = [] 12 | 13 | def __str__(self): 14 | return self.m_name 15 | 16 | def Save(self): 17 | pass 18 | 19 | def Load(self, pos_node): 20 | ns = "{http://schemas.alibaba-inc.com/tts}" 21 | 22 | id_node = pos_node.find(ns + "id") 23 | self.m_id = int(id_node.text) 24 | 25 | name_node = pos_node.find(ns + "name") 26 | self.m_name = name_node.text 27 | 28 | desc_node = pos_node.find(ns + "desc") 29 | self.m_desc = desc_node.text 30 | 31 | sub_node = pos_node.find(ns + "sub") 32 | if sub_node is not None: 33 | for sub_pos_node in sub_node.findall(ns + "pos"): 34 | sub_pos = Pos() 35 | sub_pos.Load(sub_pos_node) 36 | sub_pos.m_parent = self 37 | sub_pos.m_level = self.m_level + 1 38 | self.m_sub_pos_list.append(sub_pos) 39 | 40 | return 41 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/PosSet.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import logging 3 | 4 | from .XmlObj import XmlObj 5 | from .Pos import Pos 6 | 7 | 8 | class PosSet(XmlObj): 9 | def __init__(self, posset_path): 10 | self.m_pos_list = [] 11 | self.m_id_map = {} 12 | self.m_name_map = {} 13 | self.Load(posset_path) 14 | 15 | def Load(self, file_path): 16 | # alibaba tts xml namespace 17 | ns = "{http://schemas.alibaba-inc.com/tts}" 18 | 19 | posset_root = ET.parse(file_path).getroot() 20 | for pos_node in posset_root.findall(ns + "pos"): 21 | pos = Pos() 22 | pos.Load(pos_node) 23 | self.m_pos_list.append(pos) 24 | if pos.m_id in self.m_id_map: 25 | logging.error("PosSet.Load: duplicate id: %d", pos.m_id) 26 | self.m_id_map[pos.m_id] = pos 27 | 28 | if pos.m_name in self.m_name_map: 29 | logging.error("PosSet.Load duplicate name name: %s", pos.m_name) 30 | self.m_name_map[pos.m_name] = pos 31 | 32 | if len(pos.m_sub_pos_list) > 0: 33 | for sub_pos in pos.m_sub_pos_list: 34 | self.m_pos_list.append(sub_pos) 35 | if sub_pos.m_id in self.m_id_map: 36 | logging.error("PosSet.Load: duplicate id: %d", sub_pos.m_id) 37 | self.m_id_map[sub_pos.m_id] = sub_pos 38 | 39 | if sub_pos.m_name in self.m_name_map: 40 | logging.error( 41 | "PosSet.Load duplicate name name: %s", sub_pos.m_name 42 | ) 43 | self.m_name_map[sub_pos.m_name] = sub_pos 44 | 45 | def Save(self): 46 | pass 47 | 48 | 49 | # if __name__ == "__main__": 50 | # import os 51 | # import sys 52 | # 53 | # posset = PosSet() 54 | # posset.Load(sys.argv[1]) 55 | # 56 | # for pos in posset.m_pos_list: 57 | # print(pos) 58 | # print(pos.m_id) 59 | # print(pos.m_name) 60 | # print(pos.m_desc) 61 | # print(pos.m_level) 62 | # print(pos.m_parent) 63 | # if pos.m_sub_pos_list: 64 | # print("sub pos list:") 65 | # for sub_pos in pos.m_sub_pos_list: 66 | # print(sub_pos) 67 | # print(sub_pos.m_id) 68 | # print(sub_pos.m_name) 69 | # print(sub_pos.m_desc) 70 | # print(sub_pos.m_level) 71 | # print(sub_pos.m_parent) 72 | # print("sub pos list end") 73 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/Script.py: -------------------------------------------------------------------------------- 1 | from .XmlObj import XmlObj 2 | 3 | import xml.etree.ElementTree as ET 4 | from xml.dom import minidom 5 | 6 | 7 | class Script(XmlObj): 8 | def __init__(self, phoneset, posset): 9 | self.m_phoneset = phoneset 10 | self.m_posset = posset 11 | self.m_items = [] 12 | 13 | def Save(self, outputXMLPath): 14 | root = ET.Element("script") 15 | 16 | root.set("uttcount", str(len(self.m_items))) 17 | root.set("xmlns", "http://schemas.alibaba-inc.com/tts") 18 | for item in self.m_items: 19 | item.Save(root) 20 | 21 | xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml( 22 | indent=" ", encoding="utf-8" 23 | ) 24 | with open(outputXMLPath, "wb") as f: 25 | f.write(xmlstr) 26 | 27 | def SaveMetafile(self): 28 | meta_lines = [] 29 | 30 | for item in self.m_items: 31 | meta_lines.append(item.SaveMetafile()) 32 | 33 | return meta_lines 34 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/ScriptItem.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | from .XmlObj import XmlObj 4 | 5 | 6 | class ScriptItem(XmlObj): 7 | def __init__(self, phoneset, posset): 8 | if phoneset is None or posset is None: 9 | raise Exception("ScriptItem.__init__: phoneset or posset is None") 10 | self.m_phoneset = phoneset 11 | self.m_posset = posset 12 | 13 | self.m_id = None 14 | self.m_text = "" 15 | self.m_scriptSentence_list = [] 16 | self.m_status = None 17 | 18 | def Load(self): 19 | pass 20 | 21 | def Save(self, parent_node): 22 | utterance_node = ET.SubElement(parent_node, "utterance") 23 | utterance_node.set("id", self.m_id) 24 | 25 | text_node = ET.SubElement(utterance_node, "text") 26 | text_node.text = self.m_text 27 | 28 | for sentence in self.m_scriptSentence_list: 29 | sentence.Save(utterance_node) 30 | 31 | def SaveMetafile(self): 32 | meta_line = self.m_id + "\t" 33 | 34 | for sentence in self.m_scriptSentence_list: 35 | meta_line += sentence.SaveMetafile() 36 | 37 | return meta_line 38 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/ScriptSentence.py: -------------------------------------------------------------------------------- 1 | from .XmlObj import XmlObj 2 | 3 | import xml.etree.ElementTree as ET 4 | 5 | 6 | # TODO(jin): Not referenced, temporarily commented 7 | class WrittenSentence(XmlObj): 8 | def __init__(self, posset): 9 | self.m_written_word_list = [] 10 | self.m_written_mark_list = [] 11 | self.m_posset = posset 12 | self.m_align_list = [] 13 | self.m_alignCursor = 0 14 | self.m_accompanyIndex = 0 15 | self.m_sequence = "" 16 | self.m_text = "" 17 | 18 | def AddHost(self, writtenWord): 19 | self.m_written_word_list.append(writtenWord) 20 | self.m_align_list.append(self.m_alignCursor) 21 | 22 | def LoadHost(self): 23 | pass 24 | 25 | def SaveHost(self): 26 | pass 27 | 28 | def AddAccompany(self, writtenMark): 29 | self.m_written_mark_list.append(writtenMark) 30 | self.m_alignCursor += 1 31 | self.m_accompanyIndex += 1 32 | 33 | def SaveAccompany(self): 34 | pass 35 | 36 | def LoadAccompany(self): 37 | pass 38 | 39 | # Get the mark span corresponding to specific spoken word 40 | def GetAccompanySpan(self, host_index): 41 | if host_index == -1: 42 | return (0, self.m_align_list[0]) 43 | 44 | accompany_begin = self.m_align_list[host_index] 45 | accompany_end = ( 46 | self.m_align_list[host_index + 1] 47 | if host_index + 1 < len(self.m_written_word_list) 48 | else len(self.m_written_mark_list) 49 | ) 50 | 51 | return (accompany_begin, accompany_end) 52 | 53 | # TODO: iterable 54 | def GetElements(self): 55 | accompany_begin, accompany_end = self.GetAccompanySpan(-1) 56 | res_lst = [ 57 | self.m_written_mark_list[i] for i in range(accompany_begin, accompany_end) 58 | ] 59 | 60 | for j in range(len(self.m_written_word_list)): 61 | accompany_begin, accompany_end = self.GetAccompanySpan(j) 62 | res_lst.extend([self.m_written_word_list[j]]) 63 | res_lst.extend( 64 | [ 65 | self.m_written_mark_list[i] 66 | for i in range(accompany_begin, accompany_end) 67 | ] 68 | ) 69 | 70 | return res_lst 71 | 72 | def BuildSequence(self): 73 | self.m_sequence = " ".join([str(ele) for ele in self.GetElements()]) 74 | 75 | def BuildText(self): 76 | self.m_text = "".join([str(ele) for ele in self.GetElements()]) 77 | 78 | 79 | class SpokenSentence(XmlObj): 80 | def __init__(self, phoneset): 81 | self.m_spoken_word_list = [] 82 | self.m_spoken_mark_list = [] 83 | self.m_phoneset = phoneset 84 | self.m_align_list = [] 85 | self.m_alignCursor = 0 86 | self.m_accompanyIndex = 0 87 | self.m_sequence = "" 88 | self.m_text = "" 89 | 90 | def __len__(self): 91 | return len(self.m_spoken_word_list) 92 | 93 | def AddHost(self, spokenWord): 94 | self.m_spoken_word_list.append(spokenWord) 95 | self.m_align_list.append(self.m_alignCursor) 96 | 97 | def SaveHost(self): 98 | pass 99 | 100 | def LoadHost(self): 101 | pass 102 | 103 | def AddAccompany(self, spokenMark): 104 | self.m_spoken_mark_list.append(spokenMark) 105 | self.m_alignCursor += 1 106 | self.m_accompanyIndex += 1 107 | 108 | def SaveAccompany(self): 109 | pass 110 | 111 | # Get the mark span corresponding to specific spoken word 112 | def GetAccompanySpan(self, host_index): 113 | if host_index == -1: 114 | return (0, self.m_align_list[0]) 115 | 116 | accompany_begin = self.m_align_list[host_index] 117 | accompany_end = ( 118 | self.m_align_list[host_index + 1] 119 | if host_index + 1 < len(self.m_spoken_word_list) 120 | else len(self.m_spoken_mark_list) 121 | ) 122 | 123 | return (accompany_begin, accompany_end) 124 | 125 | # TODO: iterable 126 | def GetElements(self): 127 | accompany_begin, accompany_end = self.GetAccompanySpan(-1) 128 | res_lst = [ 129 | self.m_spoken_mark_list[i] for i in range(accompany_begin, accompany_end) 130 | ] 131 | 132 | for j in range(len(self.m_spoken_word_list)): 133 | accompany_begin, accompany_end = self.GetAccompanySpan(j) 134 | res_lst.extend([self.m_spoken_word_list[j]]) 135 | res_lst.extend( 136 | [ 137 | self.m_spoken_mark_list[i] 138 | for i in range(accompany_begin, accompany_end) 139 | ] 140 | ) 141 | 142 | return res_lst 143 | 144 | def LoadAccompany(self): 145 | pass 146 | 147 | def BuildSequence(self): 148 | self.m_sequence = " ".join([str(ele) for ele in self.GetElements()]) 149 | 150 | def BuildText(self): 151 | self.m_text = "".join([str(ele) for ele in self.GetElements()]) 152 | 153 | def Save(self, parent_node): 154 | spoken_node = ET.SubElement(parent_node, "spoken") 155 | spoken_node.set("wordcount", str(len(self.m_spoken_word_list))) 156 | 157 | text_node = ET.SubElement(spoken_node, "text") 158 | text_node.text = self.m_sequence 159 | 160 | # TODO: spoken mark might be used 161 | for word in self.m_spoken_word_list: 162 | word.Save(spoken_node) 163 | 164 | def SaveMetafile(self): 165 | meta_line_list = [word.SaveMetafile() for word in self.m_spoken_word_list] 166 | 167 | return " ".join(meta_line_list) 168 | 169 | 170 | class ScriptSentence(XmlObj): 171 | def __init__(self, phoneset, posset): 172 | self.m_phoneset = phoneset 173 | self.m_posset = posset 174 | self.m_writtenSentence = WrittenSentence(posset) 175 | self.m_spokenSentence = SpokenSentence(phoneset) 176 | self.m_text = "" 177 | 178 | def Save(self, parent_node): 179 | if len(self.m_spokenSentence) > 0: 180 | self.m_spokenSentence.Save(parent_node) 181 | 182 | def SaveMetafile(self): 183 | if len(self.m_spokenSentence) > 0: 184 | return self.m_spokenSentence.SaveMetafile() 185 | else: 186 | return "" 187 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/ScriptWord.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | from .XmlObj import XmlObj 4 | from .core_types import Language 5 | from .Syllable import SyllableList 6 | 7 | 8 | # TODO(Jin): Not referenced, temporarily commented 9 | class WrittenWord(XmlObj): 10 | def __init__(self): 11 | self.m_name = None 12 | self.m_POS = None 13 | 14 | def __str__(self): 15 | return self.m_name 16 | 17 | def Load(self): 18 | pass 19 | 20 | def Save(self): 21 | pass 22 | 23 | 24 | class WrittenMark(XmlObj): 25 | def __init__(self): 26 | self.m_punctuation = None 27 | 28 | def __str__(self): 29 | return self.m_punctuation 30 | 31 | def Load(self): 32 | pass 33 | 34 | def Save(self): 35 | pass 36 | 37 | 38 | class SpokenWord(XmlObj): 39 | def __init__(self): 40 | self.m_name = None 41 | self.m_language = None 42 | self.m_syllable_list = [] 43 | self.m_breakText = "1" 44 | self.m_POS = "0" 45 | 46 | def __str__(self): 47 | return self.m_name 48 | 49 | def Load(self): 50 | pass 51 | 52 | def Save(self, parent_node): 53 | 54 | word_node = ET.SubElement(parent_node, "word") 55 | 56 | name_node = ET.SubElement(word_node, "name") 57 | name_node.text = self.m_name 58 | 59 | if ( 60 | len(self.m_syllable_list) > 0 61 | and self.m_syllable_list[0].m_language != Language.Neutral 62 | ): 63 | language_node = ET.SubElement(word_node, "lang") 64 | language_node.text = self.m_syllable_list[0].m_language.name 65 | 66 | SyllableList(self.m_syllable_list).Save(word_node) 67 | 68 | break_node = ET.SubElement(word_node, "break") 69 | break_node.text = self.m_breakText 70 | 71 | POS_node = ET.SubElement(word_node, "POS") 72 | POS_node.text = self.m_POS 73 | 74 | return 75 | 76 | def SaveMetafile(self): 77 | word_phone_cnt = sum( 78 | [syllable.PhoneCount() for syllable in self.m_syllable_list] 79 | ) 80 | word_syllable_cnt = len(self.m_syllable_list) 81 | single_syllable_word = word_syllable_cnt == 1 82 | meta_line_list = [] 83 | 84 | for idx, syll in enumerate(self.m_syllable_list): 85 | if word_phone_cnt == 1: 86 | word_pos = "word_both" 87 | elif idx == 0: 88 | word_pos = "word_begin" 89 | elif idx == len(self.m_syllable_list) - 1: 90 | word_pos = "word_end" 91 | else: 92 | word_pos = "word_middle" 93 | meta_line_list.append( 94 | syll.SaveMetafile(word_pos, single_syllable_word=single_syllable_word) 95 | ) 96 | 97 | if self.m_breakText != "0" and self.m_breakText is not None: 98 | meta_line_list.append( 99 | "{{#{}$tone_none$s_none$word_none}}".format(self.m_breakText) 100 | ) 101 | 102 | return " ".join(meta_line_list) 103 | 104 | 105 | class SpokenMark(XmlObj): 106 | def __init__(self): 107 | self.m_breakLevel = None 108 | 109 | def BreakLevel2Text(self): 110 | return "#" + str(self.m_breakLevel.value) 111 | 112 | def __str__(self): 113 | return self.BreakLevel2Text() 114 | 115 | def Load(self): 116 | pass 117 | 118 | def Save(self): 119 | pass 120 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/Syllable.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | from .XmlObj import XmlObj 4 | 5 | 6 | class Syllable(XmlObj): 7 | def __init__(self): 8 | self.m_phone_list = [] 9 | self.m_tone = None 10 | self.m_language = None 11 | self.m_breaklevel = None 12 | 13 | def PronunciationText(self): 14 | return " ".join([str(phone) for phone in self.m_phone_list]) 15 | 16 | def PhoneCount(self): 17 | return len(self.m_phone_list) 18 | 19 | def ToneText(self): 20 | return str(self.m_tone.value) 21 | 22 | def Save(self): 23 | pass 24 | 25 | def Load(self): 26 | pass 27 | 28 | def GetPhoneMeta( 29 | self, phone_name, word_pos, syll_pos, tone_text, single_syllable_word=False 30 | ): 31 | # Special case: word with single syllable, the last phone's word_pos should be "word_end" 32 | if word_pos == "word_begin" and syll_pos == "s_end" and single_syllable_word: 33 | word_pos = "word_end" 34 | elif word_pos == "word_begin" and syll_pos not in [ 35 | "s_begin", 36 | "s_both", 37 | ]: # FIXME: keep accord with Engine logic 38 | word_pos = "word_middle" 39 | elif word_pos == "word_end" and syll_pos not in ["s_end", "s_both"]: 40 | word_pos = "word_middle" 41 | else: 42 | pass 43 | 44 | return "{{{}$tone{}${}${}}}".format(phone_name, tone_text, syll_pos, word_pos) 45 | 46 | def SaveMetafile(self, word_pos, single_syllable_word=False): 47 | syllable_phone_cnt = len(self.m_phone_list) 48 | 49 | meta_line_list = [] 50 | 51 | for idx, phone in enumerate(self.m_phone_list): 52 | if syllable_phone_cnt == 1: 53 | syll_pos = "s_both" 54 | elif idx == 0: 55 | syll_pos = "s_begin" 56 | elif idx == len(self.m_phone_list) - 1: 57 | syll_pos = "s_end" 58 | else: 59 | syll_pos = "s_middle" 60 | meta_line_list.append( 61 | self.GetPhoneMeta( 62 | phone, 63 | word_pos, 64 | syll_pos, 65 | self.ToneText(), 66 | single_syllable_word=single_syllable_word, 67 | ) 68 | ) 69 | 70 | return " ".join(meta_line_list) 71 | 72 | 73 | class SyllableList(XmlObj): 74 | def __init__(self, syllables): 75 | self.m_syllable_list = syllables 76 | 77 | def __len__(self): 78 | return len(self.m_syllable_list) 79 | 80 | def __index__(self, index): 81 | return self.m_syllable_list[index] 82 | 83 | def PronunciationText(self): 84 | return " - ".join( 85 | [syllable.PronunciationText() for syllable in self.m_syllable_list] 86 | ) 87 | 88 | def ToneText(self): 89 | return "".join([syllable.ToneText() for syllable in self.m_syllable_list]) 90 | 91 | def Save(self, parent_node): 92 | syllable_node = ET.SubElement(parent_node, "syllable") 93 | syllable_node.set("syllcount", str(len(self.m_syllable_list))) 94 | 95 | phone_node = ET.SubElement(syllable_node, "phone") 96 | phone_node.text = self.PronunciationText() 97 | 98 | tone_node = ET.SubElement(syllable_node, "tone") 99 | tone_node.text = self.ToneText() 100 | 101 | return 102 | 103 | def Load(self): 104 | pass 105 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/XmlObj.py: -------------------------------------------------------------------------------- 1 | class XmlObj: 2 | def __init__(self): 3 | pass 4 | 5 | def Load(self): 6 | pass 7 | 8 | def Save(self): 9 | pass 10 | 11 | def LoadData(self): 12 | pass 13 | 14 | def SaveData(self): 15 | pass 16 | -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/core/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/script_convertor/core/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | import codecs 4 | 5 | WordPattern = r"((?P\w+)(\(\w+\))?)" 6 | BreakPattern = r"(?P(\*?#(?P[0-4])))" 7 | MarkPattern = r"(?P[、,。!?:“”《》·])" 8 | POSPattern = r"(?P(\*?\|(?P[1-9])))" 9 | PhraseTonePattern = r"(?P(\*?%([L|H])))" 10 | 11 | NgBreakPattern = r"^ng(?P\d)" 12 | 13 | 14 | RegexWord = re.compile(WordPattern + r"\s*") 15 | RegexBreak = re.compile(BreakPattern + r"\s*") 16 | RegexID = re.compile(r"^(?P.*?)\s") 17 | RegexSentence = re.compile( 18 | r"({}|{}|{}|{}|{})\s*".format( 19 | WordPattern, BreakPattern, MarkPattern, POSPattern, PhraseTonePattern 20 | ) 21 | ) 22 | RegexForeignLang = re.compile(r"[A-Z@]") 23 | RegexSpace = re.compile(r"^\s*") 24 | RegexNeutralTone = re.compile(r"[1-5]5") 25 | 26 | 27 | def do_character_normalization(line): 28 | return unicodedata.normalize("NFKC", line) 29 | 30 | 31 | def do_prosody_text_normalization(line): 32 | tokens = line.split("\t") 33 | text = tokens[1] 34 | # Remove punctuations 35 | text = text.replace(u"。", " ") 36 | text = text.replace(u"、", " ") 37 | text = text.replace(u"“", " ") 38 | text = text.replace(u"”", " ") 39 | text = text.replace(u"‘", " ") 40 | text = text.replace(u"’", " ") 41 | text = text.replace(u"|", " ") 42 | text = text.replace(u"《", " ") 43 | text = text.replace(u"》", " ") 44 | text = text.replace(u"【", " ") 45 | text = text.replace(u"】", " ") 46 | text = text.replace(u"—", " ") 47 | text = text.replace(u"―", " ") 48 | text = text.replace(".", " ") 49 | text = text.replace("!", " ") 50 | text = text.replace("?", " ") 51 | text = text.replace("(", " ") 52 | text = text.replace(")", " ") 53 | text = text.replace("[", " ") 54 | text = text.replace("]", " ") 55 | text = text.replace("{", " ") 56 | text = text.replace("}", " ") 57 | text = text.replace("~", " ") 58 | text = text.replace(":", " ") 59 | text = text.replace(";", " ") 60 | text = text.replace("+", " ") 61 | text = text.replace(",", " ") 62 | # text = text.replace('·', ' ') 63 | text = text.replace('"', " ") 64 | text = text.replace( 65 | "-", "" 66 | ) # don't replace by space because compond word like two-year-old 67 | text = text.replace( 68 | "'", "" 69 | ) # don't replace by space because English word like that's 70 | 71 | # Replace break 72 | text = text.replace("/", "#2") 73 | text = text.replace("%", "#3") 74 | # Remove useless spaces surround #2 #3 #4 75 | text = re.sub(r"(#\d)[ ]+", r"\1", text) 76 | text = re.sub(r"[ ]+(#\d)", r"\1", text) 77 | # Replace space by #1 78 | text = re.sub("[ ]+", "#1", text) 79 | 80 | # Remove break at the end of the text 81 | text = re.sub(r"#\d$", "", text) 82 | 83 | # Add #1 between target language and foreign language 84 | text = re.sub(r"([a-zA-Z])([^a-zA-Z\d\#\s\'\%\/\-])", r"\1#1\2", text) 85 | text = re.sub(r"([^a-zA-Z\d\#\s\'\%\/\-])([a-zA-Z])", r"\1#1\2", text) 86 | 87 | return tokens[0] + "\t" + text 88 | 89 | 90 | def is_fp_line(line): 91 | fp_category_list = ["FP", "I", "N", "Q"] 92 | elements = line.strip().split(" ") 93 | res = True 94 | for ele in elements: 95 | if ele not in fp_category_list: 96 | res = False 97 | break 98 | return res 99 | 100 | 101 | def format_prosody(src_prosody): 102 | formatted_lines = [] 103 | with codecs.open(src_prosody, "r", "utf-8") as f: 104 | lines = f.readlines() 105 | 106 | idx = 0 107 | while idx < len(lines): 108 | line = do_character_normalization(lines[idx]) 109 | 110 | if len(line.strip().split("\t")) == 2: 111 | line = do_prosody_text_normalization(line) 112 | else: 113 | fp_enable = is_fp_line(line) 114 | if fp_enable: 115 | idx += 3 116 | continue 117 | formatted_lines.append(line) 118 | idx += 1 119 | # with codecs.open(tgt_prosody, 'w', 'utf-8') as f: 120 | # f.writelines(formatted_lines) 121 | return formatted_lines 122 | -------------------------------------------------------------------------------- /kantts/preprocess/se_processor/D_TDNN.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | from .layers import (DenseLayer, DenseTDNNBlock, StatsPool, TDNNLayer, SEDenseTDNNBlock, 8 | TransitLayer) 9 | 10 | class BasicBlock(nn.Module): 11 | expansion = 1 12 | 13 | def __init__(self, in_planes, planes, stride=1): 14 | super(BasicBlock, self).__init__() 15 | self.conv1 = nn.Conv2d(in_planes, 16 | planes, 17 | kernel_size=3, 18 | stride=(stride, 1), 19 | padding=1, 20 | bias=False) 21 | self.bn1 = nn.BatchNorm2d(planes) 22 | self.conv2 = nn.Conv2d(planes, 23 | planes, 24 | kernel_size=3, 25 | stride=1, 26 | padding=1, 27 | bias=False) 28 | self.bn2 = nn.BatchNorm2d(planes) 29 | 30 | self.shortcut = nn.Sequential() 31 | if stride != 1 or in_planes != self.expansion * planes: 32 | self.shortcut = nn.Sequential( 33 | nn.Conv2d(in_planes, 34 | self.expansion * planes, 35 | kernel_size=1, 36 | stride=(stride, 1), 37 | bias=False), 38 | nn.BatchNorm2d(self.expansion * planes)) 39 | 40 | def forward(self, x): 41 | out = F.relu(self.bn1(self.conv1(x))) 42 | out = self.bn2(self.conv2(out)) 43 | out += self.shortcut(x) 44 | out = F.relu(out) 45 | return out 46 | 47 | class CNN_Head(nn.Module): 48 | def __init__(self, 49 | block=BasicBlock, 50 | num_blocks=[2, 2], 51 | m_channels=32, 52 | feat_dim=80): 53 | super(CNN_Head, self).__init__() 54 | self.in_planes = m_channels 55 | self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) 56 | self.bn1 = nn.BatchNorm2d(m_channels) 57 | 58 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2) 59 | self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2) 60 | 61 | self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False) 62 | self.bn2 = nn.BatchNorm2d(m_channels) 63 | self.out_channels = m_channels * (feat_dim // 8) 64 | 65 | def _make_layer(self, block, planes, num_blocks, stride): 66 | strides = [stride] + [1] * (num_blocks - 1) 67 | layers = [] 68 | for stride in strides: 69 | layers.append(block(self.in_planes, planes, stride)) 70 | self.in_planes = planes * block.expansion 71 | return nn.Sequential(*layers) 72 | 73 | def forward(self, x): 74 | x = x.unsqueeze_(1) 75 | out = F.relu(self.bn1(self.conv1(x))) 76 | out = self.layer1(out) 77 | out = self.layer2(out) 78 | out = F.relu(self.bn2(self.conv2(out))) 79 | 80 | out = out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3]) 81 | return out 82 | 83 | class DTDNN(nn.Module): 84 | def __init__(self, 85 | feat_dim=80, 86 | embedding_size=192, 87 | growth_rate=32, 88 | bn_size=4, 89 | init_channels=128, 90 | config_str='batchnorm-relu', 91 | memory_efficient=True): 92 | super(DTDNN, self).__init__() 93 | 94 | self.head = CNN_Head() 95 | feat_dim = self.head.out_channels 96 | 97 | self.xvector = nn.Sequential( 98 | OrderedDict([ 99 | ('tdnn', 100 | TDNNLayer(feat_dim, 101 | init_channels, 102 | 5, 103 | stride=2, 104 | dilation=1, 105 | padding=-1, 106 | config_str=config_str)), 107 | ])) 108 | channels = init_channels 109 | for i, (num_layers, kernel_size, 110 | dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 3))): 111 | block = SEDenseTDNNBlock(num_layers=num_layers, 112 | in_channels=channels, 113 | out_channels=growth_rate, 114 | bn_channels=bn_size * growth_rate, 115 | kernel_size=kernel_size, 116 | dilation=dilation, 117 | config_str=config_str, 118 | memory_efficient=memory_efficient) 119 | self.xvector.add_module('block%d' % (i + 1), block) 120 | channels = channels + num_layers * growth_rate 121 | self.xvector.add_module( 122 | 'transit%d' % (i + 1), 123 | TransitLayer(channels, 124 | channels // 2, 125 | bias=False, 126 | config_str=config_str)) 127 | channels //= 2 128 | 129 | self.bn = nn.BatchNorm1d(channels) 130 | self.relu = nn.ReLU(inplace=True) 131 | 132 | self.xvector.add_module('stats', StatsPool()) 133 | self.xvector.add_module( 134 | 'dense', 135 | DenseLayer(channels * 2, embedding_size, config_str='batchnorm_')) 136 | 137 | for m in self.modules(): 138 | if isinstance(m, (nn.Conv1d, nn.Linear)): 139 | nn.init.kaiming_normal_(m.weight.data) 140 | if m.bias is not None: 141 | nn.init.zeros_(m.bias) 142 | 143 | def forward(self, x): 144 | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) 145 | x = self.head(x) 146 | x = self.xvector.tdnn(x) 147 | 148 | x = self.xvector.block1(x) 149 | x = self.xvector.transit1(x) 150 | 151 | x = self.xvector.block2(x) 152 | x = self.xvector.transit2(x) 153 | 154 | x = self.xvector.block3(x) 155 | x = self.xvector.transit3(x) 156 | x = self.relu(self.bn(x)) 157 | 158 | x = self.xvector.stats(x) 159 | x = self.xvector.dense(x) 160 | return x 161 | 162 | -------------------------------------------------------------------------------- /kantts/preprocess/se_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/se_processor/__init__.py -------------------------------------------------------------------------------- /kantts/preprocess/se_processor/se_processor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import numpy as np 4 | import os 5 | import torchaudio.compliance.kaldi as Kaldi 6 | from .D_TDNN import DTDNN 7 | import logging 8 | import argparse 9 | from glob import glob 10 | 11 | 12 | logging.basicConfig( 13 | format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s", 14 | datefmt="%Y-%m-%d:%H:%M:%S", 15 | level=logging.DEBUG, 16 | ) 17 | 18 | class SpeakerEmbeddingProcessor: 19 | def __init__(self, sample_rate=16000): 20 | self.sample_rate = sample_rate 21 | self.min_wav_length = self.sample_rate * 30 * 10 / 1000 22 | 23 | self.pcm_dict = {} 24 | self.mfcc_dict = {} 25 | self.se_list = [] 26 | 27 | def process(self, src_voice_dir, se_model): 28 | logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extractor started") 29 | 30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 31 | model = DTDNN() 32 | try: 33 | if os.path.basename(se_model) == "se.model": 34 | model.load_state_dict(torch.load(se_model, map_location=device)) 35 | else: 36 | raise Exception("[SpeakerEmbeddingProcessor] se model loading error!!!") 37 | except Exception as e: 38 | logging.info(e) 39 | if os.path.basename(se_model) == 'se.onnx': 40 | logging.info("[SpeakerEmbeddingProcessor] please update your se model to ensure that the version is greater than or equal to 1.0.5") 41 | sys.exit() 42 | model.eval() 43 | model.to(device) 44 | 45 | wav_dir = os.path.join(src_voice_dir, "wav") 46 | se_dir = os.path.join(src_voice_dir, "se") 47 | se_average_file = os.path.join(se_dir, "se.npy") 48 | 49 | os.makedirs(se_dir, exist_ok=True) 50 | 51 | wav_files = glob(os.path.join(wav_dir, '*.wav')) 52 | 53 | 54 | for wav_file in wav_files: 55 | basename = os.path.splitext(os.path.basename(wav_file))[0] 56 | se_file = os.path.join(se_dir, basename + '.npy') 57 | 58 | wav, fs = torchaudio.load(wav_file) 59 | assert wav.shape[0] == 1 60 | assert fs == 16000 61 | 62 | if wav.shape[1] < self.min_wav_length: 63 | continue 64 | 65 | fbank_feat = Kaldi.fbank(wav, num_mel_bins=80) 66 | 67 | feat = fbank_feat - fbank_feat.mean(dim=0, keepdim=True) 68 | feat = feat.unsqueeze(0).to(device) 69 | 70 | speaker_embedding = model(feat) 71 | speaker_embedding = speaker_embedding.squeeze().cpu().detach().numpy() 72 | speaker_embedding = np.expand_dims(speaker_embedding, axis=0) 73 | 74 | 75 | np.save(se_file, speaker_embedding) 76 | self.se_list.append(speaker_embedding) 77 | self.se_average = np.expand_dims( 78 | np.mean( 79 | np.concatenate(self.se_list, axis=0), 80 | axis=0 81 | ), 82 | axis=0 83 | ) 84 | np.save(se_average_file, self.se_average) 85 | 86 | logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extracted successfully!") 87 | 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser(description="Speaker Embedding Processor") 91 | parser.add_argument("--src_voice_dir", type=str, required=True) 92 | parser.add_argument('--se_model', required=True) 93 | args = parser.parse_args() 94 | 95 | sep = SpeakerEmbeddingProcessor() 96 | sep.process(args.src_voice_dir, args.se_onnx) -------------------------------------------------------------------------------- /kantts/preprocess/text_process.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import argparse 5 | import yaml 6 | import time 7 | import zipfile 8 | 9 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402 10 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402 11 | 12 | try: 13 | from kantts.datasets.dataset import BERT_Text_Dataset 14 | from kantts.utils.log import logging_to_file, get_git_revision_hash 15 | from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols 16 | except ImportError: 17 | raise ImportError("Please install kantts.") 18 | 19 | logging.basicConfig( 20 | format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", 21 | datefmt="%Y-%m-%d:%H:%M:%S", 22 | level=logging.INFO, 23 | ) 24 | 25 | 26 | def gen_metafile( 27 | output_dir, 28 | split_ratio=0.98, 29 | ): 30 | raw_metafile = os.path.join(output_dir, "raw_metafile.txt") 31 | bert_train_meta = os.path.join(output_dir, "bert_train.lst") 32 | bert_valid_meta = os.path.join(output_dir, "bert_valid.lst") 33 | if not os.path.exists( 34 | bert_train_meta) or not os.path.exists(bert_valid_meta): 35 | BERT_Text_Dataset.gen_metafile(raw_metafile, output_dir, split_ratio) 36 | logging.info("BERT Text metafile generated.") 37 | 38 | # TODO: Zh-CN as default 39 | def process_mit_style_data( 40 | text_file, 41 | resources_zip_file, 42 | output_dir, 43 | ): 44 | os.makedirs(output_dir, exist_ok=True) 45 | logging_to_file(os.path.join(output_dir, "data_process_stdout.log")) 46 | 47 | resource_root_dir = os.path.dirname(resources_zip_file) 48 | resource_dir = os.path.join(resource_root_dir, "resource") 49 | 50 | if not os.path.exists(resource_dir): 51 | logging.info("Extracting resources...") 52 | with zipfile.ZipFile(resources_zip_file, "r") as zip_ref: 53 | zip_ref.extractall(resource_root_dir) 54 | 55 | with open(text_file, "r") as text_data: 56 | texts = text_data.readlines() 57 | 58 | logging.info("Converting text to symbols...") 59 | symbols_lst = text_to_symbols(texts, resource_dir, "F7") 60 | symbols_file = os.path.join(output_dir, "raw_metafile.txt") 61 | with open(symbols_file, "w") as symbol_data: 62 | for symbol in symbols_lst: 63 | symbol_data.write(symbol) 64 | 65 | logging.info("Processing done.") 66 | 67 | # Generate BERT Text metafile 68 | # TODO: train/valid ratio setting 69 | gen_metafile(output_dir) 70 | 71 | 72 | if __name__ == "__main__": 73 | parser = argparse.ArgumentParser(description="Dataset preprocessor") 74 | parser.add_argument("--text_file", type=str, required=True) 75 | parser.add_argument("--resources_zip_file", type=str, required=True) 76 | parser.add_argument("--output_dir", type=str, required=True) 77 | 78 | args = parser.parse_args() 79 | 80 | process_mit_style_data( 81 | args.text_file, 82 | args.resources_zip_file, 83 | args.output_dir, 84 | ) 85 | 86 | -------------------------------------------------------------------------------- /kantts/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/train/__init__.py -------------------------------------------------------------------------------- /kantts/train/scheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import * # NOQA 2 | from torch.optim.lr_scheduler import _LRScheduler # NOQA 3 | 4 | """Noam Scheduler.""" 5 | 6 | 7 | class FindLR(_LRScheduler): 8 | """ 9 | inspired by fast.ai @https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html 10 | """ 11 | 12 | def __init__(self, optimizer, max_steps, max_lr=10): 13 | self.max_steps = max_steps 14 | self.max_lr = max_lr 15 | super().__init__(optimizer) 16 | 17 | def get_lr(self): 18 | return [ 19 | base_lr 20 | * ((self.max_lr / base_lr) ** (self.last_epoch / (self.max_steps - 1))) 21 | for base_lr in self.base_lrs 22 | ] 23 | 24 | 25 | class NoamLR(_LRScheduler): 26 | """ 27 | Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate 28 | linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally 29 | to the inverse square root of the step number, scaled by the inverse square root of the 30 | dimensionality of the model. Time will tell if this is just madness or it's actually important. 31 | Parameters 32 | ---------- 33 | warmup_steps: ``int``, required. 34 | The number of steps to linearly increase the learning rate. 35 | """ 36 | 37 | def __init__(self, optimizer, warmup_steps): 38 | self.warmup_steps = warmup_steps 39 | super().__init__(optimizer) 40 | 41 | def get_lr(self): 42 | last_epoch = max(1, self.last_epoch) 43 | scale = self.warmup_steps ** 0.5 * min( 44 | last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5) 45 | ) 46 | return [base_lr * scale for base_lr in self.base_lrs] 47 | -------------------------------------------------------------------------------- /kantts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/utils/__init__.py -------------------------------------------------------------------------------- /kantts/utils/audio_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import librosa 3 | from distutils.version import LooseVersion 4 | 5 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7") 6 | 7 | 8 | def stft(x, fft_size, hop_size, win_length, window): 9 | """Perform STFT and convert to magnitude spectrogram. 10 | 11 | Args: 12 | x (Tensor): Input signal tensor (B, T). 13 | fft_size (int): FFT size. 14 | hop_size (int): Hop size. 15 | win_length (int): Window length. 16 | window (str): Window function type. 17 | 18 | Returns: 19 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 20 | 21 | """ 22 | if is_pytorch_17plus: 23 | x_stft = torch.stft( 24 | x, fft_size, hop_size, win_length, window, return_complex=False 25 | ) 26 | else: 27 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window) 28 | real = x_stft[..., 0] 29 | imag = x_stft[..., 1] 30 | 31 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 32 | 33 | 34 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 35 | return 20 * torch.log10(torch.clamp(x, min=clip_val) * C) 36 | 37 | 38 | def dynamic_range_decompression_torch(x, C=1): 39 | return torch.pow(10.0, x * 0.05) / C 40 | 41 | 42 | def spectral_normalize_torch( 43 | magnitudes, 44 | min_level_db=-100.0, 45 | ref_level_db=20.0, 46 | norm_abs_value=4.0, 47 | symmetric=True, 48 | ): 49 | output = dynamic_range_compression_torch(magnitudes) - ref_level_db 50 | 51 | if symmetric: 52 | return torch.clamp( 53 | 2 * norm_abs_value * ((output - min_level_db) / (-min_level_db)) 54 | - norm_abs_value, 55 | min=-norm_abs_value, 56 | max=norm_abs_value, 57 | ) 58 | else: 59 | return torch.clamp( 60 | norm_abs_value * ((output - min_level_db) / (-min_level_db)), 61 | min=0.0, 62 | max=norm_abs_value, 63 | ) 64 | 65 | 66 | def spectral_de_normalize_torch( 67 | magnitudes, 68 | min_level_db=-100.0, 69 | ref_level_db=20.0, 70 | norm_abs_value=4.0, 71 | symmetric=True, 72 | ): 73 | if symmetric: 74 | magnitudes = torch.clamp(magnitudes, min=-norm_abs_value, max=norm_abs_value) 75 | magnitudes = (magnitudes + norm_abs_value) * (-min_level_db) / ( 76 | 2 * norm_abs_value 77 | ) + min_level_db 78 | else: 79 | magnitudes = torch.clamp(magnitudes, min=0.0, max=norm_abs_value) 80 | magnitudes = (magnitudes) * (-min_level_db) / (norm_abs_value) + min_level_db 81 | 82 | output = dynamic_range_decompression_torch(magnitudes + ref_level_db) 83 | return output 84 | 85 | 86 | class MelSpectrogram(torch.nn.Module): 87 | """Calculate Mel-spectrogram.""" 88 | 89 | def __init__( 90 | self, 91 | fs=22050, 92 | fft_size=1024, 93 | hop_size=256, 94 | win_length=None, 95 | window="hann", 96 | num_mels=80, 97 | fmin=80, 98 | fmax=7600, 99 | center=True, 100 | normalized=False, 101 | onesided=True, 102 | eps=1e-10, 103 | log_base=10.0, 104 | pad_mode="constant", 105 | ): 106 | """Initialize MelSpectrogram module.""" 107 | super().__init__() 108 | self.fft_size = fft_size 109 | if win_length is None: 110 | self.win_length = fft_size 111 | else: 112 | self.win_length = win_length 113 | self.hop_size = hop_size 114 | self.center = center 115 | self.normalized = normalized 116 | self.onesided = onesided 117 | if window is not None and not hasattr(torch, f"{window}_window"): 118 | raise ValueError(f"{window} window is not implemented") 119 | self.window = window 120 | self.eps = eps 121 | self.pad_mode = pad_mode 122 | 123 | fmin = 0 if fmin is None else fmin 124 | fmax = fs / 2 if fmax is None else fmax 125 | melmat = librosa.filters.mel( 126 | sr=fs, 127 | n_fft=fft_size, 128 | n_mels=num_mels, 129 | fmin=fmin, 130 | fmax=fmax, 131 | ) 132 | self.register_buffer("melmat", torch.from_numpy(melmat.T).float()) 133 | self.stft_params = { 134 | "n_fft": self.fft_size, 135 | "win_length": self.win_length, 136 | "hop_length": self.hop_size, 137 | "center": self.center, 138 | "normalized": self.normalized, 139 | "onesided": self.onesided, 140 | "pad_mode": self.pad_mode, 141 | } 142 | if is_pytorch_17plus: 143 | self.stft_params["return_complex"] = False 144 | 145 | self.log_base = log_base 146 | if self.log_base is None: 147 | self.log = torch.log 148 | elif self.log_base == 2.0: 149 | self.log = torch.log2 150 | elif self.log_base == 10.0: 151 | self.log = torch.log10 152 | else: 153 | raise ValueError(f"log_base: {log_base} is not supported.") 154 | 155 | def forward(self, x): 156 | """Calculate Mel-spectrogram. 157 | 158 | Args: 159 | x (Tensor): Input waveform tensor (B, T) or (B, 1, T). 160 | 161 | Returns: 162 | Tensor: Mel-spectrogram (B, #mels, #frames). 163 | 164 | """ 165 | if x.dim() == 3: 166 | # (B, C, T) -> (B*C, T) 167 | x = x.reshape(-1, x.size(2)) 168 | 169 | if self.window is not None: 170 | window_func = getattr(torch, f"{self.window}_window") 171 | window = window_func(self.win_length, dtype=x.dtype, device=x.device) 172 | else: 173 | window = None 174 | 175 | x_stft = torch.stft(x, window=window, **self.stft_params) 176 | # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2) 177 | x_stft = x_stft.transpose(1, 2) 178 | x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2 179 | x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps)) 180 | 181 | x_mel = torch.matmul(x_amp, self.melmat) 182 | x_mel = torch.clamp(x_mel, min=self.eps) 183 | x_mel = spectral_normalize_torch(x_mel) 184 | 185 | # return self.log(x_mel).transpose(1, 2) 186 | return x_mel.transpose(1, 2) 187 | -------------------------------------------------------------------------------- /kantts/utils/ling_unit/__init__.py: -------------------------------------------------------------------------------- 1 | import ttsfrd 2 | 3 | ENG_LANG_MAPPING = { 4 | "PinYin": "zh-cn", 5 | "English": "en-us", 6 | "British": "en-gb", 7 | "ZhHK": "hk_cantonese", 8 | "Sichuan": "sichuan", 9 | "Japanese": "japanese", 10 | "WuuShangHai": "shanghai", 11 | "Indonesian": "indonesian", 12 | "Malay": "malay", 13 | "Filipino": "filipino", 14 | "Vietnamese": "vietnamese", 15 | "Korean": "korean", 16 | "Russian": "russian", 17 | } 18 | 19 | 20 | def text_to_mit_symbols(texts, resources_dir, speaker, lang="PinYin"): 21 | fe = ttsfrd.TtsFrontendEngine() 22 | fe.initialize(resources_dir) 23 | fe.set_lang_type(ENG_LANG_MAPPING[lang]) 24 | 25 | symbols_lst = [] 26 | for idx, text in enumerate(texts): 27 | text = text.strip() 28 | res = fe.gen_tacotron_symbols(text) 29 | res = res.replace("F7", speaker) 30 | sentences = res.split("\n") 31 | for sentence in sentences: 32 | arr = sentence.split("\t") 33 | # skip the empty line 34 | if len(arr) != 2: 35 | continue 36 | sub_index, symbols = sentence.split("\t") 37 | symbol_str = "{}_{}\t{}\n".format(idx, sub_index, symbols) 38 | symbols_lst.append(symbol_str) 39 | 40 | return symbols_lst 41 | -------------------------------------------------------------------------------- /kantts/utils/ling_unit/cleaners.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | """ 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r"\s+") 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [ 23 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 24 | for x in [ 25 | ("mrs", "misess"), 26 | ("mr", "mister"), 27 | ("dr", "doctor"), 28 | ("st", "saint"), 29 | ("co", "company"), 30 | ("jr", "junior"), 31 | ("maj", "major"), 32 | ("gen", "general"), 33 | ("drs", "doctors"), 34 | ("rev", "reverend"), 35 | ("lt", "lieutenant"), 36 | ("hon", "honorable"), 37 | ("sgt", "sergeant"), 38 | ("capt", "captain"), 39 | ("esq", "esquire"), 40 | ("ltd", "limited"), 41 | ("col", "colonel"), 42 | ("ft", "fort"), 43 | ] 44 | ] 45 | 46 | 47 | def expand_abbreviations(text): 48 | for regex, replacement in _abbreviations: 49 | text = re.sub(regex, replacement, text) 50 | return text 51 | 52 | 53 | def expand_numbers(text): 54 | return normalize_numbers(text) 55 | 56 | 57 | def lowercase(text): 58 | return text.lower() 59 | 60 | 61 | def collapse_whitespace(text): 62 | return re.sub(_whitespace_re, " ", text) 63 | 64 | 65 | def convert_to_ascii(text): 66 | return unidecode(text) 67 | 68 | 69 | def basic_cleaners(text): 70 | """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 71 | text = lowercase(text) 72 | text = collapse_whitespace(text) 73 | return text 74 | 75 | 76 | def transliteration_cleaners(text): 77 | """Pipeline for non-English text that transliterates to ASCII.""" 78 | text = convert_to_ascii(text) 79 | text = lowercase(text) 80 | text = collapse_whitespace(text) 81 | return text 82 | 83 | 84 | def english_cleaners(text): 85 | """Pipeline for English text, including number and abbreviation expansion.""" 86 | text = convert_to_ascii(text) 87 | text = lowercase(text) 88 | text = expand_numbers(text) 89 | text = expand_abbreviations(text) 90 | text = collapse_whitespace(text) 91 | return text 92 | -------------------------------------------------------------------------------- /kantts/utils/ling_unit/emotion_types.py: -------------------------------------------------------------------------------- 1 | emotion_types = [ 2 | "emotion_none", 3 | "emotion_neutral", 4 | "emotion_angry", 5 | "emotion_disgust", 6 | "emotion_fear", 7 | "emotion_happy", 8 | "emotion_sad", 9 | "emotion_surprise", 10 | "emotion_calm", 11 | "emotion_gentle", 12 | "emotion_relax", 13 | "emotion_lyrical", 14 | "emotion_serious", 15 | "emotion_disgruntled", 16 | "emotion_satisfied", 17 | "emotion_disappointed", 18 | "emotion_excited", 19 | "emotion_anxiety", 20 | "emotion_jealousy", 21 | "emotion_hate", 22 | "emotion_pity", 23 | "emotion_pleasure", 24 | "emotion_arousal", 25 | "emotion_dominance", 26 | "emotion_placeholder1", 27 | "emotion_placeholder2", 28 | "emotion_placeholder3", 29 | "emotion_placeholder4", 30 | "emotion_placeholder5", 31 | "emotion_placeholder6", 32 | "emotion_placeholder7", 33 | "emotion_placeholder8", 34 | "emotion_placeholder9", 35 | ] 36 | -------------------------------------------------------------------------------- /kantts/utils/ling_unit/lang_symbols.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | from kantts.preprocess.languages import languages 3 | import logging 4 | import os 5 | 6 | syllable_flags = [ 7 | "s_begin", 8 | "s_end", 9 | "s_none", 10 | "s_both", 11 | "s_middle", 12 | ] 13 | 14 | word_segments = [ 15 | "word_begin", 16 | "word_end", 17 | "word_middle", 18 | "word_both", 19 | "word_none", 20 | ] 21 | 22 | LANGUAGES_DIR = os.path.join( 23 | os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 24 | "preprocess", 25 | "languages", 26 | ) 27 | 28 | 29 | def parse_phoneset(phoneset_file): 30 | """Parse a phoneset file and return a list of symbols. 31 | Args: 32 | phoneset_file (str): Path to the phoneset file. 33 | 34 | Returns: 35 | list: A list of phones. 36 | """ 37 | ns = "{http://schemas.alibaba-inc.com/tts}" 38 | 39 | phone_lst = [] 40 | phoneset_root = ET.parse(phoneset_file).getroot() 41 | for phone_node in phoneset_root.findall(ns + "phone"): 42 | phone_lst.append(phone_node.find(ns + "name").text) 43 | 44 | for i in range(1, 5): 45 | phone_lst.append("#{}".format(i)) 46 | 47 | return phone_lst 48 | 49 | 50 | def parse_tonelist(tonelist_file): 51 | """Parse a tonelist file and return a list of tones. 52 | Args: 53 | tonelist_file (str): Path to the tonelist file. 54 | 55 | Returns: 56 | dict: A dictionary of tones. 57 | """ 58 | tone_lst = [] 59 | with open(tonelist_file, "r") as f: 60 | lines = f.readlines() 61 | for line in lines: 62 | tone = line.strip() 63 | if tone != "": 64 | tone_lst.append("tone{}".format(tone)) 65 | else: 66 | tone_lst.append("tone_none") 67 | 68 | return tone_lst 69 | 70 | 71 | def get_language_symbols(language): 72 | """Get symbols of a language. 73 | Args: 74 | language (str): Language name. 75 | """ 76 | language_dict = languages.get(language, None) 77 | if language_dict is None: 78 | logging.error("Language %s not supported. Using PinYin as default", language) 79 | language_dict = languages["PinYin"] 80 | language = "PinYin" 81 | 82 | language_dir = os.path.join(LANGUAGES_DIR, language) 83 | phoneset_file = os.path.join(language_dir, language_dict["phoneset_path"]) 84 | tonelist_file = os.path.join(language_dir, language_dict["tonelist_path"]) 85 | phones = parse_phoneset(phoneset_file) 86 | tones = parse_tonelist(tonelist_file) 87 | 88 | return phones, tones, syllable_flags, word_segments 89 | -------------------------------------------------------------------------------- /kantts/utils/ling_unit/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 7 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 8 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 9 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 10 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 11 | _number_re = re.compile(r"[0-9]+") 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(",", "") 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace(".", " point ") 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split(".") 25 | if len(parts) > 2: 26 | return match + " dollars" # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = "dollar" if dollars == 1 else "dollars" 31 | cent_unit = "cent" if cents == 1 else "cents" 32 | return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = "dollar" if dollars == 1 else "dollars" 35 | return "%s %s" % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = "cent" if cents == 1 else "cents" 38 | return "%s %s" % (cents, cent_unit) 39 | else: 40 | return "zero dollars" 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return "two thousand" 52 | elif num > 2000 and num < 2010: 53 | return "two thousand " + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + " hundred" 56 | else: 57 | return _inflect.number_to_words( 58 | num, andword="", zero="oh", group=2 59 | ).replace(", ", " ") 60 | else: 61 | return _inflect.number_to_words(num, andword="") 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r"\1 pounds", text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /kantts/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | 4 | 5 | def logging_to_file(log_file): 6 | logger = logging.getLogger() 7 | handler = logging.FileHandler(log_file) 8 | formatter = logging.Formatter( 9 | "%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s", 10 | datefmt="%Y-%m-%d:%H:%M:%S", 11 | ) 12 | handler.setFormatter(formatter) 13 | logger.addHandler(handler) 14 | logger.setLevel(logging.INFO) 15 | 16 | 17 | def get_git_revision_short_hash(): 18 | return ( 19 | subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]) 20 | .decode("ascii") 21 | .strip() 22 | ) 23 | 24 | 25 | def get_git_revision_hash(): 26 | return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip() 27 | -------------------------------------------------------------------------------- /kantts/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | 3 | matplotlib.use("Agg") # NOQA: E402 4 | try: 5 | import matplotlib.pyplot as plt 6 | except ImportError: 7 | raise ImportError("Please install matplotlib.") 8 | 9 | 10 | def plot_spectrogram(spectrogram): 11 | fig, ax = plt.subplots(figsize=(12, 8)) 12 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 13 | plt.colorbar(im, ax=ax) 14 | 15 | fig.canvas.draw() 16 | plt.close() 17 | 18 | return fig 19 | 20 | 21 | def plot_alignment(alignment, info=None): 22 | fig, ax = plt.subplots() 23 | im = ax.imshow(alignment, aspect="auto", origin="lower", interpolation="none") 24 | fig.colorbar(im, ax=ax) 25 | xlabel = "Input timestep" 26 | if info is not None: 27 | xlabel += "\t" + info 28 | plt.xlabel(xlabel) 29 | plt.ylabel("Output timestep") 30 | fig.canvas.draw() 31 | plt.close() 32 | 33 | return fig 34 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # How to run notebook examples? 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | autopep8 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | version = "0.0.1" 4 | 5 | with open("README.md", "r", encoding="utf-8") as readme_file: 6 | README = readme_file.read() 7 | 8 | setup( 9 | name="kantts", 10 | version=version, 11 | url="https://github.com/AlibabaResearch/KAN-TTS", 12 | author="Jin", 13 | description="Alibaba DAMO Speech-Lab Text to Speech deeplearning toolchain", 14 | long_description=README, 15 | long_description_content_type="text/markdown", 16 | license="MIT", 17 | # cython 18 | # include_dirs=numpy.get_include(), 19 | # ext_modules=find_cython_extensions(), 20 | # package 21 | include_package_data=True, 22 | packages=find_packages(include=["kantts*"]), 23 | project_urls={ 24 | "Documentation": "https://github.com/AlibabaResearch/KAN-TTS/wiki", 25 | "Tracker": "", 26 | "Repository": "https://github.com/AlibabaResearch/KAN-TTS", 27 | "Discussions": "", 28 | }, 29 | python_requires=">=3.7.0, <3.9", 30 | classifiers=[ 31 | "Programming Language :: Python", 32 | "Programming Language :: Python :: 3", 33 | "Programming Language :: Python :: 3.7", 34 | "Programming Language :: Python :: 3.8", 35 | "Development Status :: 3 - Alpha", 36 | "Intended Audience :: Science/Research", 37 | "Intended Audience :: Developers", 38 | "Operating System :: POSIX :: Linux", 39 | "License :: OSI Approved :: MIT License", 40 | "Topic :: Software Development", 41 | "Topic :: Software Development :: Libraries :: Python Modules", 42 | "Topic :: Multimedia :: Sound/Audio :: Speech", 43 | "Topic :: Multimedia :: Sound/Audio", 44 | "Topic :: Multimedia", 45 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 46 | ], 47 | zip_safe=False, 48 | ) 49 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/test/__init__.py --------------------------------------------------------------------------------